From 3548e974e1a8d7cbeaf8789b461cfc6f648f3135 Mon Sep 17 00:00:00 2001 From: Youenn Lebras Date: Fri, 13 Aug 2021 17:26:47 +0200 Subject: [PATCH 001/239] Implement HasAll specialization for SSE and AVX2 --- src/Functions/GatherUtils/Algorithms.h | 893 +++++++++++++++++++++-- src/Functions/GatherUtils/CMakeLists.txt | 4 + src/Functions/tests/gtest_hasAll.cpp | 104 +++ 3 files changed, 921 insertions(+), 80 deletions(-) create mode 100644 src/Functions/tests/gtest_hasAll.cpp diff --git a/src/Functions/GatherUtils/Algorithms.h b/src/Functions/GatherUtils/Algorithms.h index fc54eaf88ab..245794da976 100644 --- a/src/Functions/GatherUtils/Algorithms.h +++ b/src/Functions/GatherUtils/Algorithms.h @@ -7,7 +7,9 @@ #include #include #include "GatherUtils.h" - +#if defined(__AVX2__) + #include +#endif namespace DB::ErrorCodes { @@ -418,41 +420,55 @@ void NO_INLINE conditional(SourceA && src_a, SourceB && src_b, Sink && sink, con } -/// Methods to check if first array has elements from second array, overloaded for various combinations of types. -template < - ArraySearchType search_type, - typename FirstSliceType, - typename SecondSliceType, - bool (*isEqual)(const FirstSliceType &, const SecondSliceType &, size_t, size_t)> -bool sliceHasImplAnyAll(const FirstSliceType & first, const SecondSliceType & second, const UInt8 * first_null_map, const UInt8 * second_null_map) + +template +bool sliceEqualElements(const NumericArraySlice & first [[maybe_unused]], + const NumericArraySlice & second [[maybe_unused]], + size_t first_ind [[maybe_unused]], + size_t second_ind [[maybe_unused]]) { - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - for (size_t i = 0; i < second.size; ++i) - { - bool has = false; - for (size_t j = 0; j < first.size && !has; ++j) - { - const bool is_first_null = has_first_null_map && first_null_map[j]; - const bool is_second_null = has_second_null_map && second_null_map[i]; - - if (is_first_null && is_second_null) - has = true; - - if (!is_first_null && !is_second_null && isEqual(first, second, j, i)) - has = true; - } - - if (has && search_type == ArraySearchType::Any) - return true; - - if (!has && search_type == ArraySearchType::All) - return false; - } - return search_type == ArraySearchType::All; + /// TODO: Decimal scale + if constexpr (is_decimal && is_decimal) + return accurate::equalsOp(first.data[first_ind].value, second.data[second_ind].value); + else if constexpr (is_decimal || is_decimal) + return false; + else + return accurate::equalsOp(first.data[first_ind], second.data[second_ind]); } +template +bool sliceEqualElements(const NumericArraySlice &, const GenericArraySlice &, size_t, size_t) +{ + return false; +} + +template +bool sliceEqualElements(const GenericArraySlice &, const NumericArraySlice &, size_t, size_t) +{ + return false; +} + +inline ALWAYS_INLINE bool sliceEqualElements(const GenericArraySlice & first, const GenericArraySlice & second, size_t first_ind, size_t second_ind) +{ + return first.elements->compareAt(first_ind + first.begin, second_ind + second.begin, *second.elements, -1) == 0; +} + +template +bool insliceEqualElements(const NumericArraySlice & first [[maybe_unused]], + size_t first_ind [[maybe_unused]], + size_t second_ind [[maybe_unused]]) +{ + if constexpr (is_decimal) + return accurate::equalsOp(first.data[first_ind].value, first.data[second_ind].value); + else + return accurate::equalsOp(first.data[first_ind], first.data[second_ind]); +} +inline ALWAYS_INLINE bool insliceEqualElements(const GenericArraySlice & first, size_t first_ind, size_t second_ind) +{ + return first.elements->compareAt(first_ind + first.begin, second_ind + first.begin, *first.elements, -1) == 0; +} + + /// For details of Knuth-Morris-Pratt string matching algorithm see /// https://en.wikipedia.org/wiki/Knuth%E2%80%93Morris%E2%80%93Pratt_algorithm. @@ -481,6 +497,770 @@ std::vector buildKMPPrefixFunction(const SliceType & pattern, const Equa } +/// Methods to check if first array has elements from second array, overloaded for various combinations of types. +template < + ArraySearchType search_type, + typename FirstSliceType, + typename SecondSliceType, + bool (*isEqual)(const FirstSliceType &, const SecondSliceType &, size_t, size_t)> +bool sliceHasImplAnyAll(const FirstSliceType & first, const SecondSliceType & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + for (size_t i = 0; i < second.size; ++i) + { + bool has = false; + for (unsigned j = 0; j < first.size && !has; ++j) + { + const bool is_first_null = has_first_null_map && first_null_map[j]; + const bool is_second_null = has_second_null_map && second_null_map[i]; + + if (is_first_null && is_second_null) + has = true; + + if (!is_first_null && !is_second_null && isEqual(first, second, j, i)) + has = true; + } + + if (has && search_type == ArraySearchType::Any) + return true; + + if (!has && search_type == ArraySearchType::All) + return false; + } + return search_type == ArraySearchType::All; +} + + +#if defined(__AVX2__) +// AVX2 - Int specialization +template <> +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >(const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + if (first.size == 0) return true; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + if (has_second_null_map != has_first_null_map && has_first_null_map) return false; + + unsigned j = 0; + short has_mask = 1; + const int full = -1, none = 0; + const __m256i ones = _mm256_set1_epi32(full); + const __m256i zeros = _mm256_setzero_si256(); + if (first.size > 7 && second.size > 7) + { + for (; j < first.size-7 && has_mask; j += 8) + { + has_mask = 0; + const __m256i f_data = _mm256_lddqu_si256(reinterpret_cast(first.data+j)); + // bitmask is fulfilled with ones for ones which are considered as null in the corresponding null map, 0 otherwise; + __m256i bitmask = has_first_null_map ? _mm256_set_epi32((first_null_map[j+7])? full: none, + (first_null_map[j+6])? full: none, + (first_null_map[j+5])? full: none, + (first_null_map[j+4])? full: none, + (first_null_map[j+3])? full: none, + (first_null_map[j+2])? full: none, + (first_null_map[j+1])? full: none, + (first_null_map[j]) ? full: none + ) + :zeros; + + size_t i = 0; + // Browse second array to try to match ell first elements + for (; i < second.size-7 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 8) + { + const __m256i s_data = _mm256_lddqu_si256(reinterpret_cast(second.data+i)); + // Create a mask to avoid to compare null elements + // set_m128i takes two arguments: (high segment, low segment) that are two __m128i convert from 8bits to 32bits to fit to our following operations + const __m256i second_nm_mask = _mm256_set_m128i(_mm_cvtepi8_epi32(_mm_lddqu_si128(reinterpret_cast(second_null_map+i+4))), + _mm_cvtepi8_epi32(_mm_lddqu_si128(reinterpret_cast(second_null_map+i)))); + bitmask = + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + second_nm_mask, + _mm256_cmpeq_epi32(f_data, s_data)), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(second_nm_mask, _mm256_set_epi32(6,5,4,3,2,1,0,7)), + _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(6,5,4,3,2,1,0,7))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(second_nm_mask, _mm256_set_epi32(5,4,3,2,1,0,7,6)), + _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(5,4,3,2,1,0,7,6)))), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(second_nm_mask, _mm256_set_epi32(4,3,2,1,0,7,6,5)), + _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(4,3,2,1,0,7,6,5))))) + ), + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(second_nm_mask, _mm256_set_epi32(3,2,1,0,7,6,5,4)), + _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(3,2,1,0,7,6,5,4)))), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(second_nm_mask, _mm256_set_epi32(2,1,0,7,6,5,4,3)), + _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(2,1,0,7,6,5,4,3))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(second_nm_mask, _mm256_set_epi32(1,0,7,6,5,4,3,2)), + _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(1,0,7,6,5,4,3,2)))), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(second_nm_mask, _mm256_set_epi32(0,7,6,5,4,3,2,1)), + _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(0,7,6,5,4,3,2,1))))))) + ,bitmask); + } + + if (i < second.size) + { + // Loop(i)-jam + for (; i < second.size && !has_mask; i++) + { + if (second_null_map[i]) continue; + __m256i v_i = _mm256_set1_epi32(second.data[i]); + bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi32(f_data, v_i)); + has_mask = _mm256_testc_si256 (bitmask, ones); + } + } + } + } + + bool found = false; + // Loop(j)-jam + for (; j < first.size && has_mask; j++) + { + found = (has_first_null_map && first_null_map[j])? true: false; + for (unsigned i = 0; i < second.size && !found; i ++) + { + if (has_second_null_map && second_null_map[i]) continue; + found = (second.data[i] == first.data[j]); + } + if (!found) + return false; + } + return has_mask || found; +} + +// TODO: Discuss about +// raise an error : "error: no viable conversion from 'const NumericArraySlice' to 'const NumericArraySlice'" +// How should we do, copy past each function ?? I haven't found a way to specialize a same function body for two different types. +// AVX2 UInt specialization +// template <> +// inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >(const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +// { +// return sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements > (static_cast &>(first), static_cast &>(second), first_null_map, second_null_map); +// } + +// AVX2 Int64 specialization +template <> +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >(const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + if (first.size == 0) return true; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + if (has_second_null_map != has_first_null_map && has_first_null_map) return false; + + unsigned j = 0; + short has_mask = 1; + const int full = -1, none = 0; + const __m256i ones = _mm256_set1_epi64x(full); + const __m256i zeros = _mm256_setzero_si256(); + if (first.size > 3 && second.size > 3) + { + for (; j < first.size-3 && has_mask; j += 4) + { + has_mask = 0; + const __m256i f_data = _mm256_lddqu_si256(reinterpret_cast(first.data+j)); + __m256i bitmask = has_first_null_map ? _mm256_set_epi64x((first_null_map[j+3])? full: none, + (first_null_map[j+2])? full: none, + (first_null_map[j+1])? full: none, + (first_null_map[j]) ? full: none + ) + :zeros; + + unsigned i = 0; + for (; i < second.size-3 && !has_mask; has_mask = _mm256_testc_si256 (bitmask, ones), i += 4) + { + const __m256i s_data = _mm256_lddqu_si256(reinterpret_cast(second.data+i)); + const __m256i second_nm_mask = _mm256_set_m128i(_mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast(second_null_map+i+2))), + _mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast(second_null_map+i)))); + bitmask = + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + second_nm_mask, + _mm256_cmpeq_epi64(f_data, s_data)), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(second_nm_mask, _mm256_set_epi32(5,4,3,2,1,0,7,6)), + _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(5,4,3,2,1,0,7,6))))), + + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(second_nm_mask, _mm256_set_epi32(3,2,1,0,7,6,5,4)), + _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(3,2,1,0,7,6,5,4)))), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(second_nm_mask, _mm256_set_epi32(1,0,7,6,5,4,3,2)), + _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(1,0,7,6,5,4,3,2))))) + ), + bitmask); + } + + if (i < second.size) + { + for (; i < second.size && !has_mask; i++) + { + if (second_null_map[i]) continue; + __m256i v_i = _mm256_set1_epi64x(second.data[i]); + bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi64(f_data, v_i)); + has_mask = _mm256_testc_si256 (bitmask, ones); + } + } + } + } + + bool found = false; + for (; j < first.size && (has_mask || first.size <= 2); j++) + { + found = (has_first_null_map && first_null_map[j])? true: false; + for (unsigned i = 0; i < second.size && !found; i ++) + { + if (has_second_null_map && second_null_map[i]) continue; + found = (second.data[i] == first.data[j]); + } + if (!found) + return false; + } + return has_mask || found; +} + +// AVX2 Int16_t specialization +template <> +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >(const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + if (first.size == 0) return true; + + const bool has_second_null_map = second_null_map != nullptr; + const bool has_first_null_map = first_null_map != nullptr; + if (has_second_null_map != has_first_null_map && has_first_null_map) return false; + + unsigned j = 0; + short has_mask = 1; + const int full = -1, none = 0; + const __m256i ones = _mm256_set1_epi16(full); + const __m256i zeros = _mm256_setzero_si256(); + if (first.size > 15 && second.size > 15) + { + for (; j < first.size-15 && has_mask; j += 16) + { + has_mask = 0; + const __m256i f_data = _mm256_lddqu_si256(reinterpret_cast(first.data+j)); + __m256i bitmask = has_first_null_map ? _mm256_set_epi16((first_null_map[j+15])? full: none, (first_null_map[j+14])? full: none, + (first_null_map[j+13])? full: none, (first_null_map[j+12])? full: none, + (first_null_map[j+11])? full: none, (first_null_map[j+10])? full: none, + (first_null_map[j+9])? full: none, (first_null_map[j+8])? full: none, + (first_null_map[j+7])? full: none, (first_null_map[j+6])? full: none, + (first_null_map[j+5])? full: none, (first_null_map[j+4])? full: none, + (first_null_map[j+3])? full: none, (first_null_map[j+2])? full: none, + (first_null_map[j+1])? full: none, (first_null_map[j]) ? full: none + ) + :zeros; + unsigned i = 0; + for (; i < second.size-15 && !has_mask; has_mask = _mm256_testc_si256 (bitmask, ones), i += 16) + { + const __m256i s_data = _mm256_lddqu_si256(reinterpret_cast(second.data+i)); + const __m256i second_nm_mask = _mm256_set_m128i(_mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast(second_null_map+i+8))), + _mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast(second_null_map+i)))); + bitmask = + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + second_nm_mask, + _mm256_cmpeq_epi16(f_data, s_data)), + _mm256_andnot_si256( + _mm256_shuffle_epi8(second_nm_mask, _mm256_set_epi8(29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(second_nm_mask, _mm256_set_epi8(27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(second_nm_mask, _mm256_set_epi8(25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26))))) + ), + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(second_nm_mask, _mm256_set_epi8(23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(second_nm_mask, _mm256_set_epi8(21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(second_nm_mask, _mm256_set_epi8(19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(second_nm_mask, _mm256_set_epi8(17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18)))))) + ), + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_permute2x128_si256(second_nm_mask,second_nm_mask,1), + _mm256_cmpeq_epi16(f_data, _mm256_permute2x128_si256(s_data,s_data,1))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(second_nm_mask,second_nm_mask,1), _mm256_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data,s_data,1), _mm256_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(second_nm_mask,second_nm_mask,1), _mm256_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data,s_data,1), _mm256_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(second_nm_mask,second_nm_mask,1), _mm256_set_epi8(9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data,s_data,1), _mm256_set_epi8(9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10))))) + ), + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(second_nm_mask,second_nm_mask,1), _mm256_set_epi8(7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data,s_data,1), _mm256_set_epi8(7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(second_nm_mask,second_nm_mask,1), _mm256_set_epi8(5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data,s_data,1), _mm256_set_epi8(5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(second_nm_mask,second_nm_mask,1), _mm256_set_epi8(3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data,s_data,1), _mm256_set_epi8(3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(second_nm_mask,second_nm_mask,1), _mm256_set_epi8(1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data,s_data,1), _mm256_set_epi8(1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2)))))) + ) + ), + bitmask); + } + + if (i < second.size) + { + for (; i < second.size && !has_mask; i++) + { + if (second_null_map[i]) continue; + __m256i v_i = _mm256_set1_epi16(second.data[i]); + bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi16(f_data, v_i)); + has_mask = _mm256_testc_si256 (bitmask, ones); + } + } + } + } + + bool found = false; + for (; j < first.size && (has_mask || first.size <= 2); j++) + { + found = (has_first_null_map && first_null_map[j])? true: false; + for (unsigned i = 0; i < second.size && !found; i ++) + { + if (has_second_null_map && second_null_map[i]) continue; + found = (second.data[i] == first.data[j]); + } + if (!found) + return false; + } + return has_mask || found; +} + +#else + +// SSE4.2 Int specialization +template <> +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >(const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + if (first.size == 0) return true; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + if (has_second_null_map != has_first_null_map && has_first_null_map) return false; + + unsigned j = 0; + short has_mask = 1; + const __m128i zeros = _mm_setzero_si128(); + if (first.size > 3 && second.size > 2) + { + const int full = -1, none = 0; + for (; j < first.size-3 && has_mask; j += 4) + { + has_mask = 0; + const __m128i f_data = _mm_lddqu_si128(reinterpret_cast(first.data+j)); + __m128i bitmask = has_first_null_map ? _mm_set_epi32((first_null_map[j+3])? full: none, + (first_null_map[j+2])? full: none, + (first_null_map[j+1])? full: none, + (first_null_map[j]) ? full: none + ) + :zeros; + + unsigned i = 0; + for (; i < second.size-3 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 4) + { + const __m128i s_data = _mm_lddqu_si128(reinterpret_cast(second.data+i)); + const __m128i second_nm_mask = (has_second_null_map)? _mm_cvtepi8_epi32(_mm_lddqu_si128(reinterpret_cast(second_null_map+i))) + : zeros; + + bitmask = + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + second_nm_mask, + _mm_cmpeq_epi32(f_data, s_data)), + _mm_andnot_si128( + _mm_shuffle_epi32(second_nm_mask, _MM_SHUFFLE(2,1,0,3)), + _mm_cmpeq_epi32(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(2,1,0,3))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi32(second_nm_mask, _MM_SHUFFLE(1,0,3,2)), + _mm_cmpeq_epi32(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(1,0,3,2)))), + _mm_andnot_si128( + _mm_shuffle_epi32(second_nm_mask, _MM_SHUFFLE(0,3,2,1)), + _mm_cmpeq_epi32(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(0,3,2,1))))) + ), + bitmask); + } + + if (i < second.size) + { + for (; i < second.size && !has_mask; i++) + { + if (has_second_null_map && second_null_map[i]) continue; + __m128i r_i = _mm_set1_epi32(second.data[i]); + bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi32(f_data, r_i)); + has_mask = _mm_test_all_ones(bitmask); + } + } + } + } + + bool found = false; + for (; j < first.size && has_mask; j++) + { + found = (has_first_null_map && first_null_map[j])? true: false; + for (unsigned i = 0; i < second.size && !found; i ++) + { + if (has_second_null_map && second_null_map[i]) continue; + found = (second.data[i] == first.data[j]); + } + if (!found) + return false; + } + return has_mask || found; +} + +// SSE4.2 Int64 specialization +template <> +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >(const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + if (first.size == 0) return true; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + if (has_first_null_map != has_second_null_map && has_first_null_map) return false; + + unsigned j = 0; + short has_mask = 1; + const Int64 full = -1, none = 0; + const __m128i zeros = _mm_setzero_si128(); + for (; j < first.size-1 && has_mask; j += 2) + { + has_mask = 0; + const __m128i f_data = _mm_lddqu_si128(reinterpret_cast(first.data+j)); + __m128i bitmask = has_first_null_map ? _mm_set_epi64x((first_null_map[j+1])? full: none, + (first_null_map[j]) ? full: none + ) + : zeros; + unsigned i = 0; + for (; i < second.size-1 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 2) + { + const __m128i s_data = _mm_lddqu_si128(reinterpret_cast(second.data+i)); + const __m128i second_nm_mask = (has_second_null_map)? _mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast(second_null_map+i))) + : zeros; + bitmask = + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + second_nm_mask, + _mm_cmpeq_epi32(f_data, s_data)), + _mm_andnot_si128( + _mm_shuffle_epi32(second_nm_mask, _MM_SHUFFLE(1,0,3,2)), + _mm_cmpeq_epi64(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(1,0,3,2))))) + ,bitmask); + } + + if (i < second.size) + { + for (; i < second.size && !has_mask; i++) + { + if (has_second_null_map && second_null_map[i]) continue; + __m128i v_i = _mm_set1_epi64x(second.data[i]); + bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi64(f_data, v_i)); + has_mask = _mm_test_all_ones(bitmask); + } + } + } + + bool found = false; + for (; j < first.size && has_mask; j++) + { + // skip null elements since both have at least one + found = (has_first_null_map && first_null_map[j])? true: false; + for (unsigned i = 0; i < second.size && !found; i ++) + { + if (has_second_null_map && second_null_map[i]) continue; + found = (second.data[i] == first.data[j]); + } + if (!found) + return false; + } + return has_mask || found; +} + +// SSE4.2 Int16_t specialization +template <> +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >(const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + if (first.size == 0) return true; + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + if (has_first_null_map != has_second_null_map && has_first_null_map) return false; + + unsigned j = 0; + short has_mask = 1; + const int16_t full = -1, none = 0; + const __m128i zeros = _mm_setzero_si128(); + if (first.size > 6 && second.size > 6) + { + for (; j < first.size-7 && has_mask; j += 8) + { + has_mask = 0; + const __m128i f_data = _mm_lddqu_si128(reinterpret_cast(first.data+j)); + __m128i bitmask = has_first_null_map ? _mm_set_epi16((first_null_map[j+7])? full: none, (first_null_map[j+6])? full: none, + (first_null_map[j+5])? full: none, (first_null_map[j+4])? full: none, + (first_null_map[j+3])? full: none, (first_null_map[j+2])? full: none, + (first_null_map[j+1])? full: none, (first_null_map[j]) ? full: none + ) + :zeros; + unsigned i = 0; + for (; i < second.size-7 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 8) + { + const __m128i s_data = _mm_lddqu_si128(reinterpret_cast(second.data+i)); + const __m128i second_nm_mask = (has_second_null_map)? _mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast(second_null_map+i))) + : zeros; + bitmask = + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + second_nm_mask, + _mm_cmpeq_epi16(f_data, s_data)), + _mm_andnot_si128( + _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14)), + _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)), + _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)))), + _mm_andnot_si128( + _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10)), + _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10))))) + ), + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)), + _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)))), + _mm_andnot_si128( + _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6)), + _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)), + _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)))), + _mm_andnot_si128( + _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)), + _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)))))) + ), + bitmask); + } + + if (i < second.size) + { + for (; i < second.size && !has_mask; i++) + { + if (has_second_null_map && second_null_map[i]) continue; + __m128i v_i = _mm_set1_epi16(second.data[i]); + bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi16(f_data, v_i)); + has_mask = _mm_test_all_ones(bitmask); + } + } + } + } + + bool found = false; + for (; j < first.size && (has_mask || first.size <= 2); j++) + { + found = (has_first_null_map && first_null_map[j])? true: false; + for (unsigned i = 0; i < second.size && !found; i ++) + { + if (has_second_null_map && second_null_map[i]) continue; + found = (second.data[i] == first.data[j]); + } + if (!found) + return false; + } + return has_mask || found; +} +#endif + +// SSE4.2 Int8_t specialization +template <> +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >(const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + if (first.size == 0) return true; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + if (has_first_null_map != has_second_null_map && has_first_null_map) return false; + + unsigned j = 0; + short has_mask = 1; + const int full = -1, none = 0; + const __m128i zeros = _mm_setzero_si128(); + if (first.size > 15) +{ + for (; j < first.size-15 && has_mask; j += 16) + { + has_mask = 0; + const __m128i f_data = _mm_lddqu_si128(reinterpret_cast(first.data+j)); + __m128i bitmask = has_first_null_map ? _mm_set_epi8((first_null_map[j+15])? full: none, (first_null_map[j+14])? full: none, + (first_null_map[j+13])? full: none, (first_null_map[j+12])? full: none, + (first_null_map[j+11])? full: none, (first_null_map[j+10])? full: none, + (first_null_map[j+9]) ? full: none, (first_null_map[j+8]) ? full: none, + (first_null_map[j+7]) ? full: none, (first_null_map[j+6]) ? full: none, + (first_null_map[j+5]) ? full: none, (first_null_map[j+4]) ? full: none, + (first_null_map[j+3]) ? full: none, (first_null_map[j+2]) ? full: none, + (first_null_map[j+1]) ? full: none, (first_null_map[j]) ? full: none + ) + : zeros; + unsigned i = 0; + for (; i < second.size-15 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 16) + { + const __m128i s_data = _mm_lddqu_si128(reinterpret_cast(second.data+i)); + const __m128i second_nm_mask = (has_second_null_map)? _mm_lddqu_si128(reinterpret_cast(second_null_map+i)) + : zeros; + bitmask = + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + second_nm_mask, + _mm_cmpeq_epi8(f_data, s_data)), + _mm_andnot_si128( + _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14)))), + _mm_andnot_si128( + _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13))))) + ), + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)))), + _mm_andnot_si128( + _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10)))), + _mm_andnot_si128( + _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9))))))), + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)))), + _mm_andnot_si128( + _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6)))), + _mm_andnot_si128( + _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5)))))), + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)))), + _mm_andnot_si128( + _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)))), + _mm_andnot_si128( + _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1)))))))), + bitmask); + } + + if (i < second.size) + { + for (; i < second.size && !has_mask; i++) + { + if (has_second_null_map && second_null_map[i]) continue; + __m128i v_i = _mm_set1_epi8(second.data[i]); + bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi8(f_data, v_i)); + has_mask = _mm_test_all_ones(bitmask); + } + } + } + } + + bool found = false; + for (; j < first.size && has_mask; j++) + { + found = (has_first_null_map && first_null_map[j])? true: false; + for (unsigned i = 0; i < second.size && !found; i ++) + { + if (has_second_null_map && second_null_map[i]) continue; + found = (second.data[i] == first.data[j]); + } + if (!found) + return false; + } + + return has_mask || found; +} + + template < typename FirstSliceType, typename SecondSliceType, bool (*isEqual)(const FirstSliceType &, const SecondSliceType &, size_t, size_t), @@ -551,53 +1331,6 @@ bool sliceHasImpl(const FirstSliceType & first, const SecondSliceType & second, } -template -bool sliceEqualElements(const NumericArraySlice & first [[maybe_unused]], - const NumericArraySlice & second [[maybe_unused]], - size_t first_ind [[maybe_unused]], - size_t second_ind [[maybe_unused]]) -{ - /// TODO: Decimal scale - if constexpr (is_decimal && is_decimal) - return accurate::equalsOp(first.data[first_ind].value, second.data[second_ind].value); - else if constexpr (is_decimal || is_decimal) - return false; - else - return accurate::equalsOp(first.data[first_ind], second.data[second_ind]); -} - -template -bool sliceEqualElements(const NumericArraySlice &, const GenericArraySlice &, size_t, size_t) -{ - return false; -} - -template -bool sliceEqualElements(const GenericArraySlice &, const NumericArraySlice &, size_t, size_t) -{ - return false; -} - -inline ALWAYS_INLINE bool sliceEqualElements(const GenericArraySlice & first, const GenericArraySlice & second, size_t first_ind, size_t second_ind) -{ - return first.elements->compareAt(first_ind + first.begin, second_ind + second.begin, *second.elements, -1) == 0; -} - -template -bool insliceEqualElements(const NumericArraySlice & first [[maybe_unused]], - size_t first_ind [[maybe_unused]], - size_t second_ind [[maybe_unused]]) -{ - if constexpr (is_decimal) - return accurate::equalsOp(first.data[first_ind].value, first.data[second_ind].value); - else - return accurate::equalsOp(first.data[first_ind], first.data[second_ind]); -} -inline ALWAYS_INLINE bool insliceEqualElements(const GenericArraySlice & first, size_t first_ind, size_t second_ind) -{ - return first.elements->compareAt(first_ind + first.begin, second_ind + first.begin, *first.elements, -1) == 0; -} - template bool sliceHas(const NumericArraySlice & first, const NumericArraySlice & second) { diff --git a/src/Functions/GatherUtils/CMakeLists.txt b/src/Functions/GatherUtils/CMakeLists.txt index f30527c2a46..731407e774c 100644 --- a/src/Functions/GatherUtils/CMakeLists.txt +++ b/src/Functions/GatherUtils/CMakeLists.txt @@ -11,6 +11,10 @@ if (HAS_SUGGEST_DESTRUCTOR_OVERRIDE) target_compile_definitions(clickhouse_functions_gatherutils PUBLIC HAS_SUGGEST_DESTRUCTOR_OVERRIDE) endif() +if (HAVE_AVX2) + target_compile_options(clickhouse_functions_gatherutils PRIVATE -mavx2 -DNAMESPACE=AVX2) +endif() + if (STRIP_DEBUG_SYMBOLS_FUNCTIONS) target_compile_options(clickhouse_functions_gatherutils PRIVATE "-g0") endif() diff --git a/src/Functions/tests/gtest_hasAll.cpp b/src/Functions/tests/gtest_hasAll.cpp new file mode 100644 index 00000000000..bbc841e7605 --- /dev/null +++ b/src/Functions/tests/gtest_hasAll.cpp @@ -0,0 +1,104 @@ +#include + +#include + +using namespace DB::GatherUtils; + + + +template +void array_init(T* first, size_t first_size, T* second, size_t second_size, bool expected_return) { + T i = 0; + for (; i < second_size; i++) { + second[i] = i; + } + for (i=0; i < first_size; i++) { + first[i] = second[std::rand()%second_size]; + } + // Set one element different from + if (!expected_return) { + first[first_size-1] = second_size+1; + } +} + +void null_map_init(UInt8 * null_map, size_t null_map_size, size_t nb_elem) { + for (int i =0; i < null_map_size-1 && i < nb_elem; i++) { + null_map[std::rand()%null_map_size-1] = 1; + } +} + +template +bool test_hasAll(size_t first_size, size_t second_size, bool have_null_map, bool expected_output) { + T* first_data = new T [first_size]; + T* second_data = new T [second_size]; + + UInt8 *first_nm = nullptr, *second_nm = nullptr; + if (have_null_map) { + first_nm = new UInt8 [first_size]; + second_nm = new UInt8 [second_size]; + null_map_init(first_nm, first_size, 5); + null_map_init(second_nm, second_size, 2); + } + + array_init(first_data, first_size, second_data, second_size, expected_output); + + NumericArraySlice first = {first_data, first_size}; + NumericArraySlice second = {second_data, second_size}; + + // Test + /// Check if all first array are elements from second array, overloaded for various combinations of types. + return sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >(first, second, nullptr, nullptr); +} + +TEST(HasAll, integer) +{ + bool test1 = test_hasAll(4, 100, false, true); + bool test2 = test_hasAll(4, 100, false, false); + bool test3 = test_hasAll(100, 4096, false, true); + bool test4 = test_hasAll(100, 4096, false, false); + + ASSERT_EQ(test1, true); + ASSERT_EQ(test2, false); + ASSERT_EQ(test3, true); + ASSERT_EQ(test4, false); +} + + +TEST(HasAll, int64) +{ + bool test1 = test_hasAll(2, 100, false, true); + bool test2 = test_hasAll(2, 100, false, false); + bool test3 = test_hasAll(100, 4096, false, true); + bool test4 = test_hasAll(100, 4096, false, false); + + ASSERT_EQ(test1, true); + ASSERT_EQ(test2, false); + ASSERT_EQ(test3, true); + ASSERT_EQ(test4, false); +} + +TEST(HasAll, int16) +{ + bool test1 = test_hasAll(2, 100, false, true); + bool test2 = test_hasAll(2, 100, false, false); + bool test3 = test_hasAll(100, 4096, false, true); + bool test4 = test_hasAll(100, 4096, false, false); + + ASSERT_EQ(test1, true); + ASSERT_EQ(test2, false); + ASSERT_EQ(test3, true); + ASSERT_EQ(test4, false); +} + +TEST(HasAll, int8) +{ + bool test1 = test_hasAll(2, 100, false, true); + bool test2 = test_hasAll(2, 100, false, false); + bool test3 = test_hasAll(50, 125, false, true); + bool test4 = test_hasAll(50, 125, false, false); + + ASSERT_EQ(test1, true); + ASSERT_EQ(test2, false); + ASSERT_EQ(test3, true); + ASSERT_EQ(test4, false); +} From abecb8114f470aa2b00122ee6079a32d6b17d62a Mon Sep 17 00:00:00 2001 From: Jakub Kuklis Date: Wed, 18 Aug 2021 15:19:31 +0200 Subject: [PATCH 002/239] Refactoring the hasAll gtest so that it works with the original hasAll --- src/Functions/tests/gtest_hasAll.cpp | 97 +++++++++++++++------------- 1 file changed, 53 insertions(+), 44 deletions(-) diff --git a/src/Functions/tests/gtest_hasAll.cpp b/src/Functions/tests/gtest_hasAll.cpp index bbc841e7605..310c059bbbc 100644 --- a/src/Functions/tests/gtest_hasAll.cpp +++ b/src/Functions/tests/gtest_hasAll.cpp @@ -5,57 +5,66 @@ using namespace DB::GatherUtils; - template -void array_init(T* first, size_t first_size, T* second, size_t second_size, bool expected_return) { - T i = 0; - for (; i < second_size; i++) { - second[i] = i; +void array_init(T* elements_to_have, size_t elements_to_have_count, T* set_elements, size_t set_size, bool expected_output) { + for (T i = 0; i < set_size; ++i) + { + set_elements[i] = i; } - for (i=0; i < first_size; i++) { - first[i] = second[std::rand()%second_size]; + for (T i = 0; i < elements_to_have_count; ++i) + { + elements_to_have[i] = set_elements[std::rand() % set_size]; } - // Set one element different from - if (!expected_return) { - first[first_size-1] = second_size+1; + if (!expected_output) + { + // make one element to be searched for missing from the target set + elements_to_have[elements_to_have_count - 1] = set_size + 1; } } -void null_map_init(UInt8 * null_map, size_t null_map_size, size_t nb_elem) { - for (int i =0; i < null_map_size-1 && i < nb_elem; i++) { - null_map[std::rand()%null_map_size-1] = 1; +void null_map_init(UInt8 * null_map, size_t null_map_size, size_t null_elements_count) +{ + for (int i = 0; i < null_map_size; ++i) + { + null_map[i] = 0; + } + for (int i = 0; i < null_map_size - 1 && i < null_elements_count; ++i) + { + null_map[std::rand() % null_map_size - 1] = 1; } } template -bool test_hasAll(size_t first_size, size_t second_size, bool have_null_map, bool expected_output) { - T* first_data = new T [first_size]; - T* second_data = new T [second_size]; +bool testHasAll(size_t elements_to_have_count, size_t set_size, bool have_null_map, bool expected_output) +{ + T * set_elements = new T[set_size]; + T * elements_to_have = new T[elements_to_have_count]; - UInt8 *first_nm = nullptr, *second_nm = nullptr; - if (have_null_map) { - first_nm = new UInt8 [first_size]; - second_nm = new UInt8 [second_size]; - null_map_init(first_nm, first_size, 5); - null_map_init(second_nm, second_size, 2); + UInt8 * first_nm = nullptr, * second_nm = nullptr; + if (have_null_map) + { + first_nm = new UInt8[set_size]; + second_nm = new UInt8[elements_to_have_count]; + null_map_init(first_nm, set_size, 5); + null_map_init(second_nm, elements_to_have_count, 2); } - array_init(first_data, first_size, second_data, second_size, expected_output); + array_init(elements_to_have, elements_to_have_count, set_elements, set_size, expected_output); - NumericArraySlice first = {first_data, first_size}; - NumericArraySlice second = {second_data, second_size}; + NumericArraySlice first = {set_elements, set_size}; + NumericArraySlice second = {elements_to_have, elements_to_have_count}; - // Test - /// Check if all first array are elements from second array, overloaded for various combinations of types. - return sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >(first, second, nullptr, nullptr); + /// Check whether all elements of the second array are also elements of the first array, overloaded for various combinations of types. + return sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + first, second, first_nm, second_nm); } TEST(HasAll, integer) { - bool test1 = test_hasAll(4, 100, false, true); - bool test2 = test_hasAll(4, 100, false, false); - bool test3 = test_hasAll(100, 4096, false, true); - bool test4 = test_hasAll(100, 4096, false, false); + bool test1 = testHasAll(4, 100, false, true); + bool test2 = testHasAll(4, 100, false, false); + bool test3 = testHasAll(100, 4096, false, true); + bool test4 = testHasAll(100, 4096, false, false); ASSERT_EQ(test1, true); ASSERT_EQ(test2, false); @@ -66,10 +75,10 @@ TEST(HasAll, integer) TEST(HasAll, int64) { - bool test1 = test_hasAll(2, 100, false, true); - bool test2 = test_hasAll(2, 100, false, false); - bool test3 = test_hasAll(100, 4096, false, true); - bool test4 = test_hasAll(100, 4096, false, false); + bool test1 = testHasAll(2, 100, false, true); + bool test2 = testHasAll(2, 100, false, false); + bool test3 = testHasAll(100, 4096, false, true); + bool test4 = testHasAll(100, 4096, false, false); ASSERT_EQ(test1, true); ASSERT_EQ(test2, false); @@ -79,10 +88,10 @@ TEST(HasAll, int64) TEST(HasAll, int16) { - bool test1 = test_hasAll(2, 100, false, true); - bool test2 = test_hasAll(2, 100, false, false); - bool test3 = test_hasAll(100, 4096, false, true); - bool test4 = test_hasAll(100, 4096, false, false); + bool test1 = testHasAll(2, 100, false, true); + bool test2 = testHasAll(2, 100, false, false); + bool test3 = testHasAll(100, 4096, false, true); + bool test4 = testHasAll(100, 4096, false, false); ASSERT_EQ(test1, true); ASSERT_EQ(test2, false); @@ -92,10 +101,10 @@ TEST(HasAll, int16) TEST(HasAll, int8) { - bool test1 = test_hasAll(2, 100, false, true); - bool test2 = test_hasAll(2, 100, false, false); - bool test3 = test_hasAll(50, 125, false, true); - bool test4 = test_hasAll(50, 125, false, false); + bool test1 = testHasAll(2, 100, false, true); + bool test2 = testHasAll(2, 100, false, false); + bool test3 = testHasAll(50, 125, false, true); + bool test4 = testHasAll(50, 125, false, false); ASSERT_EQ(test1, true); ASSERT_EQ(test2, false); From 92ec28a87b42653d7d1abef2369e4e6e3407a2cb Mon Sep 17 00:00:00 2001 From: Jakub Kuklis Date: Wed, 18 Aug 2021 16:52:14 +0200 Subject: [PATCH 003/239] Refactoring the new SIMD hasAll implementation to comply with current hasAll implementation (swapping 'first' and 'second' arguments meaning) and with ClickHouse C++ guidelines --- src/Functions/GatherUtils/Algorithms.h | 622 +++++++++++++------------ 1 file changed, 336 insertions(+), 286 deletions(-) diff --git a/src/Functions/GatherUtils/Algorithms.h b/src/Functions/GatherUtils/Algorithms.h index 245794da976..2812821e339 100644 --- a/src/Functions/GatherUtils/Algorithms.h +++ b/src/Functions/GatherUtils/Algorithms.h @@ -7,7 +7,7 @@ #include #include #include "GatherUtils.h" -#if defined(__AVX2__) +#if defined(__AVX2__) || defined(__SSE4_2__) #include #endif @@ -420,7 +420,6 @@ void NO_INLINE conditional(SourceA && src_a, SourceB && src_b, Sink && sink, con } - template bool sliceEqualElements(const NumericArraySlice & first [[maybe_unused]], const NumericArraySlice & second [[maybe_unused]], @@ -469,7 +468,6 @@ inline ALWAYS_INLINE bool insliceEqualElements(const GenericArraySlice & first, } - /// For details of Knuth-Morris-Pratt string matching algorithm see /// https://en.wikipedia.org/wiki/Knuth%E2%80%93Morris%E2%80%93Pratt_algorithm. /// A "prefix-function" is defined as: i-th element is the length of the longest of all prefixes that end in i-th position @@ -536,91 +534,97 @@ bool sliceHasImplAnyAll(const FirstSliceType & first, const SecondSliceType & se #if defined(__AVX2__) // AVX2 - Int specialization template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >(const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * second_null_map, const UInt8 * first_null_map) { - if (first.size == 0) return true; + if (second.size == 0) + return true; - const bool has_first_null_map = first_null_map != nullptr; + const bool has_first_null_map = first_null_map != nullptr; const bool has_second_null_map = second_null_map != nullptr; - if (has_second_null_map != has_first_null_map && has_first_null_map) return false; + if (!has_first_null_map && has_second_null_map) + return false; unsigned j = 0; short has_mask = 1; const int full = -1, none = 0; const __m256i ones = _mm256_set1_epi32(full); const __m256i zeros = _mm256_setzero_si256(); - if (first.size > 7 && second.size > 7) + if (second.size > 7 && first.size > 7) { - for (; j < first.size-7 && has_mask; j += 8) + for (; j < second.size - 7 && has_mask; j += 8) { has_mask = 0; - const __m256i f_data = _mm256_lddqu_si256(reinterpret_cast(first.data+j)); - // bitmask is fulfilled with ones for ones which are considered as null in the corresponding null map, 0 otherwise; - __m256i bitmask = has_first_null_map ? _mm256_set_epi32((first_null_map[j+7])? full: none, - (first_null_map[j+6])? full: none, - (first_null_map[j+5])? full: none, - (first_null_map[j+4])? full: none, - (first_null_map[j+3])? full: none, - (first_null_map[j+2])? full: none, - (first_null_map[j+1])? full: none, - (first_null_map[j]) ? full: none - ) - :zeros; + const __m256i f_data = _mm256_lddqu_si256(reinterpret_cast(second.data + j)); + // bitmask is filled with minus ones for ones which are considered as null in the corresponding null map, 0 otherwise; + __m256i bitmask = has_second_null_map ? + _mm256_set_epi32( + (second_null_map[j + 7]) ? full : none, + (second_null_map[j + 6]) ? full : none, + (second_null_map[j + 5]) ? full : none, + (second_null_map[j + 4]) ? full : none, + (second_null_map[j + 3]) ? full : none, + (second_null_map[j + 2]) ? full : none, + (second_null_map[j + 1]) ? full : none, + (second_null_map[j]) ? full : none) + : zeros; size_t i = 0; - // Browse second array to try to match ell first elements - for (; i < second.size-7 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 8) + // Search first array to try to match all second elements + for (; i < first.size - 7 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 8) { - const __m256i s_data = _mm256_lddqu_si256(reinterpret_cast(second.data+i)); + const __m256i s_data = _mm256_lddqu_si256(reinterpret_cast(first.data + i)); // Create a mask to avoid to compare null elements // set_m128i takes two arguments: (high segment, low segment) that are two __m128i convert from 8bits to 32bits to fit to our following operations - const __m256i second_nm_mask = _mm256_set_m128i(_mm_cvtepi8_epi32(_mm_lddqu_si128(reinterpret_cast(second_null_map+i+4))), - _mm_cvtepi8_epi32(_mm_lddqu_si128(reinterpret_cast(second_null_map+i)))); + const __m256i first_nm_mask = _mm256_set_m128i( + _mm_cvtepi8_epi32(_mm_lddqu_si128(reinterpret_cast(first_null_map + i + 4))), + _mm_cvtepi8_epi32(_mm_lddqu_si128(reinterpret_cast(first_null_map + i)))); bitmask = _mm256_or_si256( _mm256_or_si256( _mm256_or_si256( _mm256_or_si256( _mm256_andnot_si256( - second_nm_mask, + first_nm_mask, _mm256_cmpeq_epi32(f_data, s_data)), _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(second_nm_mask, _mm256_set_epi32(6,5,4,3,2,1,0,7)), + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(6,5,4,3,2,1,0,7)), _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(6,5,4,3,2,1,0,7))))), _mm256_or_si256( _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(second_nm_mask, _mm256_set_epi32(5,4,3,2,1,0,7,6)), + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(5,4,3,2,1,0,7,6)), _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(5,4,3,2,1,0,7,6)))), _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(second_nm_mask, _mm256_set_epi32(4,3,2,1,0,7,6,5)), + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(4,3,2,1,0,7,6,5)), _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(4,3,2,1,0,7,6,5))))) ), _mm256_or_si256( _mm256_or_si256( _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(second_nm_mask, _mm256_set_epi32(3,2,1,0,7,6,5,4)), + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(3,2,1,0,7,6,5,4)), _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(3,2,1,0,7,6,5,4)))), _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(second_nm_mask, _mm256_set_epi32(2,1,0,7,6,5,4,3)), + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(2,1,0,7,6,5,4,3)), _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(2,1,0,7,6,5,4,3))))), _mm256_or_si256( _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(second_nm_mask, _mm256_set_epi32(1,0,7,6,5,4,3,2)), + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(1,0,7,6,5,4,3,2)), _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(1,0,7,6,5,4,3,2)))), _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(second_nm_mask, _mm256_set_epi32(0,7,6,5,4,3,2,1)), - _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(0,7,6,5,4,3,2,1))))))) - ,bitmask); + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(0,7,6,5,4,3,2,1)), + _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(0,7,6,5,4,3,2,1))))))), + bitmask); } - if (i < second.size) + if (i < first.size) { // Loop(i)-jam - for (; i < second.size && !has_mask; i++) + for (; i < first.size && !has_mask; ++i) { - if (second_null_map[i]) continue; - __m256i v_i = _mm256_set1_epi32(second.data[i]); - bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi32(f_data, v_i)); + if (has_first_null_map && first_null_map[i]) + continue; + __m256i v_i = _mm256_set1_epi32(first.data[i]); + bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi32(f_data, v_i)); has_mask = _mm256_testc_si256 (bitmask, ones); } } @@ -629,13 +633,15 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll -// inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >(const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +// inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( +// const NumericArraySlice & second, const NumericArraySlice & first, const UInt8 * second_null_map, const UInt8 * first_null_map) // { -// return sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements > (static_cast &>(first), static_cast &>(second), first_null_map, second_null_map); +// return sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements > ( +// static_cast &>(second), static_cast &>(first), second_null_map, first_null_map); // } // AVX2 Int64 specialization template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >(const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * second_null_map, const UInt8 * first_null_map) { - if (first.size == 0) return true; + if (second.size == 0) + return true; - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - if (has_second_null_map != has_first_null_map && has_first_null_map) return false; + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + if (!has_first_null_map && has_second_null_map) + return false; unsigned j = 0; short has_mask = 1; const int full = -1, none = 0; const __m256i ones = _mm256_set1_epi64x(full); const __m256i zeros = _mm256_setzero_si256(); - if (first.size > 3 && second.size > 3) + if (second.size > 3 && first.size > 3) { - for (; j < first.size-3 && has_mask; j += 4) + for (; j < second.size - 3 && has_mask; j += 4) { has_mask = 0; - const __m256i f_data = _mm256_lddqu_si256(reinterpret_cast(first.data+j)); - __m256i bitmask = has_first_null_map ? _mm256_set_epi64x((first_null_map[j+3])? full: none, - (first_null_map[j+2])? full: none, - (first_null_map[j+1])? full: none, - (first_null_map[j]) ? full: none - ) - :zeros; + const __m256i f_data = _mm256_lddqu_si256(reinterpret_cast(second.data + j)); + __m256i bitmask = has_second_null_map ? + _mm256_set_epi64x( + (second_null_map[j + 3])? full : none, + (second_null_map[j + 2])? full : none, + (second_null_map[j + 1])? full : none, + (second_null_map[j]) ? full : none) + : zeros; unsigned i = 0; - for (; i < second.size-3 && !has_mask; has_mask = _mm256_testc_si256 (bitmask, ones), i += 4) + for (; i < first.size - 3 && !has_mask; has_mask = _mm256_testc_si256 (bitmask, ones), i += 4) { - const __m256i s_data = _mm256_lddqu_si256(reinterpret_cast(second.data+i)); - const __m256i second_nm_mask = _mm256_set_m128i(_mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast(second_null_map+i+2))), - _mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast(second_null_map+i)))); + const __m256i s_data = _mm256_lddqu_si256(reinterpret_cast(first.data + i)); + const __m256i first_nm_mask = _mm256_set_m128i( + _mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast(first_null_map + i + 2))), + _mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast(first_null_map + i)))); bitmask = _mm256_or_si256( _mm256_or_si256( _mm256_or_si256( _mm256_andnot_si256( - second_nm_mask, + first_nm_mask, _mm256_cmpeq_epi64(f_data, s_data)), _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(second_nm_mask, _mm256_set_epi32(5,4,3,2,1,0,7,6)), + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(5,4,3,2,1,0,7,6)), _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(5,4,3,2,1,0,7,6))))), _mm256_or_si256( _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(second_nm_mask, _mm256_set_epi32(3,2,1,0,7,6,5,4)), + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(3,2,1,0,7,6,5,4)), _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(3,2,1,0,7,6,5,4)))), _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(second_nm_mask, _mm256_set_epi32(1,0,7,6,5,4,3,2)), - _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(1,0,7,6,5,4,3,2))))) - ), + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(1,0,7,6,5,4,3,2)), + _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(1,0,7,6,5,4,3,2)))))), bitmask); } - if (i < second.size) + if (i < first.size) { - for (; i < second.size && !has_mask; i++) + for (; i < first.size && !has_mask; ++i) { - if (second_null_map[i]) continue; - __m256i v_i = _mm256_set1_epi64x(second.data[i]); - bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi64(f_data, v_i)); + if (has_first_null_map && first_null_map[i]) + continue; + __m256i v_i = _mm256_set1_epi64x(first.data[i]); + bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi64(f_data, v_i)); has_mask = _mm256_testc_si256 (bitmask, ones); } } @@ -723,13 +736,14 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >(const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * second_null_map, const UInt8 * first_null_map) { - if (first.size == 0) return true; + if (second.size == 0) + return true; - const bool has_second_null_map = second_null_map != nullptr; const bool has_first_null_map = first_null_map != nullptr; - if (has_second_null_map != has_first_null_map && has_first_null_map) return false; + const bool has_second_null_map = second_null_map != nullptr; + if (!has_first_null_map && has_second_null_map) + return false; unsigned j = 0; short has_mask = 1; const int full = -1, none = 0; const __m256i ones = _mm256_set1_epi16(full); const __m256i zeros = _mm256_setzero_si256(); - if (first.size > 15 && second.size > 15) + if (second.size > 15 && first.size > 15) { - for (; j < first.size-15 && has_mask; j += 16) + for (; j < second.size - 15 && has_mask; j += 16) { has_mask = 0; - const __m256i f_data = _mm256_lddqu_si256(reinterpret_cast(first.data+j)); - __m256i bitmask = has_first_null_map ? _mm256_set_epi16((first_null_map[j+15])? full: none, (first_null_map[j+14])? full: none, - (first_null_map[j+13])? full: none, (first_null_map[j+12])? full: none, - (first_null_map[j+11])? full: none, (first_null_map[j+10])? full: none, - (first_null_map[j+9])? full: none, (first_null_map[j+8])? full: none, - (first_null_map[j+7])? full: none, (first_null_map[j+6])? full: none, - (first_null_map[j+5])? full: none, (first_null_map[j+4])? full: none, - (first_null_map[j+3])? full: none, (first_null_map[j+2])? full: none, - (first_null_map[j+1])? full: none, (first_null_map[j]) ? full: none - ) - :zeros; + const __m256i f_data = _mm256_lddqu_si256(reinterpret_cast(second.data + j)); + __m256i bitmask = has_second_null_map ? + _mm256_set_epi16( + (second_null_map[j + 15]) ? full : none, (second_null_map[j + 14]) ? full : none, + (second_null_map[j + 13]) ? full : none, (second_null_map[j + 12]) ? full : none, + (second_null_map[j + 11]) ? full : none, (second_null_map[j + 10]) ? full : none, + (second_null_map[j + 9]) ? full : none, (second_null_map[j + 8])? full : none, + (second_null_map[j + 7]) ? full : none, (second_null_map[j + 6])? full : none, + (second_null_map[j + 5]) ? full : none, (second_null_map[j + 4])? full : none, + (second_null_map[j + 3]) ? full : none, (second_null_map[j + 2])? full : none, + (second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full : none) + : zeros; unsigned i = 0; - for (; i < second.size-15 && !has_mask; has_mask = _mm256_testc_si256 (bitmask, ones), i += 16) + for (; i < first.size - 15 && !has_mask; has_mask = _mm256_testc_si256 (bitmask, ones), i += 16) { - const __m256i s_data = _mm256_lddqu_si256(reinterpret_cast(second.data+i)); - const __m256i second_nm_mask = _mm256_set_m128i(_mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast(second_null_map+i+8))), - _mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast(second_null_map+i)))); + const __m256i s_data = _mm256_lddqu_si256(reinterpret_cast(first.data + i)); + const __m256i first_nm_mask = _mm256_set_m128i( + _mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast(first_null_map+i+8))), + _mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast(first_null_map+i)))); bitmask = _mm256_or_si256( _mm256_or_si256( @@ -781,79 +800,80 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >(const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * second_null_map, const UInt8 * first_null_map) { - if (first.size == 0) return true; + if (second.size == 0) + return true; const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - if (has_second_null_map != has_first_null_map && has_first_null_map) return false; + const bool has_second_null_map = second_null_map != nullptr; + if (!has_first_null_map && has_second_null_map) + return false; unsigned j = 0; short has_mask = 1; const __m128i zeros = _mm_setzero_si128(); - if (first.size > 3 && second.size > 2) + if (second.size > 3 && first.size > 2) { const int full = -1, none = 0; - for (; j < first.size-3 && has_mask; j += 4) + for (; j < second.size - 3 && has_mask; j += 4) { has_mask = 0; - const __m128i f_data = _mm_lddqu_si128(reinterpret_cast(first.data+j)); - __m128i bitmask = has_first_null_map ? _mm_set_epi32((first_null_map[j+3])? full: none, - (first_null_map[j+2])? full: none, - (first_null_map[j+1])? full: none, - (first_null_map[j]) ? full: none - ) - :zeros; + const __m128i f_data = _mm_lddqu_si128(reinterpret_cast(second.data + j)); + __m128i bitmask = has_second_null_map ? + _mm_set_epi32( + (second_null_map[j + 3]) ? full : none, + (second_null_map[j + 2]) ? full : none, + (second_null_map[j + 1]) ? full : none, + (second_null_map[j]) ? full : none) + : zeros; unsigned i = 0; - for (; i < second.size-3 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 4) + for (; i < first.size - 3 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 4) { - const __m128i s_data = _mm_lddqu_si128(reinterpret_cast(second.data+i)); - const __m128i second_nm_mask = (has_second_null_map)? _mm_cvtepi8_epi32(_mm_lddqu_si128(reinterpret_cast(second_null_map+i))) - : zeros; + const __m128i s_data = _mm_lddqu_si128(reinterpret_cast(first.data + i)); + const __m128i first_nm_mask = has_first_null_map ? + _mm_cvtepi8_epi32(_mm_lddqu_si128(reinterpret_cast(first_null_map + i))) + : zeros; bitmask = _mm_or_si128( _mm_or_si128( _mm_or_si128( _mm_andnot_si128( - second_nm_mask, + first_nm_mask, _mm_cmpeq_epi32(f_data, s_data)), _mm_andnot_si128( - _mm_shuffle_epi32(second_nm_mask, _MM_SHUFFLE(2,1,0,3)), + _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(2,1,0,3)), _mm_cmpeq_epi32(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(2,1,0,3))))), _mm_or_si128( _mm_andnot_si128( - _mm_shuffle_epi32(second_nm_mask, _MM_SHUFFLE(1,0,3,2)), + _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(1,0,3,2)), _mm_cmpeq_epi32(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(1,0,3,2)))), _mm_andnot_si128( - _mm_shuffle_epi32(second_nm_mask, _MM_SHUFFLE(0,3,2,1)), + _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(0,3,2,1)), _mm_cmpeq_epi32(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(0,3,2,1))))) ), bitmask); } - if (i < second.size) + if (i < first.size) { - for (; i < second.size && !has_mask; i++) + for (; i < first.size && !has_mask; ++i) { - if (has_second_null_map && second_null_map[i]) continue; - __m128i r_i = _mm_set1_epi32(second.data[i]); - bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi32(f_data, r_i)); + if (has_first_null_map && first_null_map[i]) + continue; + __m128i r_i = _mm_set1_epi32(first.data[i]); + bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi32(f_data, r_i)); has_mask = _mm_test_all_ones(bitmask); } } @@ -946,13 +973,14 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >(const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * second_null_map, const UInt8 * first_null_map) { - if (first.size == 0) return true; + if (second.size == 0) + return true; - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - if (has_first_null_map != has_second_null_map && has_first_null_map) return false; + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + if (!has_first_null_map && has_second_null_map) + return false; unsigned j = 0; short has_mask = 1; const Int64 full = -1, none = 0; const __m128i zeros = _mm_setzero_si128(); - for (; j < first.size-1 && has_mask; j += 2) + for (; j < second.size - 1 && has_mask; j += 2) { has_mask = 0; - const __m128i f_data = _mm_lddqu_si128(reinterpret_cast(first.data+j)); - __m128i bitmask = has_first_null_map ? _mm_set_epi64x((first_null_map[j+1])? full: none, - (first_null_map[j]) ? full: none - ) - : zeros; + const __m128i f_data = _mm_lddqu_si128(reinterpret_cast(second.data + j)); + __m128i bitmask = has_second_null_map ? + _mm_set_epi64x( + (second_null_map[j + 1]) ? full : none, + (second_null_map[j]) ? full : none) + : zeros; unsigned i = 0; - for (; i < second.size-1 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 2) + for (; i < first.size - 1 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 2) { - const __m128i s_data = _mm_lddqu_si128(reinterpret_cast(second.data+i)); - const __m128i second_nm_mask = (has_second_null_map)? _mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast(second_null_map+i))) - : zeros; + const __m128i s_data = _mm_lddqu_si128(reinterpret_cast(first.data + i)); + const __m128i first_nm_mask = has_first_null_map ? + _mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast(first_null_map + i))) + : zeros; bitmask = _mm_or_si128( _mm_or_si128( _mm_andnot_si128( - second_nm_mask, + first_nm_mask, _mm_cmpeq_epi32(f_data, s_data)), _mm_andnot_si128( - _mm_shuffle_epi32(second_nm_mask, _MM_SHUFFLE(1,0,3,2)), - _mm_cmpeq_epi64(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(1,0,3,2))))) - ,bitmask); + _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(1,0,3,2)), + _mm_cmpeq_epi64(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(1,0,3,2))))), + bitmask); } - if (i < second.size) + if (i < first.size) { - for (; i < second.size && !has_mask; i++) + for (; i < first.size && !has_mask; ++i) { - if (has_second_null_map && second_null_map[i]) continue; - __m128i v_i = _mm_set1_epi64x(second.data[i]); - bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi64(f_data, v_i)); + if (has_first_null_map && first_null_map[i]) + continue; + __m128i v_i = _mm_set1_epi64x(first.data[i]); + bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi64(f_data, v_i)); has_mask = _mm_test_all_ones(bitmask); } } } bool found = false; - for (; j < first.size && has_mask; j++) + for (; j < second.size && has_mask; j++) { - // skip null elements since both have at least one - found = (has_first_null_map && first_null_map[j])? true: false; - for (unsigned i = 0; i < second.size && !found; i ++) + found = (has_second_null_map && second_null_map[j]) ? true : false; + for (unsigned i = 0; i < first.size && !found; ++i) { - if (has_second_null_map && second_null_map[i]) continue; - found = (second.data[i] == first.data[j]); + if (has_first_null_map && first_null_map[i]) + continue; + found = (first.data[i] == second.data[j]); } if (!found) return false; @@ -1030,80 +1064,87 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >(const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * second_null_map, const UInt8 * first_null_map) { - if (first.size == 0) return true; - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - if (has_first_null_map != has_second_null_map && has_first_null_map) return false; + if (second.size == 0) + return true; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + if (!has_first_null_map && has_second_null_map) + return false; unsigned j = 0; short has_mask = 1; const int16_t full = -1, none = 0; const __m128i zeros = _mm_setzero_si128(); - if (first.size > 6 && second.size > 6) + if (second.size > 6 && first.size > 6) { - for (; j < first.size-7 && has_mask; j += 8) + for (; j < second.size - 7 && has_mask; j += 8) { has_mask = 0; - const __m128i f_data = _mm_lddqu_si128(reinterpret_cast(first.data+j)); - __m128i bitmask = has_first_null_map ? _mm_set_epi16((first_null_map[j+7])? full: none, (first_null_map[j+6])? full: none, - (first_null_map[j+5])? full: none, (first_null_map[j+4])? full: none, - (first_null_map[j+3])? full: none, (first_null_map[j+2])? full: none, - (first_null_map[j+1])? full: none, (first_null_map[j]) ? full: none - ) - :zeros; + const __m128i f_data = _mm_lddqu_si128(reinterpret_cast(second.data+j)); + __m128i bitmask = has_second_null_map ? + _mm_set_epi16( + (second_null_map[j + 7]) ? full : none, (second_null_map[j + 6]) ? full : none, + (second_null_map[j + 5]) ? full : none, (second_null_map[j + 4]) ? full : none, + (second_null_map[j + 3]) ? full : none, (second_null_map[j + 2]) ? full : none, + (second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full: none) + : zeros; unsigned i = 0; - for (; i < second.size-7 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 8) + for (; i < first.size-7 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 8) { - const __m128i s_data = _mm_lddqu_si128(reinterpret_cast(second.data+i)); - const __m128i second_nm_mask = (has_second_null_map)? _mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast(second_null_map+i))) - : zeros; + const __m128i s_data = _mm_lddqu_si128(reinterpret_cast(first.data + i)); + const __m128i first_nm_mask = has_first_null_map ? + _mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast(first_null_map+i))) + : zeros; bitmask = _mm_or_si128( _mm_or_si128( _mm_or_si128( _mm_or_si128( _mm_andnot_si128( - second_nm_mask, + first_nm_mask, _mm_cmpeq_epi16(f_data, s_data)), _mm_andnot_si128( - _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14)), + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14)), _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14))))), _mm_or_si128( _mm_andnot_si128( - _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)), + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)), _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)))), _mm_andnot_si128( - _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10)), + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10)), _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10))))) ), _mm_or_si128( _mm_or_si128( _mm_andnot_si128( - _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)), + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)), _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)))), _mm_andnot_si128( - _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6)), + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6)), _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6))))), _mm_or_si128( _mm_andnot_si128( - _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)), + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)), _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)))), _mm_andnot_si128( - _mm_shuffle_epi8(second_nm_mask, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)), + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)), _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)))))) ), bitmask); } - if (i < second.size) + if (i < first.size) { - for (; i < second.size && !has_mask; i++) + for (; i < first.size && !has_mask; ++i) { - if (has_second_null_map && second_null_map[i]) continue; - __m128i v_i = _mm_set1_epi16(second.data[i]); - bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi16(f_data, v_i)); + if (has_first_null_map && first_null_map[i]) + continue; + __m128i v_i = _mm_set1_epi16(first.data[i]); + bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi16(f_data, v_i)); has_mask = _mm_test_all_ones(bitmask); } } @@ -1111,57 +1152,62 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >(const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * second_null_map, const UInt8 * first_null_map) { - if (first.size == 0) return true; + if (second.size == 0) + return true; - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - if (has_first_null_map != has_second_null_map && has_first_null_map) return false; + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + if (!has_first_null_map && has_second_null_map) + return false; unsigned j = 0; short has_mask = 1; const int full = -1, none = 0; const __m128i zeros = _mm_setzero_si128(); - if (first.size > 15) -{ - for (; j < first.size-15 && has_mask; j += 16) + if (second.size > 15) + { + for (; j < second.size - 15 && has_mask; j += 16) { has_mask = 0; - const __m128i f_data = _mm_lddqu_si128(reinterpret_cast(first.data+j)); - __m128i bitmask = has_first_null_map ? _mm_set_epi8((first_null_map[j+15])? full: none, (first_null_map[j+14])? full: none, - (first_null_map[j+13])? full: none, (first_null_map[j+12])? full: none, - (first_null_map[j+11])? full: none, (first_null_map[j+10])? full: none, - (first_null_map[j+9]) ? full: none, (first_null_map[j+8]) ? full: none, - (first_null_map[j+7]) ? full: none, (first_null_map[j+6]) ? full: none, - (first_null_map[j+5]) ? full: none, (first_null_map[j+4]) ? full: none, - (first_null_map[j+3]) ? full: none, (first_null_map[j+2]) ? full: none, - (first_null_map[j+1]) ? full: none, (first_null_map[j]) ? full: none - ) - : zeros; + const __m128i f_data = _mm_lddqu_si128(reinterpret_cast(second.data+j)); + __m128i bitmask = has_second_null_map ? + _mm_set_epi8( + (second_null_map[j + 15]) ? full : none, (second_null_map[j + 14]) ? full : none, + (second_null_map[j + 13]) ? full : none, (second_null_map[j + 12]) ? full : none, + (second_null_map[j + 11]) ? full : none, (second_null_map[j + 10]) ? full : none, + (second_null_map[j + 9]) ? full : none, (second_null_map[j + 8]) ? full : none, + (second_null_map[j + 7]) ? full : none, (second_null_map[j + 6]) ? full : none, + (second_null_map[j + 5]) ? full : none, (second_null_map[j + 4]) ? full : none, + (second_null_map[j + 3]) ? full : none, (second_null_map[j + 2]) ? full : none, + (second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full : none) + : zeros; unsigned i = 0; - for (; i < second.size-15 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 16) + for (; i < first.size - 15 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 16) { - const __m128i s_data = _mm_lddqu_si128(reinterpret_cast(second.data+i)); - const __m128i second_nm_mask = (has_second_null_map)? _mm_lddqu_si128(reinterpret_cast(second_null_map+i)) - : zeros; + const __m128i s_data = _mm_lddqu_si128(reinterpret_cast(first.data+i)); + const __m128i first_nm_mask = has_first_null_map ? + _mm_lddqu_si128(reinterpret_cast(first_null_map+i)) + : zeros; bitmask = _mm_or_si128( _mm_or_si128( @@ -1169,89 +1215,91 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll Date: Thu, 19 Aug 2021 11:25:14 +0200 Subject: [PATCH 004/239] Adding a null map negative test, the new hasAll implementation needs correction for that case --- src/Functions/tests/gtest_hasAll.cpp | 64 +++++++++++++++++----------- 1 file changed, 39 insertions(+), 25 deletions(-) diff --git a/src/Functions/tests/gtest_hasAll.cpp b/src/Functions/tests/gtest_hasAll.cpp index 310c059bbbc..b7ba59f91c7 100644 --- a/src/Functions/tests/gtest_hasAll.cpp +++ b/src/Functions/tests/gtest_hasAll.cpp @@ -6,57 +6,58 @@ using namespace DB::GatherUtils; template -void array_init(T* elements_to_have, size_t elements_to_have_count, T* set_elements, size_t set_size, bool expected_output) { - for (T i = 0; i < set_size; ++i) +void arrayInit(T* elements_to_have, size_t nb_elements_to_have, T* array_elements, size_t array_size, bool all_elements_present) { + for (T i = 0; i < array_size; ++i) { - set_elements[i] = i; + array_elements[i] = i; } - for (T i = 0; i < elements_to_have_count; ++i) + for (T i = 0; i < nb_elements_to_have; ++i) { - elements_to_have[i] = set_elements[std::rand() % set_size]; + elements_to_have[i] = array_elements[std::rand() % array_size]; } - if (!expected_output) + if (!all_elements_present) { - // make one element to be searched for missing from the target set - elements_to_have[elements_to_have_count - 1] = set_size + 1; + /// make one element to be searched for missing from the target array + elements_to_have[nb_elements_to_have - 1] = array_size + 1; } } -void null_map_init(UInt8 * null_map, size_t null_map_size, size_t null_elements_count) +void nullMapInit(UInt8 * null_map, size_t null_map_size, size_t nb_null_elements) { for (int i = 0; i < null_map_size; ++i) { null_map[i] = 0; } - for (int i = 0; i < null_map_size - 1 && i < null_elements_count; ++i) + for (int i = 0; i < null_map_size - 1 && i < nb_null_elements; ++i) { - null_map[std::rand() % null_map_size - 1] = 1; + null_map[std::rand() % null_map_size] = 1; } } template -bool testHasAll(size_t elements_to_have_count, size_t set_size, bool have_null_map, bool expected_output) +bool testHasAll(size_t nb_elements_to_have, size_t array_size, bool with_null_maps, bool all_elements_present) { - T * set_elements = new T[set_size]; - T * elements_to_have = new T[elements_to_have_count]; + auto array_elements = std::make_unique(array_size); + auto elements_to_have = std::make_unique(nb_elements_to_have); - UInt8 * first_nm = nullptr, * second_nm = nullptr; - if (have_null_map) + std::unique_ptr first_nm = nullptr, second_nm = nullptr; + if (with_null_maps) { - first_nm = new UInt8[set_size]; - second_nm = new UInt8[elements_to_have_count]; - null_map_init(first_nm, set_size, 5); - null_map_init(second_nm, elements_to_have_count, 2); + first_nm = std::make_unique(array_size); + second_nm = std::make_unique(nb_elements_to_have); + /// add a null to elements to have, but not to the target array, making the answer negative + nullMapInit(first_nm.get(), array_size, 0); + nullMapInit(second_nm.get(), nb_elements_to_have, 1); } - array_init(elements_to_have, elements_to_have_count, set_elements, set_size, expected_output); + arrayInit(elements_to_have.get(), nb_elements_to_have, array_elements.get(), array_size, all_elements_present); - NumericArraySlice first = {set_elements, set_size}; - NumericArraySlice second = {elements_to_have, elements_to_have_count}; + NumericArraySlice first = {array_elements.get(), array_size}; + NumericArraySlice second = {elements_to_have.get(), nb_elements_to_have}; - /// Check whether all elements of the second array are also elements of the first array, overloaded for various combinations of types. + /// check whether all elements of the second array are also elements of the first array, overloaded for various combinations of types. return sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - first, second, first_nm, second_nm); + first, second, first_nm.get(), second_nm.get()); } TEST(HasAll, integer) @@ -111,3 +112,16 @@ TEST(HasAll, int8) ASSERT_EQ(test3, true); ASSERT_EQ(test4, false); } + +TEST(HasAllSingleNullElement, all) +{ + bool test1 = testHasAll(4, 100, true, true); + bool test2 = testHasAll(4, 100, true, true); + bool test3 = testHasAll(4, 100, true, true); + bool test4 = testHasAll(4, 100, true, true); + + ASSERT_EQ(test1, false); + ASSERT_EQ(test2, false); + ASSERT_EQ(test3, false); + ASSERT_EQ(test4, false); +} From 763bd006a75be454f5c109ef306a0e4e538726b1 Mon Sep 17 00:00:00 2001 From: Jakub Kuklis Date: Thu, 19 Aug 2021 14:13:30 +0200 Subject: [PATCH 005/239] Correcting new hasAll implementation for the case with null elements present in 'second' and absent in 'first', refactoring the outer loop remainder into a separate function, improving null checking in the default implementation --- src/Functions/GatherUtils/Algorithms.h | 307 +++++++++++++------------ 1 file changed, 155 insertions(+), 152 deletions(-) diff --git a/src/Functions/GatherUtils/Algorithms.h b/src/Functions/GatherUtils/Algorithms.h index 2812821e339..d37341b0f81 100644 --- a/src/Functions/GatherUtils/Algorithms.h +++ b/src/Functions/GatherUtils/Algorithms.h @@ -7,10 +7,12 @@ #include #include #include "GatherUtils.h" + #if defined(__AVX2__) || defined(__SSE4_2__) - #include +#include #endif + namespace DB::ErrorCodes { extern const int LOGICAL_ERROR; @@ -495,6 +497,20 @@ std::vector buildKMPPrefixFunction(const SliceType & pattern, const Equa } +inline ALWAYS_INLINE bool hasNull(const UInt8 * null_map, size_t null_map_size) +{ + if (null_map != nullptr) + { + for (size_t i = 0; i < null_map_size; ++i) + { + if (null_map[i]) + return true; + } + } + return false; +} + + /// Methods to check if first array has elements from second array, overloaded for various combinations of types. template < ArraySearchType search_type, @@ -506,19 +522,35 @@ bool sliceHasImplAnyAll(const FirstSliceType & first, const SecondSliceType & se const bool has_first_null_map = first_null_map != nullptr; const bool has_second_null_map = second_null_map != nullptr; + const bool has_second_null = hasNull(second_null_map, second.size); + if (has_second_null) + { + const bool has_first_null = hasNull(first_null_map, first.size); + + if (has_first_null && search_type == ArraySearchType::Any) + return true; + + if (!has_first_null && search_type == ArraySearchType::All) + return false; + } + for (size_t i = 0; i < second.size; ++i) { + if (has_second_null_map && second_null_map[i]) + continue; + bool has = false; - for (unsigned j = 0; j < first.size && !has; ++j) + + for (size_t j = 0; j < first.size && !has; ++j) { - const bool is_first_null = has_first_null_map && first_null_map[j]; - const bool is_second_null = has_second_null_map && second_null_map[i]; + if (has_first_null_map && first_null_map[j]) + continue; - if (is_first_null && is_second_null) - has = true; - - if (!is_first_null && !is_second_null && isEqual(first, second, j, i)) + if (isEqual(first, second, j, i)) + { has = true; + break; + } } if (has && search_type == ArraySearchType::Any) @@ -531,21 +563,60 @@ bool sliceHasImplAnyAll(const FirstSliceType & first, const SecondSliceType & se } +#if defined(__AVX2__) || defined(__SSE4_2__) + +template +inline ALWAYS_INLINE bool hasAllIntegralLoopRemainder( + size_t j, const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + for (; j < second.size; ++j) + { + // skip null elements since both have at least one - assuming it was checked earlier that at least one element in 'first' is null + if (has_second_null_map && second_null_map[j]) + continue; + + bool found = false; + + for (size_t i = 0; i < first.size; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + + if (first.data[i] == second.data[j]) + { + found = true; + break; + } + } + + if (!found) + return false; + } + return true; +} + +#endif + + #if defined(__AVX2__) -// AVX2 - Int specialization +// AVX2 Int specialization template <> inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * second_null_map, const UInt8 * first_null_map) + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) { if (second.size == 0) return true; - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - if (!has_first_null_map && has_second_null_map) + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) return false; - unsigned j = 0; + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; short has_mask = 1; const int full = -1, none = 0; const __m256i ones = _mm256_set1_epi32(full); @@ -625,28 +696,16 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll // inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( -// const NumericArraySlice & second, const NumericArraySlice & first, const UInt8 * second_null_map, const UInt8 * first_null_map) +// const NumericArraySlice & second, const NumericArraySlice & first, const UInt8 * first_null_map, const UInt8 * second_null_map) // { -// return sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements > ( +// return sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements > ( // static_cast &>(second), static_cast &>(first), second_null_map, first_null_map); // } // AVX2 Int64 specialization template <> inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * second_null_map, const UInt8 * first_null_map) + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) { if (second.size == 0) return true; - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - if (!has_first_null_map && has_second_null_map) + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) return false; - unsigned j = 0; + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; short has_mask = 1; const int full = -1, none = 0; const __m256i ones = _mm256_set1_epi64x(full); @@ -694,7 +754,7 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll(first.data + i)); const __m256i first_nm_mask = _mm256_set_m128i( @@ -729,42 +789,33 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll 2) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); } // AVX2 Int16_t specialization template <> inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * second_null_map, const UInt8 * first_null_map) + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) { if (second.size == 0) return true; - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - if (!has_first_null_map && has_second_null_map) + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) return false; - unsigned j = 0; + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; short has_mask = 1; const int full = -1, none = 0; const __m256i ones = _mm256_set1_epi16(full); @@ -787,7 +838,7 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll(first.data + i)); const __m256i first_nm_mask = _mm256_set_m128i( @@ -874,26 +925,16 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll 2) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); } #elif defined(__SSE4_2__) @@ -901,17 +942,18 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * second_null_map, const UInt8 * first_null_map) + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) { if (second.size == 0) return true; - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - if (!has_first_null_map && has_second_null_map) + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) return false; - unsigned j = 0; + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; short has_mask = 1; const __m128i zeros = _mm_setzero_si128(); if (second.size > 3 && first.size > 2) @@ -972,36 +1014,27 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * second_null_map, const UInt8 * first_null_map) + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) { if (second.size == 0) return true; - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - if (!has_first_null_map && has_second_null_map) + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) return false; - unsigned j = 0; + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; short has_mask = 1; const Int64 full = -1, none = 0; const __m128i zeros = _mm_setzero_si128(); @@ -1046,36 +1079,27 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * second_null_map, const UInt8 * first_null_map) + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) { if (second.size == 0) return true; - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - if (!has_first_null_map && has_second_null_map) + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) return false; - unsigned j = 0; + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; short has_mask = 1; const int16_t full = -1, none = 0; const __m128i zeros = _mm_setzero_si128(); @@ -1151,36 +1175,27 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll 2) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); } // SSE4.2 Int8_t specialization template <> inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * second_null_map, const UInt8 * first_null_map) + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) { if (second.size == 0) return true; - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - if (!has_first_null_map && has_second_null_map) + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) return false; - unsigned j = 0; + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; short has_mask = 1; const int full = -1, none = 0; const __m128i zeros = _mm_setzero_si128(); @@ -1291,21 +1306,10 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll Date: Thu, 19 Aug 2021 15:29:26 +0200 Subject: [PATCH 006/239] Correcting { placement --- src/Functions/tests/gtest_hasAll.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Functions/tests/gtest_hasAll.cpp b/src/Functions/tests/gtest_hasAll.cpp index b7ba59f91c7..89f011cd7f1 100644 --- a/src/Functions/tests/gtest_hasAll.cpp +++ b/src/Functions/tests/gtest_hasAll.cpp @@ -6,7 +6,8 @@ using namespace DB::GatherUtils; template -void arrayInit(T* elements_to_have, size_t nb_elements_to_have, T* array_elements, size_t array_size, bool all_elements_present) { +void arrayInit(T* elements_to_have, size_t nb_elements_to_have, T* array_elements, size_t array_size, bool all_elements_present) +{ for (T i = 0; i < array_size; ++i) { array_elements[i] = i; From 169c49c58378a6be1729b1ac2eadaf991d01b1ac Mon Sep 17 00:00:00 2001 From: Jakub Kuklis Date: Fri, 20 Aug 2021 13:00:40 +0200 Subject: [PATCH 007/239] Correcting style and resolving warnings --- src/Functions/tests/gtest_hasAll.cpp | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/src/Functions/tests/gtest_hasAll.cpp b/src/Functions/tests/gtest_hasAll.cpp index 89f011cd7f1..ca7bc80b4aa 100644 --- a/src/Functions/tests/gtest_hasAll.cpp +++ b/src/Functions/tests/gtest_hasAll.cpp @@ -1,20 +1,29 @@ +#include #include - #include using namespace DB::GatherUtils; +auto uni_int_dist(int min, int max) +{ + std::random_device rd; + std::mt19937 mt(rd()); + std::uniform_int_distribution<> dist(min, max); + return std::make_pair(dist, mt); +} + template void arrayInit(T* elements_to_have, size_t nb_elements_to_have, T* array_elements, size_t array_size, bool all_elements_present) { - for (T i = 0; i < array_size; ++i) + for (size_t i = 0; i < array_size; ++i) { array_elements[i] = i; } - for (T i = 0; i < nb_elements_to_have; ++i) + auto [dist, gen] = uni_int_dist(0, array_size - 1); + for (size_t i = 0; i < nb_elements_to_have; ++i) { - elements_to_have[i] = array_elements[std::rand() % array_size]; + elements_to_have[i] = array_elements[dist(gen)]; } if (!all_elements_present) { @@ -25,13 +34,15 @@ void arrayInit(T* elements_to_have, size_t nb_elements_to_have, T* array_element void nullMapInit(UInt8 * null_map, size_t null_map_size, size_t nb_null_elements) { - for (int i = 0; i < null_map_size; ++i) + /// -2 to keep the last element of the array non-null + auto [dist, gen] = uni_int_dist(0, null_map_size - 2); + for (size_t i = 0; i < null_map_size; ++i) { null_map[i] = 0; } - for (int i = 0; i < null_map_size - 1 && i < nb_null_elements; ++i) + for (size_t i = 0; i < null_map_size - 1 && i < nb_null_elements; ++i) { - null_map[std::rand() % null_map_size] = 1; + null_map[dist(gen)] = 1; } } From a3c08acac3d3ca3239b19dc09cf4bfb3730c37d3 Mon Sep 17 00:00:00 2001 From: Jakub Kuklis Date: Thu, 26 Aug 2021 12:07:56 +0200 Subject: [PATCH 008/239] Moving sliceHasImplAnyAll and sliceEqualElements to separate header files to avoid SIMD instructions bloat in Algorithms.h --- src/Functions/GatherUtils/Algorithms.h | 856 +----------------- .../GatherUtils/sliceEqualElements.h | 41 + .../GatherUtils/sliceHasImplAnyAll.h | 839 +++++++++++++++++ 3 files changed, 882 insertions(+), 854 deletions(-) create mode 100644 src/Functions/GatherUtils/sliceEqualElements.h create mode 100644 src/Functions/GatherUtils/sliceHasImplAnyAll.h diff --git a/src/Functions/GatherUtils/Algorithms.h b/src/Functions/GatherUtils/Algorithms.h index d37341b0f81..4bab415f199 100644 --- a/src/Functions/GatherUtils/Algorithms.h +++ b/src/Functions/GatherUtils/Algorithms.h @@ -7,10 +7,8 @@ #include #include #include "GatherUtils.h" - -#if defined(__AVX2__) || defined(__SSE4_2__) -#include -#endif +#include "sliceEqualElements.h" +#include "sliceHasImplAnyAll.h" namespace DB::ErrorCodes @@ -422,38 +420,6 @@ void NO_INLINE conditional(SourceA && src_a, SourceB && src_b, Sink && sink, con } -template -bool sliceEqualElements(const NumericArraySlice & first [[maybe_unused]], - const NumericArraySlice & second [[maybe_unused]], - size_t first_ind [[maybe_unused]], - size_t second_ind [[maybe_unused]]) -{ - /// TODO: Decimal scale - if constexpr (is_decimal && is_decimal) - return accurate::equalsOp(first.data[first_ind].value, second.data[second_ind].value); - else if constexpr (is_decimal || is_decimal) - return false; - else - return accurate::equalsOp(first.data[first_ind], second.data[second_ind]); -} - -template -bool sliceEqualElements(const NumericArraySlice &, const GenericArraySlice &, size_t, size_t) -{ - return false; -} - -template -bool sliceEqualElements(const GenericArraySlice &, const NumericArraySlice &, size_t, size_t) -{ - return false; -} - -inline ALWAYS_INLINE bool sliceEqualElements(const GenericArraySlice & first, const GenericArraySlice & second, size_t first_ind, size_t second_ind) -{ - return first.elements->compareAt(first_ind + first.begin, second_ind + second.begin, *second.elements, -1) == 0; -} - template bool insliceEqualElements(const NumericArraySlice & first [[maybe_unused]], size_t first_ind [[maybe_unused]], @@ -497,824 +463,6 @@ std::vector buildKMPPrefixFunction(const SliceType & pattern, const Equa } -inline ALWAYS_INLINE bool hasNull(const UInt8 * null_map, size_t null_map_size) -{ - if (null_map != nullptr) - { - for (size_t i = 0; i < null_map_size; ++i) - { - if (null_map[i]) - return true; - } - } - return false; -} - - -/// Methods to check if first array has elements from second array, overloaded for various combinations of types. -template < - ArraySearchType search_type, - typename FirstSliceType, - typename SecondSliceType, - bool (*isEqual)(const FirstSliceType &, const SecondSliceType &, size_t, size_t)> -bool sliceHasImplAnyAll(const FirstSliceType & first, const SecondSliceType & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - const bool has_second_null = hasNull(second_null_map, second.size); - if (has_second_null) - { - const bool has_first_null = hasNull(first_null_map, first.size); - - if (has_first_null && search_type == ArraySearchType::Any) - return true; - - if (!has_first_null && search_type == ArraySearchType::All) - return false; - } - - for (size_t i = 0; i < second.size; ++i) - { - if (has_second_null_map && second_null_map[i]) - continue; - - bool has = false; - - for (size_t j = 0; j < first.size && !has; ++j) - { - if (has_first_null_map && first_null_map[j]) - continue; - - if (isEqual(first, second, j, i)) - { - has = true; - break; - } - } - - if (has && search_type == ArraySearchType::Any) - return true; - - if (!has && search_type == ArraySearchType::All) - return false; - } - return search_type == ArraySearchType::All; -} - - -#if defined(__AVX2__) || defined(__SSE4_2__) - -template -inline ALWAYS_INLINE bool hasAllIntegralLoopRemainder( - size_t j, const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - for (; j < second.size; ++j) - { - // skip null elements since both have at least one - assuming it was checked earlier that at least one element in 'first' is null - if (has_second_null_map && second_null_map[j]) - continue; - - bool found = false; - - for (size_t i = 0; i < first.size; ++i) - { - if (has_first_null_map && first_null_map[i]) - continue; - - if (first.data[i] == second.data[j]) - { - found = true; - break; - } - } - - if (!found) - return false; - } - return true; -} - -#endif - - -#if defined(__AVX2__) -// AVX2 Int specialization -template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - if (second.size == 0) - return true; - - if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) - return false; - - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - size_t j = 0; - short has_mask = 1; - const int full = -1, none = 0; - const __m256i ones = _mm256_set1_epi32(full); - const __m256i zeros = _mm256_setzero_si256(); - if (second.size > 7 && first.size > 7) - { - for (; j < second.size - 7 && has_mask; j += 8) - { - has_mask = 0; - const __m256i f_data = _mm256_lddqu_si256(reinterpret_cast(second.data + j)); - // bitmask is filled with minus ones for ones which are considered as null in the corresponding null map, 0 otherwise; - __m256i bitmask = has_second_null_map ? - _mm256_set_epi32( - (second_null_map[j + 7]) ? full : none, - (second_null_map[j + 6]) ? full : none, - (second_null_map[j + 5]) ? full : none, - (second_null_map[j + 4]) ? full : none, - (second_null_map[j + 3]) ? full : none, - (second_null_map[j + 2]) ? full : none, - (second_null_map[j + 1]) ? full : none, - (second_null_map[j]) ? full : none) - : zeros; - - size_t i = 0; - // Search first array to try to match all second elements - for (; i < first.size - 7 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 8) - { - const __m256i s_data = _mm256_lddqu_si256(reinterpret_cast(first.data + i)); - // Create a mask to avoid to compare null elements - // set_m128i takes two arguments: (high segment, low segment) that are two __m128i convert from 8bits to 32bits to fit to our following operations - const __m256i first_nm_mask = _mm256_set_m128i( - _mm_cvtepi8_epi32(_mm_lddqu_si128(reinterpret_cast(first_null_map + i + 4))), - _mm_cvtepi8_epi32(_mm_lddqu_si128(reinterpret_cast(first_null_map + i)))); - bitmask = - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - first_nm_mask, - _mm256_cmpeq_epi32(f_data, s_data)), - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(6,5,4,3,2,1,0,7)), - _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(6,5,4,3,2,1,0,7))))), - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(5,4,3,2,1,0,7,6)), - _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(5,4,3,2,1,0,7,6)))), - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(4,3,2,1,0,7,6,5)), - _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(4,3,2,1,0,7,6,5))))) - ), - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(3,2,1,0,7,6,5,4)), - _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(3,2,1,0,7,6,5,4)))), - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(2,1,0,7,6,5,4,3)), - _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(2,1,0,7,6,5,4,3))))), - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(1,0,7,6,5,4,3,2)), - _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(1,0,7,6,5,4,3,2)))), - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(0,7,6,5,4,3,2,1)), - _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(0,7,6,5,4,3,2,1))))))), - bitmask); - } - - if (i < first.size) - { - // Loop(i)-jam - for (; i < first.size && !has_mask; ++i) - { - if (has_first_null_map && first_null_map[i]) - continue; - __m256i v_i = _mm256_set1_epi32(first.data[i]); - bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi32(f_data, v_i)); - has_mask = _mm256_testc_si256(bitmask, ones); - } - } - } - } - - if (!has_mask) - return false; - - return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); -} - -// TODO: Discuss about -// raise an error : "error: no viable conversion from 'const NumericArraySlice' to 'const NumericArraySlice'" -// How should we do, copy past each function ?? I haven't found a way to specialize a same function body for two different types. -// AVX2 UInt specialization -// template <> -// inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( -// const NumericArraySlice & second, const NumericArraySlice & first, const UInt8 * first_null_map, const UInt8 * second_null_map) -// { -// return sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements > ( -// static_cast &>(second), static_cast &>(first), second_null_map, first_null_map); -// } - -// AVX2 Int64 specialization -template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - if (second.size == 0) - return true; - - if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) - return false; - - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - size_t j = 0; - short has_mask = 1; - const int full = -1, none = 0; - const __m256i ones = _mm256_set1_epi64x(full); - const __m256i zeros = _mm256_setzero_si256(); - if (second.size > 3 && first.size > 3) - { - for (; j < second.size - 3 && has_mask; j += 4) - { - has_mask = 0; - const __m256i f_data = _mm256_lddqu_si256(reinterpret_cast(second.data + j)); - __m256i bitmask = has_second_null_map ? - _mm256_set_epi64x( - (second_null_map[j + 3])? full : none, - (second_null_map[j + 2])? full : none, - (second_null_map[j + 1])? full : none, - (second_null_map[j]) ? full : none) - : zeros; - - unsigned i = 0; - for (; i < first.size - 3 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 4) - { - const __m256i s_data = _mm256_lddqu_si256(reinterpret_cast(first.data + i)); - const __m256i first_nm_mask = _mm256_set_m128i( - _mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast(first_null_map + i + 2))), - _mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast(first_null_map + i)))); - bitmask = - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - first_nm_mask, - _mm256_cmpeq_epi64(f_data, s_data)), - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(5,4,3,2,1,0,7,6)), - _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(5,4,3,2,1,0,7,6))))), - - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(3,2,1,0,7,6,5,4)), - _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(3,2,1,0,7,6,5,4)))), - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(1,0,7,6,5,4,3,2)), - _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(1,0,7,6,5,4,3,2)))))), - bitmask); - } - - if (i < first.size) - { - for (; i < first.size && !has_mask; ++i) - { - if (has_first_null_map && first_null_map[i]) - continue; - __m256i v_i = _mm256_set1_epi64x(first.data[i]); - bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi64(f_data, v_i)); - has_mask = _mm256_testc_si256(bitmask, ones); - } - } - } - } - - if (!has_mask && second.size > 2) - return false; - - return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); -} - -// AVX2 Int16_t specialization -template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - if (second.size == 0) - return true; - - if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) - return false; - - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - size_t j = 0; - short has_mask = 1; - const int full = -1, none = 0; - const __m256i ones = _mm256_set1_epi16(full); - const __m256i zeros = _mm256_setzero_si256(); - if (second.size > 15 && first.size > 15) - { - for (; j < second.size - 15 && has_mask; j += 16) - { - has_mask = 0; - const __m256i f_data = _mm256_lddqu_si256(reinterpret_cast(second.data + j)); - __m256i bitmask = has_second_null_map ? - _mm256_set_epi16( - (second_null_map[j + 15]) ? full : none, (second_null_map[j + 14]) ? full : none, - (second_null_map[j + 13]) ? full : none, (second_null_map[j + 12]) ? full : none, - (second_null_map[j + 11]) ? full : none, (second_null_map[j + 10]) ? full : none, - (second_null_map[j + 9]) ? full : none, (second_null_map[j + 8])? full : none, - (second_null_map[j + 7]) ? full : none, (second_null_map[j + 6])? full : none, - (second_null_map[j + 5]) ? full : none, (second_null_map[j + 4])? full : none, - (second_null_map[j + 3]) ? full : none, (second_null_map[j + 2])? full : none, - (second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full : none) - : zeros; - unsigned i = 0; - for (; i < first.size - 15 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 16) - { - const __m256i s_data = _mm256_lddqu_si256(reinterpret_cast(first.data + i)); - const __m256i first_nm_mask = _mm256_set_m128i( - _mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast(first_null_map+i+8))), - _mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast(first_null_map+i)))); - bitmask = - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - first_nm_mask, - _mm256_cmpeq_epi16(f_data, s_data)), - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30))))), - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28)))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26))))) - ), - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24)))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22))))), - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20)))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18)))))) - ), - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_permute2x128_si256(first_nm_mask, first_nm_mask,1), - _mm256_cmpeq_epi16(f_data, _mm256_permute2x128_si256(s_data, s_data, 1))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14))))), - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12)))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10))))) - ), - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data ,s_data, 1), _mm256_set_epi8(7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8)))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6))))), - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data ,s_data ,1), _mm256_set_epi8(3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4)))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2)))))) - ) - ), - bitmask); - } - - if (i < first.size) - { - for (; i < first.size && !has_mask; ++i) - { - if (has_first_null_map && first_null_map[i]) - continue; - __m256i v_i = _mm256_set1_epi16(first.data[i]); - bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi16(f_data, v_i)); - has_mask = _mm256_testc_si256(bitmask, ones); - } - } - } - } - - if (!has_mask && second.size > 2) - return false; - - return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); -} - -#elif defined(__SSE4_2__) - -// SSE4.2 Int specialization -template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - if (second.size == 0) - return true; - - if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) - return false; - - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - size_t j = 0; - short has_mask = 1; - const __m128i zeros = _mm_setzero_si128(); - if (second.size > 3 && first.size > 2) - { - const int full = -1, none = 0; - for (; j < second.size - 3 && has_mask; j += 4) - { - has_mask = 0; - const __m128i f_data = _mm_lddqu_si128(reinterpret_cast(second.data + j)); - __m128i bitmask = has_second_null_map ? - _mm_set_epi32( - (second_null_map[j + 3]) ? full : none, - (second_null_map[j + 2]) ? full : none, - (second_null_map[j + 1]) ? full : none, - (second_null_map[j]) ? full : none) - : zeros; - - unsigned i = 0; - for (; i < first.size - 3 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 4) - { - const __m128i s_data = _mm_lddqu_si128(reinterpret_cast(first.data + i)); - const __m128i first_nm_mask = has_first_null_map ? - _mm_cvtepi8_epi32(_mm_lddqu_si128(reinterpret_cast(first_null_map + i))) - : zeros; - - bitmask = - _mm_or_si128( - _mm_or_si128( - _mm_or_si128( - _mm_andnot_si128( - first_nm_mask, - _mm_cmpeq_epi32(f_data, s_data)), - _mm_andnot_si128( - _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(2,1,0,3)), - _mm_cmpeq_epi32(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(2,1,0,3))))), - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(1,0,3,2)), - _mm_cmpeq_epi32(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(1,0,3,2)))), - _mm_andnot_si128( - _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(0,3,2,1)), - _mm_cmpeq_epi32(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(0,3,2,1))))) - ), - bitmask); - } - - if (i < first.size) - { - for (; i < first.size && !has_mask; ++i) - { - if (has_first_null_map && first_null_map[i]) - continue; - __m128i r_i = _mm_set1_epi32(first.data[i]); - bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi32(f_data, r_i)); - has_mask = _mm_test_all_ones(bitmask); - } - } - } - } - - if (!has_mask) - return false; - - return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); -} - -// SSE4.2 Int64 specialization -template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - if (second.size == 0) - return true; - - if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) - return false; - - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - size_t j = 0; - short has_mask = 1; - const Int64 full = -1, none = 0; - const __m128i zeros = _mm_setzero_si128(); - for (; j < second.size - 1 && has_mask; j += 2) - { - has_mask = 0; - const __m128i f_data = _mm_lddqu_si128(reinterpret_cast(second.data + j)); - __m128i bitmask = has_second_null_map ? - _mm_set_epi64x( - (second_null_map[j + 1]) ? full : none, - (second_null_map[j]) ? full : none) - : zeros; - unsigned i = 0; - for (; i < first.size - 1 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 2) - { - const __m128i s_data = _mm_lddqu_si128(reinterpret_cast(first.data + i)); - const __m128i first_nm_mask = has_first_null_map ? - _mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast(first_null_map + i))) - : zeros; - bitmask = - _mm_or_si128( - _mm_or_si128( - _mm_andnot_si128( - first_nm_mask, - _mm_cmpeq_epi32(f_data, s_data)), - _mm_andnot_si128( - _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(1,0,3,2)), - _mm_cmpeq_epi64(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(1,0,3,2))))), - bitmask); - } - - if (i < first.size) - { - for (; i < first.size && !has_mask; ++i) - { - if (has_first_null_map && first_null_map[i]) - continue; - __m128i v_i = _mm_set1_epi64x(first.data[i]); - bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi64(f_data, v_i)); - has_mask = _mm_test_all_ones(bitmask); - } - } - } - - if (!has_mask) - return false; - - return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); -} - -// SSE4.2 Int16_t specialization -template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - if (second.size == 0) - return true; - - if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) - return false; - - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - size_t j = 0; - short has_mask = 1; - const int16_t full = -1, none = 0; - const __m128i zeros = _mm_setzero_si128(); - if (second.size > 6 && first.size > 6) - { - for (; j < second.size - 7 && has_mask; j += 8) - { - has_mask = 0; - const __m128i f_data = _mm_lddqu_si128(reinterpret_cast(second.data+j)); - __m128i bitmask = has_second_null_map ? - _mm_set_epi16( - (second_null_map[j + 7]) ? full : none, (second_null_map[j + 6]) ? full : none, - (second_null_map[j + 5]) ? full : none, (second_null_map[j + 4]) ? full : none, - (second_null_map[j + 3]) ? full : none, (second_null_map[j + 2]) ? full : none, - (second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full: none) - : zeros; - unsigned i = 0; - for (; i < first.size-7 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 8) - { - const __m128i s_data = _mm_lddqu_si128(reinterpret_cast(first.data + i)); - const __m128i first_nm_mask = has_first_null_map ? - _mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast(first_null_map+i))) - : zeros; - bitmask = - _mm_or_si128( - _mm_or_si128( - _mm_or_si128( - _mm_or_si128( - _mm_andnot_si128( - first_nm_mask, - _mm_cmpeq_epi16(f_data, s_data)), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14)), - _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14))))), - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)), - _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10)), - _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10))))) - ), - _mm_or_si128( - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)), - _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6)), - _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6))))), - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)), - _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)), - _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)))))) - ), - bitmask); - } - - if (i < first.size) - { - for (; i < first.size && !has_mask; ++i) - { - if (has_first_null_map && first_null_map[i]) - continue; - __m128i v_i = _mm_set1_epi16(first.data[i]); - bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi16(f_data, v_i)); - has_mask = _mm_test_all_ones(bitmask); - } - } - } - } - - if (!has_mask && second.size > 2) - return false; - - return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); -} - -// SSE4.2 Int8_t specialization -template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - if (second.size == 0) - return true; - - if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) - return false; - - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - size_t j = 0; - short has_mask = 1; - const int full = -1, none = 0; - const __m128i zeros = _mm_setzero_si128(); - if (second.size > 15) - { - for (; j < second.size - 15 && has_mask; j += 16) - { - has_mask = 0; - const __m128i f_data = _mm_lddqu_si128(reinterpret_cast(second.data+j)); - __m128i bitmask = has_second_null_map ? - _mm_set_epi8( - (second_null_map[j + 15]) ? full : none, (second_null_map[j + 14]) ? full : none, - (second_null_map[j + 13]) ? full : none, (second_null_map[j + 12]) ? full : none, - (second_null_map[j + 11]) ? full : none, (second_null_map[j + 10]) ? full : none, - (second_null_map[j + 9]) ? full : none, (second_null_map[j + 8]) ? full : none, - (second_null_map[j + 7]) ? full : none, (second_null_map[j + 6]) ? full : none, - (second_null_map[j + 5]) ? full : none, (second_null_map[j + 4]) ? full : none, - (second_null_map[j + 3]) ? full : none, (second_null_map[j + 2]) ? full : none, - (second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full : none) - : zeros; - unsigned i = 0; - for (; i < first.size - 15 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 16) - { - const __m128i s_data = _mm_lddqu_si128(reinterpret_cast(first.data+i)); - const __m128i first_nm_mask = has_first_null_map ? - _mm_lddqu_si128(reinterpret_cast(first_null_map+i)) - : zeros; - bitmask = - _mm_or_si128( - _mm_or_si128( - _mm_or_si128( - _mm_or_si128( - _mm_or_si128( - _mm_andnot_si128( - first_nm_mask, - _mm_cmpeq_epi8(f_data, s_data)), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15))))), - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13))))) - ), - _mm_or_si128( - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11))))), - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9))))))), - _mm_or_si128( - _mm_or_si128( - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7))))), - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5)))))), - _mm_or_si128( - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3))))), - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1)))))))), - bitmask); - } - - if (i < first.size) - { - for (; i < first.size && !has_mask; ++i) - { - if (has_first_null_map && first_null_map[i]) - continue; - __m128i v_i = _mm_set1_epi8(first.data[i]); - bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi8(f_data, v_i)); - has_mask = _mm_test_all_ones(bitmask); - } - } - } - } - - if (!has_mask) - return false; - - return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); -} - -#endif - - template < typename FirstSliceType, typename SecondSliceType, bool (*isEqual)(const FirstSliceType &, const SecondSliceType &, size_t, size_t), diff --git a/src/Functions/GatherUtils/sliceEqualElements.h b/src/Functions/GatherUtils/sliceEqualElements.h new file mode 100644 index 00000000000..f219d51c56a --- /dev/null +++ b/src/Functions/GatherUtils/sliceEqualElements.h @@ -0,0 +1,41 @@ +#pragma once + +#include +#include "Slices.h" + +namespace DB::GatherUtils +{ + +template +bool sliceEqualElements(const NumericArraySlice & first [[maybe_unused]], + const NumericArraySlice & second [[maybe_unused]], + size_t first_ind [[maybe_unused]], + size_t second_ind [[maybe_unused]]) +{ + /// TODO: Decimal scale + if constexpr (is_decimal && is_decimal) + return accurate::equalsOp(first.data[first_ind].value, second.data[second_ind].value); + else if constexpr (is_decimal || is_decimal) + return false; + else + return accurate::equalsOp(first.data[first_ind], second.data[second_ind]); +} + +template +bool sliceEqualElements(const NumericArraySlice &, const GenericArraySlice &, size_t, size_t) +{ + return false; +} + +template +bool sliceEqualElements(const GenericArraySlice &, const NumericArraySlice &, size_t, size_t) +{ + return false; +} + +inline ALWAYS_INLINE bool sliceEqualElements(const GenericArraySlice & first, const GenericArraySlice & second, size_t first_ind, size_t second_ind) +{ + return first.elements->compareAt(first_ind + first.begin, second_ind + second.begin, *second.elements, -1) == 0; +} + +} diff --git a/src/Functions/GatherUtils/sliceHasImplAnyAll.h b/src/Functions/GatherUtils/sliceHasImplAnyAll.h new file mode 100644 index 00000000000..59d37473e42 --- /dev/null +++ b/src/Functions/GatherUtils/sliceHasImplAnyAll.h @@ -0,0 +1,839 @@ +#pragma once + +#include "GatherUtils.h" +#include "Slices.h" +#include "sliceEqualElements.h" + +#if defined(__AVX2__) || defined(__SSE4_2__) +#include +#endif + +namespace DB::GatherUtils +{ + +namespace +{ + +inline ALWAYS_INLINE bool hasNull(const UInt8 * null_map, size_t null_map_size) +{ + if (null_map != nullptr) + { + for (size_t i = 0; i < null_map_size; ++i) + { + if (null_map[i]) + return true; + } + } + return false; +} + +} + +/// Methods to check if first array has elements from second array, overloaded for various combinations of types. +template < + ArraySearchType search_type, + typename FirstSliceType, + typename SecondSliceType, + bool (*isEqual)(const FirstSliceType &, const SecondSliceType &, size_t, size_t)> +bool sliceHasImplAnyAll(const FirstSliceType & first, const SecondSliceType & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + const bool has_second_null = hasNull(second_null_map, second.size); + if (has_second_null) + { + const bool has_first_null = hasNull(first_null_map, first.size); + + if (has_first_null && search_type == ArraySearchType::Any) + return true; + + if (!has_first_null && search_type == ArraySearchType::All) + return false; + } + + for (size_t i = 0; i < second.size; ++i) + { + if (has_second_null_map && second_null_map[i]) + continue; + + bool has = false; + + for (size_t j = 0; j < first.size && !has; ++j) + { + if (has_first_null_map && first_null_map[j]) + continue; + + if (isEqual(first, second, j, i)) + { + has = true; + break; + } + } + + if (has && search_type == ArraySearchType::Any) + return true; + + if (!has && search_type == ArraySearchType::All) + return false; + } + return search_type == ArraySearchType::All; +} + + +#if defined(__AVX2__) || defined(__SSE4_2__) + +namespace +{ + +template +inline ALWAYS_INLINE bool hasAllIntegralLoopRemainder( + size_t j, const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + for (; j < second.size; ++j) + { + // skip null elements since both have at least one - assuming it was checked earlier that at least one element in 'first' is null + if (has_second_null_map && second_null_map[j]) + continue; + + bool found = false; + + for (size_t i = 0; i < first.size; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + + if (first.data[i] == second.data[j]) + { + found = true; + break; + } + } + + if (!found) + return false; + } + return true; +} + +} + +#endif + +#if defined(__AVX2__) +// AVX2 Int specialization +template <> +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + if (second.size == 0) + return true; + + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) + return false; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; + short has_mask = 1; + const int full = -1, none = 0; + const __m256i ones = _mm256_set1_epi32(full); + const __m256i zeros = _mm256_setzero_si256(); + if (second.size > 7 && first.size > 7) + { + for (; j < second.size - 7 && has_mask; j += 8) + { + has_mask = 0; + const __m256i f_data = _mm256_lddqu_si256(reinterpret_cast(second.data + j)); + // bitmask is filled with minus ones for ones which are considered as null in the corresponding null map, 0 otherwise; + __m256i bitmask = has_second_null_map ? + _mm256_set_epi32( + (second_null_map[j + 7]) ? full : none, + (second_null_map[j + 6]) ? full : none, + (second_null_map[j + 5]) ? full : none, + (second_null_map[j + 4]) ? full : none, + (second_null_map[j + 3]) ? full : none, + (second_null_map[j + 2]) ? full : none, + (second_null_map[j + 1]) ? full : none, + (second_null_map[j]) ? full : none) + : zeros; + + size_t i = 0; + // Search first array to try to match all second elements + for (; i < first.size - 7 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 8) + { + const __m256i s_data = _mm256_lddqu_si256(reinterpret_cast(first.data + i)); + // Create a mask to avoid to compare null elements + // set_m128i takes two arguments: (high segment, low segment) that are two __m128i convert from 8bits to 32bits to fit to our following operations + const __m256i first_nm_mask = _mm256_set_m128i( + _mm_cvtepi8_epi32(_mm_lddqu_si128(reinterpret_cast(first_null_map + i + 4))), + _mm_cvtepi8_epi32(_mm_lddqu_si128(reinterpret_cast(first_null_map + i)))); + bitmask = + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + first_nm_mask, + _mm256_cmpeq_epi32(f_data, s_data)), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(6,5,4,3,2,1,0,7)), + _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(6,5,4,3,2,1,0,7))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(5,4,3,2,1,0,7,6)), + _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(5,4,3,2,1,0,7,6)))), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(4,3,2,1,0,7,6,5)), + _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(4,3,2,1,0,7,6,5))))) + ), + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(3,2,1,0,7,6,5,4)), + _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(3,2,1,0,7,6,5,4)))), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(2,1,0,7,6,5,4,3)), + _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(2,1,0,7,6,5,4,3))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(1,0,7,6,5,4,3,2)), + _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(1,0,7,6,5,4,3,2)))), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(0,7,6,5,4,3,2,1)), + _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(0,7,6,5,4,3,2,1))))))), + bitmask); + } + + if (i < first.size) + { + // Loop(i)-jam + for (; i < first.size && !has_mask; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + __m256i v_i = _mm256_set1_epi32(first.data[i]); + bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi32(f_data, v_i)); + has_mask = _mm256_testc_si256(bitmask, ones); + } + } + } + } + + if (!has_mask) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); +} + +// TODO: Discuss about +// raise an error : "error: no viable conversion from 'const NumericArraySlice' to 'const NumericArraySlice'" +// How should we do, copy past each function ?? I haven't found a way to specialize a same function body for two different types. +// AVX2 UInt specialization +// template <> +// inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( +// const NumericArraySlice & second, const NumericArraySlice & first, const UInt8 * first_null_map, const UInt8 * second_null_map) +// { +// return sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements > ( +// static_cast &>(second), static_cast &>(first), second_null_map, first_null_map); +// } + +// AVX2 Int64 specialization +template <> +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + if (second.size == 0) + return true; + + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) + return false; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; + short has_mask = 1; + const int full = -1, none = 0; + const __m256i ones = _mm256_set1_epi64x(full); + const __m256i zeros = _mm256_setzero_si256(); + if (second.size > 3 && first.size > 3) + { + for (; j < second.size - 3 && has_mask; j += 4) + { + has_mask = 0; + const __m256i f_data = _mm256_lddqu_si256(reinterpret_cast(second.data + j)); + __m256i bitmask = has_second_null_map ? + _mm256_set_epi64x( + (second_null_map[j + 3])? full : none, + (second_null_map[j + 2])? full : none, + (second_null_map[j + 1])? full : none, + (second_null_map[j]) ? full : none) + : zeros; + + unsigned i = 0; + for (; i < first.size - 3 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 4) + { + const __m256i s_data = _mm256_lddqu_si256(reinterpret_cast(first.data + i)); + const __m256i first_nm_mask = _mm256_set_m128i( + _mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast(first_null_map + i + 2))), + _mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast(first_null_map + i)))); + bitmask = + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + first_nm_mask, + _mm256_cmpeq_epi64(f_data, s_data)), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(5,4,3,2,1,0,7,6)), + _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(5,4,3,2,1,0,7,6))))), + + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(3,2,1,0,7,6,5,4)), + _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(3,2,1,0,7,6,5,4)))), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(1,0,7,6,5,4,3,2)), + _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(1,0,7,6,5,4,3,2)))))), + bitmask); + } + + if (i < first.size) + { + for (; i < first.size && !has_mask; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + __m256i v_i = _mm256_set1_epi64x(first.data[i]); + bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi64(f_data, v_i)); + has_mask = _mm256_testc_si256(bitmask, ones); + } + } + } + } + + if (!has_mask && second.size > 2) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); +} + +// AVX2 Int16_t specialization +template <> +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + if (second.size == 0) + return true; + + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) + return false; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; + short has_mask = 1; + const int full = -1, none = 0; + const __m256i ones = _mm256_set1_epi16(full); + const __m256i zeros = _mm256_setzero_si256(); + if (second.size > 15 && first.size > 15) + { + for (; j < second.size - 15 && has_mask; j += 16) + { + has_mask = 0; + const __m256i f_data = _mm256_lddqu_si256(reinterpret_cast(second.data + j)); + __m256i bitmask = has_second_null_map ? + _mm256_set_epi16( + (second_null_map[j + 15]) ? full : none, (second_null_map[j + 14]) ? full : none, + (second_null_map[j + 13]) ? full : none, (second_null_map[j + 12]) ? full : none, + (second_null_map[j + 11]) ? full : none, (second_null_map[j + 10]) ? full : none, + (second_null_map[j + 9]) ? full : none, (second_null_map[j + 8])? full : none, + (second_null_map[j + 7]) ? full : none, (second_null_map[j + 6])? full : none, + (second_null_map[j + 5]) ? full : none, (second_null_map[j + 4])? full : none, + (second_null_map[j + 3]) ? full : none, (second_null_map[j + 2])? full : none, + (second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full : none) + : zeros; + unsigned i = 0; + for (; i < first.size - 15 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 16) + { + const __m256i s_data = _mm256_lddqu_si256(reinterpret_cast(first.data + i)); + const __m256i first_nm_mask = _mm256_set_m128i( + _mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast(first_null_map+i+8))), + _mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast(first_null_map+i)))); + bitmask = + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + first_nm_mask, + _mm256_cmpeq_epi16(f_data, s_data)), + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26))))) + ), + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18)))))) + ), + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_permute2x128_si256(first_nm_mask, first_nm_mask,1), + _mm256_cmpeq_epi16(f_data, _mm256_permute2x128_si256(s_data, s_data, 1))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10))))) + ), + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data ,s_data, 1), _mm256_set_epi8(7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data ,s_data ,1), _mm256_set_epi8(3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2)))))) + ) + ), + bitmask); + } + + if (i < first.size) + { + for (; i < first.size && !has_mask; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + __m256i v_i = _mm256_set1_epi16(first.data[i]); + bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi16(f_data, v_i)); + has_mask = _mm256_testc_si256(bitmask, ones); + } + } + } + } + + if (!has_mask && second.size > 2) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); +} + +#elif defined(__SSE4_2__) + +// SSE4.2 Int specialization +template <> +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + if (second.size == 0) + return true; + + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) + return false; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; + short has_mask = 1; + const __m128i zeros = _mm_setzero_si128(); + if (second.size > 3 && first.size > 2) + { + const int full = -1, none = 0; + for (; j < second.size - 3 && has_mask; j += 4) + { + has_mask = 0; + const __m128i f_data = _mm_lddqu_si128(reinterpret_cast(second.data + j)); + __m128i bitmask = has_second_null_map ? + _mm_set_epi32( + (second_null_map[j + 3]) ? full : none, + (second_null_map[j + 2]) ? full : none, + (second_null_map[j + 1]) ? full : none, + (second_null_map[j]) ? full : none) + : zeros; + + unsigned i = 0; + for (; i < first.size - 3 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 4) + { + const __m128i s_data = _mm_lddqu_si128(reinterpret_cast(first.data + i)); + const __m128i first_nm_mask = has_first_null_map ? + _mm_cvtepi8_epi32(_mm_lddqu_si128(reinterpret_cast(first_null_map + i))) + : zeros; + + bitmask = + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + first_nm_mask, + _mm_cmpeq_epi32(f_data, s_data)), + _mm_andnot_si128( + _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(2,1,0,3)), + _mm_cmpeq_epi32(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(2,1,0,3))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(1,0,3,2)), + _mm_cmpeq_epi32(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(1,0,3,2)))), + _mm_andnot_si128( + _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(0,3,2,1)), + _mm_cmpeq_epi32(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(0,3,2,1))))) + ), + bitmask); + } + + if (i < first.size) + { + for (; i < first.size && !has_mask; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + __m128i r_i = _mm_set1_epi32(first.data[i]); + bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi32(f_data, r_i)); + has_mask = _mm_test_all_ones(bitmask); + } + } + } + } + + if (!has_mask) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); +} + +// SSE4.2 Int64 specialization +template <> +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + if (second.size == 0) + return true; + + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) + return false; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; + short has_mask = 1; + const Int64 full = -1, none = 0; + const __m128i zeros = _mm_setzero_si128(); + for (; j < second.size - 1 && has_mask; j += 2) + { + has_mask = 0; + const __m128i f_data = _mm_lddqu_si128(reinterpret_cast(second.data + j)); + __m128i bitmask = has_second_null_map ? + _mm_set_epi64x( + (second_null_map[j + 1]) ? full : none, + (second_null_map[j]) ? full : none) + : zeros; + unsigned i = 0; + for (; i < first.size - 1 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 2) + { + const __m128i s_data = _mm_lddqu_si128(reinterpret_cast(first.data + i)); + const __m128i first_nm_mask = has_first_null_map ? + _mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast(first_null_map + i))) + : zeros; + bitmask = + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + first_nm_mask, + _mm_cmpeq_epi32(f_data, s_data)), + _mm_andnot_si128( + _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(1,0,3,2)), + _mm_cmpeq_epi64(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(1,0,3,2))))), + bitmask); + } + + if (i < first.size) + { + for (; i < first.size && !has_mask; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + __m128i v_i = _mm_set1_epi64x(first.data[i]); + bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi64(f_data, v_i)); + has_mask = _mm_test_all_ones(bitmask); + } + } + } + + if (!has_mask) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); +} + +// SSE4.2 Int16_t specialization +template <> +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + if (second.size == 0) + return true; + + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) + return false; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; + short has_mask = 1; + const int16_t full = -1, none = 0; + const __m128i zeros = _mm_setzero_si128(); + if (second.size > 6 && first.size > 6) + { + for (; j < second.size - 7 && has_mask; j += 8) + { + has_mask = 0; + const __m128i f_data = _mm_lddqu_si128(reinterpret_cast(second.data+j)); + __m128i bitmask = has_second_null_map ? + _mm_set_epi16( + (second_null_map[j + 7]) ? full : none, (second_null_map[j + 6]) ? full : none, + (second_null_map[j + 5]) ? full : none, (second_null_map[j + 4]) ? full : none, + (second_null_map[j + 3]) ? full : none, (second_null_map[j + 2]) ? full : none, + (second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full: none) + : zeros; + unsigned i = 0; + for (; i < first.size-7 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 8) + { + const __m128i s_data = _mm_lddqu_si128(reinterpret_cast(first.data + i)); + const __m128i first_nm_mask = has_first_null_map ? + _mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast(first_null_map+i))) + : zeros; + bitmask = + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + first_nm_mask, + _mm_cmpeq_epi16(f_data, s_data)), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14)), + _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)), + _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10)), + _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10))))) + ), + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)), + _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6)), + _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)), + _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)), + _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)))))) + ), + bitmask); + } + + if (i < first.size) + { + for (; i < first.size && !has_mask; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + __m128i v_i = _mm_set1_epi16(first.data[i]); + bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi16(f_data, v_i)); + has_mask = _mm_test_all_ones(bitmask); + } + } + } + } + + if (!has_mask && second.size > 2) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); +} + +// SSE4.2 Int8_t specialization +template <> +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + if (second.size == 0) + return true; + + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) + return false; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; + short has_mask = 1; + const int full = -1, none = 0; + const __m128i zeros = _mm_setzero_si128(); + if (second.size > 15) + { + for (; j < second.size - 15 && has_mask; j += 16) + { + has_mask = 0; + const __m128i f_data = _mm_lddqu_si128(reinterpret_cast(second.data+j)); + __m128i bitmask = has_second_null_map ? + _mm_set_epi8( + (second_null_map[j + 15]) ? full : none, (second_null_map[j + 14]) ? full : none, + (second_null_map[j + 13]) ? full : none, (second_null_map[j + 12]) ? full : none, + (second_null_map[j + 11]) ? full : none, (second_null_map[j + 10]) ? full : none, + (second_null_map[j + 9]) ? full : none, (second_null_map[j + 8]) ? full : none, + (second_null_map[j + 7]) ? full : none, (second_null_map[j + 6]) ? full : none, + (second_null_map[j + 5]) ? full : none, (second_null_map[j + 4]) ? full : none, + (second_null_map[j + 3]) ? full : none, (second_null_map[j + 2]) ? full : none, + (second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full : none) + : zeros; + unsigned i = 0; + for (; i < first.size - 15 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 16) + { + const __m128i s_data = _mm_lddqu_si128(reinterpret_cast(first.data+i)); + const __m128i first_nm_mask = has_first_null_map ? + _mm_lddqu_si128(reinterpret_cast(first_null_map+i)) + : zeros; + bitmask = + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + first_nm_mask, + _mm_cmpeq_epi8(f_data, s_data)), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13))))) + ), + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9))))))), + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5)))))), + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1)))))))), + bitmask); + } + + if (i < first.size) + { + for (; i < first.size && !has_mask; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + __m128i v_i = _mm_set1_epi8(first.data[i]); + bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi8(f_data, v_i)); + has_mask = _mm_test_all_ones(bitmask); + } + } + } + } + + if (!has_mask) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); +} + +#endif + +} From 13878d261850d190dfec91c10d1d8846df7e8035 Mon Sep 17 00:00:00 2001 From: Youenn Lebras Date: Tue, 31 Aug 2021 14:04:15 +0200 Subject: [PATCH 009/239] Modify include files according to the processors capabilities --- src/Functions/GatherUtils/sliceHasImplAnyAll.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/Functions/GatherUtils/sliceHasImplAnyAll.h b/src/Functions/GatherUtils/sliceHasImplAnyAll.h index 59d37473e42..5603a802e7a 100644 --- a/src/Functions/GatherUtils/sliceHasImplAnyAll.h +++ b/src/Functions/GatherUtils/sliceHasImplAnyAll.h @@ -4,8 +4,12 @@ #include "Slices.h" #include "sliceEqualElements.h" -#if defined(__AVX2__) || defined(__SSE4_2__) -#include +#if defined(__SSE4_2__) + #include + #include +#endif +#if defined(__AVX2__) + #include #endif namespace DB::GatherUtils @@ -124,7 +128,7 @@ inline ALWAYS_INLINE bool hasAllIntegralLoopRemainder( #endif #if defined(__AVX2__) -// AVX2 Int specialization +// AVX2 Int specialization of sliceHasImplAnyAll template <> inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) From ade754d444f668eb0534e36f998bba341ba047e4 Mon Sep 17 00:00:00 2001 From: Youenn Lebras Date: Thu, 2 Sep 2021 18:28:25 +0200 Subject: [PATCH 010/239] Fix a bug for avx2 and add performance tests for HasAll --- tests/performance/hasAll.xml | 113 +++++++++++++++++++++++++++++++++++ 1 file changed, 113 insertions(+) create mode 100644 tests/performance/hasAll.xml diff --git a/tests/performance/hasAll.xml b/tests/performance/hasAll.xml new file mode 100644 index 00000000000..a6ceb915bd5 --- /dev/null +++ b/tests/performance/hasAll.xml @@ -0,0 +1,113 @@ + + CREATE TABLE test_table_small (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_small2 (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_smallf (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set + + CREATE TABLE test_table_medium (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_medium2 (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_mediumf (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set + + CREATE TABLE test_table_large (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_large2 (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_largef (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set + + + + INSERT INTO test_table_small SELECT + groupArraySample(500)(number) AS set, + groupArraySample(10)(number) AS subset + FROM (SELECT * FROM numbers(500)) + + INSERT INTO test_table_small2 SELECT + groupArraySample(500)(number) AS set, + groupArraySample(400)(number) AS subset + FROM (SELECT * FROM numbers(500)) + + INSERT INTO test_table_smallf SELECT + groupArraySample(500)(number) AS set, + groupArraySample(10)(number) AS subset + FROM (SELECT * FROM numbers(5000000)) + + + + INSERT INTO test_table_medium SELECT + groupArraySample(50000)(number) AS set, + groupArraySample(10)(number) AS subset + FROM + ( + SELECT * + FROM numbers(50000) + ) + + INSERT INTO test_table_medium2 SELECT + groupArraySample(50000)(number) AS set, + groupArraySample(4000)(number) AS subset + FROM + ( + SELECT * + FROM numbers(50000) + ) + + INSERT INTO test_table_mediumf SELECT + groupArraySample(50000)(number) AS set, + groupArraySample(10)(number) AS subset + FROM + ( + SELECT * + FROM numbers(5000000) + ) + + + + INSERT INTO test_table_large SELECT + groupArraySample(5000000)(number) AS set, + groupArraySample(10)(number) AS subset + FROM + ( + SELECT * + FROM numbers(5000000) + ) + + INSERT INTO test_table_large2 SELECT + groupArraySample(5000000)(number) AS set, + groupArraySample(4000)(number) AS subset + FROM + ( + SELECT * + FROM numbers(5000000) + ) + + INSERT INTO test_table_largef SELECT + groupArraySample(5000000)(number) AS set, + groupArraySample(10)(number) AS subset + FROM + ( + SELECT * + FROM numbers(100000000) + ) + + + select hasAll(set, subset) from test_table_small + select hasAll(set, subset) from test_table_small2 + select hasAll(set, subset) from test_table_smallf + + select hasAll(set, subset) from test_table_medium + select hasAll(set, subset) from test_table_medium2 + select hasAll(set, subset) from test_table_mediumf + + select hasAll(set, subset) from test_table_large + select hasAll(set, subset) from test_table_large2 + select hasAll(set, subset) from test_table_largef + + DROP TABLE IF EXISTS test_table_small + DROP TABLE IF EXISTS test_table_small2 + DROP TABLE IF EXISTS test_table_smallf + + DROP TABLE IF EXISTS test_table_medium + DROP TABLE IF EXISTS test_table_medium2 + DROP TABLE IF EXISTS test_table_mediumf + + DROP TABLE IF EXISTS test_table_large + DROP TABLE IF EXISTS test_table_large2 + DROP TABLE IF EXISTS test_table_largef + From a71944d11ddcbc2d277692d8175fea5d91220aed Mon Sep 17 00:00:00 2001 From: Youenn Lebras Date: Fri, 3 Sep 2021 12:19:42 +0200 Subject: [PATCH 011/239] Add performance tests for HasAll for int{64,16,8} --- tests/performance/hasAll.xml | 113 ------------------------ tests/performance/hasAll_simd_int16.xml | 52 +++++++++++ tests/performance/hasAll_simd_int32.xml | 52 +++++++++++ tests/performance/hasAll_simd_int64.xml | 52 +++++++++++ tests/performance/hasAll_simd_int8.xml | 52 +++++++++++ 5 files changed, 208 insertions(+), 113 deletions(-) delete mode 100644 tests/performance/hasAll.xml create mode 100644 tests/performance/hasAll_simd_int16.xml create mode 100644 tests/performance/hasAll_simd_int32.xml create mode 100644 tests/performance/hasAll_simd_int64.xml create mode 100644 tests/performance/hasAll_simd_int8.xml diff --git a/tests/performance/hasAll.xml b/tests/performance/hasAll.xml deleted file mode 100644 index a6ceb915bd5..00000000000 --- a/tests/performance/hasAll.xml +++ /dev/null @@ -1,113 +0,0 @@ - - CREATE TABLE test_table_small (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_small2 (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_smallf (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set - - CREATE TABLE test_table_medium (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_medium2 (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_mediumf (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set - - CREATE TABLE test_table_large (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_large2 (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_largef (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set - - - - INSERT INTO test_table_small SELECT - groupArraySample(500)(number) AS set, - groupArraySample(10)(number) AS subset - FROM (SELECT * FROM numbers(500)) - - INSERT INTO test_table_small2 SELECT - groupArraySample(500)(number) AS set, - groupArraySample(400)(number) AS subset - FROM (SELECT * FROM numbers(500)) - - INSERT INTO test_table_smallf SELECT - groupArraySample(500)(number) AS set, - groupArraySample(10)(number) AS subset - FROM (SELECT * FROM numbers(5000000)) - - - - INSERT INTO test_table_medium SELECT - groupArraySample(50000)(number) AS set, - groupArraySample(10)(number) AS subset - FROM - ( - SELECT * - FROM numbers(50000) - ) - - INSERT INTO test_table_medium2 SELECT - groupArraySample(50000)(number) AS set, - groupArraySample(4000)(number) AS subset - FROM - ( - SELECT * - FROM numbers(50000) - ) - - INSERT INTO test_table_mediumf SELECT - groupArraySample(50000)(number) AS set, - groupArraySample(10)(number) AS subset - FROM - ( - SELECT * - FROM numbers(5000000) - ) - - - - INSERT INTO test_table_large SELECT - groupArraySample(5000000)(number) AS set, - groupArraySample(10)(number) AS subset - FROM - ( - SELECT * - FROM numbers(5000000) - ) - - INSERT INTO test_table_large2 SELECT - groupArraySample(5000000)(number) AS set, - groupArraySample(4000)(number) AS subset - FROM - ( - SELECT * - FROM numbers(5000000) - ) - - INSERT INTO test_table_largef SELECT - groupArraySample(5000000)(number) AS set, - groupArraySample(10)(number) AS subset - FROM - ( - SELECT * - FROM numbers(100000000) - ) - - - select hasAll(set, subset) from test_table_small - select hasAll(set, subset) from test_table_small2 - select hasAll(set, subset) from test_table_smallf - - select hasAll(set, subset) from test_table_medium - select hasAll(set, subset) from test_table_medium2 - select hasAll(set, subset) from test_table_mediumf - - select hasAll(set, subset) from test_table_large - select hasAll(set, subset) from test_table_large2 - select hasAll(set, subset) from test_table_largef - - DROP TABLE IF EXISTS test_table_small - DROP TABLE IF EXISTS test_table_small2 - DROP TABLE IF EXISTS test_table_smallf - - DROP TABLE IF EXISTS test_table_medium - DROP TABLE IF EXISTS test_table_medium2 - DROP TABLE IF EXISTS test_table_mediumf - - DROP TABLE IF EXISTS test_table_large - DROP TABLE IF EXISTS test_table_large2 - DROP TABLE IF EXISTS test_table_largef - diff --git a/tests/performance/hasAll_simd_int16.xml b/tests/performance/hasAll_simd_int16.xml new file mode 100644 index 00000000000..c2ce4eec77f --- /dev/null +++ b/tests/performance/hasAll_simd_int16.xml @@ -0,0 +1,52 @@ + + CREATE TABLE test_table_small (`set` Array(Int16), `subset` Array (Int16)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_small2 (`set` Array(Int16), `subset` Array (Int16)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_smallf (`set` Array(Int16), `subset` Array (Int16)) ENGINE = MergeTree ORDER BY set + + CREATE TABLE test_table_medium (`set` Array(Int16), `subset` Array (Int16)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_medium2 (`set` Array(Int16), `subset` Array (Int16)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_mediumf (`set` Array(Int16), `subset` Array (Int16)) ENGINE = MergeTree ORDER BY set + + CREATE TABLE test_table_large (`set` Array(Int16), `subset` Array (Int16)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_large2 (`set` Array(Int16), `subset` Array (Int16)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_largef (`set` Array(Int16), `subset` Array (Int16)) ENGINE = MergeTree ORDER BY set + + + INSERT INTO test_table_small SELECT groupArraySample(500)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(500)) + INSERT INTO test_table_small2 SELECT groupArraySample(500)(number) AS set, groupArraySample(400)(number) AS subset FROM (SELECT * FROM numbers(500)) + INSERT INTO test_table_smallf SELECT groupArraySample(500)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(5000000)) + + + INSERT INTO test_table_medium SELECT groupArraySample(1000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(1000000)) + INSERT INTO test_table_medium2 SELECT groupArraySample(1000000)(number) AS set, groupArraySample(4000)(number) AS subset FROM (SELECT * FROM numbers(1000000)) + INSERT INTO test_table_mediumf SELECT groupArraySample(1000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(100000000)) + + + INSERT INTO test_table_large SELECT groupArraySample(50000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(50000000)) + INSERT INTO test_table_large2 SELECT groupArraySample(50000000)(number) AS set, groupArraySample(8000)(number) AS subset FROM (SELECT * FROM numbers(50000000)) + INSERT INTO test_table_largef SELECT groupArraySample(50000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(1000000000)) + + select hasAll(set, subset) from test_table_small + select hasAll(set, subset) from test_table_small2 + select hasAll(set, subset) from test_table_smallf + + select hasAll(set, subset) from test_table_medium + select hasAll(set, subset) from test_table_medium2 + select hasAll(set, subset) from test_table_mediumf + + select hasAll(set, subset) from test_table_large + select hasAll(set, subset) from test_table_large2 Settings max_execution_time=300 + select hasAll(set, subset) from test_table_largef + + DROP TABLE IF EXISTS test_table_small + DROP TABLE IF EXISTS test_table_small2 + DROP TABLE IF EXISTS test_table_smallf + + DROP TABLE IF EXISTS test_table_medium + DROP TABLE IF EXISTS test_table_medium2 + DROP TABLE IF EXISTS test_table_mediumf + + DROP TABLE IF EXISTS test_table_large + DROP TABLE IF EXISTS test_table_large2 + DROP TABLE IF EXISTS test_table_largef + diff --git a/tests/performance/hasAll_simd_int32.xml b/tests/performance/hasAll_simd_int32.xml new file mode 100644 index 00000000000..4543dea161b --- /dev/null +++ b/tests/performance/hasAll_simd_int32.xml @@ -0,0 +1,52 @@ + + CREATE TABLE test_table_small (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_small2 (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_smallf (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set + + CREATE TABLE test_table_medium (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_medium2 (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_mediumf (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set + + CREATE TABLE test_table_large (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_large2 (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_largef (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set + + + INSERT INTO test_table_small SELECT groupArraySample(500)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(500)) + INSERT INTO test_table_small2 SELECT groupArraySample(500)(number) AS set, groupArraySample(400)(number) AS subset FROM (SELECT * FROM numbers(500)) + INSERT INTO test_table_smallf SELECT groupArraySample(500)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(5000000)) + + + INSERT INTO test_table_medium SELECT groupArraySample(1000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(1000000)) + INSERT INTO test_table_medium2 SELECT groupArraySample(1000000)(number) AS set, groupArraySample(4000)(number) AS subset FROM (SELECT * FROM numbers(1000000)) + INSERT INTO test_table_mediumf SELECT groupArraySample(1000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(100000000)) + + + INSERT INTO test_table_large SELECT groupArraySample(50000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(50000000)) + INSERT INTO test_table_large2 SELECT groupArraySample(50000000)(number) AS set, groupArraySample(4000)(number) AS subset FROM (SELECT * FROM numbers(50000000)) Settings max_execution_time=30 + INSERT INTO test_table_largef SELECT groupArraySample(50000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(1000000000)) + + select hasAll(set, subset) from test_table_small + select hasAll(set, subset) from test_table_small2 + select hasAll(set, subset) from test_table_smallf + + select hasAll(set, subset) from test_table_medium + select hasAll(set, subset) from test_table_medium2 + select hasAll(set, subset) from test_table_mediumf + + select hasAll(set, subset) from test_table_large + select hasAll(set, subset) from test_table_large2 Settings max_execution_time=300 + select hasAll(set, subset) from test_table_largef + + DROP TABLE IF EXISTS test_table_small + DROP TABLE IF EXISTS test_table_small2 + DROP TABLE IF EXISTS test_table_smallf + + DROP TABLE IF EXISTS test_table_medium + DROP TABLE IF EXISTS test_table_medium2 + DROP TABLE IF EXISTS test_table_mediumf + + DROP TABLE IF EXISTS test_table_large + DROP TABLE IF EXISTS test_table_large2 + DROP TABLE IF EXISTS test_table_largef + diff --git a/tests/performance/hasAll_simd_int64.xml b/tests/performance/hasAll_simd_int64.xml new file mode 100644 index 00000000000..07e52483bb1 --- /dev/null +++ b/tests/performance/hasAll_simd_int64.xml @@ -0,0 +1,52 @@ + + CREATE TABLE test_table_small (`set` Array(Int64), `subset` Array (Int64)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_small2 (`set` Array(Int64), `subset` Array (Int64)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_smallf (`set` Array(Int64), `subset` Array (Int64)) ENGINE = MergeTree ORDER BY set + + CREATE TABLE test_table_medium (`set` Array(Int64), `subset` Array (Int64)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_medium2 (`set` Array(Int64), `subset` Array (Int64)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_mediumf (`set` Array(Int64), `subset` Array (Int64)) ENGINE = MergeTree ORDER BY set + + CREATE TABLE test_table_large (`set` Array(Int64), `subset` Array (Int64)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_large2 (`set` Array(Int64), `subset` Array (Int64)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_largef (`set` Array(Int64), `subset` Array (Int64)) ENGINE = MergeTree ORDER BY set + + + INSERT INTO test_table_small SELECT groupArraySample(500)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(500)) + INSERT INTO test_table_small2 SELECT groupArraySample(500)(number) AS set, groupArraySample(400)(number) AS subset FROM (SELECT * FROM numbers(500)) + INSERT INTO test_table_smallf SELECT groupArraySample(500)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(5000000)) + + + INSERT INTO test_table_medium SELECT groupArraySample(1000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(1000000)) + INSERT INTO test_table_medium2 SELECT groupArraySample(1000000)(number) AS set, groupArraySample(4000)(number) AS subset FROM (SELECT * FROM numbers(1000000)) + INSERT INTO test_table_mediumf SELECT groupArraySample(1000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(100000000)) + + + INSERT INTO test_table_large SELECT groupArraySample(50000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(50000000)) + INSERT INTO test_table_large2 SELECT groupArraySample(50000000)(number) AS set, groupArraySample(2000)(number) AS subset FROM (SELECT * FROM numbers(50000000)) Settings max_execution_time=30 + INSERT INTO test_table_largef SELECT groupArraySample(50000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(1000000000)) + + select hasAll(set, subset) from test_table_small + select hasAll(set, subset) from test_table_small2 + select hasAll(set, subset) from test_table_smallf + + select hasAll(set, subset) from test_table_medium + select hasAll(set, subset) from test_table_medium2 + select hasAll(set, subset) from test_table_mediumf + + select hasAll(set, subset) from test_table_large + select hasAll(set, subset) from test_table_large2 Settings max_execution_time=300 + select hasAll(set, subset) from test_table_largef + + DROP TABLE IF EXISTS test_table_small + DROP TABLE IF EXISTS test_table_small2 + DROP TABLE IF EXISTS test_table_smallf + + DROP TABLE IF EXISTS test_table_medium + DROP TABLE IF EXISTS test_table_medium2 + DROP TABLE IF EXISTS test_table_mediumf + + DROP TABLE IF EXISTS test_table_large + DROP TABLE IF EXISTS test_table_large2 + DROP TABLE IF EXISTS test_table_largef + diff --git a/tests/performance/hasAll_simd_int8.xml b/tests/performance/hasAll_simd_int8.xml new file mode 100644 index 00000000000..5ddc84aa5bd --- /dev/null +++ b/tests/performance/hasAll_simd_int8.xml @@ -0,0 +1,52 @@ + + CREATE TABLE test_table_small (`set` Array(Int8), `subset` Array (Int8)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_small2 (`set` Array(Int8), `subset` Array (Int8)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_smallf (`set` Array(Int8), `subset` Array (Int8)) ENGINE = MergeTree ORDER BY set + + CREATE TABLE test_table_medium (`set` Array(Int8), `subset` Array (Int8)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_medium2 (`set` Array(Int8), `subset` Array (Int8)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_mediumf (`set` Array(Int8), `subset` Array (Int8)) ENGINE = MergeTree ORDER BY set + + CREATE TABLE test_table_large (`set` Array(Int8), `subset` Array (Int8)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_large2 (`set` Array(Int8), `subset` Array (Int8)) ENGINE = MergeTree ORDER BY set + CREATE TABLE test_table_largef (`set` Array(Int8), `subset` Array (Int8)) ENGINE = MergeTree ORDER BY set + + + INSERT INTO test_table_small SELECT groupArraySample(500)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(500)) + INSERT INTO test_table_small2 SELECT groupArraySample(500)(number) AS set, groupArraySample(400)(number) AS subset FROM (SELECT * FROM numbers(500)) + INSERT INTO test_table_smallf SELECT groupArraySample(500)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(5000000)) + + + INSERT INTO test_table_medium SELECT groupArraySample(1000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(1000000)) + INSERT INTO test_table_medium2 SELECT groupArraySample(1000000)(number) AS set, groupArraySample(4000)(number) AS subset FROM (SELECT * FROM numbers(1000000)) + INSERT INTO test_table_mediumf SELECT groupArraySample(1000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(100000000)) + + + INSERT INTO test_table_large SELECT groupArraySample(50000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(50000000)) + INSERT INTO test_table_large2 SELECT groupArraySample(50000000)(number) AS set, groupArraySample(4000)(number) AS subset FROM (SELECT * FROM numbers(50000000)) + INSERT INTO test_table_largef SELECT groupArraySample(50000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(1000000000)) + + select hasAll(set, subset) from test_table_small + select hasAll(set, subset) from test_table_small2 + select hasAll(set, subset) from test_table_smallf + + select hasAll(set, subset) from test_table_medium + select hasAll(set, subset) from test_table_medium2 + select hasAll(set, subset) from test_table_mediumf + + select hasAll(set, subset) from test_table_large + select hasAll(set, subset) from test_table_large2 Settings max_execution_time=300 + select hasAll(set, subset) from test_table_largef + + DROP TABLE IF EXISTS test_table_small + DROP TABLE IF EXISTS test_table_small2 + DROP TABLE IF EXISTS test_table_smallf + + DROP TABLE IF EXISTS test_table_medium + DROP TABLE IF EXISTS test_table_medium2 + DROP TABLE IF EXISTS test_table_mediumf + + DROP TABLE IF EXISTS test_table_large + DROP TABLE IF EXISTS test_table_large2 + DROP TABLE IF EXISTS test_table_largef + From 62487fe2fcf0eb90e53bd9291afec42761f4a797 Mon Sep 17 00:00:00 2001 From: Youenn Lebras Date: Mon, 6 Sep 2021 09:20:03 +0200 Subject: [PATCH 012/239] Pass SSE version to 4.2 and exploiting it's specific loadu --- src/Functions/GatherUtils/CMakeLists.txt | 3 + .../GatherUtils/sliceHasImplAnyAll.h | 140 ++++++++++-------- 2 files changed, 79 insertions(+), 64 deletions(-) diff --git a/src/Functions/GatherUtils/CMakeLists.txt b/src/Functions/GatherUtils/CMakeLists.txt index 731407e774c..a379ccbadde 100644 --- a/src/Functions/GatherUtils/CMakeLists.txt +++ b/src/Functions/GatherUtils/CMakeLists.txt @@ -11,6 +11,9 @@ if (HAS_SUGGEST_DESTRUCTOR_OVERRIDE) target_compile_definitions(clickhouse_functions_gatherutils PUBLIC HAS_SUGGEST_DESTRUCTOR_OVERRIDE) endif() +if (HAVE_SSE42) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse2 -msse4.2") +endif() if (HAVE_AVX2) target_compile_options(clickhouse_functions_gatherutils PRIVATE -mavx2 -DNAMESPACE=AVX2) endif() diff --git a/src/Functions/GatherUtils/sliceHasImplAnyAll.h b/src/Functions/GatherUtils/sliceHasImplAnyAll.h index 5603a802e7a..111b9d767dd 100644 --- a/src/Functions/GatherUtils/sliceHasImplAnyAll.h +++ b/src/Functions/GatherUtils/sliceHasImplAnyAll.h @@ -6,6 +6,7 @@ #if defined(__SSE4_2__) #include + #include #include #endif #if defined(__AVX2__) @@ -153,7 +154,7 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll(second.data + j)); - // bitmask is filled with minus ones for ones which are considered as null in the corresponding null map, 0 otherwise; + // bits of the bitmask are set to one if considered as null in the corresponding null map, 0 otherwise; __m256i bitmask = has_second_null_map ? _mm256_set_epi32( (second_null_map[j + 7]) ? full : none, @@ -167,15 +168,16 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll(first.data + i)); // Create a mask to avoid to compare null elements - // set_m128i takes two arguments: (high segment, low segment) that are two __m128i convert from 8bits to 32bits to fit to our following operations - const __m256i first_nm_mask = _mm256_set_m128i( - _mm_cvtepi8_epi32(_mm_lddqu_si128(reinterpret_cast(first_null_map + i + 4))), - _mm_cvtepi8_epi32(_mm_lddqu_si128(reinterpret_cast(first_null_map + i)))); + // set_m128i takes two arguments: (high segment, low segment) that are two __m128i convert from 8bits to 32bits to match with next operations + const __m256i first_nm_mask = has_first_null_map? + _mm256_set_m128i( + _mm_cvtepi8_epi32(_mm_lddqu_si128(reinterpret_cast(first_null_map + i + 4))), + _mm_cvtepi8_epi32(_mm_lddqu_si128(reinterpret_cast(first_null_map + i)))) + : zeros; bitmask = _mm256_or_si256( _mm256_or_si256( @@ -228,7 +230,7 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll 7) return false; return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); @@ -262,7 +264,7 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll 3 && first.size > 3) @@ -271,6 +273,7 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll(second.data + j)); + // bits of the bitmask are set to one if considered as null in the corresponding null map, 0 otherwise; __m256i bitmask = has_second_null_map ? _mm256_set_epi64x( (second_null_map[j + 3])? full : none, @@ -283,9 +286,11 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll(first.data + i)); - const __m256i first_nm_mask = _mm256_set_m128i( - _mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast(first_null_map + i + 2))), - _mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast(first_null_map + i)))); + const __m256i first_nm_mask = has_first_null_map? + _mm256_set_m128i( + _mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast(first_null_map + i + 2))), + _mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast(first_null_map + i)))) + : zeros; bitmask = _mm256_or_si256( _mm256_or_si256( @@ -321,7 +326,7 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll 2) + if (!has_mask && second.size > 3) return false; return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); @@ -343,7 +348,7 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll 15 && first.size > 15) @@ -367,9 +372,11 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll(first.data + i)); - const __m256i first_nm_mask = _mm256_set_m128i( - _mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast(first_null_map+i+8))), - _mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast(first_null_map+i)))); + const __m256i first_nm_mask = has_first_null_map? + _mm256_set_m128i( + _mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast(first_null_map + i + 8))), + _mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast(first_null_map + i)))) + : zeros; bitmask = _mm256_or_si256( _mm256_or_si256( @@ -457,7 +464,7 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll 2) + if (!has_mask && second.size > 15) return false; return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); @@ -481,10 +488,10 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll 3 && first.size > 2) + if (second.size > 3 && first.size > 3) { - const int full = -1, none = 0; for (; j < second.size - 3 && has_mask; j += 4) { has_mask = 0; @@ -540,7 +547,7 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll 3) return false; return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); @@ -564,48 +571,51 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll 1 && first.size > 1) { - has_mask = 0; - const __m128i f_data = _mm_lddqu_si128(reinterpret_cast(second.data + j)); - __m128i bitmask = has_second_null_map ? - _mm_set_epi64x( - (second_null_map[j + 1]) ? full : none, - (second_null_map[j]) ? full : none) - : zeros; - unsigned i = 0; - for (; i < first.size - 1 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 2) + for (; j < second.size - 1 && has_mask; j += 2) { - const __m128i s_data = _mm_lddqu_si128(reinterpret_cast(first.data + i)); - const __m128i first_nm_mask = has_first_null_map ? - _mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast(first_null_map + i))) + has_mask = 0; + const __m128i f_data = _mm_lddqu_si128(reinterpret_cast(second.data + j)); + __m128i bitmask = has_second_null_map ? + _mm_set_epi64x( + (second_null_map[j + 1]) ? full : none, + (second_null_map[j]) ? full : none) : zeros; - bitmask = - _mm_or_si128( - _mm_or_si128( - _mm_andnot_si128( - first_nm_mask, - _mm_cmpeq_epi32(f_data, s_data)), - _mm_andnot_si128( - _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(1,0,3,2)), - _mm_cmpeq_epi64(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(1,0,3,2))))), - bitmask); - } - - if (i < first.size) - { - for (; i < first.size && !has_mask; ++i) + unsigned i = 0; + for (; i < first.size - 1 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 2) { - if (has_first_null_map && first_null_map[i]) - continue; - __m128i v_i = _mm_set1_epi64x(first.data[i]); - bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi64(f_data, v_i)); - has_mask = _mm_test_all_ones(bitmask); + const __m128i s_data = _mm_lddqu_si128(reinterpret_cast(first.data + i)); + const __m128i first_nm_mask = has_first_null_map ? + _mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast(first_null_map + i))) + : zeros; + bitmask = + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + first_nm_mask, + _mm_cmpeq_epi64(f_data, s_data)), + _mm_andnot_si128( + _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(1,0,3,2)), + _mm_cmpeq_epi64(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(1,0,3,2))))), + bitmask); + } + + if (i < first.size) + { + for (; i < first.size && !has_mask; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + __m128i v_i = _mm_set1_epi64x(first.data[i]); + bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi64(f_data, v_i)); + has_mask = _mm_test_all_ones(bitmask); + } } } } - if (!has_mask) + if (!has_mask && second.size > 1) return false; return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); @@ -634,7 +644,7 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll(second.data+j)); + const __m128i f_data = _mm_lddqu_si128(reinterpret_cast(second.data + j)); __m128i bitmask = has_second_null_map ? _mm_set_epi16( (second_null_map[j + 7]) ? full : none, (second_null_map[j + 6]) ? full : none, @@ -647,7 +657,7 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll(first.data + i)); const __m128i first_nm_mask = has_first_null_map ? - _mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast(first_null_map+i))) + _mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast(first_null_map + i))) : zeros; bitmask = _mm_or_si128( @@ -701,13 +711,15 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll 2) + if (!has_mask && second.size > 6) return false; return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); } -// SSE4.2 Int8_t specialization +// Int8 version is faster with SSE than with AVX2 +#if defined(__SSE4_2__) +// SSE2 Int8_t specialization template <> inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) @@ -723,14 +735,14 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll 15) + if (second.size > 15 && first.size > 15) { for (; j < second.size - 15 && has_mask; j += 16) { has_mask = 0; - const __m128i f_data = _mm_lddqu_si128(reinterpret_cast(second.data+j)); + const __m128i f_data = _mm_lddqu_si128(reinterpret_cast(second.data + j)); __m128i bitmask = has_second_null_map ? _mm_set_epi8( (second_null_map[j + 15]) ? full : none, (second_null_map[j + 14]) ? full : none, @@ -745,9 +757,9 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll(first.data+i)); + const __m128i s_data = _mm_lddqu_si128(reinterpret_cast(first.data + i)); const __m128i first_nm_mask = has_first_null_map ? - _mm_lddqu_si128(reinterpret_cast(first_null_map+i)) + _mm_lddqu_si128(reinterpret_cast(first_null_map + i)) : zeros; bitmask = _mm_or_si128( @@ -832,7 +844,7 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll 15) return false; return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); From a810ce5dcb46a6de40fdc1afa9cb5ee7eab6a5b5 Mon Sep 17 00:00:00 2001 From: Youenn Lebras Date: Thu, 9 Sep 2021 11:21:32 +0200 Subject: [PATCH 013/239] Remove AVX2 to figure out where is the illegal intruction Enable AVX2 - int32 --- .../GatherUtils/sliceHasImplAnyAll.h | 235 ------------------ 1 file changed, 235 deletions(-) diff --git a/src/Functions/GatherUtils/sliceHasImplAnyAll.h b/src/Functions/GatherUtils/sliceHasImplAnyAll.h index 111b9d767dd..7c253cbc407 100644 --- a/src/Functions/GatherUtils/sliceHasImplAnyAll.h +++ b/src/Functions/GatherUtils/sliceHasImplAnyAll.h @@ -12,7 +12,6 @@ #if defined(__AVX2__) #include #endif - namespace DB::GatherUtils { @@ -236,240 +235,6 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll' to 'const NumericArraySlice'" -// How should we do, copy past each function ?? I haven't found a way to specialize a same function body for two different types. -// AVX2 UInt specialization -// template <> -// inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( -// const NumericArraySlice & second, const NumericArraySlice & first, const UInt8 * first_null_map, const UInt8 * second_null_map) -// { -// return sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements > ( -// static_cast &>(second), static_cast &>(first), second_null_map, first_null_map); -// } - -// AVX2 Int64 specialization -template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - if (second.size == 0) - return true; - - if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) - return false; - - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - size_t j = 0; - short has_mask = 1; - const Int64 full = -1, none = 0; - const __m256i ones = _mm256_set1_epi64x(full); - const __m256i zeros = _mm256_setzero_si256(); - if (second.size > 3 && first.size > 3) - { - for (; j < second.size - 3 && has_mask; j += 4) - { - has_mask = 0; - const __m256i f_data = _mm256_lddqu_si256(reinterpret_cast(second.data + j)); - // bits of the bitmask are set to one if considered as null in the corresponding null map, 0 otherwise; - __m256i bitmask = has_second_null_map ? - _mm256_set_epi64x( - (second_null_map[j + 3])? full : none, - (second_null_map[j + 2])? full : none, - (second_null_map[j + 1])? full : none, - (second_null_map[j]) ? full : none) - : zeros; - - unsigned i = 0; - for (; i < first.size - 3 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 4) - { - const __m256i s_data = _mm256_lddqu_si256(reinterpret_cast(first.data + i)); - const __m256i first_nm_mask = has_first_null_map? - _mm256_set_m128i( - _mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast(first_null_map + i + 2))), - _mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast(first_null_map + i)))) - : zeros; - bitmask = - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - first_nm_mask, - _mm256_cmpeq_epi64(f_data, s_data)), - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(5,4,3,2,1,0,7,6)), - _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(5,4,3,2,1,0,7,6))))), - - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(3,2,1,0,7,6,5,4)), - _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(3,2,1,0,7,6,5,4)))), - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(1,0,7,6,5,4,3,2)), - _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(1,0,7,6,5,4,3,2)))))), - bitmask); - } - - if (i < first.size) - { - for (; i < first.size && !has_mask; ++i) - { - if (has_first_null_map && first_null_map[i]) - continue; - __m256i v_i = _mm256_set1_epi64x(first.data[i]); - bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi64(f_data, v_i)); - has_mask = _mm256_testc_si256(bitmask, ones); - } - } - } - } - - if (!has_mask && second.size > 3) - return false; - - return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); -} - -// AVX2 Int16_t specialization -template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - if (second.size == 0) - return true; - - if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) - return false; - - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - size_t j = 0; - short has_mask = 1; - const int16_t full = -1, none = 0; - const __m256i ones = _mm256_set1_epi16(full); - const __m256i zeros = _mm256_setzero_si256(); - if (second.size > 15 && first.size > 15) - { - for (; j < second.size - 15 && has_mask; j += 16) - { - has_mask = 0; - const __m256i f_data = _mm256_lddqu_si256(reinterpret_cast(second.data + j)); - __m256i bitmask = has_second_null_map ? - _mm256_set_epi16( - (second_null_map[j + 15]) ? full : none, (second_null_map[j + 14]) ? full : none, - (second_null_map[j + 13]) ? full : none, (second_null_map[j + 12]) ? full : none, - (second_null_map[j + 11]) ? full : none, (second_null_map[j + 10]) ? full : none, - (second_null_map[j + 9]) ? full : none, (second_null_map[j + 8])? full : none, - (second_null_map[j + 7]) ? full : none, (second_null_map[j + 6])? full : none, - (second_null_map[j + 5]) ? full : none, (second_null_map[j + 4])? full : none, - (second_null_map[j + 3]) ? full : none, (second_null_map[j + 2])? full : none, - (second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full : none) - : zeros; - unsigned i = 0; - for (; i < first.size - 15 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 16) - { - const __m256i s_data = _mm256_lddqu_si256(reinterpret_cast(first.data + i)); - const __m256i first_nm_mask = has_first_null_map? - _mm256_set_m128i( - _mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast(first_null_map + i + 8))), - _mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast(first_null_map + i)))) - : zeros; - bitmask = - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - first_nm_mask, - _mm256_cmpeq_epi16(f_data, s_data)), - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30))))), - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28)))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26))))) - ), - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24)))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22))))), - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20)))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18)))))) - ), - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_permute2x128_si256(first_nm_mask, first_nm_mask,1), - _mm256_cmpeq_epi16(f_data, _mm256_permute2x128_si256(s_data, s_data, 1))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14))))), - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12)))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10))))) - ), - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data ,s_data, 1), _mm256_set_epi8(7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8)))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6))))), - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data ,s_data ,1), _mm256_set_epi8(3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4)))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2)))))) - ) - ), - bitmask); - } - - if (i < first.size) - { - for (; i < first.size && !has_mask; ++i) - { - if (has_first_null_map && first_null_map[i]) - continue; - __m256i v_i = _mm256_set1_epi16(first.data[i]); - bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi16(f_data, v_i)); - has_mask = _mm256_testc_si256(bitmask, ones); - } - } - } - } - - if (!has_mask && second.size > 15) - return false; - - return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); -} - #elif defined(__SSE4_2__) // SSE4.2 Int specialization From 2a2eb3a27bf623e23c08d462d17ade6791bbab23 Mon Sep 17 00:00:00 2001 From: Youenn Lebras Date: Fri, 10 Sep 2021 15:07:36 +0200 Subject: [PATCH 014/239] re-enable full AVX2 - change lddqu to loadu - Update CmakeList.txt --- src/Functions/GatherUtils/CMakeLists.txt | 16 +- .../GatherUtils/sliceHasImplAnyAll.h | 269 ++++++++++++++++-- 2 files changed, 262 insertions(+), 23 deletions(-) diff --git a/src/Functions/GatherUtils/CMakeLists.txt b/src/Functions/GatherUtils/CMakeLists.txt index a379ccbadde..b1c72656f24 100644 --- a/src/Functions/GatherUtils/CMakeLists.txt +++ b/src/Functions/GatherUtils/CMakeLists.txt @@ -11,13 +11,15 @@ if (HAS_SUGGEST_DESTRUCTOR_OVERRIDE) target_compile_definitions(clickhouse_functions_gatherutils PUBLIC HAS_SUGGEST_DESTRUCTOR_OVERRIDE) endif() -if (HAVE_SSE42) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse2 -msse4.2") -endif() -if (HAVE_AVX2) - target_compile_options(clickhouse_functions_gatherutils PRIVATE -mavx2 -DNAMESPACE=AVX2) -endif() - if (STRIP_DEBUG_SYMBOLS_FUNCTIONS) target_compile_options(clickhouse_functions_gatherutils PRIVATE "-g0") endif() + +if (HAVE_SSE42) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2") + target_compile_options(clickhouse_functions_gatherutils PRIVATE -msse4.2) +endif() +if (HAVE_AVX2) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2") + target_compile_options(clickhouse_functions_gatherutils PRIVATE -mavx2) +endif() diff --git a/src/Functions/GatherUtils/sliceHasImplAnyAll.h b/src/Functions/GatherUtils/sliceHasImplAnyAll.h index 7c253cbc407..a14acd08e93 100644 --- a/src/Functions/GatherUtils/sliceHasImplAnyAll.h +++ b/src/Functions/GatherUtils/sliceHasImplAnyAll.h @@ -128,6 +128,19 @@ inline ALWAYS_INLINE bool hasAllIntegralLoopRemainder( #endif #if defined(__AVX2__) + +// TODO: Discuss about +// raise an error : "error: no viable conversion from 'const NumericArraySlice' to 'const NumericArraySlice'" +// How should we do, copy past each function ?? I haven't found a way to specialize a same function body for two different types. +// AVX2 UInt specialization +// template <> +// inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( +// const NumericArraySlice & second, const NumericArraySlice & first, const UInt8 * first_null_map, const UInt8 * second_null_map) +// { +// return sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements > ( +// static_cast &>(second), static_cast &>(first), second_null_map, first_null_map); +// } + // AVX2 Int specialization of sliceHasImplAnyAll template <> inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( @@ -152,7 +165,7 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll(second.data + j)); + const __m256i f_data = _mm256_loadu_si256(reinterpret_cast(second.data + j)); // bits of the bitmask are set to one if considered as null in the corresponding null map, 0 otherwise; __m256i bitmask = has_second_null_map ? _mm256_set_epi32( @@ -169,13 +182,13 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll(first.data + i)); + const __m256i s_data = _mm256_loadu_si256(reinterpret_cast(first.data + i)); // Create a mask to avoid to compare null elements // set_m128i takes two arguments: (high segment, low segment) that are two __m128i convert from 8bits to 32bits to match with next operations const __m256i first_nm_mask = has_first_null_map? _mm256_set_m128i( - _mm_cvtepi8_epi32(_mm_lddqu_si128(reinterpret_cast(first_null_map + i + 4))), - _mm_cvtepi8_epi32(_mm_lddqu_si128(reinterpret_cast(first_null_map + i)))) + _mm_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast(first_null_map + i + 4))), + _mm_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast(first_null_map + i)))) : zeros; bitmask = _mm256_or_si256( @@ -235,6 +248,228 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + if (second.size == 0) + return true; + + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) + return false; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; + short has_mask = 1; + const Int64 full = -1, none = 0; + const __m256i ones = _mm256_set1_epi64x(full); + const __m256i zeros = _mm256_setzero_si256(); + if (second.size > 3 && first.size > 3) + { + for (; j < second.size - 3 && has_mask; j += 4) + { + has_mask = 0; + const __m256i f_data = _mm256_loadu_si256(reinterpret_cast(second.data + j)); + // bits of the bitmask are set to one if considered as null in the corresponding null map, 0 otherwise; + __m256i bitmask = has_second_null_map ? + _mm256_set_epi64x( + (second_null_map[j + 3])? full : none, + (second_null_map[j + 2])? full : none, + (second_null_map[j + 1])? full : none, + (second_null_map[j]) ? full : none) + : zeros; + + unsigned i = 0; + for (; i < first.size - 3 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 4) + { + const __m256i s_data = _mm256_loadu_si256(reinterpret_cast(first.data + i)); + const __m256i first_nm_mask = has_first_null_map? + _mm256_set_m128i( + _mm_cvtepi8_epi64(_mm_loadu_si128(reinterpret_cast(first_null_map + i + 2))), + _mm_cvtepi8_epi64(_mm_loadu_si128(reinterpret_cast(first_null_map + i)))) + : zeros; + bitmask = + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + first_nm_mask, + _mm256_cmpeq_epi64(f_data, s_data)), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(5,4,3,2,1,0,7,6)), + _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(5,4,3,2,1,0,7,6))))), + + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(3,2,1,0,7,6,5,4)), + _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(3,2,1,0,7,6,5,4)))), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(1,0,7,6,5,4,3,2)), + _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(1,0,7,6,5,4,3,2)))))), + bitmask); + } + + if (i < first.size) + { + for (; i < first.size && !has_mask; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + __m256i v_i = _mm256_set1_epi64x(first.data[i]); + bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi64(f_data, v_i)); + has_mask = _mm256_testc_si256(bitmask, ones); + } + } + } + } + + if (!has_mask && second.size > 3) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); +} + +// AVX2 Int16_t specialization +template <> +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + if (second.size == 0) + return true; + + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) + return false; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; + short has_mask = 1; + const int16_t full = -1, none = 0; + const __m256i ones = _mm256_set1_epi16(full); + const __m256i zeros = _mm256_setzero_si256(); + if (second.size > 15 && first.size > 15) + { + for (; j < second.size - 15 && has_mask; j += 16) + { + has_mask = 0; + const __m256i f_data = _mm256_loadu_si256(reinterpret_cast(second.data + j)); + __m256i bitmask = has_second_null_map ? + _mm256_set_epi16( + (second_null_map[j + 15]) ? full : none, (second_null_map[j + 14]) ? full : none, + (second_null_map[j + 13]) ? full : none, (second_null_map[j + 12]) ? full : none, + (second_null_map[j + 11]) ? full : none, (second_null_map[j + 10]) ? full : none, + (second_null_map[j + 9]) ? full : none, (second_null_map[j + 8])? full : none, + (second_null_map[j + 7]) ? full : none, (second_null_map[j + 6])? full : none, + (second_null_map[j + 5]) ? full : none, (second_null_map[j + 4])? full : none, + (second_null_map[j + 3]) ? full : none, (second_null_map[j + 2])? full : none, + (second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full : none) + : zeros; + unsigned i = 0; + for (; i < first.size - 15 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 16) + { + const __m256i s_data = _mm256_loadu_si256(reinterpret_cast(first.data + i)); + const __m256i first_nm_mask = has_first_null_map? + _mm256_set_m128i( + _mm_cvtepi8_epi16(_mm_loadu_si128(reinterpret_cast(first_null_map + i + 8))), + _mm_cvtepi8_epi16(_mm_loadu_si128(reinterpret_cast(first_null_map + i)))) + : zeros; + bitmask = + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + first_nm_mask, + _mm256_cmpeq_epi16(f_data, s_data)), + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26))))) + ), + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18)))))) + ), + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_permute2x128_si256(first_nm_mask, first_nm_mask,1), + _mm256_cmpeq_epi16(f_data, _mm256_permute2x128_si256(s_data, s_data, 1))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10))))) + ), + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data ,s_data, 1), _mm256_set_epi8(7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data ,s_data ,1), _mm256_set_epi8(3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2)))))) + ) + ), + bitmask); + } + + if (i < first.size) + { + for (; i < first.size && !has_mask; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + __m256i v_i = _mm256_set1_epi16(first.data[i]); + bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi16(f_data, v_i)); + has_mask = _mm256_testc_si256(bitmask, ones); + } + } + } + } + + if (!has_mask && second.size > 15) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); +} + #elif defined(__SSE4_2__) // SSE4.2 Int specialization @@ -260,7 +495,7 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll(second.data + j)); + const __m128i f_data = _mm_loadu_si128(reinterpret_cast(second.data + j)); __m128i bitmask = has_second_null_map ? _mm_set_epi32( (second_null_map[j + 3]) ? full : none, @@ -272,9 +507,9 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll(first.data + i)); + const __m128i s_data = _mm_loadu_si128(reinterpret_cast(first.data + i)); const __m128i first_nm_mask = has_first_null_map ? - _mm_cvtepi8_epi32(_mm_lddqu_si128(reinterpret_cast(first_null_map + i))) + _mm_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast(first_null_map + i))) : zeros; bitmask = @@ -341,7 +576,7 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll(second.data + j)); + const __m128i f_data = _mm_loadu_si128(reinterpret_cast(second.data + j)); __m128i bitmask = has_second_null_map ? _mm_set_epi64x( (second_null_map[j + 1]) ? full : none, @@ -350,9 +585,9 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll(first.data + i)); + const __m128i s_data = _mm_loadu_si128(reinterpret_cast(first.data + i)); const __m128i first_nm_mask = has_first_null_map ? - _mm_cvtepi8_epi64(_mm_lddqu_si128(reinterpret_cast(first_null_map + i))) + _mm_cvtepi8_epi64(_mm_loadu_si128(reinterpret_cast(first_null_map + i))) : zeros; bitmask = _mm_or_si128( @@ -409,7 +644,7 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll(second.data + j)); + const __m128i f_data = _mm_loadu_si128(reinterpret_cast(second.data + j)); __m128i bitmask = has_second_null_map ? _mm_set_epi16( (second_null_map[j + 7]) ? full : none, (second_null_map[j + 6]) ? full : none, @@ -420,9 +655,9 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll(first.data + i)); + const __m128i s_data = _mm_loadu_si128(reinterpret_cast(first.data + i)); const __m128i first_nm_mask = has_first_null_map ? - _mm_cvtepi8_epi16(_mm_lddqu_si128(reinterpret_cast(first_null_map + i))) + _mm_cvtepi8_epi16(_mm_loadu_si128(reinterpret_cast(first_null_map + i))) : zeros; bitmask = _mm_or_si128( @@ -482,6 +717,8 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll(second.data + j)); + const __m128i f_data = _mm_loadu_si128(reinterpret_cast(second.data + j)); __m128i bitmask = has_second_null_map ? _mm_set_epi8( (second_null_map[j + 15]) ? full : none, (second_null_map[j + 14]) ? full : none, @@ -522,9 +759,9 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll(first.data + i)); + const __m128i s_data = _mm_loadu_si128(reinterpret_cast(first.data + i)); const __m128i first_nm_mask = has_first_null_map ? - _mm_lddqu_si128(reinterpret_cast(first_null_map + i)) + _mm_loadu_si128(reinterpret_cast(first_null_map + i)) : zeros; bitmask = _mm_or_si128( From 72fb56904d8de814bddeb08f93d8c0882b6cd4d2 Mon Sep 17 00:00:00 2001 From: youenn lebras Date: Tue, 26 Oct 2021 10:43:23 +0200 Subject: [PATCH 015/239] Add cmake option to enable or not AVX2 instructions --- src/Functions/GatherUtils/CMakeLists.txt | 6 ++++-- src/Functions/GatherUtils/sliceHasImplAnyAll.h | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/Functions/GatherUtils/CMakeLists.txt b/src/Functions/GatherUtils/CMakeLists.txt index b1c72656f24..f291663550d 100644 --- a/src/Functions/GatherUtils/CMakeLists.txt +++ b/src/Functions/GatherUtils/CMakeLists.txt @@ -1,4 +1,6 @@ include("${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake") +option(ENABLE_AVX2 "Enable AVX2 instructions (when available) when build for modern Intel CPUs" OFF) + add_headers_and_sources(clickhouse_functions_gatherutils .) add_library(clickhouse_functions_gatherutils ${clickhouse_functions_gatherutils_sources} ${clickhouse_functions_gatherutils_headers}) target_link_libraries(clickhouse_functions_gatherutils PRIVATE dbms) @@ -19,7 +21,7 @@ if (HAVE_SSE42) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2") target_compile_options(clickhouse_functions_gatherutils PRIVATE -msse4.2) endif() -if (HAVE_AVX2) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2") +if (HAVE_AVX2 AND ENABLE_AVX2) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2 -DENABLE_AVX2") target_compile_options(clickhouse_functions_gatherutils PRIVATE -mavx2) endif() diff --git a/src/Functions/GatherUtils/sliceHasImplAnyAll.h b/src/Functions/GatherUtils/sliceHasImplAnyAll.h index a14acd08e93..9028a94b2aa 100644 --- a/src/Functions/GatherUtils/sliceHasImplAnyAll.h +++ b/src/Functions/GatherUtils/sliceHasImplAnyAll.h @@ -85,7 +85,7 @@ bool sliceHasImplAnyAll(const FirstSliceType & first, const SecondSliceType & se } -#if defined(__AVX2__) || defined(__SSE4_2__) +#if (defined(__AVX2__) && defined(ENABLE_AVX2)) || defined(__SSE4_2__) namespace { @@ -127,7 +127,7 @@ inline ALWAYS_INLINE bool hasAllIntegralLoopRemainder( #endif -#if defined(__AVX2__) +#if defined(__AVX2__) && defined(ENABLE_AVX2) // TODO: Discuss about // raise an error : "error: no viable conversion from 'const NumericArraySlice' to 'const NumericArraySlice'" From 0154eab9cb1000c831bc44c49cfc1d3ccf2ff5c1 Mon Sep 17 00:00:00 2001 From: youenn lebras Date: Wed, 8 Dec 2021 10:27:42 +0100 Subject: [PATCH 016/239] Modify performance tests for HasAll, removing Large tests to see if it helps passing CICD --- tests/performance/hasAll_simd_int16.xml | 16 ++++++++-------- tests/performance/hasAll_simd_int32.xml | 16 ++++++++-------- tests/performance/hasAll_simd_int64.xml | 16 ++++++++-------- tests/performance/hasAll_simd_int8.xml | 16 ++++++++-------- 4 files changed, 32 insertions(+), 32 deletions(-) diff --git a/tests/performance/hasAll_simd_int16.xml b/tests/performance/hasAll_simd_int16.xml index c2ce4eec77f..63d869e7794 100644 --- a/tests/performance/hasAll_simd_int16.xml +++ b/tests/performance/hasAll_simd_int16.xml @@ -7,9 +7,9 @@ CREATE TABLE test_table_medium2 (`set` Array(Int16), `subset` Array (Int16)) ENGINE = MergeTree ORDER BY set CREATE TABLE test_table_mediumf (`set` Array(Int16), `subset` Array (Int16)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_large (`set` Array(Int16), `subset` Array (Int16)) ENGINE = MergeTree ORDER BY set + INSERT INTO test_table_small SELECT groupArraySample(500)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(500)) @@ -22,9 +22,9 @@ INSERT INTO test_table_mediumf SELECT groupArraySample(1000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(100000000)) - INSERT INTO test_table_large SELECT groupArraySample(50000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(50000000)) + select hasAll(set, subset) from test_table_small select hasAll(set, subset) from test_table_small2 @@ -34,9 +34,9 @@ select hasAll(set, subset) from test_table_medium2 select hasAll(set, subset) from test_table_mediumf - select hasAll(set, subset) from test_table_large + DROP TABLE IF EXISTS test_table_small DROP TABLE IF EXISTS test_table_small2 @@ -46,7 +46,7 @@ DROP TABLE IF EXISTS test_table_medium2 DROP TABLE IF EXISTS test_table_mediumf - DROP TABLE IF EXISTS test_table_large + diff --git a/tests/performance/hasAll_simd_int32.xml b/tests/performance/hasAll_simd_int32.xml index 4543dea161b..074901737b0 100644 --- a/tests/performance/hasAll_simd_int32.xml +++ b/tests/performance/hasAll_simd_int32.xml @@ -7,9 +7,9 @@ CREATE TABLE test_table_medium2 (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set CREATE TABLE test_table_mediumf (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_large (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set + INSERT INTO test_table_small SELECT groupArraySample(500)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(500)) @@ -22,9 +22,9 @@ INSERT INTO test_table_mediumf SELECT groupArraySample(1000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(100000000)) - INSERT INTO test_table_large SELECT groupArraySample(50000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(50000000)) + select hasAll(set, subset) from test_table_small select hasAll(set, subset) from test_table_small2 @@ -34,9 +34,9 @@ select hasAll(set, subset) from test_table_medium2 select hasAll(set, subset) from test_table_mediumf - select hasAll(set, subset) from test_table_large + DROP TABLE IF EXISTS test_table_small DROP TABLE IF EXISTS test_table_small2 @@ -46,7 +46,7 @@ DROP TABLE IF EXISTS test_table_medium2 DROP TABLE IF EXISTS test_table_mediumf - DROP TABLE IF EXISTS test_table_large + diff --git a/tests/performance/hasAll_simd_int64.xml b/tests/performance/hasAll_simd_int64.xml index 07e52483bb1..9e68d3d219c 100644 --- a/tests/performance/hasAll_simd_int64.xml +++ b/tests/performance/hasAll_simd_int64.xml @@ -7,9 +7,9 @@ CREATE TABLE test_table_medium2 (`set` Array(Int64), `subset` Array (Int64)) ENGINE = MergeTree ORDER BY set CREATE TABLE test_table_mediumf (`set` Array(Int64), `subset` Array (Int64)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_large (`set` Array(Int64), `subset` Array (Int64)) ENGINE = MergeTree ORDER BY set + INSERT INTO test_table_small SELECT groupArraySample(500)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(500)) @@ -22,9 +22,9 @@ INSERT INTO test_table_mediumf SELECT groupArraySample(1000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(100000000)) - INSERT INTO test_table_large SELECT groupArraySample(50000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(50000000)) + select hasAll(set, subset) from test_table_small select hasAll(set, subset) from test_table_small2 @@ -34,9 +34,9 @@ select hasAll(set, subset) from test_table_medium2 select hasAll(set, subset) from test_table_mediumf - select hasAll(set, subset) from test_table_large + DROP TABLE IF EXISTS test_table_small DROP TABLE IF EXISTS test_table_small2 @@ -46,7 +46,7 @@ DROP TABLE IF EXISTS test_table_medium2 DROP TABLE IF EXISTS test_table_mediumf - DROP TABLE IF EXISTS test_table_large + diff --git a/tests/performance/hasAll_simd_int8.xml b/tests/performance/hasAll_simd_int8.xml index 5ddc84aa5bd..4a0b30524ad 100644 --- a/tests/performance/hasAll_simd_int8.xml +++ b/tests/performance/hasAll_simd_int8.xml @@ -7,9 +7,9 @@ CREATE TABLE test_table_medium2 (`set` Array(Int8), `subset` Array (Int8)) ENGINE = MergeTree ORDER BY set CREATE TABLE test_table_mediumf (`set` Array(Int8), `subset` Array (Int8)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_large (`set` Array(Int8), `subset` Array (Int8)) ENGINE = MergeTree ORDER BY set + INSERT INTO test_table_small SELECT groupArraySample(500)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(500)) @@ -22,9 +22,9 @@ INSERT INTO test_table_mediumf SELECT groupArraySample(1000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(100000000)) - INSERT INTO test_table_large SELECT groupArraySample(50000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(50000000)) + select hasAll(set, subset) from test_table_small select hasAll(set, subset) from test_table_small2 @@ -34,9 +34,9 @@ select hasAll(set, subset) from test_table_medium2 select hasAll(set, subset) from test_table_mediumf - select hasAll(set, subset) from test_table_large + DROP TABLE IF EXISTS test_table_small DROP TABLE IF EXISTS test_table_small2 @@ -46,7 +46,7 @@ DROP TABLE IF EXISTS test_table_medium2 DROP TABLE IF EXISTS test_table_mediumf - DROP TABLE IF EXISTS test_table_large + From c2b761acf282c00290dae1cbeb1ae43d8a7858bd Mon Sep 17 00:00:00 2001 From: youenn lebras Date: Wed, 8 Dec 2021 11:01:24 +0100 Subject: [PATCH 017/239] Add cmake option to enable or not AVX2 instructions This reverts commit bca8eca44fe382b6efe80a381d42e6ede8a91fa3. --- src/Functions/GatherUtils/CMakeLists.txt | 2 +- tests/performance/hasAll_simd_int16.xml | 16 ++++++++-------- tests/performance/hasAll_simd_int32.xml | 16 ++++++++-------- tests/performance/hasAll_simd_int64.xml | 16 ++++++++-------- tests/performance/hasAll_simd_int8.xml | 16 ++++++++-------- 5 files changed, 33 insertions(+), 33 deletions(-) diff --git a/src/Functions/GatherUtils/CMakeLists.txt b/src/Functions/GatherUtils/CMakeLists.txt index f291663550d..10909b99b82 100644 --- a/src/Functions/GatherUtils/CMakeLists.txt +++ b/src/Functions/GatherUtils/CMakeLists.txt @@ -22,6 +22,6 @@ if (HAVE_SSE42) target_compile_options(clickhouse_functions_gatherutils PRIVATE -msse4.2) endif() if (HAVE_AVX2 AND ENABLE_AVX2) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2 -DENABLE_AVX2") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2 -DENABLE_AVX2") target_compile_options(clickhouse_functions_gatherutils PRIVATE -mavx2) endif() diff --git a/tests/performance/hasAll_simd_int16.xml b/tests/performance/hasAll_simd_int16.xml index 63d869e7794..c2ce4eec77f 100644 --- a/tests/performance/hasAll_simd_int16.xml +++ b/tests/performance/hasAll_simd_int16.xml @@ -7,9 +7,9 @@ CREATE TABLE test_table_medium2 (`set` Array(Int16), `subset` Array (Int16)) ENGINE = MergeTree ORDER BY set CREATE TABLE test_table_mediumf (`set` Array(Int16), `subset` Array (Int16)) ENGINE = MergeTree ORDER BY set - + CREATE TABLE test_table_largef (`set` Array(Int16), `subset` Array (Int16)) ENGINE = MergeTree ORDER BY set INSERT INTO test_table_small SELECT groupArraySample(500)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(500)) @@ -22,9 +22,9 @@ INSERT INTO test_table_mediumf SELECT groupArraySample(1000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(100000000)) - + INSERT INTO test_table_largef SELECT groupArraySample(50000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(1000000000)) select hasAll(set, subset) from test_table_small select hasAll(set, subset) from test_table_small2 @@ -34,9 +34,9 @@ select hasAll(set, subset) from test_table_medium2 select hasAll(set, subset) from test_table_mediumf - + select hasAll(set, subset) from test_table_largef DROP TABLE IF EXISTS test_table_small DROP TABLE IF EXISTS test_table_small2 @@ -46,7 +46,7 @@ DROP TABLE IF EXISTS test_table_medium2 DROP TABLE IF EXISTS test_table_mediumf - + DROP TABLE IF EXISTS test_table_largef diff --git a/tests/performance/hasAll_simd_int32.xml b/tests/performance/hasAll_simd_int32.xml index 074901737b0..4543dea161b 100644 --- a/tests/performance/hasAll_simd_int32.xml +++ b/tests/performance/hasAll_simd_int32.xml @@ -7,9 +7,9 @@ CREATE TABLE test_table_medium2 (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set CREATE TABLE test_table_mediumf (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set - + CREATE TABLE test_table_largef (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set INSERT INTO test_table_small SELECT groupArraySample(500)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(500)) @@ -22,9 +22,9 @@ INSERT INTO test_table_mediumf SELECT groupArraySample(1000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(100000000)) - + INSERT INTO test_table_largef SELECT groupArraySample(50000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(1000000000)) select hasAll(set, subset) from test_table_small select hasAll(set, subset) from test_table_small2 @@ -34,9 +34,9 @@ select hasAll(set, subset) from test_table_medium2 select hasAll(set, subset) from test_table_mediumf - + select hasAll(set, subset) from test_table_largef DROP TABLE IF EXISTS test_table_small DROP TABLE IF EXISTS test_table_small2 @@ -46,7 +46,7 @@ DROP TABLE IF EXISTS test_table_medium2 DROP TABLE IF EXISTS test_table_mediumf - + DROP TABLE IF EXISTS test_table_largef diff --git a/tests/performance/hasAll_simd_int64.xml b/tests/performance/hasAll_simd_int64.xml index 9e68d3d219c..07e52483bb1 100644 --- a/tests/performance/hasAll_simd_int64.xml +++ b/tests/performance/hasAll_simd_int64.xml @@ -7,9 +7,9 @@ CREATE TABLE test_table_medium2 (`set` Array(Int64), `subset` Array (Int64)) ENGINE = MergeTree ORDER BY set CREATE TABLE test_table_mediumf (`set` Array(Int64), `subset` Array (Int64)) ENGINE = MergeTree ORDER BY set - + CREATE TABLE test_table_largef (`set` Array(Int64), `subset` Array (Int64)) ENGINE = MergeTree ORDER BY set INSERT INTO test_table_small SELECT groupArraySample(500)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(500)) @@ -22,9 +22,9 @@ INSERT INTO test_table_mediumf SELECT groupArraySample(1000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(100000000)) - + INSERT INTO test_table_largef SELECT groupArraySample(50000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(1000000000)) select hasAll(set, subset) from test_table_small select hasAll(set, subset) from test_table_small2 @@ -34,9 +34,9 @@ select hasAll(set, subset) from test_table_medium2 select hasAll(set, subset) from test_table_mediumf - + select hasAll(set, subset) from test_table_largef DROP TABLE IF EXISTS test_table_small DROP TABLE IF EXISTS test_table_small2 @@ -46,7 +46,7 @@ DROP TABLE IF EXISTS test_table_medium2 DROP TABLE IF EXISTS test_table_mediumf - + DROP TABLE IF EXISTS test_table_largef diff --git a/tests/performance/hasAll_simd_int8.xml b/tests/performance/hasAll_simd_int8.xml index 4a0b30524ad..5ddc84aa5bd 100644 --- a/tests/performance/hasAll_simd_int8.xml +++ b/tests/performance/hasAll_simd_int8.xml @@ -7,9 +7,9 @@ CREATE TABLE test_table_medium2 (`set` Array(Int8), `subset` Array (Int8)) ENGINE = MergeTree ORDER BY set CREATE TABLE test_table_mediumf (`set` Array(Int8), `subset` Array (Int8)) ENGINE = MergeTree ORDER BY set - + CREATE TABLE test_table_largef (`set` Array(Int8), `subset` Array (Int8)) ENGINE = MergeTree ORDER BY set INSERT INTO test_table_small SELECT groupArraySample(500)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(500)) @@ -22,9 +22,9 @@ INSERT INTO test_table_mediumf SELECT groupArraySample(1000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(100000000)) - + INSERT INTO test_table_largef SELECT groupArraySample(50000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(1000000000)) select hasAll(set, subset) from test_table_small select hasAll(set, subset) from test_table_small2 @@ -34,9 +34,9 @@ select hasAll(set, subset) from test_table_medium2 select hasAll(set, subset) from test_table_mediumf - + select hasAll(set, subset) from test_table_largef DROP TABLE IF EXISTS test_table_small DROP TABLE IF EXISTS test_table_small2 @@ -46,7 +46,7 @@ DROP TABLE IF EXISTS test_table_medium2 DROP TABLE IF EXISTS test_table_mediumf - + DROP TABLE IF EXISTS test_table_largef From 9ec7e61f5f96a97653cbf364ef68d61ca2b09d07 Mon Sep 17 00:00:00 2001 From: youenn lebras Date: Mon, 31 Jan 2022 12:15:20 +0100 Subject: [PATCH 018/239] Add Unsigned version for Int8, Int16, Int32, Int64 for SS4.2 and AVX2 (aka AVX256) --- .../GatherUtils/sliceHasImplAnyAll.h | 844 ++++++++++++++++-- 1 file changed, 768 insertions(+), 76 deletions(-) diff --git a/src/Functions/GatherUtils/sliceHasImplAnyAll.h b/src/Functions/GatherUtils/sliceHasImplAnyAll.h index 9028a94b2aa..52448f88447 100644 --- a/src/Functions/GatherUtils/sliceHasImplAnyAll.h +++ b/src/Functions/GatherUtils/sliceHasImplAnyAll.h @@ -129,22 +129,178 @@ inline ALWAYS_INLINE bool hasAllIntegralLoopRemainder( #if defined(__AVX2__) && defined(ENABLE_AVX2) -// TODO: Discuss about -// raise an error : "error: no viable conversion from 'const NumericArraySlice' to 'const NumericArraySlice'" -// How should we do, copy past each function ?? I haven't found a way to specialize a same function body for two different types. -// AVX2 UInt specialization -// template <> -// inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( -// const NumericArraySlice & second, const NumericArraySlice & first, const UInt8 * first_null_map, const UInt8 * second_null_map) -// { -// return sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements > ( -// static_cast &>(second), static_cast &>(first), second_null_map, first_null_map); -// } - -// AVX2 Int specialization of sliceHasImplAnyAll +// AVX2 Int64 specialization template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + if (second.size == 0) + return true; + + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) + return false; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; + short has_mask = 1; + const Int64 full = -1, none = 0; + const __m256i ones = _mm256_set1_epi64x(full); + const __m256i zeros = _mm256_setzero_si256(); + if (second.size > 3 && first.size > 3) + { + for (; j < second.size - 3 && has_mask; j += 4) + { + has_mask = 0; + const __m256i f_data = _mm256_loadu_si256(reinterpret_cast(second.data + j)); + // bits of the bitmask are set to one if considered as null in the corresponding null map, 0 otherwise; + __m256i bitmask = has_second_null_map ? + _mm256_set_epi64x( + (second_null_map[j + 3])? full : none, + (second_null_map[j + 2])? full : none, + (second_null_map[j + 1])? full : none, + (second_null_map[j]) ? full : none) + : zeros; + + unsigned i = 0; + for (; i < first.size - 3 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 4) + { + const __m256i s_data = _mm256_loadu_si256(reinterpret_cast(first.data + i)); + const __m256i first_nm_mask = has_first_null_map? + _mm256_set_m128i( + _mm_cvtepi8_epi64(_mm_loadu_si128(reinterpret_cast(first_null_map + i + 2))), + _mm_cvtepi8_epi64(_mm_loadu_si128(reinterpret_cast(first_null_map + i)))) + : zeros; + bitmask = + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + first_nm_mask, + _mm256_cmpeq_epi64(f_data, s_data)), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(5,4,3,2,1,0,7,6)), + _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(5,4,3,2,1,0,7,6))))), + + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(3,2,1,0,7,6,5,4)), + _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(3,2,1,0,7,6,5,4)))), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(1,0,7,6,5,4,3,2)), + _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(1,0,7,6,5,4,3,2)))))), + bitmask); + } + + if (i < first.size) + { + for (; i < first.size && !has_mask; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + __m256i v_i = _mm256_set1_epi64x(first.data[i]); + bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi64(f_data, v_i)); + has_mask = _mm256_testc_si256(bitmask, ones); + } + } + } + } + + if (!has_mask && second.size > 3) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); +} + +// AVX2 Int64 specialization +template <> +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + if (second.size == 0) + return true; + + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) + return false; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; + short has_mask = 1; + const Int64 full = -1, none = 0; + const __m256i ones = _mm256_set1_epi64x(full); + const __m256i zeros = _mm256_setzero_si256(); + if (second.size > 3 && first.size > 3) + { + for (; j < second.size - 3 && has_mask; j += 4) + { + has_mask = 0; + const __m256i f_data = _mm256_loadu_si256(reinterpret_cast(second.data + j)); + // bits of the bitmask are set to one if considered as null in the corresponding null map, 0 otherwise; + __m256i bitmask = has_second_null_map ? + _mm256_set_epi64x( + (second_null_map[j + 3])? full : none, + (second_null_map[j + 2])? full : none, + (second_null_map[j + 1])? full : none, + (second_null_map[j]) ? full : none) + : zeros; + + unsigned i = 0; + for (; i < first.size - 3 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 4) + { + const __m256i s_data = _mm256_loadu_si256(reinterpret_cast(first.data + i)); + const __m256i first_nm_mask = has_first_null_map? + _mm256_set_m128i( + _mm_cvtepi8_epi64(_mm_loadu_si128(reinterpret_cast(first_null_map + i + 2))), + _mm_cvtepi8_epi64(_mm_loadu_si128(reinterpret_cast(first_null_map + i)))) + : zeros; + bitmask = + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + first_nm_mask, + _mm256_cmpeq_epi64(f_data, s_data)), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(5,4,3,2,1,0,7,6)), + _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(5,4,3,2,1,0,7,6))))), + + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(3,2,1,0,7,6,5,4)), + _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(3,2,1,0,7,6,5,4)))), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(1,0,7,6,5,4,3,2)), + _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(1,0,7,6,5,4,3,2)))))), + bitmask); + } + + if (i < first.size) + { + for (; i < first.size && !has_mask; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + __m256i v_i = _mm256_set1_epi64x(first.data[i]); + bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi64(f_data, v_i)); + has_mask = _mm256_testc_si256(bitmask, ones); + } + } + } + } + + if (!has_mask && second.size > 3) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); +} + +// AVX2 Int32 specialization of sliceHasImplAnyAll +template <> +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) { if (second.size == 0) return true; @@ -248,10 +404,117 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + if (second.size == 0) + return true; + + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) + return false; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; + short has_mask = 1; + const int full = -1, none = 0; + const __m256i ones = _mm256_set1_epi32(full); + const __m256i zeros = _mm256_setzero_si256(); + if (second.size > 7 && first.size > 7) + { + for (; j < second.size - 7 && has_mask; j += 8) + { + has_mask = 0; + const __m256i f_data = _mm256_loadu_si256(reinterpret_cast(second.data + j)); + // bits of the bitmask are set to one if considered as null in the corresponding null map, 0 otherwise; + __m256i bitmask = has_second_null_map ? + _mm256_set_epi32( + (second_null_map[j + 7]) ? full : none, + (second_null_map[j + 6]) ? full : none, + (second_null_map[j + 5]) ? full : none, + (second_null_map[j + 4]) ? full : none, + (second_null_map[j + 3]) ? full : none, + (second_null_map[j + 2]) ? full : none, + (second_null_map[j + 1]) ? full : none, + (second_null_map[j]) ? full : none) + : zeros; + + size_t i = 0; + for (; i < first.size - 7 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 8) + { + const __m256i s_data = _mm256_loadu_si256(reinterpret_cast(first.data + i)); + // Create a mask to avoid to compare null elements + // set_m128i takes two arguments: (high segment, low segment) that are two __m128i convert from 8bits to 32bits to match with next operations + const __m256i first_nm_mask = has_first_null_map? + _mm256_set_m128i( + _mm_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast(first_null_map + i + 4))), + _mm_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast(first_null_map + i)))) + : zeros; + bitmask = + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + first_nm_mask, + _mm256_cmpeq_epi32(f_data, s_data)), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(6,5,4,3,2,1,0,7)), + _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(6,5,4,3,2,1,0,7))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(5,4,3,2,1,0,7,6)), + _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(5,4,3,2,1,0,7,6)))), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(4,3,2,1,0,7,6,5)), + _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(4,3,2,1,0,7,6,5))))) + ), + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(3,2,1,0,7,6,5,4)), + _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(3,2,1,0,7,6,5,4)))), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(2,1,0,7,6,5,4,3)), + _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(2,1,0,7,6,5,4,3))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(1,0,7,6,5,4,3,2)), + _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(1,0,7,6,5,4,3,2)))), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(0,7,6,5,4,3,2,1)), + _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(0,7,6,5,4,3,2,1))))))), + bitmask); + } + + if (i < first.size) + { + // Loop(i)-jam + for (; i < first.size && !has_mask; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + __m256i v_i = _mm256_set1_epi32(first.data[i]); + bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi32(f_data, v_i)); + has_mask = _mm256_testc_si256(bitmask, ones); + } + } + } + } + + if (!has_mask && second.size > 7) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); +} + +// AVX2 Int16 specialization +template <> +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) { if (second.size == 0) return true; @@ -264,52 +527,106 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll 3 && first.size > 3) + if (second.size > 15 && first.size > 15) { - for (; j < second.size - 3 && has_mask; j += 4) + for (; j < second.size - 15 && has_mask; j += 16) { has_mask = 0; const __m256i f_data = _mm256_loadu_si256(reinterpret_cast(second.data + j)); - // bits of the bitmask are set to one if considered as null in the corresponding null map, 0 otherwise; __m256i bitmask = has_second_null_map ? - _mm256_set_epi64x( - (second_null_map[j + 3])? full : none, - (second_null_map[j + 2])? full : none, - (second_null_map[j + 1])? full : none, - (second_null_map[j]) ? full : none) + _mm256_set_epi16( + (second_null_map[j + 15]) ? full : none, (second_null_map[j + 14]) ? full : none, + (second_null_map[j + 13]) ? full : none, (second_null_map[j + 12]) ? full : none, + (second_null_map[j + 11]) ? full : none, (second_null_map[j + 10]) ? full : none, + (second_null_map[j + 9]) ? full : none, (second_null_map[j + 8])? full : none, + (second_null_map[j + 7]) ? full : none, (second_null_map[j + 6])? full : none, + (second_null_map[j + 5]) ? full : none, (second_null_map[j + 4])? full : none, + (second_null_map[j + 3]) ? full : none, (second_null_map[j + 2])? full : none, + (second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full : none) : zeros; - unsigned i = 0; - for (; i < first.size - 3 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 4) + for (; i < first.size - 15 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 16) { const __m256i s_data = _mm256_loadu_si256(reinterpret_cast(first.data + i)); const __m256i first_nm_mask = has_first_null_map? _mm256_set_m128i( - _mm_cvtepi8_epi64(_mm_loadu_si128(reinterpret_cast(first_null_map + i + 2))), - _mm_cvtepi8_epi64(_mm_loadu_si128(reinterpret_cast(first_null_map + i)))) + _mm_cvtepi8_epi16(_mm_loadu_si128(reinterpret_cast(first_null_map + i + 8))), + _mm_cvtepi8_epi16(_mm_loadu_si128(reinterpret_cast(first_null_map + i)))) : zeros; bitmask = _mm256_or_si256( _mm256_or_si256( _mm256_or_si256( - _mm256_andnot_si256( - first_nm_mask, - _mm256_cmpeq_epi64(f_data, s_data)), - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(5,4,3,2,1,0,7,6)), - _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(5,4,3,2,1,0,7,6))))), - + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + first_nm_mask, + _mm256_cmpeq_epi16(f_data, s_data)), + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26))))) + ), + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18)))))) + ), _mm256_or_si256( - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(3,2,1,0,7,6,5,4)), - _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(3,2,1,0,7,6,5,4)))), - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(1,0,7,6,5,4,3,2)), - _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(1,0,7,6,5,4,3,2)))))), - bitmask); + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_permute2x128_si256(first_nm_mask, first_nm_mask,1), + _mm256_cmpeq_epi16(f_data, _mm256_permute2x128_si256(s_data, s_data, 1))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10))))) + ), + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data ,s_data, 1), _mm256_set_epi8(7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data ,s_data ,1), _mm256_set_epi8(3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2)), + _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2)))))) + ) + ), + bitmask); } if (i < first.size) @@ -318,24 +635,24 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll 3) + if (!has_mask && second.size > 15) return false; return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); } -// AVX2 Int16_t specialization +// AVX2 UInt16 specialization template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) { if (second.size == 0) return true; @@ -472,10 +789,146 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + if (second.size == 0) + return true; + + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) + return false; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; + short has_mask = 1; + const Int64 full = -1, none = 0; + const __m128i zeros = _mm_setzero_si128(); + if (second.size > 1 && first.size > 1) + { + for (; j < second.size - 1 && has_mask; j += 2) + { + has_mask = 0; + const __m128i f_data = _mm_loadu_si128(reinterpret_cast(second.data + j)); + __m128i bitmask = has_second_null_map ? + _mm_set_epi64x( + (second_null_map[j + 1]) ? full : none, + (second_null_map[j]) ? full : none) + : zeros; + unsigned i = 0; + for (; i < first.size - 1 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 2) + { + const __m128i s_data = _mm_loadu_si128(reinterpret_cast(first.data + i)); + const __m128i first_nm_mask = has_first_null_map ? + _mm_cvtepi8_epi64(_mm_loadu_si128(reinterpret_cast(first_null_map + i))) + : zeros; + bitmask = + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + first_nm_mask, + _mm_cmpeq_epi64(f_data, s_data)), + _mm_andnot_si128( + _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(1,0,3,2)), + _mm_cmpeq_epi64(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(1,0,3,2))))), + bitmask); + } + + if (i < first.size) + { + for (; i < first.size && !has_mask; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + __m128i v_i = _mm_set1_epi64x(first.data[i]); + bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi64(f_data, v_i)); + has_mask = _mm_test_all_ones(bitmask); + } + } + } + } + + if (!has_mask && second.size > 1) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); +} + +// SSE4.2 UInt64 specialization +template <> +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + if (second.size == 0) + return true; + + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) + return false; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; + short has_mask = 1; + const Int64 full = -1, none = 0; + const __m128i zeros = _mm_setzero_si128(); + if (second.size > 1 && first.size > 1) + { + for (; j < second.size - 1 && has_mask; j += 2) + { + has_mask = 0; + const __m128i f_data = _mm_loadu_si128(reinterpret_cast(second.data + j)); + __m128i bitmask = has_second_null_map ? + _mm_set_epi64x( + (second_null_map[j + 1]) ? full : none, + (second_null_map[j]) ? full : none) + : zeros; + unsigned i = 0; + for (; i < first.size - 1 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 2) + { + const __m128i s_data = _mm_loadu_si128(reinterpret_cast(first.data + i)); + const __m128i first_nm_mask = has_first_null_map ? + _mm_cvtepi8_epi64(_mm_loadu_si128(reinterpret_cast(first_null_map + i))) + : zeros; + bitmask = + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + first_nm_mask, + _mm_cmpeq_epi64(f_data, s_data)), + _mm_andnot_si128( + _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(1,0,3,2)), + _mm_cmpeq_epi64(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(1,0,3,2))))), + bitmask); + } + + if (i < first.size) + { + for (; i < first.size && !has_mask; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + __m128i v_i = _mm_set1_epi64x(first.data[i]); + bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi64(f_data, v_i)); + has_mask = _mm_test_all_ones(bitmask); + } + } + } + } + + if (!has_mask && second.size > 1) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); +} + +// SSE4.2 Int32 specialization +template <> +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) { if (second.size == 0) return true; @@ -553,10 +1006,10 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) { if (second.size == 0) return true; @@ -569,35 +1022,48 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll 1 && first.size > 1) + if (second.size > 3 && first.size > 3) { - for (; j < second.size - 1 && has_mask; j += 2) + for (; j < second.size - 3 && has_mask; j += 4) { has_mask = 0; const __m128i f_data = _mm_loadu_si128(reinterpret_cast(second.data + j)); __m128i bitmask = has_second_null_map ? - _mm_set_epi64x( + _mm_set_epi32( + (second_null_map[j + 3]) ? full : none, + (second_null_map[j + 2]) ? full : none, (second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full : none) : zeros; + unsigned i = 0; - for (; i < first.size - 1 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 2) + for (; i < first.size - 3 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 4) { const __m128i s_data = _mm_loadu_si128(reinterpret_cast(first.data + i)); const __m128i first_nm_mask = has_first_null_map ? - _mm_cvtepi8_epi64(_mm_loadu_si128(reinterpret_cast(first_null_map + i))) + _mm_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast(first_null_map + i))) : zeros; + bitmask = _mm_or_si128( + _mm_or_si128( _mm_or_si128( _mm_andnot_si128( - first_nm_mask, - _mm_cmpeq_epi64(f_data, s_data)), + first_nm_mask, + _mm_cmpeq_epi32(f_data, s_data)), + _mm_andnot_si128( + _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(2,1,0,3)), + _mm_cmpeq_epi32(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(2,1,0,3))))), + _mm_or_si128( _mm_andnot_si128( _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(1,0,3,2)), - _mm_cmpeq_epi64(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(1,0,3,2))))), + _mm_cmpeq_epi32(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(1,0,3,2)))), + _mm_andnot_si128( + _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(0,3,2,1)), + _mm_cmpeq_epi32(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(0,3,2,1))))) + ), bitmask); } @@ -607,24 +1073,120 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll 1) + if (!has_mask && second.size > 3) return false; return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); } -// SSE4.2 Int16_t specialization +// SSE4.2 Int16 specialization template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + if (second.size == 0) + return true; + + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) + return false; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; + short has_mask = 1; + const int16_t full = -1, none = 0; + const __m128i zeros = _mm_setzero_si128(); + if (second.size > 6 && first.size > 6) + { + for (; j < second.size - 7 && has_mask; j += 8) + { + has_mask = 0; + const __m128i f_data = _mm_loadu_si128(reinterpret_cast(second.data + j)); + __m128i bitmask = has_second_null_map ? + _mm_set_epi16( + (second_null_map[j + 7]) ? full : none, (second_null_map[j + 6]) ? full : none, + (second_null_map[j + 5]) ? full : none, (second_null_map[j + 4]) ? full : none, + (second_null_map[j + 3]) ? full : none, (second_null_map[j + 2]) ? full : none, + (second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full: none) + : zeros; + unsigned i = 0; + for (; i < first.size-7 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 8) + { + const __m128i s_data = _mm_loadu_si128(reinterpret_cast(first.data + i)); + const __m128i first_nm_mask = has_first_null_map ? + _mm_cvtepi8_epi16(_mm_loadu_si128(reinterpret_cast(first_null_map + i))) + : zeros; + bitmask = + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + first_nm_mask, + _mm_cmpeq_epi16(f_data, s_data)), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14)), + _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)), + _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10)), + _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10))))) + ), + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)), + _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6)), + _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)), + _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)), + _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)))))) + ), + bitmask); + } + + if (i < first.size) + { + for (; i < first.size && !has_mask; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + __m128i v_i = _mm_set1_epi16(first.data[i]); + bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi16(f_data, v_i)); + has_mask = _mm_test_all_ones(bitmask); + } + } + } + } + + if (!has_mask && second.size > 6) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); +} + +// SSE4.2 UInt16 specialization +template <> +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) { if (second.size == 0) return true; @@ -721,10 +1283,10 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) { if (second.size == 0) return true; @@ -852,6 +1414,136 @@ inline ALWAYS_INLINE bool sliceHasImplAnyAll +inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( + const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + if (second.size == 0) + return true; + + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) + return false; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; + short has_mask = 1; + const int8_t full = -1, none = 0; + const __m128i zeros = _mm_setzero_si128(); + if (second.size > 15 && first.size > 15) + { + for (; j < second.size - 15 && has_mask; j += 16) + { + has_mask = 0; + const __m128i f_data = _mm_loadu_si128(reinterpret_cast(second.data + j)); + __m128i bitmask = has_second_null_map ? + _mm_set_epi8( + (second_null_map[j + 15]) ? full : none, (second_null_map[j + 14]) ? full : none, + (second_null_map[j + 13]) ? full : none, (second_null_map[j + 12]) ? full : none, + (second_null_map[j + 11]) ? full : none, (second_null_map[j + 10]) ? full : none, + (second_null_map[j + 9]) ? full : none, (second_null_map[j + 8]) ? full : none, + (second_null_map[j + 7]) ? full : none, (second_null_map[j + 6]) ? full : none, + (second_null_map[j + 5]) ? full : none, (second_null_map[j + 4]) ? full : none, + (second_null_map[j + 3]) ? full : none, (second_null_map[j + 2]) ? full : none, + (second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full : none) + : zeros; + unsigned i = 0; + for (; i < first.size - 15 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 16) + { + const __m128i s_data = _mm_loadu_si128(reinterpret_cast(first.data + i)); + const __m128i first_nm_mask = has_first_null_map ? + _mm_loadu_si128(reinterpret_cast(first_null_map + i)) + : zeros; + bitmask = + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + first_nm_mask, + _mm_cmpeq_epi8(f_data, s_data)), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13))))) + ), + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9))))))), + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5)))))), + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1)), + _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1)))))))), + bitmask); + } + + if (i < first.size) + { + for (; i < first.size && !has_mask; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + __m128i v_i = _mm_set1_epi8(first.data[i]); + bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi8(f_data, v_i)); + has_mask = _mm_test_all_ones(bitmask); + } + } + } + } + + if (!has_mask && second.size > 15) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); +} #endif } From 57ff05b6e6583a3ca67c92cc1eb0e88fe4918a20 Mon Sep 17 00:00:00 2001 From: Habibullah Oladepo Date: Mon, 7 Feb 2022 21:38:36 +0100 Subject: [PATCH 019/239] Add new function toLastDayOfMonth --- .../functions/date-time-functions.md | 7 ++++++ src/Common/DateLUTImpl.h | 21 ++++++++++++++++ src/Common/tests/gtest_DateLUTImpl.cpp | 6 +++++ src/Functions/DateTimeTransforms.h | 24 +++++++++++++++++++ src/Functions/registerFunctionsDateTime.cpp | 2 ++ src/Functions/toLastDayOfMonth.cpp | 21 ++++++++++++++++ utils/db-generator/query_db_generator.cpp | 4 ++-- 7 files changed, 83 insertions(+), 2 deletions(-) create mode 100644 src/Functions/toLastDayOfMonth.cpp diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md index 7ded7e72d8c..6180ad91946 100644 --- a/docs/en/sql-reference/functions/date-time-functions.md +++ b/docs/en/sql-reference/functions/date-time-functions.md @@ -393,6 +393,13 @@ This is a generalization of other functions named `toStartOf*`. For example, `toStartOfInterval(t, INTERVAL 1 day)` returns the same as `toStartOfDay(t)`, `toStartOfInterval(t, INTERVAL 15 minute)` returns the same as `toStartOfFifteenMinutes(t)` etc. +## toLastDayOfMonth {#toLastDayOfMonth} + +Rounds up a date or date with time to the last day of the month. +Returns the date. + +Alias: `LAST_DAY`. + ## toTime {#totime} Converts a date with time to a certain fixed date, while preserving the time. diff --git a/src/Common/DateLUTImpl.h b/src/Common/DateLUTImpl.h index c178dc58854..afee4fa8750 100644 --- a/src/Common/DateLUTImpl.h +++ b/src/Common/DateLUTImpl.h @@ -360,6 +360,27 @@ public: return toDayNum(LUTIndex(i - (lut[i].day_of_month - 1))); } + /// Round up to last day of month. + template + inline Time toLastDayOfMonth(DateOrTime v) const + { + const LUTIndex i = toLUTIndex(v); + if constexpr (std::is_unsigned_v || std::is_same_v) + return lut_saturated[i - lut[i].day_of_month + lut[i].days_in_month].date; + else + return lut[i - lut[i].day_of_month + lut[i].days_in_month].date; + } + + template + inline auto toLastDayNumOfMonth(DateOrTime v) const + { + const LUTIndex i = toLUTIndex(v); + if constexpr (std::is_unsigned_v || std::is_same_v) + return toDayNum(LUTIndexWithSaturation(i - lut[i].day_of_month + lut[i].days_in_month)); + else + return toDayNum(LUTIndex(i - lut[i].day_of_month + lut[i].days_in_month)); + } + /// Round down to start of quarter. template inline auto toFirstDayNumOfQuarter(DateOrTime v) const diff --git a/src/Common/tests/gtest_DateLUTImpl.cpp b/src/Common/tests/gtest_DateLUTImpl.cpp index 1220c50b409..c917f4951e2 100644 --- a/src/Common/tests/gtest_DateLUTImpl.cpp +++ b/src/Common/tests/gtest_DateLUTImpl.cpp @@ -142,6 +142,8 @@ TEST(DateLUTTest, TimeValuesInMiddleOfRange) EXPECT_EQ(lut.addYears(time, 10), 1884270011 /*time_t*/); EXPECT_EQ(lut.timeToString(time), "2019-09-16 19:20:11" /*std::string*/); EXPECT_EQ(lut.dateToString(time), "2019-09-16" /*std::string*/); + EXPECT_EQ(lut.toLastDayOfMonth(time), 1569790800 /*time_t*/); + EXPECT_EQ(lut.toLastDayNumOfMonth(time), DayNum(18169) /*DayNum*/); } @@ -202,6 +204,8 @@ TEST(DateLUTTest, TimeValuesAtLeftBoderOfRange) EXPECT_EQ(lut.addYears(time, 10), 315532800 /*time_t*/); EXPECT_EQ(lut.timeToString(time), "1970-01-01 00:00:00" /*std::string*/); EXPECT_EQ(lut.dateToString(time), "1970-01-01" /*std::string*/); + EXPECT_EQ(lut.toLastDayOfMonth(time), 2592000 /*time_t*/); + EXPECT_EQ(lut.toLastDayNumOfMonth(time), DayNum(30) /*DayNum*/); } TEST(DateLUTTest, TimeValuesAtRightBoderOfRangeOfOldLUT) @@ -264,6 +268,8 @@ TEST(DateLUTTest, TimeValuesAtRightBoderOfRangeOfOldLUT) EXPECT_EQ(lut.timeToString(time), "2106-01-31 01:17:53" /*std::string*/); EXPECT_EQ(lut.dateToString(time), "2106-01-31" /*std::string*/); + EXPECT_EQ(lut.toLastDayOfMonth(time), 4294339200 /*time_t*/); // 2016-01-01 + EXPECT_EQ(lut.toLastDayNumOfMonth(time), DayNum(49703)); } diff --git a/src/Functions/DateTimeTransforms.h b/src/Functions/DateTimeTransforms.h index a7f06689820..9581a2cca76 100644 --- a/src/Functions/DateTimeTransforms.h +++ b/src/Functions/DateTimeTransforms.h @@ -174,6 +174,30 @@ struct ToStartOfMonthImpl using FactorTransform = ZeroTransform; }; +struct ToLastDayOfMonthImpl +{ + static constexpr auto name = "toLastDayOfMonth"; + + static inline UInt16 execute(Int64 t, const DateLUTImpl & time_zone) + { + return time_zone.toLastDayNumOfMonth(time_zone.toDayNum(t)); + } + static inline UInt16 execute(UInt32 t, const DateLUTImpl & time_zone) + { + return time_zone.toLastDayNumOfMonth(time_zone.toDayNum(t)); + } + static inline UInt16 execute(Int32 d, const DateLUTImpl & time_zone) + { + return time_zone.toLastDayNumOfMonth(ExtendedDayNum(d)); + } + static inline UInt16 execute(UInt16 d, const DateLUTImpl & time_zone) + { + return time_zone.toLastDayNumOfMonth(DayNum(d)); + } + + using FactorTransform = ZeroTransform; +}; + struct ToStartOfQuarterImpl { static constexpr auto name = "toStartOfQuarter"; diff --git a/src/Functions/registerFunctionsDateTime.cpp b/src/Functions/registerFunctionsDateTime.cpp index 5211a62ff1e..1ee80a5fac9 100644 --- a/src/Functions/registerFunctionsDateTime.cpp +++ b/src/Functions/registerFunctionsDateTime.cpp @@ -20,6 +20,7 @@ void registerFunctionToISOYear(FunctionFactory &); void registerFunctionToCustomWeek(FunctionFactory &); void registerFunctionToModifiedJulianDay(FunctionFactory &); void registerFunctionToStartOfMonth(FunctionFactory &); +void registerFunctionToLastDayOfMonth(FunctionFactory &); void registerFunctionToStartOfQuarter(FunctionFactory &); void registerFunctionToStartOfYear(FunctionFactory &); void registerFunctionToStartOfMinute(FunctionFactory &); @@ -91,6 +92,7 @@ void registerFunctionsDateTime(FunctionFactory & factory) registerFunctionToCustomWeek(factory); registerFunctionToModifiedJulianDay(factory); registerFunctionToStartOfMonth(factory); + registerFunctionToLastDayOfMonth(factory); registerFunctionToStartOfQuarter(factory); registerFunctionToStartOfYear(factory); registerFunctionToStartOfSecond(factory); diff --git a/src/Functions/toLastDayOfMonth.cpp b/src/Functions/toLastDayOfMonth.cpp new file mode 100644 index 00000000000..49561a16e71 --- /dev/null +++ b/src/Functions/toLastDayOfMonth.cpp @@ -0,0 +1,21 @@ +#include +#include +#include + + +namespace DB +{ + +using FunctionToLastDayOfMonth = FunctionDateOrDateTimeToSomething; + +void registerFunctionToLastDayOfMonth(FunctionFactory & factory) +{ + factory.registerFunction(); + + /// MysQL compatibility alias. + factory.registerFunction("LAST_DAY", FunctionFactory::CaseInsensitive); +} + +} + + diff --git a/utils/db-generator/query_db_generator.cpp b/utils/db-generator/query_db_generator.cpp index 7d71e13a6e9..0020bc53753 100644 --- a/utils/db-generator/query_db_generator.cpp +++ b/utils/db-generator/query_db_generator.cpp @@ -229,7 +229,7 @@ std::map func_to_return_type = { {"torelativeweeknum", FuncRet(Type::i, "")}, {"torelativedaynum", FuncRet(Type::i, "")}, {"torelativehournum", FuncRet(Type::i, "")}, {"torelativeminutenum", FuncRet(Type::i, "")}, {"torelativesecondsnum", FuncRet(Type::i, "")}, {"datediff", FuncRet(Type::d | Type::dt, "")}, {"formatdatetime", FuncRet(Type::s, "")}, {"now", FuncRet(Type::dt | Type::d, "now()")}, {"today", FuncRet(Type::d | Type::dt, "today()")}, - {"yesterday", FuncRet(Type::d | Type::dt, "yesterday()")} + {"yesterday", FuncRet(Type::d | Type::dt, "yesterday()")}, {"tolastdayofmonth", FuncRet(Type::dt | Type::d, "")} }; std::set func_args_same_types = { @@ -253,7 +253,7 @@ std::map func_to_param_type = { {"tostartofinterval", Type::d | Type::dt}, {"totime", Type::d | Type::dt}, {"torelativehonthnum", Type::d | Type::dt}, {"torelativeweeknum", Type::d | Type::dt}, {"torelativedaynum", Type::d | Type::dt}, {"torelativehournum", Type::d | Type::dt}, {"torelativeminutenum", Type::d | Type::dt}, {"torelativesecondnum", Type::d | Type::dt}, {"datediff", Type::d | Type::dt}, - {"formatdatetime", Type::dt} + {"formatdatetime", Type::dt}, {"tolastdayofmonth", Type::d | Type::dt} }; From dec083ab4406f44bc5c189f74297052116049b4b Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Tue, 18 Jan 2022 18:47:25 +0800 Subject: [PATCH 020/239] add minmax index for hivengine; remove libhdfspp dependency for hive engine --- src/Storages/Hive/HiveFile.cpp | 203 ++++++++++++++++++++++++++---- src/Storages/Hive/HiveFile.h | 28 ++--- src/Storages/Hive/StorageHive.cpp | 6 +- 3 files changed, 194 insertions(+), 43 deletions(-) diff --git a/src/Storages/Hive/HiveFile.cpp b/src/Storages/Hive/HiveFile.cpp index b0cfa9809e1..a74a5e36575 100644 --- a/src/Storages/Hive/HiveFile.cpp +++ b/src/Storages/Hive/HiveFile.cpp @@ -1,22 +1,37 @@ +<<<<<<< HEAD #include +======= + +#include +>>>>>>> d9558cbca4... add minmax index for hivengine; remove libhdfspp dependency for hive engine #if USE_HIVE #include +<<<<<<< HEAD #include #include +======= +#include +#include +#include +#include +>>>>>>> d9558cbca4... add minmax index for hivengine; remove libhdfspp dependency for hive engine #include +#include #include -#include -#include #include #include #include +#include #include #include #include +#include +#include +#include #include #include @@ -28,10 +43,22 @@ namespace ErrorCodes namespace DB { +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; +} + +#define THROW_ARROW_NOT_OK(status) \ + do \ + { \ + if (::arrow::Status _s = (status); !_s.ok()) \ + throw Exception(_s.ToString(), ErrorCodes::BAD_ARGUMENTS); \ + } while (false) + + template Range createRangeFromOrcStatistics(const StatisticsType * stats) { - /// We must check if there are minimum or maximum values in statistics in case of /// null values or NaN/Inf values of double type. if (stats->hasMinimum() && stats->hasMaximum()) { @@ -117,65 +144,193 @@ Range HiveOrcFile::buildRange(const orc::ColumnStatistics * col_stats) void HiveOrcFile::prepareReader() { - // TODO To be implemented - throw Exception("Unimplemented HiveOrcFile::prepareReader", ErrorCodes::NOT_IMPLEMENTED); + in = std::make_unique(namenode_url, path, getContext()->getGlobalContext()->getConfigRef()); + auto format_settings = getFormatSettings(getContext()); + THROW_ARROW_NOT_OK(arrow::adapters::orc::ORCFileReader::Open(asArrowFile(*in, format_settings), arrow::default_memory_pool(), &reader)); } void HiveOrcFile::prepareColumnMapping() { - // TODO To be implemented - throw Exception("Unimplemented HiveOrcFile::prepareColumnMapping", ErrorCodes::NOT_IMPLEMENTED); + const orc::Type & type = reader->GetRawORCReader()->getType(); + size_t size = type.getSubtypeCount(); + for (size_t pos = 0; pos < size; pos++) + { + // hive中字符串不区分大小写。所以这里统一改成小写,方便匹配 + String column{type.getFieldName(pos)}; + boost::to_lower(column); + orc_column_positions[column] = pos; + } } bool HiveOrcFile::hasMinMaxIndex() const { - return false; + return !storage_settings->disable_orc_file_minmax_index; } -std::unique_ptr HiveOrcFile::buildMinMaxIndex(const orc::Statistics * /*statistics*/) +std::unique_ptr HiveOrcFile::buildMinMaxIndex(const orc::Statistics * statistics) { - // TODO To be implemented - throw Exception("Unimplemented HiveOrcFile::buildMinMaxIndex", ErrorCodes::NOT_IMPLEMENTED); + if (!statistics) + return nullptr; + + size_t size = index_names_and_types.size(); + auto idx = std::make_unique(); + idx->hyperrectangle.resize(size); + + size_t i = 0; + for (const auto & name_type : index_names_and_types) + { + String column{name_type.name}; + boost::to_lower(column); + auto it = orc_column_positions.find(column); + if (it == orc_column_positions.end()) + { + idx->hyperrectangle[i] = buildRange(nullptr); + // std::cerr << "statistics:nullptr" << std::endl; + } + else + { + size_t pos = it->second; + // 注意:column statistics从1开始. 0有特殊用途 + const orc::ColumnStatistics * col_stats = statistics->getColumnStatistics(pos + 1); + idx->hyperrectangle[i] = buildRange(col_stats); + // std::cerr << "statistics:" << col_stats->toString(); + // std::cerr << "name:" << column << ", pos" << pos << ", range:" << idx->hyperrectangle[i].toString() << std::endl; + } + ++i; + } + idx->initialized = true; + return idx; } void HiveOrcFile::loadMinMaxIndex() { - // TODO To be implemented - throw Exception("Unimplemented HiveOrcFile::loadMinMaxIndex", ErrorCodes::NOT_IMPLEMENTED); + if (!reader) + { + prepareReader(); + prepareColumnMapping(); + } + + auto statistics = reader->GetRawORCReader()->getStatistics(); + minmax_idx = buildMinMaxIndex(statistics.get()); } bool HiveOrcFile::hasSubMinMaxIndex() const { - // TODO To be implemented - return false; + return !storage_settings->disable_orc_stripe_minmax_index; } void HiveOrcFile::loadSubMinMaxIndex() { - // TODO To be implemented - throw Exception("Unimplemented HiveOrcFile::loadSubMinMaxIndex", ErrorCodes::NOT_IMPLEMENTED); + if (!reader) + { + prepareReader(); + prepareColumnMapping(); + } + + auto * raw_reader = reader->GetRawORCReader(); + auto stripe_num = raw_reader->getNumberOfStripes(); + auto stripe_stats_num = raw_reader->getNumberOfStripeStatistics(); + if (stripe_num != stripe_stats_num) + throw Exception( + fmt::format("orc file:{} has different strip num {} and strip statistics num {}", path, stripe_num, stripe_stats_num), + ErrorCodes::BAD_ARGUMENTS); + + sub_minmax_idxes.resize(stripe_num); + for (size_t i = 0; i < stripe_num; ++i) + { + auto stripe_stats = raw_reader->getStripeStatistics(i); + sub_minmax_idxes[i] = buildMinMaxIndex(stripe_stats.get()); + } } bool HiveParquetFile::hasSubMinMaxIndex() const { - // TODO To be implemented - return false; + return !storage_settings->disable_parquet_rowgroup_minmax_index; } void HiveParquetFile::prepareReader() { - // TODO To be implemented - throw Exception("Unimplemented HiveParquetFile::prepareReader", ErrorCodes::NOT_IMPLEMENTED); + in = std::make_unique(namenode_url, path, getContext()->getGlobalContext()->getConfigRef()); + auto format_settings = getFormatSettings(getContext()); + THROW_ARROW_NOT_OK(parquet::arrow::OpenFile(asArrowFile(*in, format_settings), arrow::default_memory_pool(), &reader)); } - void HiveParquetFile::loadSubMinMaxIndex() { - // TODO To be implemented - throw Exception("Unimplemented HiveParquetFile::loadSubMinMaxIndex", ErrorCodes::NOT_IMPLEMENTED); + if (!reader) + prepareReader(); + + auto meta = reader->parquet_reader()->metadata(); + size_t num_cols = meta->num_columns(); + size_t num_row_groups = meta->num_row_groups(); + const auto * schema = meta->schema(); + for (size_t pos = 0; pos < num_cols; ++pos) + { + String column{schema->Column(pos)->name()}; + boost::to_lower(column); + parquet_column_positions[column] = pos; + } + + + sub_minmax_idxes.resize(num_row_groups); + for (size_t i = 0; i < num_row_groups; ++i) + { + auto row_group_meta = meta->RowGroup(i); + sub_minmax_idxes[i] = std::make_shared(); + sub_minmax_idxes[i]->hyperrectangle.resize(num_cols); + + size_t j = 0; + auto it = index_names_and_types.begin(); + for (; it != index_names_and_types.end(); ++j, ++it) + { + // 如果parquet file中不存在该字段,使用空Range + String name{it->name}; + boost::to_lower(name); + auto mit = parquet_column_positions.find(name); + if (mit == parquet_column_positions.end()) + continue; + + size_t pos = mit->second; + auto col_chunk = row_group_meta->ColumnChunk(pos); + if (!col_chunk->is_stats_set()) + continue; + + auto stats = col_chunk->statistics(); + if (stats->HasNullCount() && stats->null_count() > 0) + continue; + + if (auto bool_stats = std::dynamic_pointer_cast(stats)) + { + sub_minmax_idxes[i]->hyperrectangle[j] = createRangeFromParquetStatistics(bool_stats); + } + else if (auto int32_stats = std::dynamic_pointer_cast(stats)) + { + // Hive中没有unsigned interger, 这里不用考虑相关case + sub_minmax_idxes[i]->hyperrectangle[j] = createRangeFromParquetStatistics(int32_stats); + } + else if (auto int64_stats = std::dynamic_pointer_cast(stats)) + { + sub_minmax_idxes[i]->hyperrectangle[j] = createRangeFromParquetStatistics(int64_stats); + } + else if (auto float_stats = std::dynamic_pointer_cast(stats)) + { + sub_minmax_idxes[i]->hyperrectangle[j] = createRangeFromParquetStatistics(float_stats); + } + else if (auto double_stats = std::dynamic_pointer_cast(stats)) + { + sub_minmax_idxes[i]->hyperrectangle[j] = createRangeFromParquetStatistics(double_stats); + } + else if (auto string_stats = std::dynamic_pointer_cast(stats)) + { + sub_minmax_idxes[i]->hyperrectangle[j] = createRangeFromParquetStatistics(string_stats); + } + // 其他类型无法使用minmax index, 跳过 + } + sub_minmax_idxes[i]->initialized = true; + } } } diff --git a/src/Storages/Hive/HiveFile.h b/src/Storages/Hive/HiveFile.h index 63cca2562eb..7a3fcac312f 100644 --- a/src/Storages/Hive/HiveFile.h +++ b/src/Storages/Hive/HiveFile.h @@ -17,26 +17,18 @@ namespace orc { -class Reader; +class Statistics; +class ColumnStatistics; } -namespace parquet +namespace parquet::arrow { -class ParquetFileReader; -namespace arrow -{ - class FileReader; -} +class FileReader; } -namespace arrow +namespace arrow::adapters::orc { -namespace io -{ - class RandomAccessFile; -} - -class Buffer; +class ORCFileReader; } namespace DB @@ -46,6 +38,7 @@ namespace ErrorCodes extern const int NOT_IMPLEMENTED; } +class ReadBufferFromHDFS; class IHiveFile : public WithContext { public: @@ -230,7 +223,8 @@ protected: virtual void prepareReader(); virtual void prepareColumnMapping(); - std::shared_ptr reader; + std::unique_ptr in; + std::unique_ptr reader; std::map orc_column_positions; }; @@ -259,8 +253,8 @@ public: protected: virtual void prepareReader(); - std::shared_ptr fs; - std::shared_ptr reader; + std::unique_ptr in; + std::unique_ptr reader; std::map parquet_column_positions; }; } diff --git a/src/Storages/Hive/StorageHive.cpp b/src/Storages/Hive/StorageHive.cpp index 3040ad23283..ed9da822fb0 100644 --- a/src/Storages/Hive/StorageHive.cpp +++ b/src/Storages/Hive/StorageHive.cpp @@ -2,12 +2,14 @@ #if USE_HIVE -#include #include #include +#include #include - #include +#include +#include + #include #include #include From f19f0d847f1752ed1263253e898c04a51f17966e Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Wed, 16 Feb 2022 12:23:06 +0800 Subject: [PATCH 021/239] fix code style --- src/Storages/Hive/HiveFile.cpp | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/src/Storages/Hive/HiveFile.cpp b/src/Storages/Hive/HiveFile.cpp index a74a5e36575..85f0a5dfee0 100644 --- a/src/Storages/Hive/HiveFile.cpp +++ b/src/Storages/Hive/HiveFile.cpp @@ -1,23 +1,12 @@ -<<<<<<< HEAD #include -======= - -#include ->>>>>>> d9558cbca4... add minmax index for hivengine; remove libhdfspp dependency for hive engine #if USE_HIVE #include -<<<<<<< HEAD -#include - -#include -======= #include #include #include #include ->>>>>>> d9558cbca4... add minmax index for hivengine; remove libhdfspp dependency for hive engine #include #include #include @@ -35,11 +24,6 @@ #include #include -namespace ErrorCodes -{ - extern const int NOT_IMPLEMENTED; -} - namespace DB { From afcb295273c14708cf12d6f0102f56e7bced0278 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Wed, 16 Feb 2022 14:51:56 +0800 Subject: [PATCH 022/239] fix compile error --- src/Storages/Hive/HiveFile.cpp | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/src/Storages/Hive/HiveFile.cpp b/src/Storages/Hive/HiveFile.cpp index 85f0a5dfee0..bbcb1d3e82e 100644 --- a/src/Storages/Hive/HiveFile.cpp +++ b/src/Storages/Hive/HiveFile.cpp @@ -43,7 +43,7 @@ namespace ErrorCodes template Range createRangeFromOrcStatistics(const StatisticsType * stats) { - /// null values or NaN/Inf values of double type. + /// Null values or NaN/Inf values of double type. if (stats->hasMinimum() && stats->hasMaximum()) { return Range(FieldType(stats->getMinimum()), true, FieldType(stats->getMaximum()), true); @@ -130,7 +130,8 @@ void HiveOrcFile::prepareReader() { in = std::make_unique(namenode_url, path, getContext()->getGlobalContext()->getConfigRef()); auto format_settings = getFormatSettings(getContext()); - THROW_ARROW_NOT_OK(arrow::adapters::orc::ORCFileReader::Open(asArrowFile(*in, format_settings), arrow::default_memory_pool(), &reader)); + std::atomic is_stopped{0}; + THROW_ARROW_NOT_OK(arrow::adapters::orc::ORCFileReader::Open(asArrowFile(*in, format_settings, is_stopped), arrow::default_memory_pool(), &reader)); } void HiveOrcFile::prepareColumnMapping() @@ -139,7 +140,7 @@ void HiveOrcFile::prepareColumnMapping() size_t size = type.getSubtypeCount(); for (size_t pos = 0; pos < size; pos++) { - // hive中字符串不区分大小写。所以这里统一改成小写,方便匹配 + /// Column names in hive is case-insensitive. String column{type.getFieldName(pos)}; boost::to_lower(column); orc_column_positions[column] = pos; @@ -148,7 +149,7 @@ void HiveOrcFile::prepareColumnMapping() bool HiveOrcFile::hasMinMaxIndex() const { - return !storage_settings->disable_orc_file_minmax_index; + return !storage_settings->enable_orc_file_minmax_index; } @@ -170,16 +171,13 @@ std::unique_ptr HiveOrcFile::buildMinMaxIndex(c if (it == orc_column_positions.end()) { idx->hyperrectangle[i] = buildRange(nullptr); - // std::cerr << "statistics:nullptr" << std::endl; } else { size_t pos = it->second; - // 注意:column statistics从1开始. 0有特殊用途 + /// Attention: column statistics start from 1. 0 has special purpose. const orc::ColumnStatistics * col_stats = statistics->getColumnStatistics(pos + 1); idx->hyperrectangle[i] = buildRange(col_stats); - // std::cerr << "statistics:" << col_stats->toString(); - // std::cerr << "name:" << column << ", pos" << pos << ", range:" << idx->hyperrectangle[i].toString() << std::endl; } ++i; } @@ -202,7 +200,7 @@ void HiveOrcFile::loadMinMaxIndex() bool HiveOrcFile::hasSubMinMaxIndex() const { - return !storage_settings->disable_orc_stripe_minmax_index; + return !storage_settings->enable_orc_stripe_minmax_index; } @@ -232,14 +230,15 @@ void HiveOrcFile::loadSubMinMaxIndex() bool HiveParquetFile::hasSubMinMaxIndex() const { - return !storage_settings->disable_parquet_rowgroup_minmax_index; + return !storage_settings->enable_parquet_rowgroup_minmax_index; } void HiveParquetFile::prepareReader() { in = std::make_unique(namenode_url, path, getContext()->getGlobalContext()->getConfigRef()); auto format_settings = getFormatSettings(getContext()); - THROW_ARROW_NOT_OK(parquet::arrow::OpenFile(asArrowFile(*in, format_settings), arrow::default_memory_pool(), &reader)); + std::atomic is_stopped{0}; + THROW_ARROW_NOT_OK(parquet::arrow::OpenFile(asArrowFile(*in, format_settings, is_stopped), arrow::default_memory_pool(), &reader)); } void HiveParquetFile::loadSubMinMaxIndex() @@ -270,7 +269,6 @@ void HiveParquetFile::loadSubMinMaxIndex() auto it = index_names_and_types.begin(); for (; it != index_names_and_types.end(); ++j, ++it) { - // 如果parquet file中不存在该字段,使用空Range String name{it->name}; boost::to_lower(name); auto mit = parquet_column_positions.find(name); @@ -292,7 +290,6 @@ void HiveParquetFile::loadSubMinMaxIndex() } else if (auto int32_stats = std::dynamic_pointer_cast(stats)) { - // Hive中没有unsigned interger, 这里不用考虑相关case sub_minmax_idxes[i]->hyperrectangle[j] = createRangeFromParquetStatistics(int32_stats); } else if (auto int64_stats = std::dynamic_pointer_cast(stats)) @@ -311,7 +308,7 @@ void HiveParquetFile::loadSubMinMaxIndex() { sub_minmax_idxes[i]->hyperrectangle[j] = createRangeFromParquetStatistics(string_stats); } - // 其他类型无法使用minmax index, 跳过 + /// Other types are not supported for minmax index, skip } sub_minmax_idxes[i]->initialized = true; } From a4baec6d267f3aa5283f26d4034914cdfdfc2b3b Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Wed, 16 Feb 2022 15:12:43 +0800 Subject: [PATCH 023/239] fix building --- src/Storages/Hive/HiveFile.cpp | 7 ++++--- src/Storages/Hive/HiveFile.h | 2 -- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/Storages/Hive/HiveFile.cpp b/src/Storages/Hive/HiveFile.cpp index bbcb1d3e82e..896c464c80f 100644 --- a/src/Storages/Hive/HiveFile.cpp +++ b/src/Storages/Hive/HiveFile.cpp @@ -3,8 +3,6 @@ #if USE_HIVE #include -#include -#include #include #include #include @@ -13,6 +11,7 @@ #include #include #include +#include #include #include @@ -131,7 +130,9 @@ void HiveOrcFile::prepareReader() in = std::make_unique(namenode_url, path, getContext()->getGlobalContext()->getConfigRef()); auto format_settings = getFormatSettings(getContext()); std::atomic is_stopped{0}; - THROW_ARROW_NOT_OK(arrow::adapters::orc::ORCFileReader::Open(asArrowFile(*in, format_settings, is_stopped), arrow::default_memory_pool(), &reader)); + auto result = arrow::adapters::orc::ORCFileReader::Open(asArrowFile(*in, format_settings, is_stopped), arrow::default_memory_pool()); + THROW_ARROW_NOT_OK(result.status()); + reader = std::move(result).ValueOrDie(); } void HiveOrcFile::prepareColumnMapping() diff --git a/src/Storages/Hive/HiveFile.h b/src/Storages/Hive/HiveFile.h index 7a3fcac312f..dfecd79f932 100644 --- a/src/Storages/Hive/HiveFile.h +++ b/src/Storages/Hive/HiveFile.h @@ -7,8 +7,6 @@ #include #include -#include -#include #include #include From 40dd7da309f32771a4ae08323500707c61f697e2 Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Thu, 10 Mar 2022 22:29:56 +0300 Subject: [PATCH 024/239] Update gtest_DateLUTImpl.cpp --- src/Common/tests/gtest_DateLUTImpl.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Common/tests/gtest_DateLUTImpl.cpp b/src/Common/tests/gtest_DateLUTImpl.cpp index c917f4951e2..2b1ccf99f59 100644 --- a/src/Common/tests/gtest_DateLUTImpl.cpp +++ b/src/Common/tests/gtest_DateLUTImpl.cpp @@ -225,7 +225,7 @@ TEST(DateLUTTest, TimeValuesAtRightBoderOfRangeOfOldLUT) EXPECT_EQ(lut.toFirstDayOfWeek(time), 4293820800 /*time_t*/); EXPECT_EQ(lut.toFirstDayNumOfWeek(time), DayNum(49697)); - EXPECT_EQ(lut.toFirstDayOfMonth(time), 4291747200 /*time_t*/); // 2016-01-01 + EXPECT_EQ(lut.toFirstDayOfMonth(time), 4291747200 /*time_t*/); // 2106-01-01 EXPECT_EQ(lut.toFirstDayNumOfMonth(time), DayNum(49673)); EXPECT_EQ(lut.toFirstDayNumOfQuarter(time), DayNum(49673) /*DayNum*/); EXPECT_EQ(lut.toFirstDayOfQuarter(time), 4291747200 /*time_t*/); @@ -268,7 +268,7 @@ TEST(DateLUTTest, TimeValuesAtRightBoderOfRangeOfOldLUT) EXPECT_EQ(lut.timeToString(time), "2106-01-31 01:17:53" /*std::string*/); EXPECT_EQ(lut.dateToString(time), "2106-01-31" /*std::string*/); - EXPECT_EQ(lut.toLastDayOfMonth(time), 4294339200 /*time_t*/); // 2016-01-01 + EXPECT_EQ(lut.toLastDayOfMonth(time), 4294339200 /*time_t*/); // 2106-01-01 EXPECT_EQ(lut.toLastDayNumOfMonth(time), DayNum(49703)); } From 008760b4362ec67accdcdb08ddb754dbc0fc9641 Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Thu, 10 Mar 2022 22:30:46 +0300 Subject: [PATCH 025/239] Update toLastDayOfMonth.cpp --- src/Functions/toLastDayOfMonth.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Functions/toLastDayOfMonth.cpp b/src/Functions/toLastDayOfMonth.cpp index 49561a16e71..7a15ede4e96 100644 --- a/src/Functions/toLastDayOfMonth.cpp +++ b/src/Functions/toLastDayOfMonth.cpp @@ -12,7 +12,7 @@ void registerFunctionToLastDayOfMonth(FunctionFactory & factory) { factory.registerFunction(); - /// MysQL compatibility alias. + /// MySQL compatibility alias. factory.registerFunction("LAST_DAY", FunctionFactory::CaseInsensitive); } From 93550d547e387802b3720e6f725b47b7b461e971 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Tue, 15 Mar 2022 18:25:36 +0800 Subject: [PATCH 026/239] add doc and tests --- .../functions/other-functions.md | 36 +++++++++++++++++++ .../registerFunctionsMiscellaneous.cpp | 2 ++ .../02240_enumerate_streams.reference | 5 +++ .../0_stateless/02240_enumerate_streams.sql | 5 +++ 4 files changed, 48 insertions(+) create mode 100644 tests/queries/0_stateless/02240_enumerate_streams.reference create mode 100644 tests/queries/0_stateless/02240_enumerate_streams.sql diff --git a/docs/en/sql-reference/functions/other-functions.md b/docs/en/sql-reference/functions/other-functions.md index bce3f9144b1..2f6c3ec9cf3 100644 --- a/docs/en/sql-reference/functions/other-functions.md +++ b/docs/en/sql-reference/functions/other-functions.md @@ -2499,3 +2499,39 @@ Result: │ 286 │ └──────────────────────────┘ ``` + +## enumerateStreams {#enumerateStreams} + +return the enumerated stream paths of data type. + +**Syntax** + +``` sql +enumerateStreams(type_name) +``` + +**Arguments** +- `type_name` - Name of data type to enumerate its stream paths. [String](../../sql-reference/data-types/string.md#string). + +**Returned value** +- List of enumerated stream paths. + +Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). + + + +**Example** + +Query: + +``` sql +SELECT enumerateStreams('Array(Array(Int8))') +``` + +Result: + +``` text +┌─enumerateStreams('Array(Array(Int8))')───────────────────────────────────────────────────┐ +│ ['{ArraySizes}','{ArrayElements, ArraySizes}','{ArrayElements, ArrayElements, Regular}'] │ +└──────────────────────────────────────────────────────────────────────────────────────────┘ +``` diff --git a/src/Functions/registerFunctionsMiscellaneous.cpp b/src/Functions/registerFunctionsMiscellaneous.cpp index 76d61ce509a..2ed7e3e0238 100644 --- a/src/Functions/registerFunctionsMiscellaneous.cpp +++ b/src/Functions/registerFunctionsMiscellaneous.cpp @@ -80,6 +80,7 @@ void registerFunctionInitialQueryID(FunctionFactory & factory); void registerFunctionServerUUID(FunctionFactory &); void registerFunctionZooKeeperSessionUptime(FunctionFactory &); void registerFunctionGetOSKernelVersion(FunctionFactory &); +void registerFunctionEnumerateStreams(FunctionFactory &); #if USE_ICU void registerFunctionConvertCharset(FunctionFactory &); @@ -166,6 +167,7 @@ void registerFunctionsMiscellaneous(FunctionFactory & factory) registerFunctionServerUUID(factory); registerFunctionZooKeeperSessionUptime(factory); registerFunctionGetOSKernelVersion(factory); + registerFunctionEnumerateStreams(factory); #if USE_ICU registerFunctionConvertCharset(factory); diff --git a/tests/queries/0_stateless/02240_enumerate_streams.reference b/tests/queries/0_stateless/02240_enumerate_streams.reference new file mode 100644 index 00000000000..f17375788d6 --- /dev/null +++ b/tests/queries/0_stateless/02240_enumerate_streams.reference @@ -0,0 +1,5 @@ +['{ArraySizes}','{ArrayElements, Regular}'] +['{ArraySizes}','{ArrayElements, TupleElement(keys, escape_tuple_delimiter = true), Regular}','{ArrayElements, TupleElement(values, escape_tuple_delimiter = true), Regular}'] +['{TupleElement(1, escape_tuple_delimiter = true), Regular}','{TupleElement(2, escape_tuple_delimiter = true), Regular}','{TupleElement(3, escape_tuple_delimiter = true), Regular}'] +['{DictionaryKeys, Regular}','{DictionaryIndexes}'] +['{NullMap}','{NullableElements, Regular}'] diff --git a/tests/queries/0_stateless/02240_enumerate_streams.sql b/tests/queries/0_stateless/02240_enumerate_streams.sql new file mode 100644 index 00000000000..9514feaf55f --- /dev/null +++ b/tests/queries/0_stateless/02240_enumerate_streams.sql @@ -0,0 +1,5 @@ +select enumerateStreams('Array(Int8)'); +select enumerateStreams('Map(String, Int64)'); +select enumerateStreams('Tuple(String, Int64, Float64)'); +select enumerateStreams('LowCardinality(String)'); +select enumerateStreams('Nullable(String)'); From 0867a0ce178f6c7258418d4643f6738877207c87 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Thu, 17 Mar 2022 10:59:13 +0800 Subject: [PATCH 027/239] fix code style --- src/Functions/registerFunctionsMiscellaneous.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Functions/registerFunctionsMiscellaneous.cpp b/src/Functions/registerFunctionsMiscellaneous.cpp index 2ed7e3e0238..f70c52b7621 100644 --- a/src/Functions/registerFunctionsMiscellaneous.cpp +++ b/src/Functions/registerFunctionsMiscellaneous.cpp @@ -71,12 +71,12 @@ void registerFunctionErrorCodeToName(FunctionFactory &); void registerFunctionTcpPort(FunctionFactory &); void registerFunctionGetServerPort(FunctionFactory &); void registerFunctionByteSize(FunctionFactory &); -void registerFunctionFile(FunctionFactory & factory); -void registerFunctionConnectionId(FunctionFactory & factory); -void registerFunctionPartitionId(FunctionFactory & factory); +void registerFunctionFile(FunctionFactory &); +void registerFunctionConnectionId(FunctionFactory &); +void registerFunctionPartitionId(FunctionFactory &); void registerFunctionIsIPAddressContainedIn(FunctionFactory &); -void registerFunctionQueryID(FunctionFactory & factory); -void registerFunctionInitialQueryID(FunctionFactory & factory); +void registerFunctionQueryID(FunctionFactory &); +void registerFunctionInitialQueryID(FunctionFactory &); void registerFunctionServerUUID(FunctionFactory &); void registerFunctionZooKeeperSessionUptime(FunctionFactory &); void registerFunctionGetOSKernelVersion(FunctionFactory &); From c04ba090bfd9b80bd3184c3c25de958d2af07c9f Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Thu, 17 Mar 2022 15:35:35 +0800 Subject: [PATCH 028/239] add missing sourcefile and fix code style --- src/Access/LDAPAccessStorage.cpp | 4 +- src/Functions/enumerateStreams.cpp | 80 ++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+), 1 deletion(-) create mode 100644 src/Functions/enumerateStreams.cpp diff --git a/src/Access/LDAPAccessStorage.cpp b/src/Access/LDAPAccessStorage.cpp index dd1c50343f2..f05b240d287 100644 --- a/src/Access/LDAPAccessStorage.cpp +++ b/src/Access/LDAPAccessStorage.cpp @@ -481,7 +481,9 @@ std::optional LDAPAccessStorage::authenticateImpl( const Credentials & credentials, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators, - bool throw_if_user_not_exists,bool allow_no_password __attribute__((unused)), bool allow_plaintext_password __attribute__((unused))) const + bool throw_if_user_not_exists, + bool allow_no_password __attribute__((unused)), + bool allow_plaintext_password __attribute__((unused))) const { std::scoped_lock lock(mutex); auto id = memory_storage.find(credentials.getUserName()); diff --git a/src/Functions/enumerateStreams.cpp b/src/Functions/enumerateStreams.cpp new file mode 100644 index 00000000000..8ba703cc741 --- /dev/null +++ b/src/Functions/enumerateStreams.cpp @@ -0,0 +1,80 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ +namespace ErrorCodes +{ + extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int ILLEGAL_COLUMN; +} + +namespace +{ + +/// Enumerate stream paths of data type. +class FunctionEnumerateStreams : public IFunction +{ +public: + static constexpr auto name = "enumerateStreams"; + static FunctionPtr create(ContextPtr) + { + return std::make_shared(); + } + + String getName() const override + { + return name; + } + + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } + + size_t getNumberOfArguments() const override + { + return 1; + } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + if (!isString(arguments[0])) + throw Exception("The argument of function " + getName() + " must have String type", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + return std::make_shared(std::make_shared()); + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override + { + const IColumn * arg_column = arguments[0].column.get(); + const ColumnString * arg_string = checkAndGetColumnConstData(arg_column); + + if (!arg_string) + throw Exception("The argument of function " + getName() + " must be constant String", ErrorCodes::ILLEGAL_COLUMN); + + DataTypePtr type = DataTypeFactory::instance().get(arg_string->getDataAt(0).toString()); + SerializationPtr serialization = type->getDefaultSerialization(); + auto col_res = ColumnArray::create(ColumnString::create()); + ColumnString & col_res_strings = typeid_cast(col_res->getData()); + ColumnVectorHelper::Offsets & col_res_offsets = typeid_cast(col_res->getOffsets()); + serialization->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path) + { + col_res_strings.insert(substream_path.toString()); + }); + col_res_offsets.push_back(col_res_strings.size()); + return ColumnConst::create(std::move(col_res), input_rows_count); + } +}; + +} + +void registerFunctionEnumerateStreams(FunctionFactory & factory) +{ + factory.registerFunction(); +} + +} From d1369165084dabdbdfe72378df8c84c4066235dd Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 17 Mar 2022 10:49:14 +0100 Subject: [PATCH 029/239] Remove testmode option --- src/Client/ClientBase.cpp | 22 +++---------------- src/Client/TestHint.cpp | 5 +---- src/Client/TestHint.h | 8 +++---- tests/clickhouse-test | 2 +- ...825_protobuf_format_no_length_delimiter.sh | 2 +- ..._block_size_rows_for_materialized_views.sh | 4 ++-- .../01280_ssd_complex_key_dictionary.sh | 4 ++-- ...006_client_test_hint_no_such_error_name.sh | 2 +- .../02234_clickhouse_local_test_mode.sh | 3 +-- ..._parallel_processing_on_replicas_part_1.sh | 4 ++-- 10 files changed, 18 insertions(+), 38 deletions(-) diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp index c575cd37a5f..be63b96f654 100644 --- a/src/Client/ClientBase.cpp +++ b/src/Client/ClientBase.cpp @@ -1487,24 +1487,12 @@ MultiQueryProcessingStage ClientBase::analyzeMultiQueryText( bool ClientBase::executeMultiQuery(const String & all_queries_text) { - // It makes sense not to base any control flow on this, so that it is - // the same in tests and in normal usage. The only difference is that in - // normal mode we ignore the test hints. - const bool test_mode = config().has("testmode"); - if (test_mode) - { - /// disable logs if expects errors - TestHint test_hint(test_mode, all_queries_text); - if (test_hint.clientError() || test_hint.serverError()) - processTextAsSingleQuery("SET send_logs_level = 'fatal'"); - } - bool echo_query = echo_queries; /// Test tags are started with "--" so they are interpreted as comments anyway. /// But if the echo is enabled we have to remove the test tags from `all_queries_text` /// because we don't want test tags to be echoed. - size_t test_tags_length = test_mode ? getTestTagsLength(all_queries_text) : 0; + size_t test_tags_length = getTestTagsLength(all_queries_text); /// Several queries separated by ';'. /// INSERT data is ended by the end of line, not ';'. @@ -1541,7 +1529,7 @@ bool ClientBase::executeMultiQuery(const String & all_queries_text) // Try to find test hint for syntax error. We don't know where // the query ends because we failed to parse it, so we consume // the entire line. - TestHint hint(test_mode, String(this_query_begin, this_query_end - this_query_begin)); + TestHint hint(String(this_query_begin, this_query_end - this_query_begin)); if (hint.serverError()) { // Syntax errors are considered as client errors @@ -1579,7 +1567,7 @@ bool ClientBase::executeMultiQuery(const String & all_queries_text) // Look for the hint in the text of query + insert data + trailing // comments, e.g. insert into t format CSV 'a' -- { serverError 123 }. // Use the updated query boundaries we just calculated. - TestHint test_hint(test_mode, full_query); + TestHint test_hint(full_query); // Echo all queries if asked; makes for a more readable reference file. echo_query = test_hint.echoQueries().value_or(echo_query); @@ -2182,8 +2170,6 @@ void ClientBase::init(int argc, char ** argv) ("suggestion_limit", po::value()->default_value(10000), "Suggestion limit for how many databases, tables and columns to fetch.") - ("testmode,T", "enable test hints in comments") - ("format,f", po::value(), "default output format") ("vertical,E", "vertical output format, same as --format=Vertical or FORMAT Vertical or \\G at end of command") ("highlight", po::value()->default_value(true), "enable or disable basic syntax highlight in interactive command line") @@ -2289,8 +2275,6 @@ void ClientBase::init(int argc, char ** argv) config().setBool("interactive", true); if (options.count("pager")) config().setString("pager", options["pager"].as()); - if (options.count("testmode")) - config().setBool("testmode", true); if (options.count("log-level")) Poco::Logger::root().setLevel(options["log-level"].as()); diff --git a/src/Client/TestHint.cpp b/src/Client/TestHint.cpp index 2f3be2a5350..f6d1e5d73c3 100644 --- a/src/Client/TestHint.cpp +++ b/src/Client/TestHint.cpp @@ -32,12 +32,9 @@ int parseErrorCode(DB::ReadBufferFromString & in) namespace DB { -TestHint::TestHint(bool enabled_, const String & query_) +TestHint::TestHint(const String & query_) : query(query_) { - if (!enabled_) - return; - // Don't parse error hints in leading comments, because it feels weird. // Leading 'echo' hint is OK. bool is_leading_hint = true; diff --git a/src/Client/TestHint.h b/src/Client/TestHint.h index 377637d0db8..7fa4e86c025 100644 --- a/src/Client/TestHint.h +++ b/src/Client/TestHint.h @@ -7,7 +7,7 @@ namespace DB { -/// Checks expected server and client error codes in --testmode. +/// Checks expected server and client error codes. /// /// The following comment hints are supported: /// @@ -25,12 +25,12 @@ namespace DB /// /// Examples: /// -/// - echo 'select / -- { clientError 62 }' | clickhouse-client --testmode -nm +/// - echo 'select / -- { clientError 62 }' | clickhouse-client -nm /// // Here the client parses the query but it is incorrect, so it expects /// SYNTAX_ERROR (62). /// -/// - echo 'select foo -- { serverError 47 }' | clickhouse-client --testmode -nm +/// - echo 'select foo -- { serverError 47 }' | clickhouse-client -nm /// /// But here the query is correct, but there is no such column "foo", so it /// is UNKNOWN_IDENTIFIER server error. @@ -43,7 +43,7 @@ namespace DB class TestHint { public: - TestHint(bool enabled_, const String & query_); + TestHint(const String & query_); int serverError() const { return server_error; } int clientError() const { return client_error; } diff --git a/tests/clickhouse-test b/tests/clickhouse-test index 121a283d0e4..9c2d599e9cd 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -657,7 +657,7 @@ class TestCase: pattern = '{test} > {stdout} 2> {stderr}' if self.ext == '.sql': - pattern = "{client} --send_logs_level={logs_level} --testmode --multiquery {options} < " + pattern + pattern = "{client} --send_logs_level={logs_level} --multiquery {options} < " + pattern command = pattern.format(**params) diff --git a/tests/queries/0_stateless/00825_protobuf_format_no_length_delimiter.sh b/tests/queries/0_stateless/00825_protobuf_format_no_length_delimiter.sh index a16345c4bb1..a1bbdc318d5 100755 --- a/tests/queries/0_stateless/00825_protobuf_format_no_length_delimiter.sh +++ b/tests/queries/0_stateless/00825_protobuf_format_no_length_delimiter.sh @@ -43,7 +43,7 @@ $CLICKHOUSE_CLIENT --query "SELECT * FROM roundtrip_no_length_delimiter_protobuf rm "$BINARY_FILE_PATH" # The ProtobufSingle format can't be used to write multiple rows because this format doesn't have any row delimiter. -$CLICKHOUSE_CLIENT --multiquery --testmode > /dev/null < /dev/null < /dev/null 2>&1 # fails echo "Should throw 1" -execute_insert --testmode +execute_insert echo "Should throw 2" -execute_insert --testmode --min_insert_block_size_rows=1 --min_insert_block_size_rows_for_materialized_views=$((1<<20)) +execute_insert --min_insert_block_size_rows=1 --min_insert_block_size_rows_for_materialized_views=$((1<<20)) # passes echo "Should pass 1" diff --git a/tests/queries/0_stateless/01280_ssd_complex_key_dictionary.sh b/tests/queries/0_stateless/01280_ssd_complex_key_dictionary.sh index d5cae099f36..0de8b3a1a25 100755 --- a/tests/queries/0_stateless/01280_ssd_complex_key_dictionary.sh +++ b/tests/queries/0_stateless/01280_ssd_complex_key_dictionary.sh @@ -41,7 +41,7 @@ $CLICKHOUSE_CLIENT -n --query=" LIFETIME(MIN 1000 MAX 2000) LAYOUT(COMPLEX_KEY_SSD_CACHE(FILE_SIZE 8192 PATH '$USER_FILES_PATH/0d'));" -$CLICKHOUSE_CLIENT --testmode -nq "SELECT dictHas('01280_db.ssd_dict', 'a', tuple('1')); -- { serverError 43 }" +$CLICKHOUSE_CLIENT -nq "SELECT dictHas('01280_db.ssd_dict', 'a', tuple('1')); -- { serverError 43 }" $CLICKHOUSE_CLIENT -n --query=" SELECT 'TEST_SMALL'; @@ -65,7 +65,7 @@ $CLICKHOUSE_CLIENT -n --query=" SELECT dictGetInt32('01280_db.ssd_dict', 'b', tuple('10', toInt32(-20))); SELECT dictGetString('01280_db.ssd_dict', 'c', tuple('10', toInt32(-20)));" -$CLICKHOUSE_CLIENT --testmode -nq "SELECT dictGetUInt64('01280_db.ssd_dict', 'a', tuple(toInt32(3))); -- { serverError 53 }" +$CLICKHOUSE_CLIENT -nq "SELECT dictGetUInt64('01280_db.ssd_dict', 'a', tuple(toInt32(3))); -- { serverError 53 }" $CLICKHOUSE_CLIENT -n --query="DROP DICTIONARY 01280_db.ssd_dict; DROP TABLE IF EXISTS 01280_db.keys_table; diff --git a/tests/queries/0_stateless/02006_client_test_hint_no_such_error_name.sh b/tests/queries/0_stateless/02006_client_test_hint_no_such_error_name.sh index b846136ae58..972ff3ba73f 100755 --- a/tests/queries/0_stateless/02006_client_test_hint_no_such_error_name.sh +++ b/tests/queries/0_stateless/02006_client_test_hint_no_such_error_name.sh @@ -5,4 +5,4 @@ CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CUR_DIR"/../shell_config.sh -$CLICKHOUSE_CLIENT --testmode -n -q 'select 1 -- { clientError FOOBAR }' |& grep -o 'No error code with name:.*' +$CLICKHOUSE_CLIENT -n -q 'select 1 -- { clientError FOOBAR }' |& grep -o 'No error code with name:.*' diff --git a/tests/queries/0_stateless/02234_clickhouse_local_test_mode.sh b/tests/queries/0_stateless/02234_clickhouse_local_test_mode.sh index 6abe1e30334..f736751726d 100755 --- a/tests/queries/0_stateless/02234_clickhouse_local_test_mode.sh +++ b/tests/queries/0_stateless/02234_clickhouse_local_test_mode.sh @@ -6,5 +6,4 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) $CLICKHOUSE_LOCAL --query="SELECT n" 2>&1 | grep -q "Code: 47. DB::Exception: Missing columns:" && echo 'OK' || echo 'FAIL' ||: -$CLICKHOUSE_LOCAL --testmode --query="SELECT n -- { serverError 47 }" - +$CLICKHOUSE_LOCAL --query="SELECT n -- { serverError 47 }" diff --git a/tests/queries/1_stateful/00168_parallel_processing_on_replicas_part_1.sh b/tests/queries/1_stateful/00168_parallel_processing_on_replicas_part_1.sh index 276fc0274c2..58ce66056af 100755 --- a/tests/queries/1_stateful/00168_parallel_processing_on_replicas_part_1.sh +++ b/tests/queries/1_stateful/00168_parallel_processing_on_replicas_part_1.sh @@ -68,8 +68,8 @@ do TESTNAME_RESULT="/tmp/result_$TESTNAME" NEW_TESTNAME_RESULT="/tmp/result_dist_$TESTNAME" - $CLICKHOUSE_CLIENT $SETTINGS -nm --testmode < $TESTPATH > $TESTNAME_RESULT - $CLICKHOUSE_CLIENT $SETTINGS -nm --testmode < $NEW_TESTNAME > $NEW_TESTNAME_RESULT + $CLICKHOUSE_CLIENT $SETTINGS -nm < $TESTPATH > $TESTNAME_RESULT + $CLICKHOUSE_CLIENT $SETTINGS -nm < $NEW_TESTNAME > $NEW_TESTNAME_RESULT expected=$(cat $TESTNAME_RESULT | md5sum) actual=$(cat $NEW_TESTNAME_RESULT | md5sum) From c2d19350807450250eadfa3b182b12f62bdc1e66 Mon Sep 17 00:00:00 2001 From: Anton Kozlov Date: Fri, 18 Mar 2022 15:56:25 +0000 Subject: [PATCH 030/239] Do not build krb5 if ENABLE_LIBRARIES is not set This module has hard dependency on SSL. If ENABLE_LIBRARIES is off then SSL is disabled. With this change, building this module will not break. --- contrib/krb5-cmake/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/krb5-cmake/CMakeLists.txt b/contrib/krb5-cmake/CMakeLists.txt index 685e8737ef0..0d6075ee99e 100644 --- a/contrib/krb5-cmake/CMakeLists.txt +++ b/contrib/krb5-cmake/CMakeLists.txt @@ -1,4 +1,4 @@ -set (ENABLE_KRB5_DEFAULT 1) +set (ENABLE_KRB5_DEFAULT ${ENABLE_LIBRARIES}) if (NOT CMAKE_SYSTEM_NAME MATCHES "Linux" AND NOT (CMAKE_SYSTEM_NAME MATCHES "Darwin" AND NOT CMAKE_CROSSCOMPILING)) message (WARNING "krb5 disabled in non-Linux and non-native-Darwin environments") set (ENABLE_KRB5_DEFAULT 0) From 8e856a00fdd10de120b502d6a7f683fd75407853 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Mon, 21 Mar 2022 12:49:05 +0800 Subject: [PATCH 031/239] enable column as argument --- src/Functions/enumerateStreams.cpp | 30 +++++++++++++------ .../02240_enumerate_streams.reference | 3 ++ .../0_stateless/02240_enumerate_streams.sql | 3 ++ 3 files changed, 27 insertions(+), 9 deletions(-) diff --git a/src/Functions/enumerateStreams.cpp b/src/Functions/enumerateStreams.cpp index 8ba703cc741..ce554368519 100644 --- a/src/Functions/enumerateStreams.cpp +++ b/src/Functions/enumerateStreams.cpp @@ -41,22 +41,15 @@ public: return 1; } - DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + DataTypePtr getReturnTypeImpl(const DataTypes &) const override { - if (!isString(arguments[0])) - throw Exception("The argument of function " + getName() + " must have String type", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); return std::make_shared(std::make_shared()); } ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override { - const IColumn * arg_column = arguments[0].column.get(); - const ColumnString * arg_string = checkAndGetColumnConstData(arg_column); + auto type = getType(arguments[0]); - if (!arg_string) - throw Exception("The argument of function " + getName() + " must be constant String", ErrorCodes::ILLEGAL_COLUMN); - - DataTypePtr type = DataTypeFactory::instance().get(arg_string->getDataAt(0).toString()); SerializationPtr serialization = type->getDefaultSerialization(); auto col_res = ColumnArray::create(ColumnString::create()); ColumnString & col_res_strings = typeid_cast(col_res->getData()); @@ -68,6 +61,25 @@ public: col_res_offsets.push_back(col_res_strings.size()); return ColumnConst::create(std::move(col_res), input_rows_count); } + +private: + static DataTypePtr getType(const ColumnWithTypeAndName & argument) + { + const IColumn * arg_column = argument.column.get(); + const ColumnString * arg_string = checkAndGetColumnConstData(arg_column); + if (!arg_string) + return argument.type; + + try + { + DataTypePtr type = DataTypeFactory::instance().get(arg_string->getDataAt(0).toString()); + return type; + } + catch (const DB::Exception &) + { + return argument.type; + } + } }; } diff --git a/tests/queries/0_stateless/02240_enumerate_streams.reference b/tests/queries/0_stateless/02240_enumerate_streams.reference index f17375788d6..3537720214f 100644 --- a/tests/queries/0_stateless/02240_enumerate_streams.reference +++ b/tests/queries/0_stateless/02240_enumerate_streams.reference @@ -3,3 +3,6 @@ ['{TupleElement(1, escape_tuple_delimiter = true), Regular}','{TupleElement(2, escape_tuple_delimiter = true), Regular}','{TupleElement(3, escape_tuple_delimiter = true), Regular}'] ['{DictionaryKeys, Regular}','{DictionaryIndexes}'] ['{NullMap}','{NullableElements, Regular}'] +['{ArraySizes}','{ArrayElements, Regular}'] +['{ArraySizes}','{ArrayElements, TupleElement(keys, escape_tuple_delimiter = true), Regular}','{ArrayElements, TupleElement(values, escape_tuple_delimiter = true), Regular}'] +['{TupleElement(1, escape_tuple_delimiter = true), Regular}','{TupleElement(2, escape_tuple_delimiter = true), Regular}','{TupleElement(3, escape_tuple_delimiter = true), Regular}','{TupleElement(4, escape_tuple_delimiter = true), Regular}'] diff --git a/tests/queries/0_stateless/02240_enumerate_streams.sql b/tests/queries/0_stateless/02240_enumerate_streams.sql index 9514feaf55f..e33a13098a0 100644 --- a/tests/queries/0_stateless/02240_enumerate_streams.sql +++ b/tests/queries/0_stateless/02240_enumerate_streams.sql @@ -3,3 +3,6 @@ select enumerateStreams('Map(String, Int64)'); select enumerateStreams('Tuple(String, Int64, Float64)'); select enumerateStreams('LowCardinality(String)'); select enumerateStreams('Nullable(String)'); +select enumerateStreams([1,2,3]); +select enumerateStreams(map('a', 1, 'b', 2)); +select enumerateStreams(tuple('a', 1, 'b', 2)); From b52a066903664b2a5ed043442a792fc19092ca42 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Mon, 21 Mar 2022 14:19:54 +0800 Subject: [PATCH 032/239] fix style --- src/Functions/enumerateStreams.cpp | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/Functions/enumerateStreams.cpp b/src/Functions/enumerateStreams.cpp index ce554368519..fb38f4f0b91 100644 --- a/src/Functions/enumerateStreams.cpp +++ b/src/Functions/enumerateStreams.cpp @@ -10,11 +10,6 @@ namespace DB { -namespace ErrorCodes -{ - extern const int ILLEGAL_TYPE_OF_ARGUMENT; - extern const int ILLEGAL_COLUMN; -} namespace { @@ -63,7 +58,7 @@ public: } private: - static DataTypePtr getType(const ColumnWithTypeAndName & argument) + static DataTypePtr getType(const ColumnWithTypeAndName & argument) { const IColumn * arg_column = argument.column.get(); const ColumnString * arg_string = checkAndGetColumnConstData(arg_column); From 98a9b81c1d866cfad519ed2c3c87b9121a75a734 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Mon, 21 Mar 2022 14:23:25 +0800 Subject: [PATCH 033/239] update doc --- docs/en/sql-reference/functions/other-functions.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/en/sql-reference/functions/other-functions.md b/docs/en/sql-reference/functions/other-functions.md index 2f6c3ec9cf3..7337aed3bd0 100644 --- a/docs/en/sql-reference/functions/other-functions.md +++ b/docs/en/sql-reference/functions/other-functions.md @@ -2508,10 +2508,13 @@ return the enumerated stream paths of data type. ``` sql enumerateStreams(type_name) + +enumerateStreams(arg) ``` **Arguments** - `type_name` - Name of data type to enumerate its stream paths. [String](../../sql-reference/data-types/string.md#string). +- `arg` - any column which has a data type **Returned value** - List of enumerated stream paths. From bf05b949401a493aefc02e061730e941a7d9d3c0 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Mon, 21 Mar 2022 15:03:28 +0800 Subject: [PATCH 034/239] fix build --- src/Storages/Hive/HiveFile.cpp | 3 --- src/Storages/Hive/HiveFile.h | 14 +++----------- src/TableFunctions/CMakeLists.txt | 5 ++--- 3 files changed, 5 insertions(+), 17 deletions(-) diff --git a/src/Storages/Hive/HiveFile.cpp b/src/Storages/Hive/HiveFile.cpp index 896c464c80f..e18eef58947 100644 --- a/src/Storages/Hive/HiveFile.cpp +++ b/src/Storages/Hive/HiveFile.cpp @@ -3,12 +3,10 @@ #if USE_HIVE #include -#include #include #include #include #include -#include #include #include #include @@ -19,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/src/Storages/Hive/HiveFile.h b/src/Storages/Hive/HiveFile.h index dfecd79f932..7d38efe6acb 100644 --- a/src/Storages/Hive/HiveFile.h +++ b/src/Storages/Hive/HiveFile.h @@ -7,11 +7,14 @@ #include #include +#include +#include #include #include #include #include +#include namespace orc { @@ -19,16 +22,6 @@ class Statistics; class ColumnStatistics; } -namespace parquet::arrow -{ -class FileReader; -} - -namespace arrow::adapters::orc -{ -class ORCFileReader; -} - namespace DB { namespace ErrorCodes @@ -36,7 +29,6 @@ namespace ErrorCodes extern const int NOT_IMPLEMENTED; } -class ReadBufferFromHDFS; class IHiveFile : public WithContext { public: diff --git a/src/TableFunctions/CMakeLists.txt b/src/TableFunctions/CMakeLists.txt index c9948a4b131..1d6eb9eb02e 100644 --- a/src/TableFunctions/CMakeLists.txt +++ b/src/TableFunctions/CMakeLists.txt @@ -8,10 +8,9 @@ list(REMOVE_ITEM clickhouse_table_functions_sources ITableFunction.cpp TableFunc list(REMOVE_ITEM clickhouse_table_functions_headers ITableFunction.h TableFunctionFactory.h) add_library(clickhouse_table_functions ${clickhouse_table_functions_sources}) +target_link_libraries(clickhouse_table_functions PRIVATE clickhouse_parsers clickhouse_storages_system dbms) if (TARGET ch_contrib::hivemetastore) - target_link_libraries(clickhouse_table_functions PRIVATE clickhouse_parsers clickhouse_storages_system dbms ch_contrib::hivemetastore ch_contrib::hdfs) -else () - target_link_libraries(clickhouse_table_functions PRIVATE clickhouse_parsers clickhouse_storages_system dbms) + target_link_libraries(clickhouse_table_functions PRIVATE ch_contrib::hivemetastore ch_contrib::hdfs ch_contrib::parquet) endif () From fe67d86869a67027e4ea286d15540a0bdb4b39a0 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Mon, 21 Mar 2022 20:08:56 +0800 Subject: [PATCH 035/239] update contrib/arrow --- contrib/arrow | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/arrow b/contrib/arrow index ce6b7af516c..efdcd015cfd 160000 --- a/contrib/arrow +++ b/contrib/arrow @@ -1 +1 @@ -Subproject commit ce6b7af516cff9b106e0f7b1c30628f18e7a6169 +Subproject commit efdcd015cfdee1b6aa349c9ca227ca12c3d697f5 From 68d5b538aa37d7dbdb002672fe459f38345015ab Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Wed, 23 Mar 2022 11:15:42 +0800 Subject: [PATCH 036/239] fix build error --- src/Storages/Hive/HiveFile.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/Hive/HiveFile.cpp b/src/Storages/Hive/HiveFile.cpp index e18eef58947..2862e654ddc 100644 --- a/src/Storages/Hive/HiveFile.cpp +++ b/src/Storages/Hive/HiveFile.cpp @@ -31,7 +31,7 @@ namespace ErrorCodes #define THROW_ARROW_NOT_OK(status) \ do \ { \ - if (::arrow::Status _s = (status); !_s.ok()) \ + if (const ::arrow::Status & _s = (status); !_s.ok()) \ throw Exception(_s.ToString(), ErrorCodes::BAD_ARGUMENTS); \ } while (false) From 1d8ab36de06d43680455ec80905d58b76256d234 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E6=89=AC?= <654010905@qq.com> Date: Wed, 23 Mar 2022 21:44:14 +0800 Subject: [PATCH 037/239] Update other-functions.md --- docs/en/sql-reference/functions/other-functions.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/en/sql-reference/functions/other-functions.md b/docs/en/sql-reference/functions/other-functions.md index 7337aed3bd0..ea8d9ca3ed7 100644 --- a/docs/en/sql-reference/functions/other-functions.md +++ b/docs/en/sql-reference/functions/other-functions.md @@ -2505,7 +2505,6 @@ Result: return the enumerated stream paths of data type. **Syntax** - ``` sql enumerateStreams(type_name) From 9cc528b01f50e11e0095fe763b02d31659f306dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E6=89=AC?= <654010905@qq.com> Date: Wed, 23 Mar 2022 21:57:58 +0800 Subject: [PATCH 038/239] Update HiveFile.h --- src/Storages/Hive/HiveFile.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Storages/Hive/HiveFile.h b/src/Storages/Hive/HiveFile.h index 7d38efe6acb..aef9d72755a 100644 --- a/src/Storages/Hive/HiveFile.h +++ b/src/Storages/Hive/HiveFile.h @@ -156,7 +156,8 @@ protected: NamesAndTypesList index_names_and_types; MinMaxIndexPtr minmax_idx; std::vector sub_minmax_idxes; - std::set skip_splits; // skip splits for this file after applying minmax index (if any) + /// Skip splits for this file after applying minmax index (if any) + std::set skip_splits; std::shared_ptr storage_settings; }; From 097ff9cc98b985201f2af1dcd262b5300a9079ee Mon Sep 17 00:00:00 2001 From: Kerry Clendinning Date: Thu, 24 Mar 2022 08:35:22 -0500 Subject: [PATCH 039/239] Update index.md Fixed spelling "retuned" -> "returned" --- docs/en/sql-reference/functions/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/functions/index.md b/docs/en/sql-reference/functions/index.md index 7cceec889bd..572aa7f632e 100644 --- a/docs/en/sql-reference/functions/index.md +++ b/docs/en/sql-reference/functions/index.md @@ -77,7 +77,7 @@ A function configuration contains the following settings: - `argument` - argument description with the `type`, and optional `name` of an argument. Each argument is described in a separate setting. Specifying name is necessary if argument names are part of serialization for user defined function format like [Native](../../interfaces/formats.md#native) or [JSONEachRow](../../interfaces/formats.md#jsoneachrow). Default argument name value is `c` + argument_number. - `format` - a [format](../../interfaces/formats.md) in which arguments are passed to the command. - `return_type` - the type of a returned value. -- `return_name` - name of retuned value. Specifying return name is necessary if return name is part of serialization for user defined function format like [Native](../../interfaces/formats.md#native) or [JSONEachRow](../../interfaces/formats.md#jsoneachrow). Optional. Default value is `result`. +- `return_name` - name of returned value. Specifying return name is necessary if return name is part of serialization for user defined function format like [Native](../../interfaces/formats.md#native) or [JSONEachRow](../../interfaces/formats.md#jsoneachrow). Optional. Default value is `result`. - `type` - an executable type. If `type` is set to `executable` then single command is started. If it is set to `executable_pool` then a pool of commands is created. - `max_command_execution_time` - maximum execution time in seconds for processing block of data. This setting is valid for `executable_pool` commands only. Optional. Default value is `10`. - `command_termination_timeout` - time in seconds during which a command should finish after its pipe is closed. After that time `SIGTERM` is sent to the process executing the command. Optional. Default value is `10`. From c9acc550087d60d2316c083bc57e92573c845b04 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Fri, 25 Mar 2022 14:51:59 +0800 Subject: [PATCH 040/239] rename function name --- src/Functions/enumerateStreams.cpp | 10 +++++----- .../0_stateless/02240_enumerate_streams.sql | 16 ++++++++-------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/Functions/enumerateStreams.cpp b/src/Functions/enumerateStreams.cpp index fb38f4f0b91..3306c91dd0a 100644 --- a/src/Functions/enumerateStreams.cpp +++ b/src/Functions/enumerateStreams.cpp @@ -15,13 +15,13 @@ namespace { /// Enumerate stream paths of data type. -class FunctionEnumerateStreams : public IFunction +class FunctionGetTypeSerializationStreams : public IFunction { public: - static constexpr auto name = "enumerateStreams"; + static constexpr auto name = "getTypeSerializationStreams"; static FunctionPtr create(ContextPtr) { - return std::make_shared(); + return std::make_shared(); } String getName() const override @@ -79,9 +79,9 @@ private: } -void registerFunctionEnumerateStreams(FunctionFactory & factory) +void registerFunctionGetTypeSerializationStreams(FunctionFactory & factory) { - factory.registerFunction(); + factory.registerFunction(); } } diff --git a/tests/queries/0_stateless/02240_enumerate_streams.sql b/tests/queries/0_stateless/02240_enumerate_streams.sql index e33a13098a0..72a66269e22 100644 --- a/tests/queries/0_stateless/02240_enumerate_streams.sql +++ b/tests/queries/0_stateless/02240_enumerate_streams.sql @@ -1,8 +1,8 @@ -select enumerateStreams('Array(Int8)'); -select enumerateStreams('Map(String, Int64)'); -select enumerateStreams('Tuple(String, Int64, Float64)'); -select enumerateStreams('LowCardinality(String)'); -select enumerateStreams('Nullable(String)'); -select enumerateStreams([1,2,3]); -select enumerateStreams(map('a', 1, 'b', 2)); -select enumerateStreams(tuple('a', 1, 'b', 2)); +select getTypeSerializationStreams('Array(Int8)'); +select getTypeSerializationStreams('Map(String, Int64)'); +select getTypeSerializationStreams('Tuple(String, Int64, Float64)'); +select getTypeSerializationStreams('LowCardinality(String)'); +select getTypeSerializationStreams('Nullable(String)'); +select getTypeSerializationStreams([1,2,3]); +select getTypeSerializationStreams(map('a', 1, 'b', 2)); +select getTypeSerializationStreams(tuple('a', 1, 'b', 2)); From dd0873189d931ad63765f20da842c0de117a04de Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Fri, 25 Mar 2022 14:54:01 +0800 Subject: [PATCH 041/239] rename all files --- .../{enumerateStreams.cpp => getTypeSerializationStreams.cpp} | 0 ...s.reference => 02240_get_type_serialization_streams.reference} | 0 ...erate_streams.sql => 02240_get_type_serialization_streams.sql} | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename src/Functions/{enumerateStreams.cpp => getTypeSerializationStreams.cpp} (100%) rename tests/queries/0_stateless/{02240_enumerate_streams.reference => 02240_get_type_serialization_streams.reference} (100%) rename tests/queries/0_stateless/{02240_enumerate_streams.sql => 02240_get_type_serialization_streams.sql} (100%) diff --git a/src/Functions/enumerateStreams.cpp b/src/Functions/getTypeSerializationStreams.cpp similarity index 100% rename from src/Functions/enumerateStreams.cpp rename to src/Functions/getTypeSerializationStreams.cpp diff --git a/tests/queries/0_stateless/02240_enumerate_streams.reference b/tests/queries/0_stateless/02240_get_type_serialization_streams.reference similarity index 100% rename from tests/queries/0_stateless/02240_enumerate_streams.reference rename to tests/queries/0_stateless/02240_get_type_serialization_streams.reference diff --git a/tests/queries/0_stateless/02240_enumerate_streams.sql b/tests/queries/0_stateless/02240_get_type_serialization_streams.sql similarity index 100% rename from tests/queries/0_stateless/02240_enumerate_streams.sql rename to tests/queries/0_stateless/02240_get_type_serialization_streams.sql From 18ab49e788f3ed8a0c20634e31ba652129c0d24e Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Fri, 25 Mar 2022 11:59:50 +0100 Subject: [PATCH 042/239] Check all logs for crashes, logical errors, etc in backward compatibility check --- docker/test/stress/run.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docker/test/stress/run.sh b/docker/test/stress/run.sh index 3cef5b008db..e56afcbfd7a 100755 --- a/docker/test/stress/run.sh +++ b/docker/test/stress/run.sh @@ -348,13 +348,13 @@ then rm -f /test_output/tmp # OOM - zgrep -Fa " Application: Child process was terminated by signal 9" /var/log/clickhouse-server/clickhouse-server.log > /dev/null \ + zgrep -Fa " Application: Child process was terminated by signal 9" /var/log/clickhouse-server/clickhouse-server.log* > /dev/null \ && echo -e 'Backward compatibility check: OOM killer (or signal 9) in clickhouse-server.log\tFAIL' >> /test_output/test_results.tsv \ || echo -e 'Backward compatibility check: No OOM messages in clickhouse-server.log\tOK' >> /test_output/test_results.tsv # Logical errors echo "Check for Logical errors in server log:" - zgrep -Fa -A20 "Code: 49, e.displayText() = DB::Exception:" /var/log/clickhouse-server/clickhouse-server.log > /test_output/bc_check_logical_errors.txt \ + zgrep -Fa -A20 "Code: 49, e.displayText() = DB::Exception:" /var/log/clickhouse-server/clickhouse-server.log* > /test_output/bc_check_logical_errors.txt \ && echo -e 'Backward compatibility check: Logical error thrown (see clickhouse-server.log or bc_check_logical_errors.txt)\tFAIL' >> /test_output/test_results.tsv \ || echo -e 'Backward compatibility check: No logical errors\tOK' >> /test_output/test_results.tsv @@ -362,13 +362,13 @@ then [ -s /test_output/bc_check_logical_errors.txt ] || rm /test_output/bc_check_logical_errors.txt # Crash - zgrep -Fa "########################################" /var/log/clickhouse-server/clickhouse-server.log > /dev/null \ + zgrep -Fa "########################################" /var/log/clickhouse-server/clickhouse-server.log* > /dev/null \ && echo -e 'Backward compatibility check: Killed by signal (in clickhouse-server.log)\tFAIL' >> /test_output/test_results.tsv \ || echo -e 'Backward compatibility check: Not crashed\tOK' >> /test_output/test_results.tsv # It also checks for crash without stacktrace (printed by watchdog) echo "Check for Fatal message in server log:" - zgrep -Fa " " /var/log/clickhouse-server/clickhouse-server.log > /test_output/bc_check_fatal_messages.txt \ + zgrep -Fa " " /var/log/clickhouse-server/clickhouse-server.log* > /test_output/bc_check_fatal_messages.txt \ && echo -e 'Backward compatibility check: Fatal message in clickhouse-server.log (see bc_check_fatal_messages.txt)\tFAIL' >> /test_output/test_results.tsv \ || echo -e 'Backward compatibility check: No fatal messages in clickhouse-server.log\tOK' >> /test_output/test_results.tsv From 902b402d200822b4fa85c82a790d5e9d0027968f Mon Sep 17 00:00:00 2001 From: Nikifor Seriakov Date: Fri, 25 Mar 2022 15:37:15 +0300 Subject: [PATCH 043/239] fix link in documentation --- docs/ru/sql-reference/table-functions/postgresql.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/table-functions/postgresql.md b/docs/ru/sql-reference/table-functions/postgresql.md index a8ae7cfb80b..e61ca69d78c 100644 --- a/docs/ru/sql-reference/table-functions/postgresql.md +++ b/docs/ru/sql-reference/table-functions/postgresql.md @@ -126,7 +126,7 @@ CREATE TABLE pg_table_schema_with_dots (a UInt32) **См. также** -- [Движок таблиц PostgreSQL](../../sql-reference/table-functions/postgresql.md) +- [Движок таблиц PostgreSQL](../../engines/table-engines/integrations/postgresql.md) - [Использование PostgreSQL как источника данных для внешнего словаря](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-postgresql) [Оригинальная статья](https://clickhouse.com/docs/ru/sql-reference/table-functions/postgresql/) From 7889059f7d8bab561a002a6954f60236691ac924 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Fri, 25 Mar 2022 22:39:57 +0800 Subject: [PATCH 044/239] fix building --- .../functions/other-functions.md | 22 +++++++++---------- .../registerFunctionsMiscellaneous.cpp | 4 ++-- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/docs/en/sql-reference/functions/other-functions.md b/docs/en/sql-reference/functions/other-functions.md index ea8d9ca3ed7..cedde8a7f35 100644 --- a/docs/en/sql-reference/functions/other-functions.md +++ b/docs/en/sql-reference/functions/other-functions.md @@ -2500,23 +2500,23 @@ Result: └──────────────────────────┘ ``` -## enumerateStreams {#enumerateStreams} +## getTypeSerializationStreams {#getTypeSerializationStreams} -return the enumerated stream paths of data type. +return the serialization streams of data type. **Syntax** ``` sql -enumerateStreams(type_name) +getTypeSerializationStreams(type_name) -enumerateStreams(arg) +getTypeSerializationStreams(column) ``` **Arguments** -- `type_name` - Name of data type to enumerate its stream paths. [String](../../sql-reference/data-types/string.md#string). -- `arg` - any column which has a data type +- `type_name` - Name of data type to get its serialization paths. [String](../../sql-reference/data-types/string.md#string). +- `column` - any column which has a data type **Returned value** -- List of enumerated stream paths. +- List of serialization streams; Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md)). @@ -2527,13 +2527,13 @@ Type: [Array](../../sql-reference/data-types/array.md)([String](../../sql-refere Query: ``` sql -SELECT enumerateStreams('Array(Array(Int8))') +SELECT getTypeSerializationStreams('Array(Array(Int8))') ``` Result: ``` text -┌─enumerateStreams('Array(Array(Int8))')───────────────────────────────────────────────────┐ -│ ['{ArraySizes}','{ArrayElements, ArraySizes}','{ArrayElements, ArrayElements, Regular}'] │ -└──────────────────────────────────────────────────────────────────────────────────────────┘ +┌───────────────────────getTypeSerializationStreams('Array(Array(Int8))')─────────────────────────────┐ +│ ['{ArraySizes}','{ArrayElements, ArraySizes}','{ArrayElements, ArrayElements, Regular}'] │ +└─────────────────────────────────────────────────────────────────────────────────────────────────────┘ ``` diff --git a/src/Functions/registerFunctionsMiscellaneous.cpp b/src/Functions/registerFunctionsMiscellaneous.cpp index f70c52b7621..56a0458d826 100644 --- a/src/Functions/registerFunctionsMiscellaneous.cpp +++ b/src/Functions/registerFunctionsMiscellaneous.cpp @@ -80,7 +80,7 @@ void registerFunctionInitialQueryID(FunctionFactory &); void registerFunctionServerUUID(FunctionFactory &); void registerFunctionZooKeeperSessionUptime(FunctionFactory &); void registerFunctionGetOSKernelVersion(FunctionFactory &); -void registerFunctionEnumerateStreams(FunctionFactory &); +void registerFunctionGetTypeSerializationStreams(FunctionFactory &); #if USE_ICU void registerFunctionConvertCharset(FunctionFactory &); @@ -167,7 +167,7 @@ void registerFunctionsMiscellaneous(FunctionFactory & factory) registerFunctionServerUUID(factory); registerFunctionZooKeeperSessionUptime(factory); registerFunctionGetOSKernelVersion(factory); - registerFunctionEnumerateStreams(factory); + registerFunctionGetTypeSerializationStreams(factory); #if USE_ICU registerFunctionConvertCharset(factory); From eee89491500e75a741ba3407cd2df8d583fe156e Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Fri, 25 Mar 2022 23:33:46 +0800 Subject: [PATCH 045/239] fix code --- src/Storages/Hive/HiveFile.cpp | 6 +++--- src/Storages/Hive/StorageHive.cpp | 2 -- src/TableFunctions/Hive/TableFunctionHive.cpp | 1 + src/TableFunctions/Hive/TableFunctionHive.h | 1 + 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Storages/Hive/HiveFile.cpp b/src/Storages/Hive/HiveFile.cpp index 2862e654ddc..d1765da6b28 100644 --- a/src/Storages/Hive/HiveFile.cpp +++ b/src/Storages/Hive/HiveFile.cpp @@ -147,7 +147,7 @@ void HiveOrcFile::prepareColumnMapping() bool HiveOrcFile::hasMinMaxIndex() const { - return !storage_settings->enable_orc_file_minmax_index; + return storage_settings->enable_orc_file_minmax_index; } @@ -198,7 +198,7 @@ void HiveOrcFile::loadMinMaxIndex() bool HiveOrcFile::hasSubMinMaxIndex() const { - return !storage_settings->enable_orc_stripe_minmax_index; + return storage_settings->enable_orc_stripe_minmax_index; } @@ -228,7 +228,7 @@ void HiveOrcFile::loadSubMinMaxIndex() bool HiveParquetFile::hasSubMinMaxIndex() const { - return !storage_settings->enable_parquet_rowgroup_minmax_index; + return storage_settings->enable_parquet_rowgroup_minmax_index; } void HiveParquetFile::prepareReader() diff --git a/src/Storages/Hive/StorageHive.cpp b/src/Storages/Hive/StorageHive.cpp index 7507f60b9b3..4296df3d7b1 100644 --- a/src/Storages/Hive/StorageHive.cpp +++ b/src/Storages/Hive/StorageHive.cpp @@ -7,8 +7,6 @@ #include #include #include -#include -#include #include #include diff --git a/src/TableFunctions/Hive/TableFunctionHive.cpp b/src/TableFunctions/Hive/TableFunctionHive.cpp index e7de55181c3..d29d65c2b8b 100644 --- a/src/TableFunctions/Hive/TableFunctionHive.cpp +++ b/src/TableFunctions/Hive/TableFunctionHive.cpp @@ -1,4 +1,5 @@ #include + #if USE_HIVE #include #include diff --git a/src/TableFunctions/Hive/TableFunctionHive.h b/src/TableFunctions/Hive/TableFunctionHive.h index 0973bdda329..2a8e47a90da 100644 --- a/src/TableFunctions/Hive/TableFunctionHive.h +++ b/src/TableFunctions/Hive/TableFunctionHive.h @@ -1,5 +1,6 @@ #pragma once #include + #if USE_HIVE #include #include From 3bfd911ce2af3166efb7533a3a7fc33d577c70fd Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Sat, 26 Mar 2022 16:11:45 -0300 Subject: [PATCH 046/239] test for crash _join_with_nullable_lowcardinality --- ...th_nullable_lowcardinality_crash.reference | 2 ++ ...oin_with_nullable_lowcardinality_crash.sql | 20 +++++++++++++++++++ 2 files changed, 22 insertions(+) create mode 100644 tests/queries/0_stateless/02245_join_with_nullable_lowcardinality_crash.reference create mode 100644 tests/queries/0_stateless/02245_join_with_nullable_lowcardinality_crash.sql diff --git a/tests/queries/0_stateless/02245_join_with_nullable_lowcardinality_crash.reference b/tests/queries/0_stateless/02245_join_with_nullable_lowcardinality_crash.reference new file mode 100644 index 00000000000..12c61d9c54e --- /dev/null +++ b/tests/queries/0_stateless/02245_join_with_nullable_lowcardinality_crash.reference @@ -0,0 +1,2 @@ +usa + diff --git a/tests/queries/0_stateless/02245_join_with_nullable_lowcardinality_crash.sql b/tests/queries/0_stateless/02245_join_with_nullable_lowcardinality_crash.sql new file mode 100644 index 00000000000..abc2ee41402 --- /dev/null +++ b/tests/queries/0_stateless/02245_join_with_nullable_lowcardinality_crash.sql @@ -0,0 +1,20 @@ +drop table if exists with_nullable; +drop table if exists without_nullable; + +CREATE TABLE with_nullable +( timestamp UInt32, + country LowCardinality(Nullable(String)) ) ENGINE = Memory; + +CREATE TABLE without_nullable +( timestamp UInt32, + country LowCardinality(String)) ENGINE = Memory; + +insert into with_nullable values(0,'f'),(0,'usa'); +insert into without_nullable values(0,'usa'),(0,'us2a'); + +select if(t0.country is null ,t2.country,t0.country) "country" +from without_nullable t0 right outer join with_nullable t2 on t0.country=t2.country; + +drop table with_nullable; +drop table without_nullable; + From a216bc26c1906ec12cc49757f4c0e47ebdba2314 Mon Sep 17 00:00:00 2001 From: vdimir Date: Mon, 28 Mar 2022 13:29:34 +0000 Subject: [PATCH 047/239] Correct check asof join key nullability --- src/Interpreters/TableJoin.cpp | 17 +++++++++-------- .../0_stateless/01428_nullable_asof_join.sql | 5 +++++ 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/src/Interpreters/TableJoin.cpp b/src/Interpreters/TableJoin.cpp index 7b7ccb689c3..ec5358cf6bc 100644 --- a/src/Interpreters/TableJoin.cpp +++ b/src/Interpreters/TableJoin.cpp @@ -512,14 +512,6 @@ TableJoin::createConvertingActions(const ColumnsWithTypeAndName & left_sample_co template void TableJoin::inferJoinKeyCommonType(const LeftNamesAndTypes & left, const RightNamesAndTypes & right, bool allow_right) { - if (strictness() == ASTTableJoin::Strictness::Asof) - { - if (clauses.size() != 1) - throw DB::Exception("ASOF join over multiple keys is not supported", ErrorCodes::NOT_IMPLEMENTED); - if (right.back().type->isNullable()) - throw DB::Exception("ASOF join over right table Nullable column is not implemented", ErrorCodes::NOT_IMPLEMENTED); - } - if (!left_type_map.empty() || !right_type_map.empty()) return; @@ -531,6 +523,15 @@ void TableJoin::inferJoinKeyCommonType(const LeftNamesAndTypes & left, const Rig for (const auto & col : right) right_types[renamedRightColumnName(col.name)] = col.type; + if (strictness() == ASTTableJoin::Strictness::Asof) + { + if (clauses.size() != 1) + throw DB::Exception("ASOF join over multiple keys is not supported", ErrorCodes::NOT_IMPLEMENTED); + + auto asof_key_type = right_types.find(clauses.back().key_names_right.back()); + if (asof_key_type != right_types.end() && asof_key_type->second->isNullable()) + throw DB::Exception("ASOF join over right table Nullable column is not implemented", ErrorCodes::NOT_IMPLEMENTED); + } forAllKeys(clauses, [&](const auto & left_key_name, const auto & right_key_name) { diff --git a/tests/queries/0_stateless/01428_nullable_asof_join.sql b/tests/queries/0_stateless/01428_nullable_asof_join.sql index 30e5c51eb1c..e1b00158d68 100644 --- a/tests/queries/0_stateless/01428_nullable_asof_join.sql +++ b/tests/queries/0_stateless/01428_nullable_asof_join.sql @@ -109,3 +109,8 @@ FROM (SELECT toUInt8(number) > 0 as pk, toNullable(toUInt8(number)) as dt FROM n ASOF JOIN (SELECT 1 as pk, toNullable(0) as dt) b ON a.dt >= b.dt AND a.pk = b.pk ORDER BY a.dt; -- { serverError 48 } + +SELECT * +FROM (SELECT NULL AS y, 1 AS x, '2020-01-01 10:10:10' :: DateTime64 AS t) AS t1 +ASOF LEFT JOIN (SELECT NULL AS y, 1 AS x, '2020-01-01 10:10:10' :: DateTime64 AS t) AS t2 +ON t1.t <= t2.t AND t1.x == t2.x FORMAT Null; From bbfe8a2ca7bcd52aee0f138b59db4ad96b0b623f Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Mon, 28 Mar 2022 15:28:17 +0000 Subject: [PATCH 048/239] fix possible loss of subcolumns in type Object --- src/DataTypes/DataTypeTuple.cpp | 2 +- .../0_stateless/01825_type_json_9.reference | 1 + tests/queries/0_stateless/01825_type_json_9.sql | 16 ++++++++++++++++ 3 files changed, 18 insertions(+), 1 deletion(-) create mode 100644 tests/queries/0_stateless/01825_type_json_9.reference create mode 100644 tests/queries/0_stateless/01825_type_json_9.sql diff --git a/src/DataTypes/DataTypeTuple.cpp b/src/DataTypes/DataTypeTuple.cpp index a5e9868cf89..abf53a4baf1 100644 --- a/src/DataTypes/DataTypeTuple.cpp +++ b/src/DataTypes/DataTypeTuple.cpp @@ -206,7 +206,7 @@ bool DataTypeTuple::equals(const IDataType & rhs) const return false; for (size_t i = 0; i < size; ++i) - if (!elems[i]->equals(*rhs_tuple.elems[i])) + if (!elems[i]->equals(*rhs_tuple.elems[i]) || names[i] != rhs_tuple.names[i]) return false; return true; diff --git a/tests/queries/0_stateless/01825_type_json_9.reference b/tests/queries/0_stateless/01825_type_json_9.reference new file mode 100644 index 00000000000..a426b09a100 --- /dev/null +++ b/tests/queries/0_stateless/01825_type_json_9.reference @@ -0,0 +1 @@ +Tuple(foo Int8, k1 Int8, k2 Int8) diff --git a/tests/queries/0_stateless/01825_type_json_9.sql b/tests/queries/0_stateless/01825_type_json_9.sql new file mode 100644 index 00000000000..8fa4b335578 --- /dev/null +++ b/tests/queries/0_stateless/01825_type_json_9.sql @@ -0,0 +1,16 @@ +-- Tags: no-fasttest + +DROP TABLE IF EXISTS t_json; + +SET allow_experimental_object_type = 1; + +CREATE TABLE t_json(id UInt64, obj JSON) ENGINE = MergeTree ORDER BY id; + +INSERT INTO t_json format JSONEachRow {"id": 1, "obj": {"foo": 1, "k1": 2}}; +INSERT INTO t_json format JSONEachRow {"id": 2, "obj": {"foo": 1, "k2": 2}}; + +OPTIMIZE TABLE t_json FINAL; + +SELECT any(toTypeName(obj)) from t_json; + +DROP TABLE IF EXISTS t_json; From 6cbdc6af005f87e8b638c7c3f862cf1aea464a22 Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Mon, 28 Mar 2022 18:44:53 +0000 Subject: [PATCH 049/239] remove obsolete parameter --- .../DataTypeLowCardinalityHelpers.cpp | 2 +- src/DataTypes/DataTypeTuple.cpp | 21 +++++++------------ src/DataTypes/DataTypeTuple.h | 6 +----- src/Functions/FunctionsConversion.h | 3 +-- src/Functions/tuple.cpp | 21 ++----------------- .../02008_tuple_to_name_value_pairs.sql | 2 +- 6 files changed, 13 insertions(+), 42 deletions(-) diff --git a/src/DataTypes/DataTypeLowCardinalityHelpers.cpp b/src/DataTypes/DataTypeLowCardinalityHelpers.cpp index 41ba81814d0..21ab25b6da3 100644 --- a/src/DataTypes/DataTypeLowCardinalityHelpers.cpp +++ b/src/DataTypes/DataTypeLowCardinalityHelpers.cpp @@ -36,7 +36,7 @@ DataTypePtr recursiveRemoveLowCardinality(const DataTypePtr & type) element = recursiveRemoveLowCardinality(element); if (tuple_type->haveExplicitNames()) - return std::make_shared(elements, tuple_type->getElementNames(), tuple_type->serializeNames()); + return std::make_shared(elements, tuple_type->getElementNames()); else return std::make_shared(elements); } diff --git a/src/DataTypes/DataTypeTuple.cpp b/src/DataTypes/DataTypeTuple.cpp index abf53a4baf1..908e0184b8d 100644 --- a/src/DataTypes/DataTypeTuple.cpp +++ b/src/DataTypes/DataTypeTuple.cpp @@ -64,8 +64,8 @@ static std::optional checkTupleNames(const Strings & names) return {}; } -DataTypeTuple::DataTypeTuple(const DataTypes & elems_, const Strings & names_, bool serialize_names_) - : elems(elems_), names(names_), have_explicit_names(true), serialize_names(serialize_names_) +DataTypeTuple::DataTypeTuple(const DataTypes & elems_, const Strings & names_) + : elems(elems_), names(names_), have_explicit_names(true) { size_t size = elems.size(); if (names.size() != size) @@ -75,11 +75,6 @@ DataTypeTuple::DataTypeTuple(const DataTypes & elems_, const Strings & names_, b throw std::move(*exception); } -bool DataTypeTuple::canBeCreatedWithNames(const Strings & names) -{ - return checkTupleNames(names) == std::nullopt; -} - std::string DataTypeTuple::doGetName() const { size_t size = elems.size(); @@ -91,7 +86,7 @@ std::string DataTypeTuple::doGetName() const if (i != 0) s << ", "; - if (have_explicit_names && serialize_names) + if (have_explicit_names) s << backQuoteIfNeed(names[i]) << ' '; s << elems[i]->getName(); @@ -265,31 +260,29 @@ size_t DataTypeTuple::getSizeOfValueInMemory() const SerializationPtr DataTypeTuple::doGetDefaultSerialization() const { SerializationTuple::ElementSerializations serializations(elems.size()); - bool use_explicit_names = have_explicit_names && serialize_names; for (size_t i = 0; i < elems.size(); ++i) { - String elem_name = use_explicit_names ? names[i] : toString(i + 1); + String elem_name = have_explicit_names ? names[i] : toString(i + 1); auto serialization = elems[i]->getDefaultSerialization(); serializations[i] = std::make_shared(serialization, elem_name); } - return std::make_shared(std::move(serializations), use_explicit_names); + return std::make_shared(std::move(serializations), have_explicit_names); } SerializationPtr DataTypeTuple::getSerialization(const SerializationInfo & info) const { SerializationTuple::ElementSerializations serializations(elems.size()); const auto & info_tuple = assert_cast(info); - bool use_explicit_names = have_explicit_names && serialize_names; for (size_t i = 0; i < elems.size(); ++i) { - String elem_name = use_explicit_names ? names[i] : toString(i + 1); + String elem_name = have_explicit_names ? names[i] : toString(i + 1); auto serialization = elems[i]->getSerialization(*info_tuple.getElementInfo(i)); serializations[i] = std::make_shared(serialization, elem_name); } - return std::make_shared(std::move(serializations), use_explicit_names); + return std::make_shared(std::move(serializations), have_explicit_names); } MutableSerializationInfoPtr DataTypeTuple::createSerializationInfo(const SerializationInfo::Settings & settings) const diff --git a/src/DataTypes/DataTypeTuple.h b/src/DataTypes/DataTypeTuple.h index db122aae5df..009a2284a0a 100644 --- a/src/DataTypes/DataTypeTuple.h +++ b/src/DataTypes/DataTypeTuple.h @@ -22,14 +22,11 @@ private: DataTypes elems; Strings names; bool have_explicit_names; - bool serialize_names = true; public: static constexpr bool is_parametric = true; explicit DataTypeTuple(const DataTypes & elems); - DataTypeTuple(const DataTypes & elems, const Strings & names, bool serialize_names_ = true); - - static bool canBeCreatedWithNames(const Strings & names); + DataTypeTuple(const DataTypes & elems, const Strings & names); TypeIndex getTypeId() const override { return TypeIndex::Tuple; } std::string doGetName() const override; @@ -66,7 +63,6 @@ public: String getNameByPosition(size_t i) const; bool haveExplicitNames() const { return have_explicit_names; } - bool serializeNames() const { return serialize_names; } }; } diff --git a/src/Functions/FunctionsConversion.h b/src/Functions/FunctionsConversion.h index e098378f51a..587efa9f217 100644 --- a/src/Functions/FunctionsConversion.h +++ b/src/Functions/FunctionsConversion.h @@ -2957,8 +2957,7 @@ private: /// For named tuples allow conversions for tuples with /// different sets of elements. If element exists in @to_type /// and doesn't exist in @to_type it will be filled by default values. - if (from_type->haveExplicitNames() && from_type->serializeNames() - && to_type->haveExplicitNames() && to_type->serializeNames()) + if (from_type->haveExplicitNames() && to_type->haveExplicitNames()) { const auto & from_names = from_type->getElementNames(); std::unordered_map from_positions; diff --git a/src/Functions/tuple.cpp b/src/Functions/tuple.cpp index 8e8b18e335d..6d5c53c0770 100644 --- a/src/Functions/tuple.cpp +++ b/src/Functions/tuple.cpp @@ -54,29 +54,12 @@ public: bool useDefaultImplementationForNulls() const override { return false; } bool useDefaultImplementationForConstants() const override { return true; } - DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { if (arguments.empty()) throw Exception("Function " + getName() + " requires at least one argument.", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); - DataTypes types; - Strings names; - - for (const auto & argument : arguments) - { - types.emplace_back(argument.type); - names.emplace_back(argument.name); - } - - /// Create named tuple if possible. We don't print tuple element names - /// because they are bad anyway -- aliases are not used, e.g. tuple(1 a) - /// will have element name '1' and not 'a'. If we ever change this, and - /// add the ability to access tuple elements by name, like tuple(1 a).a, - /// we should probably enable printing for better discoverability. - if (DataTypeTuple::canBeCreatedWithNames(names)) - return std::make_shared(types, names, false /*print names*/); - - return std::make_shared(types); + return std::make_shared(arguments); } ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/) const override diff --git a/tests/queries/0_stateless/02008_tuple_to_name_value_pairs.sql b/tests/queries/0_stateless/02008_tuple_to_name_value_pairs.sql index 9204975b579..59987a86590 100644 --- a/tests/queries/0_stateless/02008_tuple_to_name_value_pairs.sql +++ b/tests/queries/0_stateless/02008_tuple_to_name_value_pairs.sql @@ -4,7 +4,7 @@ DROP TABLE IF EXISTS test02008; CREATE TABLE test02008 ( col Tuple( a Tuple(key1 int, key2 int), - b Tuple(key1 int, key3 int) + b Tuple(key1 int, key2 int) ) ) ENGINE=Memory(); INSERT INTO test02008 VALUES (tuple(tuple(1, 2), tuple(3, 4))); From c05bf7beb4ee275cd05166265888f45de40cc1f2 Mon Sep 17 00:00:00 2001 From: rfraposa Date: Mon, 28 Mar 2022 23:09:17 -0600 Subject: [PATCH 050/239] Testing new /en folder --- docs/en/_category_.yml | 8 + docs/en/commercial/cloud.md | 9 - docs/en/commercial/index.md | 13 - docs/en/commercial/support.md | 9 - docs/en/development/_category_.yml | 7 + docs/en/development/index.md | 10 - docs/en/engines/_category_.yml | 7 + docs/en/engines/index.md | 15 - .../mergetree-family/mergetree.md | 6 +- docs/en/example-datasets/_category_.yml | 7 + .../example-datasets/amplab-benchmark.md | 0 .../example-datasets/brown-benchmark.md | 0 .../example-datasets/cell-towers.md | 0 .../example-datasets/criteo.md | 0 .../example-datasets/github-events.md | 0 .../example-datasets/menus.md | 0 .../example-datasets/metrica.md | 0 .../example-datasets/nyc-taxi.md | 0 .../example-datasets/ontime.md | 0 .../example-datasets/opensky.md | 0 .../example-datasets/recipes.md | 0 .../example-datasets/star-schema.md | 0 .../example-datasets/uk-price-paid.md | 0 .../example-datasets/wikistat.md | 0 docs/en/faq/general/columnar-database.md | 25 - docs/en/faq/general/dbms-naming.md | 17 - .../how-do-i-contribute-code-to-clickhouse.md | 15 - docs/en/faq/general/index.md | 25 - docs/en/faq/general/mapreduce.md | 13 - docs/en/faq/general/ne-tormozit.md | 26 - docs/en/faq/general/olap.md | 39 -- .../en/faq/general/who-is-using-clickhouse.md | 19 - .../faq/general/why-clickhouse-is-so-fast.md | 63 -- docs/en/faq/index.md | 47 -- docs/en/faq/integration/file-export.md | 37 - docs/en/faq/integration/index.md | 19 - docs/en/faq/integration/json-import.md | 33 - docs/en/faq/integration/oracle-odbc.md | 15 - docs/en/faq/operations/delete-old-data.md | 42 -- docs/en/faq/operations/index.md | 19 - .../operations/multi-region-replication.md | 13 - docs/en/faq/operations/production.md | 70 -- docs/en/faq/use-cases/index.md | 18 - docs/en/faq/use-cases/key-value.md | 17 - docs/en/faq/use-cases/time-series.md | 15 - .../getting-started/example-datasets/index.md | 28 - docs/en/getting-started/index.md | 15 - docs/en/getting-started/playground.md | 59 -- docs/en/getting-started/tutorial.md | 662 ------------------ docs/en/guides/apply-catboost-model.md | 242 ------- docs/en/guides/index.md | 14 - docs/en/index.md | 95 --- docs/en/{getting-started => }/install.md | 94 ++- docs/en/interfaces/index.md | 7 +- docs/en/introduction/adopters.md | 199 ------ docs/en/introduction/distinctive-features.md | 96 --- docs/en/introduction/history.md | 54 -- docs/en/introduction/index.md | 6 - docs/en/introduction/performance.md | 30 - docs/en/operations/_category_.yml | 7 + docs/en/sql-reference/_category_.yml | 7 + .../functions/encoding-functions.md | 4 +- .../sql-reference/statements/select/sample.md | 11 +- docs/en/whats-new/changelog/2017.md | 4 +- docs/en/whats-new/changelog/2018.md | 4 +- docs/en/whats-new/changelog/2019.md | 4 +- docs/en/whats-new/changelog/2020.md | 4 +- docs/en/whats-new/changelog/2021.md | 6 +- docs/en/whats-new/changelog/index.md | 499 ++++++++++++- docs/en/whats-new/index.md | 8 +- docs/en/whats-new/roadmap.md | 1 - docs/en/whats-new/security-changelog.md | 10 +- 72 files changed, 660 insertions(+), 2188 deletions(-) create mode 100644 docs/en/_category_.yml delete mode 100644 docs/en/commercial/cloud.md delete mode 100644 docs/en/commercial/index.md delete mode 100644 docs/en/commercial/support.md create mode 100644 docs/en/development/_category_.yml delete mode 100644 docs/en/development/index.md create mode 100644 docs/en/engines/_category_.yml delete mode 100644 docs/en/engines/index.md create mode 100644 docs/en/example-datasets/_category_.yml rename docs/en/{getting-started => }/example-datasets/amplab-benchmark.md (100%) rename docs/en/{getting-started => }/example-datasets/brown-benchmark.md (100%) rename docs/en/{getting-started => }/example-datasets/cell-towers.md (100%) rename docs/en/{getting-started => }/example-datasets/criteo.md (100%) rename docs/en/{getting-started => }/example-datasets/github-events.md (100%) rename docs/en/{getting-started => }/example-datasets/menus.md (100%) rename docs/en/{getting-started => }/example-datasets/metrica.md (100%) rename docs/en/{getting-started => }/example-datasets/nyc-taxi.md (100%) rename docs/en/{getting-started => }/example-datasets/ontime.md (100%) rename docs/en/{getting-started => }/example-datasets/opensky.md (100%) rename docs/en/{getting-started => }/example-datasets/recipes.md (100%) rename docs/en/{getting-started => }/example-datasets/star-schema.md (100%) rename docs/en/{getting-started => }/example-datasets/uk-price-paid.md (100%) rename docs/en/{getting-started => }/example-datasets/wikistat.md (100%) delete mode 100644 docs/en/faq/general/columnar-database.md delete mode 100644 docs/en/faq/general/dbms-naming.md delete mode 100644 docs/en/faq/general/how-do-i-contribute-code-to-clickhouse.md delete mode 100644 docs/en/faq/general/index.md delete mode 100644 docs/en/faq/general/mapreduce.md delete mode 100644 docs/en/faq/general/ne-tormozit.md delete mode 100644 docs/en/faq/general/olap.md delete mode 100644 docs/en/faq/general/who-is-using-clickhouse.md delete mode 100644 docs/en/faq/general/why-clickhouse-is-so-fast.md delete mode 100644 docs/en/faq/index.md delete mode 100644 docs/en/faq/integration/file-export.md delete mode 100644 docs/en/faq/integration/index.md delete mode 100644 docs/en/faq/integration/json-import.md delete mode 100644 docs/en/faq/integration/oracle-odbc.md delete mode 100644 docs/en/faq/operations/delete-old-data.md delete mode 100644 docs/en/faq/operations/index.md delete mode 100644 docs/en/faq/operations/multi-region-replication.md delete mode 100644 docs/en/faq/operations/production.md delete mode 100644 docs/en/faq/use-cases/index.md delete mode 100644 docs/en/faq/use-cases/key-value.md delete mode 100644 docs/en/faq/use-cases/time-series.md delete mode 100644 docs/en/getting-started/example-datasets/index.md delete mode 100644 docs/en/getting-started/index.md delete mode 100644 docs/en/getting-started/playground.md delete mode 100644 docs/en/getting-started/tutorial.md delete mode 100644 docs/en/guides/apply-catboost-model.md delete mode 100644 docs/en/guides/index.md delete mode 100644 docs/en/index.md rename docs/en/{getting-started => }/install.md (69%) delete mode 100644 docs/en/introduction/adopters.md delete mode 100644 docs/en/introduction/distinctive-features.md delete mode 100644 docs/en/introduction/history.md delete mode 100644 docs/en/introduction/index.md delete mode 100644 docs/en/introduction/performance.md create mode 100644 docs/en/operations/_category_.yml create mode 100644 docs/en/sql-reference/_category_.yml diff --git a/docs/en/_category_.yml b/docs/en/_category_.yml new file mode 100644 index 00000000000..8009b548223 --- /dev/null +++ b/docs/en/_category_.yml @@ -0,0 +1,8 @@ +position: 50 +label: 'Reference Guides' +collapsible: true +collapsed: true +link: + type: generated-index + title: Reference Guides + slug: /en \ No newline at end of file diff --git a/docs/en/commercial/cloud.md b/docs/en/commercial/cloud.md deleted file mode 100644 index afa2e23b7a8..00000000000 --- a/docs/en/commercial/cloud.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -toc_priority: 1 -toc_title: Cloud ---- - -# ClickHouse Cloud Service {#clickhouse-cloud-service} - -!!! info "Info" - Detailed public description for ClickHouse cloud services is not ready yet, please [contact us](https://clickhouse.com/company/#contact) to learn more. diff --git a/docs/en/commercial/index.md b/docs/en/commercial/index.md deleted file mode 100644 index 1f1911b8c4d..00000000000 --- a/docs/en/commercial/index.md +++ /dev/null @@ -1,13 +0,0 @@ ---- -toc_folder_title: Commercial -toc_priority: 70 -toc_title: Introduction ---- - -# ClickHouse Commercial Services {#clickhouse-commercial-services} - -Service categories: - -- [Cloud](../commercial/cloud.md) -- [Support](../commercial/support.md) - diff --git a/docs/en/commercial/support.md b/docs/en/commercial/support.md deleted file mode 100644 index 33b69b40b2d..00000000000 --- a/docs/en/commercial/support.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -toc_priority: 3 -toc_title: Support ---- - -# ClickHouse Commercial Support Service {#clickhouse-commercial-support-service} - -!!! info "Info" - Detailed public description for ClickHouse support services is not ready yet, please [contact us](https://clickhouse.com/company/#contact) to learn more. diff --git a/docs/en/development/_category_.yml b/docs/en/development/_category_.yml new file mode 100644 index 00000000000..ef272510d47 --- /dev/null +++ b/docs/en/development/_category_.yml @@ -0,0 +1,7 @@ +position: 100 +label: 'Development' +collapsible: true +collapsed: true +link: + type: generated-index + title: Reference \ No newline at end of file diff --git a/docs/en/development/index.md b/docs/en/development/index.md deleted file mode 100644 index f9f0d644973..00000000000 --- a/docs/en/development/index.md +++ /dev/null @@ -1,10 +0,0 @@ ---- -toc_folder_title: Development -toc_hidden: true -toc_priority: 58 -toc_title: hidden ---- - -# ClickHouse Development {#clickhouse-development} - -[Original article](https://clickhouse.com/docs/en/development/) diff --git a/docs/en/engines/_category_.yml b/docs/en/engines/_category_.yml new file mode 100644 index 00000000000..f8554057fdc --- /dev/null +++ b/docs/en/engines/_category_.yml @@ -0,0 +1,7 @@ +position: 30 +label: 'Database & Table Engines' +collapsible: true +collapsed: true +link: + type: generated-index + title: Database & Table Engines \ No newline at end of file diff --git a/docs/en/engines/index.md b/docs/en/engines/index.md deleted file mode 100644 index b3f4a4f7b69..00000000000 --- a/docs/en/engines/index.md +++ /dev/null @@ -1,15 +0,0 @@ ---- -toc_folder_title: Engines -toc_hidden: true -toc_priority: 25 -toc_title: hidden ---- - -# ClickHouse Engines {#clickhouse-engines} - -There are two key engine kinds in ClickHouse: - -- [Table engines](../engines/table-engines/index.md) -- [Database engines](../engines/database-engines/index.md) - -{## [Original article](https://clickhouse.com/docs/en/engines/) ##} diff --git a/docs/en/engines/table-engines/mergetree-family/mergetree.md b/docs/en/engines/table-engines/mergetree-family/mergetree.md index a0acda5d5c6..9d820e4961b 100644 --- a/docs/en/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md @@ -304,8 +304,8 @@ CREATE TABLE table_name Indices from the example can be used by ClickHouse to reduce the amount of data to read from disk in the following queries: ``` sql -SELECT count() FROM table WHERE s < 'z' -SELECT count() FROM table WHERE u64 * i32 == 10 AND u64 * length(s) >= 1234 +SELECT count() FROM table WHERE s < 'z' +SELECT count() FROM table WHERE u64 * i32 == 10 AND u64 * length(s) >= 1234 ``` #### Available Types of Indices {#available-types-of-indices} @@ -364,7 +364,7 @@ The `set` index can be used with all functions. Function subsets for other index | Function (operator) / Index | primary key | minmax | ngrambf_v1 | tokenbf_v1 | bloom_filter | |------------------------------------------------------------------------------------------------------------|-------------|--------|-------------|-------------|---------------| | [equals (=, ==)](../../../sql-reference/functions/comparison-functions.md#function-equals) | ✔ | ✔ | ✔ | ✔ | ✔ | -| [notEquals(!=, <>)](../../../sql-reference/functions/comparison-functions.md#function-notequals) | ✔ | ✔ | ✔ | ✔ | ✔ | +| [notEquals(!=, <>)](../../../sql-reference/functions/comparison-functions.md#function-notequals) | ✔ | ✔ | ✔ | ✔ | ✔ | | [like](../../../sql-reference/functions/string-search-functions.md#function-like) | ✔ | ✔ | ✔ | ✔ | ✗ | | [notLike](../../../sql-reference/functions/string-search-functions.md#function-notlike) | ✔ | ✔ | ✔ | ✔ | ✗ | | [startsWith](../../../sql-reference/functions/string-functions.md#startswith) | ✔ | ✔ | ✔ | ✔ | ✗ | diff --git a/docs/en/example-datasets/_category_.yml b/docs/en/example-datasets/_category_.yml new file mode 100644 index 00000000000..5824de77e1d --- /dev/null +++ b/docs/en/example-datasets/_category_.yml @@ -0,0 +1,7 @@ +position: 10 +label: 'Example Datasets' +collapsible: true +collapsed: true +link: + type: generated-index + title: Example Datasets \ No newline at end of file diff --git a/docs/en/getting-started/example-datasets/amplab-benchmark.md b/docs/en/example-datasets/amplab-benchmark.md similarity index 100% rename from docs/en/getting-started/example-datasets/amplab-benchmark.md rename to docs/en/example-datasets/amplab-benchmark.md diff --git a/docs/en/getting-started/example-datasets/brown-benchmark.md b/docs/en/example-datasets/brown-benchmark.md similarity index 100% rename from docs/en/getting-started/example-datasets/brown-benchmark.md rename to docs/en/example-datasets/brown-benchmark.md diff --git a/docs/en/getting-started/example-datasets/cell-towers.md b/docs/en/example-datasets/cell-towers.md similarity index 100% rename from docs/en/getting-started/example-datasets/cell-towers.md rename to docs/en/example-datasets/cell-towers.md diff --git a/docs/en/getting-started/example-datasets/criteo.md b/docs/en/example-datasets/criteo.md similarity index 100% rename from docs/en/getting-started/example-datasets/criteo.md rename to docs/en/example-datasets/criteo.md diff --git a/docs/en/getting-started/example-datasets/github-events.md b/docs/en/example-datasets/github-events.md similarity index 100% rename from docs/en/getting-started/example-datasets/github-events.md rename to docs/en/example-datasets/github-events.md diff --git a/docs/en/getting-started/example-datasets/menus.md b/docs/en/example-datasets/menus.md similarity index 100% rename from docs/en/getting-started/example-datasets/menus.md rename to docs/en/example-datasets/menus.md diff --git a/docs/en/getting-started/example-datasets/metrica.md b/docs/en/example-datasets/metrica.md similarity index 100% rename from docs/en/getting-started/example-datasets/metrica.md rename to docs/en/example-datasets/metrica.md diff --git a/docs/en/getting-started/example-datasets/nyc-taxi.md b/docs/en/example-datasets/nyc-taxi.md similarity index 100% rename from docs/en/getting-started/example-datasets/nyc-taxi.md rename to docs/en/example-datasets/nyc-taxi.md diff --git a/docs/en/getting-started/example-datasets/ontime.md b/docs/en/example-datasets/ontime.md similarity index 100% rename from docs/en/getting-started/example-datasets/ontime.md rename to docs/en/example-datasets/ontime.md diff --git a/docs/en/getting-started/example-datasets/opensky.md b/docs/en/example-datasets/opensky.md similarity index 100% rename from docs/en/getting-started/example-datasets/opensky.md rename to docs/en/example-datasets/opensky.md diff --git a/docs/en/getting-started/example-datasets/recipes.md b/docs/en/example-datasets/recipes.md similarity index 100% rename from docs/en/getting-started/example-datasets/recipes.md rename to docs/en/example-datasets/recipes.md diff --git a/docs/en/getting-started/example-datasets/star-schema.md b/docs/en/example-datasets/star-schema.md similarity index 100% rename from docs/en/getting-started/example-datasets/star-schema.md rename to docs/en/example-datasets/star-schema.md diff --git a/docs/en/getting-started/example-datasets/uk-price-paid.md b/docs/en/example-datasets/uk-price-paid.md similarity index 100% rename from docs/en/getting-started/example-datasets/uk-price-paid.md rename to docs/en/example-datasets/uk-price-paid.md diff --git a/docs/en/getting-started/example-datasets/wikistat.md b/docs/en/example-datasets/wikistat.md similarity index 100% rename from docs/en/getting-started/example-datasets/wikistat.md rename to docs/en/example-datasets/wikistat.md diff --git a/docs/en/faq/general/columnar-database.md b/docs/en/faq/general/columnar-database.md deleted file mode 100644 index 11bbd2e63f6..00000000000 --- a/docs/en/faq/general/columnar-database.md +++ /dev/null @@ -1,25 +0,0 @@ ---- -title: What is a columnar database? -toc_hidden: true -toc_priority: 101 ---- - -# What Is a Columnar Database? {#what-is-a-columnar-database} - -A columnar database stores data of each column independently. This allows to read data from disks only for those columns that are used in any given query. The cost is that operations that affect whole rows become proportionally more expensive. The synonym for a columnar database is a column-oriented database management system. ClickHouse is a typical example of such a system. - -Key columnar database advantages are: - -- Queries that use only a few columns out of many. -- Aggregating queries against large volumes of data. -- Column-wise data compression. - -Here is the illustration of the difference between traditional row-oriented systems and columnar databases when building reports: - -**Traditional row-oriented** -![Traditional row-oriented](https://clickhouse.com/docs/en/images/row-oriented.gif#) - -**Columnar** -![Columnar](https://clickhouse.com/docs/en/images/column-oriented.gif#) - -A columnar database is a preferred choice for analytical applications because it allows to have many columns in a table just in case, but do not pay the cost for unused columns on read query execution time. Column-oriented databases are designed for big data processing and data warehousing, because they often natively scale using distributed clusters of low-cost hardware to increase throughput. ClickHouse does it with combination of [distributed](../../engines/table-engines/special/distributed.md) and [replicated](../../engines/table-engines/mergetree-family/replication.md) tables. diff --git a/docs/en/faq/general/dbms-naming.md b/docs/en/faq/general/dbms-naming.md deleted file mode 100644 index d4e87ff450a..00000000000 --- a/docs/en/faq/general/dbms-naming.md +++ /dev/null @@ -1,17 +0,0 @@ ---- -title: "What does \u201CClickHouse\u201D mean?" -toc_hidden: true -toc_priority: 10 ---- - -# What Does “ClickHouse” Mean? {#what-does-clickhouse-mean} - -It’s a combination of “**Click**stream” and “Data ware**House**”. It comes from the original use case at Yandex.Metrica, where ClickHouse was supposed to keep records of all clicks by people from all over the Internet, and it still does the job. You can read more about this use case on [ClickHouse history](../../introduction/history.md) page. - -This two-part meaning has two consequences: - -- The only correct way to write Click**H**ouse is with capital H. -- If you need to abbreviate it, use **CH**. For some historical reasons, abbreviating as CK is also popular in China, mostly because one of the first talks about ClickHouse in Chinese used this form. - -!!! info "Fun fact" - Many years after ClickHouse got its name, this approach of combining two words that are meaningful on their own has been highlighted as the best way to name a database in a [research by Andy Pavlo](https://www.cs.cmu.edu/~pavlo/blog/2020/03/on-naming-a-database-management-system.html), an Associate Professor of Databases at Carnegie Mellon University. ClickHouse shared his “best database name of all time” award with Postgres. diff --git a/docs/en/faq/general/how-do-i-contribute-code-to-clickhouse.md b/docs/en/faq/general/how-do-i-contribute-code-to-clickhouse.md deleted file mode 100644 index 731dc9dface..00000000000 --- a/docs/en/faq/general/how-do-i-contribute-code-to-clickhouse.md +++ /dev/null @@ -1,15 +0,0 @@ ---- -title: How do I contribute code to ClickHouse? -toc_hidden: true -toc_priority: 120 ---- - -# How do I contribute code to ClickHouse? {#how-do-i-contribute-code-to-clickhouse} - -ClickHouse is an open-source project [developed on GitHub](https://github.com/ClickHouse/ClickHouse). - -As customary, contribution instructions are published in [CONTRIBUTING.md](https://github.com/ClickHouse/ClickHouse/blob/master/CONTRIBUTING.md) file in the root of the source code repository. - -If you want to suggest a substantial change to ClickHouse, consider [opening a GitHub issue](https://github.com/ClickHouse/ClickHouse/issues/new/choose) explaining what you want to do, to discuss it with maintainers and community first. [Examples of such RFC issues](https://github.com/ClickHouse/ClickHouse/issues?q=is%3Aissue+is%3Aopen+rfc). - -If your contributions are security related, please check out [our security policy](https://github.com/ClickHouse/ClickHouse/security/policy/) too. diff --git a/docs/en/faq/general/index.md b/docs/en/faq/general/index.md deleted file mode 100644 index 51fff9a53ae..00000000000 --- a/docs/en/faq/general/index.md +++ /dev/null @@ -1,25 +0,0 @@ ---- -title: General questions about ClickHouse -toc_hidden_folder: true -toc_priority: 1 -toc_title: General ---- - -# General Questions About ClickHouse {#general-questions} - -Questions: - -- [What is ClickHouse?](../../index.md#what-is-clickhouse) -- [Why ClickHouse is so fast?](../../faq/general/why-clickhouse-is-so-fast.md) -- [Who is using ClickHouse?](../../faq/general/who-is-using-clickhouse.md) -- [What does “ClickHouse” mean?](../../faq/general/dbms-naming.md) -- [What does “Не тормозит” mean?](../../faq/general/ne-tormozit.md) -- [What is OLAP?](../../faq/general/olap.md) -- [What is a columnar database?](../../faq/general/columnar-database.md) -- [Why not use something like MapReduce?](../../faq/general/mapreduce.md) -- [How do I contribute code to ClickHouse?](../../faq/general/how-do-i-contribute-code-to-clickhouse.md) - -!!! info "Don’t see what you were looking for?" - Check out [other F.A.Q. categories](../../faq/index.md) or browse around main documentation articles found in the left sidebar. - -{## [Original article](https://clickhouse.com/docs/en/faq/general/) ##} diff --git a/docs/en/faq/general/mapreduce.md b/docs/en/faq/general/mapreduce.md deleted file mode 100644 index 30cae65cba2..00000000000 --- a/docs/en/faq/general/mapreduce.md +++ /dev/null @@ -1,13 +0,0 @@ ---- -title: Why not use something like MapReduce? -toc_hidden: true -toc_priority: 110 ---- - -# Why Not Use Something Like MapReduce? {#why-not-use-something-like-mapreduce} - -We can refer to systems like MapReduce as distributed computing systems in which the reduce operation is based on distributed sorting. The most common open-source solution in this class is [Apache Hadoop](http://hadoop.apache.org). Large IT companies often have proprietary in-house solutions. - -These systems aren’t appropriate for online queries due to their high latency. In other words, they can’t be used as the back-end for a web interface. These types of systems aren’t useful for real-time data updates. Distributed sorting isn’t the best way to perform reduce operations if the result of the operation and all the intermediate results (if there are any) are located in the RAM of a single server, which is usually the case for online queries. In such a case, a hash table is an optimal way to perform reduce operations. A common approach to optimizing map-reduce tasks is pre-aggregation (partial reduce) using a hash table in RAM. The user performs this optimization manually. Distributed sorting is one of the main causes of reduced performance when running simple map-reduce tasks. - -Most MapReduce implementations allow you to execute arbitrary code on a cluster. But a declarative query language is better suited to OLAP to run experiments quickly. For example, Hadoop has Hive and Pig. Also consider Cloudera Impala or Shark (outdated) for Spark, as well as Spark SQL, Presto, and Apache Drill. Performance when running such tasks is highly sub-optimal compared to specialized systems, but relatively high latency makes it unrealistic to use these systems as the backend for a web interface. diff --git a/docs/en/faq/general/ne-tormozit.md b/docs/en/faq/general/ne-tormozit.md deleted file mode 100644 index e8dc7388eff..00000000000 --- a/docs/en/faq/general/ne-tormozit.md +++ /dev/null @@ -1,26 +0,0 @@ ---- -title: "What does \u201C\u043D\u0435 \u0442\u043E\u0440\u043C\u043E\u0437\u0438\u0442\ - \u201D mean?" -toc_hidden: true -toc_priority: 11 ---- - -# What Does “Не тормозит” Mean? {#what-does-ne-tormozit-mean} - -This question usually arises when people see official ClickHouse t-shirts. They have large words **“ClickHouse не тормозит”** on the front. - -Before ClickHouse became open-source, it has been developed as an in-house storage system by the largest Russian IT company, Yandex. That’s why it initially got its slogan in Russian, which is “не тормозит” (pronounced as “ne tormozit”). After the open-source release we first produced some of those t-shirts for events in Russia and it was a no-brainer to use the slogan as-is. - -One of the following batches of those t-shirts was supposed to be given away on events outside of Russia and we tried to make the English version of the slogan. Unfortunately, the Russian language is kind of elegant in terms of expressing stuff and there was a restriction of limited space on a t-shirt, so we failed to come up with good enough translation (most options appeared to be either long or inaccurate) and decided to keep the slogan in Russian even on t-shirts produced for international events. It appeared to be a great decision because people all over the world get positively surprised and curious when they see it. - -So, what does it mean? Here are some ways to translate *“не тормозит”*: - -- If you translate it literally, it’d be something like *“ClickHouse does not press the brake pedal”*. -- If you’d want to express it as close to how it sounds to a Russian person with IT background, it’d be something like *“If your larger system lags, it’s not because it uses ClickHouse”*. -- Shorter, but not so precise versions could be *“ClickHouse is not slow”*, *“ClickHouse does not lag”* or just *“ClickHouse is fast”*. - -If you haven’t seen one of those t-shirts in person, you can check them out online in many ClickHouse-related videos. For example, this one: - -![iframe](https://www.youtube.com/embed/bSyQahMVZ7w) - -P.S. These t-shirts are not for sale, they are given away for free on most [ClickHouse Meetups](https://clickhouse.com/#meet), usually for best questions or other forms of active participation. diff --git a/docs/en/faq/general/olap.md b/docs/en/faq/general/olap.md deleted file mode 100644 index 1f6df183f8c..00000000000 --- a/docs/en/faq/general/olap.md +++ /dev/null @@ -1,39 +0,0 @@ ---- -title: What is OLAP? -toc_hidden: true -toc_priority: 100 ---- - -# What Is OLAP? {#what-is-olap} - -[OLAP](https://en.wikipedia.org/wiki/Online_analytical_processing) stands for Online Analytical Processing. It is a broad term that can be looked at from two perspectives: technical and business. But at the very high level, you can just read these words backward: - -Processing -: Some source data is processed… - -Analytical -: …to produce some analytical reports and insights… - -Online -: …in real-time. - -## OLAP from the Business Perspective {#olap-from-the-business-perspective} - -In recent years, business people started to realize the value of data. Companies who make their decisions blindly, more often than not fail to keep up with the competition. The data-driven approach of successful companies forces them to collect all data that might be remotely useful for making business decisions and need mechanisms to timely analyze them. Here’s where OLAP database management systems (DBMS) come in. - -In a business sense, OLAP allows companies to continuously plan, analyze, and report operational activities, thus maximizing efficiency, reducing expenses, and ultimately conquering the market share. It could be done either in an in-house system or outsourced to SaaS providers like web/mobile analytics services, CRM services, etc. OLAP is the technology behind many BI applications (Business Intelligence). - -ClickHouse is an OLAP database management system that is pretty often used as a backend for those SaaS solutions for analyzing domain-specific data. However, some businesses are still reluctant to share their data with third-party providers and an in-house data warehouse scenario is also viable. - -## OLAP from the Technical Perspective {#olap-from-the-technical-perspective} - -All database management systems could be classified into two groups: OLAP (Online **Analytical** Processing) and OLTP (Online **Transactional** Processing). Former focuses on building reports, each based on large volumes of historical data, but doing it not so frequently. While the latter usually handle a continuous stream of transactions, constantly modifying the current state of data. - -In practice OLAP and OLTP are not categories, it’s more like a spectrum. Most real systems usually focus on one of them but provide some solutions or workarounds if the opposite kind of workload is also desired. This situation often forces businesses to operate multiple storage systems integrated, which might be not so big deal but having more systems make it more expensive to maintain. So the trend of recent years is HTAP (**Hybrid Transactional/Analytical Processing**) when both kinds of the workload are handled equally well by a single database management system. - -Even if a DBMS started as a pure OLAP or pure OLTP, they are forced to move towards that HTAP direction to keep up with their competition. And ClickHouse is no exception, initially, it has been designed as [fast-as-possible OLAP system](../../faq/general/why-clickhouse-is-so-fast.md) and it still does not have full-fledged transaction support, but some features like consistent read/writes and mutations for updating/deleting data had to be added. - -The fundamental trade-off between OLAP and OLTP systems remains: - -- To build analytical reports efficiently it’s crucial to be able to read columns separately, thus most OLAP databases are [columnar](../../faq/general/columnar-database.md), -- While storing columns separately increases costs of operations on rows, like append or in-place modification, proportionally to the number of columns (which can be huge if the systems try to collect all details of an event just in case). Thus, most OLTP systems store data arranged by rows. diff --git a/docs/en/faq/general/who-is-using-clickhouse.md b/docs/en/faq/general/who-is-using-clickhouse.md deleted file mode 100644 index b7ff867d726..00000000000 --- a/docs/en/faq/general/who-is-using-clickhouse.md +++ /dev/null @@ -1,19 +0,0 @@ ---- -title: Who is using ClickHouse? -toc_hidden: true -toc_priority: 9 ---- - -# Who Is Using ClickHouse? {#who-is-using-clickhouse} - -Being an open-source product makes this question not so straightforward to answer. You do not have to tell anyone if you want to start using ClickHouse, you just go grab source code or pre-compiled packages. There’s no contract to sign and the [Apache 2.0 license](https://github.com/ClickHouse/ClickHouse/blob/master/LICENSE) allows for unconstrained software distribution. - -Also, the technology stack is often in a grey zone of what’s covered by an NDA. Some companies consider technologies they use as a competitive advantage even if they are open-source and do not allow employees to share any details publicly. Some see some PR risks and allow employees to share implementation details only with their PR department approval. - -So how to tell who is using ClickHouse? - -One way is to **ask around**. If it’s not in writing, people are much more willing to share what technologies are used in their companies, what the use cases are, what kind of hardware is used, data volumes, etc. We’re talking with users regularly on [ClickHouse Meetups](https://www.youtube.com/channel/UChtmrD-dsdpspr42P_PyRAw/playlists) all over the world and have heard stories about 1000+ companies that use ClickHouse. Unfortunately, that’s not reproducible and we try to treat such stories as if they were told under NDA to avoid any potential troubles. But you can come to any of our future meetups and talk with other users on your own. There are multiple ways how meetups are announced, for example, you can subscribe to [our Twitter](http://twitter.com/ClickHouseDB/). - -The second way is to look for companies **publicly saying** that they use ClickHouse. It’s more substantial because there’s usually some hard evidence like a blog post, talk video recording, slide deck, etc. We collect the collection of links to such evidence on our **[Adopters](../../introduction/adopters.md)** page. Feel free to contribute the story of your employer or just some links you’ve stumbled upon (but try not to violate your NDA in the process). - -You can find names of very large companies in the adopters list, like Bloomberg, Cisco, China Telecom, Tencent, or Uber, but with the first approach, we found that there are many more. For example, if you take [the list of largest IT companies by Forbes (2020)](https://www.forbes.com/sites/hanktucker/2020/05/13/worlds-largest-technology-companies-2020-apple-stays-on-top-zoom-and-uber-debut/) over half of them are using ClickHouse in some way. Also, it would be unfair not to mention [Yandex](../../introduction/history.md), the company which initially open-sourced ClickHouse in 2016 and happens to be one of the largest IT companies in Europe. diff --git a/docs/en/faq/general/why-clickhouse-is-so-fast.md b/docs/en/faq/general/why-clickhouse-is-so-fast.md deleted file mode 100644 index 1ccf2595768..00000000000 --- a/docs/en/faq/general/why-clickhouse-is-so-fast.md +++ /dev/null @@ -1,63 +0,0 @@ ---- -title: Why ClickHouse is so fast? -toc_hidden: true -toc_priority: 8 ---- - -# Why ClickHouse Is So Fast? {#why-clickhouse-is-so-fast} - -It was designed to be fast. Query execution performance has always been a top priority during the development process, but other important characteristics like user-friendliness, scalability, and security were also considered so ClickHouse could become a real production system. - -ClickHouse was initially built as a prototype to do just a single task well: to filter and aggregate data as fast as possible. That’s what needs to be done to build a typical analytical report and that’s what a typical [GROUP BY](../../sql-reference/statements/select/group-by.md) query does. ClickHouse team has made several high-level decisions that combined made achieving this task possible: - -Column-oriented storage -: Source data often contain hundreds or even thousands of columns, while a report can use just a few of them. The system needs to avoid reading unnecessary columns, or most expensive disk read operations would be wasted. - -Indexes -: ClickHouse keeps data structures in memory that allows reading not only used columns but only necessary row ranges of those columns. - -Data compression -: Storing different values of the same column together often leads to better compression ratios (compared to row-oriented systems) because in real data column often has the same or not so many different values for neighboring rows. In addition to general-purpose compression, ClickHouse supports [specialized codecs](../../sql-reference/statements/create/table.md#create-query-specialized-codecs) that can make data even more compact. - -Vectorized query execution -: ClickHouse not only stores data in columns but also processes data in columns. It leads to better CPU cache utilization and allows for [SIMD](https://en.wikipedia.org/wiki/SIMD) CPU instructions usage. - -Scalability -: ClickHouse can leverage all available CPU cores and disks to execute even a single query. Not only on a single server but all CPU cores and disks of a cluster as well. - -But many other database management systems use similar techniques. What really makes ClickHouse stand out is **attention to low-level details**. Most programming languages provide implementations for most common algorithms and data structures, but they tend to be too generic to be effective. Every task can be considered as a landscape with various characteristics, instead of just throwing in random implementation. For example, if you need a hash table, here are some key questions to consider: - -- Which hash function to choose? -- Collision resolution algorithm: [open addressing](https://en.wikipedia.org/wiki/Open_addressing) vs [chaining](https://en.wikipedia.org/wiki/Hash_table#Separate_chaining)? -- Memory layout: one array for keys and values or separate arrays? Will it store small or large values? -- Fill factor: when and how to resize? How to move values around on resize? -- Will values be removed and which algorithm will work better if they will? -- Will we need fast probing with bitmaps, inline placement of string keys, support for non-movable values, prefetch, and batching? - -Hash table is a key data structure for `GROUP BY` implementation and ClickHouse automatically chooses one of [30+ variations](https://github.com/ClickHouse/ClickHouse/blob/master/src/Interpreters/Aggregator.h) for each specific query. - -The same goes for algorithms, for example, in sorting you might consider: - -- What will be sorted: an array of numbers, tuples, strings, or structures? -- Is all data available completely in RAM? -- Do we need a stable sort? -- Do we need a full sort? Maybe partial sort or n-th element will suffice? -- How to implement comparisons? -- Are we sorting data that has already been partially sorted? - -Algorithms that they rely on characteristics of data they are working with can often do better than their generic counterparts. If it is not really known in advance, the system can try various implementations and choose the one that works best in runtime. For example, see an [article on how LZ4 decompression is implemented in ClickHouse](https://habr.com/en/company/yandex/blog/457612/). - -Last but not least, the ClickHouse team always monitors the Internet on people claiming that they came up with the best implementation, algorithm, or data structure to do something and tries it out. Those claims mostly appear to be false, but from time to time you’ll indeed find a gem. - -!!! info "Tips for building your own high-performance software" - - - - Keep in mind low-level details when designing your system. - - Design based on hardware capabilities. - - Choose data structures and abstractions based on the needs of the task. - - Provide specializations for special cases. - - Try new, “best” algorithms, that you read about yesterday. - - Choose an algorithm in runtime based on statistics. - - Benchmark on real datasets. - - Test for performance regressions in CI. - - Measure and observe everything. diff --git a/docs/en/faq/index.md b/docs/en/faq/index.md deleted file mode 100644 index 891e1ea464e..00000000000 --- a/docs/en/faq/index.md +++ /dev/null @@ -1,47 +0,0 @@ ---- -toc_folder_title: F.A.Q. -toc_hidden: true -toc_priority: 76 ---- - -# ClickHouse F.A.Q {#clickhouse-f-a-q} - -This section of the documentation is a place to collect answers to ClickHouse-related questions that arise often. - -Categories: - -- **[General](../faq/general/index.md)** - - [What is ClickHouse?](../index.md#what-is-clickhouse) - - [Why ClickHouse is so fast?](../faq/general/why-clickhouse-is-so-fast.md) - - [Who is using ClickHouse?](../faq/general/who-is-using-clickhouse.md) - - [What does “ClickHouse” mean?](../faq/general/dbms-naming.md) - - [What does “Не тормозит” mean?](../faq/general/ne-tormozit.md) - - [What is OLAP?](../faq/general/olap.md) - - [What is a columnar database?](../faq/general/columnar-database.md) - - [Why not use something like MapReduce?](../faq/general/mapreduce.md) -- **[Use Cases](../faq/use-cases/index.md)** - - [Can I use ClickHouse as a time-series database?](../faq/use-cases/time-series.md) - - [Can I use ClickHouse as a key-value storage?](../faq/use-cases/key-value.md) -- **[Operations](../faq/operations/index.md)** - - [Which ClickHouse version to use in production?](../faq/operations/production.md) - - [Is it possible to delete old records from a ClickHouse table?](../faq/operations/delete-old-data.md) - - [Does ClickHouse support multi-region replication?](../faq/operations/multi-region-replication.md) -- **[Integration](../faq/integration/index.md)** - - [How do I export data from ClickHouse to a file?](../faq/integration/file-export.md) - - [What if I have a problem with encodings when connecting to Oracle via ODBC?](../faq/integration/oracle-odbc.md) - -{## TODO -Question candidates: -- How to choose a primary key? -- How to add a column in ClickHouse? -- Too many parts -- How to filter ClickHouse table by an array column contents? -- How to insert all rows from one table to another of identical structure? -- How to kill a process (query) in ClickHouse? -- How to implement pivot (like in pandas)? -- How to remove the default ClickHouse user through users.d? -- Importing MySQL dump to ClickHouse -- Window function workarounds (row_number, lag/lead, running diff/sum/average) -##} - -{## [Original article](https://clickhouse.com/docs/en/faq) ##} diff --git a/docs/en/faq/integration/file-export.md b/docs/en/faq/integration/file-export.md deleted file mode 100644 index f8f458929f9..00000000000 --- a/docs/en/faq/integration/file-export.md +++ /dev/null @@ -1,37 +0,0 @@ ---- -title: How do I export data from ClickHouse to a file? -toc_hidden: true -toc_priority: 10 ---- - -# How Do I Export Data from ClickHouse to a File? {#how-to-export-to-file} - -## Using INTO OUTFILE Clause {#using-into-outfile-clause} - -Add an [INTO OUTFILE](../../sql-reference/statements/select/into-outfile.md#into-outfile-clause) clause to your query. - -For example: - -``` sql -SELECT * FROM table INTO OUTFILE 'file' -``` - -By default, ClickHouse uses the [TabSeparated](../../interfaces/formats.md#tabseparated) format for output data. To select the [data format](../../interfaces/formats.md), use the [FORMAT clause](../../sql-reference/statements/select/format.md#format-clause). - -For example: - -``` sql -SELECT * FROM table INTO OUTFILE 'file' FORMAT CSV -``` - -## Using a File-Engine Table {#using-a-file-engine-table} - -See [File](../../engines/table-engines/special/file.md) table engine. - -## Using Command-Line Redirection {#using-command-line-redirection} - -``` bash -$ clickhouse-client --query "SELECT * from table" --format FormatName > result.txt -``` - -See [clickhouse-client](../../interfaces/cli.md). diff --git a/docs/en/faq/integration/index.md b/docs/en/faq/integration/index.md deleted file mode 100644 index 51a2593b751..00000000000 --- a/docs/en/faq/integration/index.md +++ /dev/null @@ -1,19 +0,0 @@ ---- -title: Questions about integrating ClickHouse and other systems -toc_hidden_folder: true -toc_priority: 4 -toc_title: Integration ---- - -# Questions About Integrating ClickHouse and Other Systems {#question-about-integrating-clickhouse-and-other-systems} - -Questions: - -- [How do I export data from ClickHouse to a file?](../../faq/integration/file-export.md) -- [How to import JSON into ClickHouse?](../../faq/integration/json-import.md) -- [What if I have a problem with encodings when connecting to Oracle via ODBC?](../../faq/integration/oracle-odbc.md) - -!!! info "Don’t see what you were looking for?" - Check out [other F.A.Q. categories](../../faq/index.md) or browse around main documentation articles found in the left sidebar. - -{## [Original article](https://clickhouse.com/docs/en/faq/integration/) ##} diff --git a/docs/en/faq/integration/json-import.md b/docs/en/faq/integration/json-import.md deleted file mode 100644 index 3fa026c794a..00000000000 --- a/docs/en/faq/integration/json-import.md +++ /dev/null @@ -1,33 +0,0 @@ ---- -title: How to import JSON into ClickHouse? -toc_hidden: true -toc_priority: 11 ---- - -# How to Import JSON Into ClickHouse? {#how-to-import-json-into-clickhouse} - -ClickHouse supports a wide range of [data formats for input and output](../../interfaces/formats.md). There are multiple JSON variations among them, but the most commonly used for data ingestion is [JSONEachRow](../../interfaces/formats.md#jsoneachrow). It expects one JSON object per row, each object separated by a newline. - -## Examples {#examples} - -Using [HTTP interface](../../interfaces/http.md): - -``` bash -$ echo '{"foo":"bar"}' | curl 'http://localhost:8123/?query=INSERT%20INTO%20test%20FORMAT%20JSONEachRow' --data-binary @- -``` - -Using [CLI interface](../../interfaces/cli.md): - -``` bash -$ echo '{"foo":"bar"}' | clickhouse-client --query="INSERT INTO test FORMAT JSONEachRow" -``` - -Instead of inserting data manually, you might consider to use one of [client libraries](../../interfaces/index.md) instead. - -## Useful Settings {#useful-settings} - -- `input_format_skip_unknown_fields` allows to insert JSON even if there were additional fields not present in table schema (by discarding them). -- `input_format_import_nested_json` allows to insert nested JSON objects into columns of [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) type. - -!!! note "Note" - Settings are specified as `GET` parameters for the HTTP interface or as additional command-line arguments prefixed with `--` for the `CLI` interface. diff --git a/docs/en/faq/integration/oracle-odbc.md b/docs/en/faq/integration/oracle-odbc.md deleted file mode 100644 index 91265a3daa2..00000000000 --- a/docs/en/faq/integration/oracle-odbc.md +++ /dev/null @@ -1,15 +0,0 @@ ---- -title: What if I have a problem with encodings when using Oracle via ODBC? -toc_hidden: true -toc_priority: 20 ---- - -# What If I Have a Problem with Encodings When Using Oracle Via ODBC? {#oracle-odbc-encodings} - -If you use Oracle as a source of ClickHouse external dictionaries via Oracle ODBC driver, you need to set the correct value for the `NLS_LANG` environment variable in `/etc/default/clickhouse`. For more information, see the [Oracle NLS_LANG FAQ](https://www.oracle.com/technetwork/products/globalization/nls-lang-099431.html). - -**Example** - -``` sql -NLS_LANG=RUSSIAN_RUSSIA.UTF8 -``` diff --git a/docs/en/faq/operations/delete-old-data.md b/docs/en/faq/operations/delete-old-data.md deleted file mode 100644 index 32fc485e98a..00000000000 --- a/docs/en/faq/operations/delete-old-data.md +++ /dev/null @@ -1,42 +0,0 @@ ---- -title: Is it possible to delete old records from a ClickHouse table? -toc_hidden: true -toc_priority: 20 ---- - -# Is It Possible to Delete Old Records from a ClickHouse Table? {#is-it-possible-to-delete-old-records-from-a-clickhouse-table} - -The short answer is “yes”. ClickHouse has multiple mechanisms that allow freeing up disk space by removing old data. Each mechanism is aimed for different scenarios. - -## TTL {#ttl} - -ClickHouse allows to automatically drop values when some condition happens. This condition is configured as an expression based on any columns, usually just static offset for any timestamp column. - -The key advantage of this approach is that it does not need any external system to trigger, once TTL is configured, data removal happens automatically in background. - -!!! note "Note" - TTL can also be used to move data not only to [/dev/null](https://en.wikipedia.org/wiki/Null_device), but also between different storage systems, like from SSD to HDD. - -More details on [configuring TTL](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-ttl). - -## ALTER DELETE {#alter-delete} - -ClickHouse does not have real-time point deletes like in [OLTP](https://en.wikipedia.org/wiki/Online_transaction_processing) databases. The closest thing to them are mutations. They are issued as `ALTER ... DELETE` or `ALTER ... UPDATE` queries to distinguish from normal `DELETE` or `UPDATE` as they are asynchronous batch operations, not immediate modifications. The rest of syntax after `ALTER TABLE` prefix is similar. - -`ALTER DELETE` can be issued to flexibly remove old data. If you need to do it regularly, the main downside will be the need to have an external system to submit the query. There are also some performance considerations since mutation rewrite complete parts even there’s only a single row to be deleted. - -This is the most common approach to make your system based on ClickHouse [GDPR](https://gdpr-info.eu)-compliant. - -More details on [mutations](../../sql-reference/statements/alter/index.md#alter-mutations). - -## DROP PARTITION {#drop-partition} - -`ALTER TABLE ... DROP PARTITION` provides a cost-efficient way to drop a whole partition. It’s not that flexible and needs proper partitioning scheme configured on table creation, but still covers most common cases. Like mutations need to be executed from an external system for regular use. - -More details on [manipulating partitions](../../sql-reference/statements/alter/partition.md#alter_drop-partition). - -## TRUNCATE {#truncate} - -It’s rather radical to drop all data from a table, but in some cases it might be exactly what you need. - -More details on [table truncation](../../sql-reference/statements/truncate.md). diff --git a/docs/en/faq/operations/index.md b/docs/en/faq/operations/index.md deleted file mode 100644 index 81aec18b9cf..00000000000 --- a/docs/en/faq/operations/index.md +++ /dev/null @@ -1,19 +0,0 @@ ---- -title: Question about operating ClickHouse servers and clusters -toc_hidden_folder: true -toc_priority: 3 -toc_title: Operations ---- - -# Question About Operating ClickHouse Servers and Clusters {#question-about-operating-clickhouse-servers-and-clusters} - -Questions: - -- [Which ClickHouse version to use in production?](../../faq/operations/production.md) -- [Is it possible to delete old records from a ClickHouse table?](../../faq/operations/delete-old-data.md) -- [Does ClickHouse support multi-region replication?](../../faq/operations/multi-region-replication.md) - -!!! info "Don’t see what you were looking for?" - Check out [other F.A.Q. categories](../../faq/index.md) or browse around main documentation articles found in the left sidebar. - -{## [Original article](https://clickhouse.com/docs/en/faq/production/) ##} diff --git a/docs/en/faq/operations/multi-region-replication.md b/docs/en/faq/operations/multi-region-replication.md deleted file mode 100644 index 7d78737544a..00000000000 --- a/docs/en/faq/operations/multi-region-replication.md +++ /dev/null @@ -1,13 +0,0 @@ ---- -title: Does ClickHouse support multi-region replication? -toc_hidden: true -toc_priority: 30 ---- - -# Does ClickHouse support multi-region replication? {#does-clickhouse-support-multi-region-replication} - -The short answer is "yes". However, we recommend keeping latency between all regions/datacenters in two-digit range, otherwise write performance will suffer as it goes through distributed consensus protocol. For example, replication between US coasts will likely work fine, but between the US and Europe won't. - -Configuration-wise there's no difference compared to single-region replication, simply use hosts that are located in different locations for replicas. - -For more information, see [full article on data replication](../../engines/table-engines/mergetree-family/replication.md). diff --git a/docs/en/faq/operations/production.md b/docs/en/faq/operations/production.md deleted file mode 100644 index 52ca300ced0..00000000000 --- a/docs/en/faq/operations/production.md +++ /dev/null @@ -1,70 +0,0 @@ ---- -title: Which ClickHouse version to use in production? -toc_hidden: true -toc_priority: 10 ---- - -# Which ClickHouse Version to Use in Production? {#which-clickhouse-version-to-use-in-production} - -First of all, let’s discuss why people ask this question in the first place. There are two key reasons: - -1. ClickHouse is developed with pretty high velocity and usually, there are 10+ stable releases per year. It makes a wide range of releases to choose from, which is not so trivial choice. -2. Some users want to avoid spending time figuring out which version works best for their use case and just follow someone else’s advice. - -The second reason is more fundamental, so we’ll start with it and then get back to navigating through various ClickHouse releases. - -## Which ClickHouse Version Do You Recommend? {#which-clickhouse-version-do-you-recommend} - -It’s tempting to hire consultants or trust some known experts to get rid of responsibility for your production environment. You install some specific ClickHouse version that someone else recommended, now if there’s some issue with it - it’s not your fault, it’s someone else’s. This line of reasoning is a big trap. No external person knows better what’s going on in your company’s production environment. - -So how to properly choose which ClickHouse version to upgrade to? Or how to choose your first ClickHouse version? First of all, you need to invest in setting up a **realistic pre-production environment**. In an ideal world, it could be a completely identical shadow copy, but that’s usually expensive. - -Here’re some key points to get reasonable fidelity in a pre-production environment with not so high costs: - -- Pre-production environment needs to run an as close set of queries as you intend to run in production: - - Don’t make it read-only with some frozen data. - - Don’t make it write-only with just copying data without building some typical reports. - - Don’t wipe it clean instead of applying schema migrations. -- Use a sample of real production data and queries. Try to choose a sample that’s still representative and makes `SELECT` queries return reasonable results. Use obfuscation if your data is sensitive and internal policies do not allow it to leave the production environment. -- Make sure that pre-production is covered by your monitoring and alerting software the same way as your production environment does. -- If your production spans across multiple datacenters or regions, make your pre-production does the same. -- If your production uses complex features like replication, distributed table, cascading materialize views, make sure they are configured similarly in pre-production. -- There’s a trade-off on using the roughly same number of servers or VMs in pre-production as in production, but of smaller size, or much less of them, but of the same size. The first option might catch extra network-related issues, while the latter is easier to manage. - -The second area to invest in is **automated testing infrastructure**. Don’t assume that if some kind of query has executed successfully once, it’ll continue to do so forever. It’s ok to have some unit tests where ClickHouse is mocked but make sure your product has a reasonable set of automated tests that are run against real ClickHouse and check that all important use cases are still working as expected. - -Extra step forward could be contributing those automated tests to [ClickHouse’s open-source test infrastructure](https://github.com/ClickHouse/ClickHouse/tree/master/tests) that’s continuously used in its day-to-day development. It definitely will take some additional time and effort to learn [how to run it](../../development/tests.md) and then how to adapt your tests to this framework, but it’ll pay off by ensuring that ClickHouse releases are already tested against them when they are announced stable, instead of repeatedly losing time on reporting the issue after the fact and then waiting for a bugfix to be implemented, backported and released. Some companies even have such test contributions to infrastructure by its use as an internal policy, most notably it’s called [Beyonce’s Rule](https://www.oreilly.com/library/view/software-engineering-at/9781492082781/ch01.html#policies_that_scale_well) at Google. - -When you have your pre-production environment and testing infrastructure in place, choosing the best version is straightforward: - -1. Routinely run your automated tests against new ClickHouse releases. You can do it even for ClickHouse releases that are marked as `testing`, but going forward to the next steps with them is not recommended. -2. Deploy the ClickHouse release that passed the tests to pre-production and check that all processes are running as expected. -3. Report any issues you discovered to [ClickHouse GitHub Issues](https://github.com/ClickHouse/ClickHouse/issues). -4. If there were no major issues, it should be safe to start deploying ClickHouse release to your production environment. Investing in gradual release automation that implements an approach similar to [canary releases](https://martinfowler.com/bliki/CanaryRelease.html) or [green-blue deployments](https://martinfowler.com/bliki/BlueGreenDeployment.html) might further reduce the risk of issues in production. - -As you might have noticed, there’s nothing specific to ClickHouse in the approach described above, people do that for any piece of infrastructure they rely on if they take their production environment seriously. - -## How to Choose Between ClickHouse Releases? {#how-to-choose-between-clickhouse-releases} - -If you look into contents of ClickHouse package repository, you’ll see four kinds of packages: - -1. `testing` -2. `prestable` -3. `stable` -4. `lts` (long-term support) - -As was mentioned earlier, `testing` is good mostly to notice issues early, running them in production is not recommended because each of them is not tested as thoroughly as other kinds of packages. - -`prestable` is a release candidate which generally looks promising and is likely to become announced as `stable` soon. You can try them out in pre-production and report issues if you see any. - -For production use, there are two key options: `stable` and `lts`. Here is some guidance on how to choose between them: - -- `stable` is the kind of package we recommend by default. They are released roughly monthly (and thus provide new features with reasonable delay) and three latest stable releases are supported in terms of diagnostics and backporting of bugfixes. -- `lts` are released twice a year and are supported for a year after their initial release. You might prefer them over `stable` in the following cases: - - Your company has some internal policies that do not allow for frequent upgrades or using non-LTS software. - - You are using ClickHouse in some secondary products that either does not require any complex ClickHouse features and do not have enough resources to keep it updated. - -Many teams who initially thought that `lts` is the way to go, often switch to `stable` anyway because of some recent feature that’s important for their product. - -!!! warning "Important" - One more thing to keep in mind when upgrading ClickHouse: we’re always keeping eye on compatibility across releases, but sometimes it’s not reasonable to keep and some minor details might change. So make sure you check the [changelog](../../whats-new/changelog/index.md) before upgrading to see if there are any notes about backward-incompatible changes. diff --git a/docs/en/faq/use-cases/index.md b/docs/en/faq/use-cases/index.md deleted file mode 100644 index aac5493b105..00000000000 --- a/docs/en/faq/use-cases/index.md +++ /dev/null @@ -1,18 +0,0 @@ ---- -title: Questions about ClickHouse use cases -toc_hidden_folder: true -toc_priority: 2 -toc_title: Use Cases ---- - -# Questions About ClickHouse Use Cases {#questions-about-clickhouse-use-cases} - -Questions: - -- [Can I use ClickHouse as a time-series database?](../../faq/use-cases/time-series.md) -- [Can I use ClickHouse as a key-value storage?](../../faq/use-cases/key-value.md) - -!!! info "Don’t see what you were looking for?" - Check out [other F.A.Q. categories](../../faq/index.md) or browse around main documentation articles found in the left sidebar. - -{## [Original article](https://clickhouse.com/docs/en/faq/use-cases/) ##} diff --git a/docs/en/faq/use-cases/key-value.md b/docs/en/faq/use-cases/key-value.md deleted file mode 100644 index 2827dd2fa58..00000000000 --- a/docs/en/faq/use-cases/key-value.md +++ /dev/null @@ -1,17 +0,0 @@ ---- -title: Can I use ClickHouse as a key-value storage? -toc_hidden: true -toc_priority: 101 ---- - -# Can I Use ClickHouse As a Key-Value Storage? {#can-i-use-clickhouse-as-a-key-value-storage} - -The short answer is **“no”**. The key-value workload is among top positions in the list of cases when **NOT**{.text-danger} to use ClickHouse. It’s an [OLAP](../../faq/general/olap.md) system after all, while there are many excellent key-value storage systems out there. - -However, there might be situations where it still makes sense to use ClickHouse for key-value-like queries. Usually, it’s some low-budget products where the main workload is analytical in nature and fits ClickHouse well, but there’s also some secondary process that needs a key-value pattern with not so high request throughput and without strict latency requirements. If you had an unlimited budget, you would have installed a secondary key-value database for thus secondary workload, but in reality, there’s an additional cost of maintaining one more storage system (monitoring, backups, etc.) which might be desirable to avoid. - -If you decide to go against recommendations and run some key-value-like queries against ClickHouse, here’re some tips: - -- The key reason why point queries are expensive in ClickHouse is its sparse primary index of main [MergeTree table engine family](../../engines/table-engines/mergetree-family/mergetree.md). This index can’t point to each specific row of data, instead, it points to each N-th and the system has to scan from the neighboring N-th row to the desired one, reading excessive data along the way. In a key-value scenario, it might be useful to reduce the value of N with the `index_granularity` setting. -- ClickHouse keeps each column in a separate set of files, so to assemble one complete row it needs to go through each of those files. Their count increases linearly with the number of columns, so in the key-value scenario, it might be worth to avoid using many columns and put all your payload in a single `String` column encoded in some serialization format like JSON, Protobuf or whatever makes sense. -- There’s an alternative approach that uses [Join](../../engines/table-engines/special/join.md) table engine instead of normal `MergeTree` tables and [joinGet](../../sql-reference/functions/other-functions.md#joinget) function to retrieve the data. It can provide better query performance but might have some usability and reliability issues. Here’s an [usage example](https://github.com/ClickHouse/ClickHouse/blob/master/tests/queries/0_stateless/00800_versatile_storage_join.sql#L49-L51). diff --git a/docs/en/faq/use-cases/time-series.md b/docs/en/faq/use-cases/time-series.md deleted file mode 100644 index bf97ac4b1e2..00000000000 --- a/docs/en/faq/use-cases/time-series.md +++ /dev/null @@ -1,15 +0,0 @@ ---- -title: Can I use ClickHouse as a time-series database? -toc_hidden: true -toc_priority: 101 ---- - -# Can I Use ClickHouse As a Time-Series Database? {#can-i-use-clickhouse-as-a-time-series-database} - -ClickHouse is a generic data storage solution for [OLAP](../../faq/general/olap.md) workloads, while there are many specialized time-series database management systems. Nevertheless, ClickHouse’s [focus on query execution speed](../../faq/general/why-clickhouse-is-so-fast.md) allows it to outperform specialized systems in many cases. There are many independent benchmarks on this topic out there, so we’re not going to conduct one here. Instead, let’s focus on ClickHouse features that are important to use if that’s your use case. - -First of all, there are **[specialized codecs](../../sql-reference/statements/create/table.md#create-query-specialized-codecs)** which make typical time-series. Either common algorithms like `DoubleDelta` and `Gorilla` or specific to ClickHouse like `T64`. - -Second, time-series queries often hit only recent data, like one day or one week old. It makes sense to use servers that have both fast nVME/SSD drives and high-capacity HDD drives. ClickHouse [TTL](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes) feature allows to configure keeping fresh hot data on fast drives and gradually move it to slower drives as it ages. Rollup or removal of even older data is also possible if your requirements demand it. - -Even though it’s against ClickHouse philosophy of storing and processing raw data, you can use [materialized views](../../sql-reference/statements/create/view.md) to fit into even tighter latency or costs requirements. diff --git a/docs/en/getting-started/example-datasets/index.md b/docs/en/getting-started/example-datasets/index.md deleted file mode 100644 index d4c9bab2441..00000000000 --- a/docs/en/getting-started/example-datasets/index.md +++ /dev/null @@ -1,28 +0,0 @@ ---- -toc_folder_title: Example Datasets -toc_priority: 10 -toc_title: Introduction ---- - -# Example Datasets {#example-datasets} - -This section describes how to obtain example datasets and import them into ClickHouse. For some datasets example queries are also available. - -The list of documented datasets: - -- [GitHub Events](../../getting-started/example-datasets/github-events.md) -- [Anonymized Web Analytics Dataset](../../getting-started/example-datasets/metrica.md) -- [Recipes](../../getting-started/example-datasets/recipes.md) -- [Star Schema Benchmark](../../getting-started/example-datasets/star-schema.md) -- [WikiStat](../../getting-started/example-datasets/wikistat.md) -- [Terabyte of Click Logs from Criteo](../../getting-started/example-datasets/criteo.md) -- [AMPLab Big Data Benchmark](../../getting-started/example-datasets/amplab-benchmark.md) -- [Brown University Benchmark](../../getting-started/example-datasets/brown-benchmark.md) -- [New York Taxi Data](../../getting-started/example-datasets/nyc-taxi.md) -- [OpenSky](../../getting-started/example-datasets/opensky.md) -- [UK Property Price Paid](../../getting-started/example-datasets/uk-price-paid.md) -- [Cell Towers](../../getting-started/example-datasets/cell-towers.md) -- [What's on the Menu?](../../getting-started/example-datasets/menus.md) -- [OnTime](../../getting-started/example-datasets/ontime.md) - -[Original article](https://clickhouse.com/docs/en/getting_started/example_datasets) diff --git a/docs/en/getting-started/index.md b/docs/en/getting-started/index.md deleted file mode 100644 index 372e8d7bd64..00000000000 --- a/docs/en/getting-started/index.md +++ /dev/null @@ -1,15 +0,0 @@ ---- -toc_folder_title: Getting Started -toc_hidden: true -toc_priority: 8 -toc_title: hidden ---- - -# Getting Started {#getting-started} - -If you are new to ClickHouse and want to get a hands-on feeling of its performance, first of all, you need to go through the [installation process](../getting-started/install.md). After that you can: - -- [Go through detailed tutorial](../getting-started/tutorial.md) -- [Experiment with example datasets](../getting-started/example-datasets/ontime.md) - -[Original article](https://clickhouse.com/docs/en/getting_started/) diff --git a/docs/en/getting-started/playground.md b/docs/en/getting-started/playground.md deleted file mode 100644 index 6c44f250242..00000000000 --- a/docs/en/getting-started/playground.md +++ /dev/null @@ -1,59 +0,0 @@ ---- -toc_priority: 14 -toc_title: Playground ---- - -# ClickHouse Playground {#clickhouse-playground} - -!!! warning "Warning" - This service is deprecated and will be replaced in foreseeable future. - -[ClickHouse Playground](https://play.clickhouse.com) allows people to experiment with ClickHouse by running queries instantly, without setting up their server or cluster. -Several example datasets are available in Playground as well as sample queries that show ClickHouse features. There’s also a selection of ClickHouse LTS releases to experiment with. - -You can make queries to Playground using any HTTP client, for example [curl](https://curl.haxx.se) or [wget](https://www.gnu.org/software/wget/), or set up a connection using [JDBC](../interfaces/jdbc.md) or [ODBC](../interfaces/odbc.md) drivers. More information about software products that support ClickHouse is available [here](../interfaces/index.md). - -## Credentials {#credentials} - -| Parameter | Value | -|:--------------------|:----------------------------------------| -| HTTPS endpoint | `https://play-api.clickhouse.com:8443` | -| Native TCP endpoint | `play-api.clickhouse.com:9440` | -| User | `playground` | -| Password | `clickhouse` | - -There are additional endpoints with specific ClickHouse releases to experiment with their differences (ports and user/password are the same as above): - -- 20.3 LTS: `play-api-v20-3.clickhouse.com` -- 19.14 LTS: `play-api-v19-14.clickhouse.com` - -!!! note "Note" - All these endpoints require a secure TLS connection. - -## Limitations {#limitations} - -The queries are executed as a read-only user. It implies some limitations: - -- DDL queries are not allowed -- INSERT queries are not allowed - -The following settings are also enforced: - -- [max_result_bytes=10485760](../operations/settings/query-complexity/#max-result-bytes) -- [max_result_rows=2000](../operations/settings/query-complexity/#setting-max_result_rows) -- [result_overflow_mode=break](../operations/settings/query-complexity/#result-overflow-mode) -- [max_execution_time=60000](../operations/settings/query-complexity/#max-execution-time) - -## Examples {#examples} - -HTTPS endpoint example with `curl`: - -``` bash -curl "https://play-api.clickhouse.com:8443/?query=SELECT+'Play+ClickHouse\!';&user=playground&password=clickhouse&database=datasets" -``` - -TCP endpoint example with [CLI](../interfaces/cli.md): - -``` bash -clickhouse client --secure -h play-api.clickhouse.com --port 9440 -u playground --password clickhouse -q "SELECT 'Play ClickHouse\!'" -``` diff --git a/docs/en/getting-started/tutorial.md b/docs/en/getting-started/tutorial.md deleted file mode 100644 index 9f43cc8769d..00000000000 --- a/docs/en/getting-started/tutorial.md +++ /dev/null @@ -1,662 +0,0 @@ ---- -toc_priority: 12 -toc_title: Tutorial ---- - -# ClickHouse Tutorial {#clickhouse-tutorial} - -## What to Expect from This Tutorial? {#what-to-expect-from-this-tutorial} - -By going through this tutorial, you’ll learn how to set up a simple ClickHouse cluster. It’ll be small, but fault-tolerant and scalable. Then we will use one of the example datasets to fill it with data and execute some demo queries. - -## Single Node Setup {#single-node-setup} - -To postpone the complexities of a distributed environment, we’ll start with deploying ClickHouse on a single server or virtual machine. ClickHouse is usually installed from [deb](../getting-started/install.md#install-from-deb-packages) or [rpm](../getting-started/install.md#from-rpm-packages) packages, but there are [alternatives](../getting-started/install.md#from-docker-image) for the operating systems that do not support them. - -For example, you have chosen `deb` packages and executed: - -``` bash -{% include 'install/deb.sh' %} -``` - -What do we have in the packages that got installed: - -- `clickhouse-client` package contains [clickhouse-client](../interfaces/cli.md) application, interactive ClickHouse console client. -- `clickhouse-common` package contains a ClickHouse executable file. -- `clickhouse-server` package contains configuration files to run ClickHouse as a server. - -Server config files are located in `/etc/clickhouse-server/`. Before going further, please notice the `` element in `config.xml`. Path determines the location for data storage, so it should be located on volume with large disk capacity; the default value is `/var/lib/clickhouse/`. If you want to adjust the configuration, it’s not handy to directly edit `config.xml` file, considering it might get rewritten on future package updates. The recommended way to override the config elements is to create [files in config.d directory](../operations/configuration-files.md) which serve as “patches” to config.xml. - -As you might have noticed, `clickhouse-server` is not launched automatically after package installation. It won’t be automatically restarted after updates, either. The way you start the server depends on your init system, usually, it is: - -``` bash -sudo service clickhouse-server start -``` - -or - -``` bash -sudo /etc/init.d/clickhouse-server start -``` - -The default location for server logs is `/var/log/clickhouse-server/`. The server is ready to handle client connections once it logs the `Ready for connections` message. - -Once the `clickhouse-server` is up and running, we can use `clickhouse-client` to connect to the server and run some test queries like `SELECT "Hello, world!";`. - -
- -Quick tips for clickhouse-client - -Interactive mode: - -``` bash -clickhouse-client -clickhouse-client --host=... --port=... --user=... --password=... -``` - -Enable multiline queries: - -``` bash -clickhouse-client -m -clickhouse-client --multiline -``` - -Run queries in batch-mode: - -``` bash -clickhouse-client --query='SELECT 1' -echo 'SELECT 1' | clickhouse-client -clickhouse-client <<< 'SELECT 1' -``` - -Insert data from a file in specified format: - -``` bash -clickhouse-client --query='INSERT INTO table VALUES' < data.txt -clickhouse-client --query='INSERT INTO table FORMAT TabSeparated' < data.tsv -``` - -
- -## Import Sample Dataset {#import-sample-dataset} - -Now it’s time to fill our ClickHouse server with some sample data. In this tutorial, we’ll use some anonymized web analytics data. There are [multiple ways to import the dataset](../getting-started/example-datasets/metrica.md), and for the sake of the tutorial, we’ll go with the most realistic one. - -### Download and Extract Table Data {#download-and-extract-table-data} - -``` bash -curl https://datasets.clickhouse.com/hits/tsv/hits_v1.tsv.xz | unxz --threads=`nproc` > hits_v1.tsv -curl https://datasets.clickhouse.com/visits/tsv/visits_v1.tsv.xz | unxz --threads=`nproc` > visits_v1.tsv -``` - -The extracted files are about 10GB in size. - -### Create Tables {#create-tables} - -As in most databases management systems, ClickHouse logically groups tables into “databases”. There’s a `default` database, but we’ll create a new one named `tutorial`: - -``` bash -clickhouse-client --query "CREATE DATABASE IF NOT EXISTS tutorial" -``` - -Syntax for creating tables is way more complicated compared to databases (see [reference](../sql-reference/statements/create/table.md). In general `CREATE TABLE` statement has to specify three key things: - -1. Name of table to create. -2. Table schema, i.e. list of columns and their [data types](../sql-reference/data-types/index.md). -3. [Table engine](../engines/table-engines/index.md) and its settings, which determines all the details on how queries to this table will be physically executed. - -There are two tables to create: - -- `hits` is a table with each action done by all users on all websites covered by the service. -- `visits` is a table that contains pre-built sessions instead of individual actions. - -Let’s see and execute the real create table queries for these tables: - -``` sql -CREATE TABLE tutorial.hits_v1 -( - `WatchID` UInt64, - `JavaEnable` UInt8, - `Title` String, - `GoodEvent` Int16, - `EventTime` DateTime, - `EventDate` Date, - `CounterID` UInt32, - `ClientIP` UInt32, - `ClientIP6` FixedString(16), - `RegionID` UInt32, - `UserID` UInt64, - `CounterClass` Int8, - `OS` UInt8, - `UserAgent` UInt8, - `URL` String, - `Referer` String, - `URLDomain` String, - `RefererDomain` String, - `Refresh` UInt8, - `IsRobot` UInt8, - `RefererCategories` Array(UInt16), - `URLCategories` Array(UInt16), - `URLRegions` Array(UInt32), - `RefererRegions` Array(UInt32), - `ResolutionWidth` UInt16, - `ResolutionHeight` UInt16, - `ResolutionDepth` UInt8, - `FlashMajor` UInt8, - `FlashMinor` UInt8, - `FlashMinor2` String, - `NetMajor` UInt8, - `NetMinor` UInt8, - `UserAgentMajor` UInt16, - `UserAgentMinor` FixedString(2), - `CookieEnable` UInt8, - `JavascriptEnable` UInt8, - `IsMobile` UInt8, - `MobilePhone` UInt8, - `MobilePhoneModel` String, - `Params` String, - `IPNetworkID` UInt32, - `TraficSourceID` Int8, - `SearchEngineID` UInt16, - `SearchPhrase` String, - `AdvEngineID` UInt8, - `IsArtifical` UInt8, - `WindowClientWidth` UInt16, - `WindowClientHeight` UInt16, - `ClientTimeZone` Int16, - `ClientEventTime` DateTime, - `SilverlightVersion1` UInt8, - `SilverlightVersion2` UInt8, - `SilverlightVersion3` UInt32, - `SilverlightVersion4` UInt16, - `PageCharset` String, - `CodeVersion` UInt32, - `IsLink` UInt8, - `IsDownload` UInt8, - `IsNotBounce` UInt8, - `FUniqID` UInt64, - `HID` UInt32, - `IsOldCounter` UInt8, - `IsEvent` UInt8, - `IsParameter` UInt8, - `DontCountHits` UInt8, - `WithHash` UInt8, - `HitColor` FixedString(1), - `UTCEventTime` DateTime, - `Age` UInt8, - `Sex` UInt8, - `Income` UInt8, - `Interests` UInt16, - `Robotness` UInt8, - `GeneralInterests` Array(UInt16), - `RemoteIP` UInt32, - `RemoteIP6` FixedString(16), - `WindowName` Int32, - `OpenerName` Int32, - `HistoryLength` Int16, - `BrowserLanguage` FixedString(2), - `BrowserCountry` FixedString(2), - `SocialNetwork` String, - `SocialAction` String, - `HTTPError` UInt16, - `SendTiming` Int32, - `DNSTiming` Int32, - `ConnectTiming` Int32, - `ResponseStartTiming` Int32, - `ResponseEndTiming` Int32, - `FetchTiming` Int32, - `RedirectTiming` Int32, - `DOMInteractiveTiming` Int32, - `DOMContentLoadedTiming` Int32, - `DOMCompleteTiming` Int32, - `LoadEventStartTiming` Int32, - `LoadEventEndTiming` Int32, - `NSToDOMContentLoadedTiming` Int32, - `FirstPaintTiming` Int32, - `RedirectCount` Int8, - `SocialSourceNetworkID` UInt8, - `SocialSourcePage` String, - `ParamPrice` Int64, - `ParamOrderID` String, - `ParamCurrency` FixedString(3), - `ParamCurrencyID` UInt16, - `GoalsReached` Array(UInt32), - `OpenstatServiceName` String, - `OpenstatCampaignID` String, - `OpenstatAdID` String, - `OpenstatSourceID` String, - `UTMSource` String, - `UTMMedium` String, - `UTMCampaign` String, - `UTMContent` String, - `UTMTerm` String, - `FromTag` String, - `HasGCLID` UInt8, - `RefererHash` UInt64, - `URLHash` UInt64, - `CLID` UInt32, - `YCLID` UInt64, - `ShareService` String, - `ShareURL` String, - `ShareTitle` String, - `ParsedParams` Nested( - Key1 String, - Key2 String, - Key3 String, - Key4 String, - Key5 String, - ValueDouble Float64), - `IslandID` FixedString(16), - `RequestNum` UInt32, - `RequestTry` UInt8 -) -ENGINE = MergeTree() -PARTITION BY toYYYYMM(EventDate) -ORDER BY (CounterID, EventDate, intHash32(UserID)) -SAMPLE BY intHash32(UserID) -``` - -``` sql -CREATE TABLE tutorial.visits_v1 -( - `CounterID` UInt32, - `StartDate` Date, - `Sign` Int8, - `IsNew` UInt8, - `VisitID` UInt64, - `UserID` UInt64, - `StartTime` DateTime, - `Duration` UInt32, - `UTCStartTime` DateTime, - `PageViews` Int32, - `Hits` Int32, - `IsBounce` UInt8, - `Referer` String, - `StartURL` String, - `RefererDomain` String, - `StartURLDomain` String, - `EndURL` String, - `LinkURL` String, - `IsDownload` UInt8, - `TraficSourceID` Int8, - `SearchEngineID` UInt16, - `SearchPhrase` String, - `AdvEngineID` UInt8, - `PlaceID` Int32, - `RefererCategories` Array(UInt16), - `URLCategories` Array(UInt16), - `URLRegions` Array(UInt32), - `RefererRegions` Array(UInt32), - `IsYandex` UInt8, - `GoalReachesDepth` Int32, - `GoalReachesURL` Int32, - `GoalReachesAny` Int32, - `SocialSourceNetworkID` UInt8, - `SocialSourcePage` String, - `MobilePhoneModel` String, - `ClientEventTime` DateTime, - `RegionID` UInt32, - `ClientIP` UInt32, - `ClientIP6` FixedString(16), - `RemoteIP` UInt32, - `RemoteIP6` FixedString(16), - `IPNetworkID` UInt32, - `SilverlightVersion3` UInt32, - `CodeVersion` UInt32, - `ResolutionWidth` UInt16, - `ResolutionHeight` UInt16, - `UserAgentMajor` UInt16, - `UserAgentMinor` UInt16, - `WindowClientWidth` UInt16, - `WindowClientHeight` UInt16, - `SilverlightVersion2` UInt8, - `SilverlightVersion4` UInt16, - `FlashVersion3` UInt16, - `FlashVersion4` UInt16, - `ClientTimeZone` Int16, - `OS` UInt8, - `UserAgent` UInt8, - `ResolutionDepth` UInt8, - `FlashMajor` UInt8, - `FlashMinor` UInt8, - `NetMajor` UInt8, - `NetMinor` UInt8, - `MobilePhone` UInt8, - `SilverlightVersion1` UInt8, - `Age` UInt8, - `Sex` UInt8, - `Income` UInt8, - `JavaEnable` UInt8, - `CookieEnable` UInt8, - `JavascriptEnable` UInt8, - `IsMobile` UInt8, - `BrowserLanguage` UInt16, - `BrowserCountry` UInt16, - `Interests` UInt16, - `Robotness` UInt8, - `GeneralInterests` Array(UInt16), - `Params` Array(String), - `Goals` Nested( - ID UInt32, - Serial UInt32, - EventTime DateTime, - Price Int64, - OrderID String, - CurrencyID UInt32), - `WatchIDs` Array(UInt64), - `ParamSumPrice` Int64, - `ParamCurrency` FixedString(3), - `ParamCurrencyID` UInt16, - `ClickLogID` UInt64, - `ClickEventID` Int32, - `ClickGoodEvent` Int32, - `ClickEventTime` DateTime, - `ClickPriorityID` Int32, - `ClickPhraseID` Int32, - `ClickPageID` Int32, - `ClickPlaceID` Int32, - `ClickTypeID` Int32, - `ClickResourceID` Int32, - `ClickCost` UInt32, - `ClickClientIP` UInt32, - `ClickDomainID` UInt32, - `ClickURL` String, - `ClickAttempt` UInt8, - `ClickOrderID` UInt32, - `ClickBannerID` UInt32, - `ClickMarketCategoryID` UInt32, - `ClickMarketPP` UInt32, - `ClickMarketCategoryName` String, - `ClickMarketPPName` String, - `ClickAWAPSCampaignName` String, - `ClickPageName` String, - `ClickTargetType` UInt16, - `ClickTargetPhraseID` UInt64, - `ClickContextType` UInt8, - `ClickSelectType` Int8, - `ClickOptions` String, - `ClickGroupBannerID` Int32, - `OpenstatServiceName` String, - `OpenstatCampaignID` String, - `OpenstatAdID` String, - `OpenstatSourceID` String, - `UTMSource` String, - `UTMMedium` String, - `UTMCampaign` String, - `UTMContent` String, - `UTMTerm` String, - `FromTag` String, - `HasGCLID` UInt8, - `FirstVisit` DateTime, - `PredLastVisit` Date, - `LastVisit` Date, - `TotalVisits` UInt32, - `TraficSource` Nested( - ID Int8, - SearchEngineID UInt16, - AdvEngineID UInt8, - PlaceID UInt16, - SocialSourceNetworkID UInt8, - Domain String, - SearchPhrase String, - SocialSourcePage String), - `Attendance` FixedString(16), - `CLID` UInt32, - `YCLID` UInt64, - `NormalizedRefererHash` UInt64, - `SearchPhraseHash` UInt64, - `RefererDomainHash` UInt64, - `NormalizedStartURLHash` UInt64, - `StartURLDomainHash` UInt64, - `NormalizedEndURLHash` UInt64, - `TopLevelDomain` UInt64, - `URLScheme` UInt64, - `OpenstatServiceNameHash` UInt64, - `OpenstatCampaignIDHash` UInt64, - `OpenstatAdIDHash` UInt64, - `OpenstatSourceIDHash` UInt64, - `UTMSourceHash` UInt64, - `UTMMediumHash` UInt64, - `UTMCampaignHash` UInt64, - `UTMContentHash` UInt64, - `UTMTermHash` UInt64, - `FromHash` UInt64, - `WebVisorEnabled` UInt8, - `WebVisorActivity` UInt32, - `ParsedParams` Nested( - Key1 String, - Key2 String, - Key3 String, - Key4 String, - Key5 String, - ValueDouble Float64), - `Market` Nested( - Type UInt8, - GoalID UInt32, - OrderID String, - OrderPrice Int64, - PP UInt32, - DirectPlaceID UInt32, - DirectOrderID UInt32, - DirectBannerID UInt32, - GoodID String, - GoodName String, - GoodQuantity Int32, - GoodPrice Int64), - `IslandID` FixedString(16) -) -ENGINE = CollapsingMergeTree(Sign) -PARTITION BY toYYYYMM(StartDate) -ORDER BY (CounterID, StartDate, intHash32(UserID), VisitID) -SAMPLE BY intHash32(UserID) -``` - -You can execute those queries using the interactive mode of `clickhouse-client` (just launch it in a terminal without specifying a query in advance) or try some [alternative interface](../interfaces/index.md) if you want. - -As we can see, `hits_v1` uses the [basic MergeTree engine](../engines/table-engines/mergetree-family/mergetree.md), while the `visits_v1` uses the [Collapsing](../engines/table-engines/mergetree-family/collapsingmergetree.md) variant. - -### Import Data {#import-data} - -Data import to ClickHouse is done via [INSERT INTO](../sql-reference/statements/insert-into.md) query like in many other SQL databases. However, data is usually provided in one of the [supported serialization formats](../interfaces/formats.md) instead of `VALUES` clause (which is also supported). - -The files we downloaded earlier are in tab-separated format, so here’s how to import them via console client: - -``` bash -clickhouse-client --query "INSERT INTO tutorial.hits_v1 FORMAT TSV" --max_insert_block_size=100000 < hits_v1.tsv -clickhouse-client --query "INSERT INTO tutorial.visits_v1 FORMAT TSV" --max_insert_block_size=100000 < visits_v1.tsv -``` - -ClickHouse has a lot of [settings to tune](../operations/settings/index.md) and one way to specify them in console client is via arguments, as we can see with `--max_insert_block_size`. The easiest way to figure out what settings are available, what do they mean and what the defaults are is to query the `system.settings` table: - -``` sql -SELECT name, value, changed, description -FROM system.settings -WHERE name LIKE '%max_insert_b%' -FORMAT TSV - -max_insert_block_size 1048576 0 "The maximum block size for insertion, if we control the creation of blocks for insertion." -``` - -Optionally you can [OPTIMIZE](../sql-reference/statements/optimize.md) the tables after import. Tables that are configured with an engine from MergeTree-family always do merges of data parts in the background to optimize data storage (or at least check if it makes sense). These queries force the table engine to do storage optimization right now instead of some time later: - -``` bash -clickhouse-client --query "OPTIMIZE TABLE tutorial.hits_v1 FINAL" -clickhouse-client --query "OPTIMIZE TABLE tutorial.visits_v1 FINAL" -``` - -These queries start an I/O and CPU intensive operation, so if the table consistently receives new data, it’s better to leave it alone and let merges run in the background. - -Now we can check if the table import was successful: - -``` bash -clickhouse-client --query "SELECT COUNT(*) FROM tutorial.hits_v1" -clickhouse-client --query "SELECT COUNT(*) FROM tutorial.visits_v1" -``` - -## Example Queries {#example-queries} - -``` sql -SELECT - StartURL AS URL, - AVG(Duration) AS AvgDuration -FROM tutorial.visits_v1 -WHERE StartDate BETWEEN '2014-03-23' AND '2014-03-30' -GROUP BY URL -ORDER BY AvgDuration DESC -LIMIT 10 -``` - -``` sql -SELECT - sum(Sign) AS visits, - sumIf(Sign, has(Goals.ID, 1105530)) AS goal_visits, - (100. * goal_visits) / visits AS goal_percent -FROM tutorial.visits_v1 -WHERE (CounterID = 912887) AND (toYYYYMM(StartDate) = 201403) AND (domain(StartURL) = 'yandex.ru') -``` - -## Cluster Deployment {#cluster-deployment} - -ClickHouse cluster is a homogenous cluster. Steps to set up: - -1. Install ClickHouse server on all machines of the cluster -2. Set up cluster configs in configuration files -3. Create local tables on each instance -4. Create a [Distributed table](../engines/table-engines/special/distributed.md) - -[Distributed table](../engines/table-engines/special/distributed.md) is actually a kind of “view” to local tables of ClickHouse cluster. SELECT query from a distributed table executes using resources of all cluster’s shards. You may specify configs for multiple clusters and create multiple distributed tables providing views to different clusters. - -Example config for a cluster with three shards, one replica each: - -``` xml - - - - - example-perftest01j - 9000 - - - - - example-perftest02j - 9000 - - - - - example-perftest03j - 9000 - - - - -``` - -For further demonstration, let’s create a new local table with the same `CREATE TABLE` query that we used for `hits_v1`, but different table name: - -``` sql -CREATE TABLE tutorial.hits_local (...) ENGINE = MergeTree() ... -``` - -Creating a distributed table providing a view into local tables of the cluster: - -``` sql -CREATE TABLE tutorial.hits_all AS tutorial.hits_local -ENGINE = Distributed(perftest_3shards_1replicas, tutorial, hits_local, rand()); -``` - -A common practice is to create similar Distributed tables on all machines of the cluster. It allows running distributed queries on any machine of the cluster. Also there’s an alternative option to create temporary distributed table for a given SELECT query using [remote](../sql-reference/table-functions/remote.md) table function. - -Let’s run [INSERT SELECT](../sql-reference/statements/insert-into.md) into the Distributed table to spread the table to multiple servers. - -``` sql -INSERT INTO tutorial.hits_all SELECT * FROM tutorial.hits_v1; -``` - -!!! warning "Notice" - This approach is not suitable for the sharding of large tables. There’s a separate tool [clickhouse-copier](../operations/utilities/clickhouse-copier.md) that can re-shard arbitrary large tables. - -As you could expect, computationally heavy queries run N times faster if they utilize 3 servers instead of one. - -In this case, we have used a cluster with 3 shards, and each contains a single replica. - -To provide resilience in a production environment, we recommend that each shard should contain 2-3 replicas spread between multiple availability zones or datacenters (or at least racks). Note that ClickHouse supports an unlimited number of replicas. - -Example config for a cluster of one shard containing three replicas: - -``` xml - - ... - - - - example-perftest01j - 9000 - - - example-perftest02j - 9000 - - - example-perftest03j - 9000 - - - - -``` - -To enable native replication [ZooKeeper](http://zookeeper.apache.org/) is required. ClickHouse takes care of data consistency on all replicas and runs restore procedure after failure automatically. It’s recommended to deploy the ZooKeeper cluster on separate servers (where no other processes including ClickHouse are running). - -!!! note "Note" - ZooKeeper is not a strict requirement: in some simple cases, you can duplicate the data by writing it into all the replicas from your application code. This approach is **not** recommended, in this case, ClickHouse won’t be able to guarantee data consistency on all replicas. Thus it becomes the responsibility of your application. - -ZooKeeper locations are specified in the configuration file: - -``` xml - - - zoo01 - 2181 - - - zoo02 - 2181 - - - zoo03 - 2181 - - -``` - -Also, we need to set macros for identifying each shard and replica which are used on table creation: - -``` xml - - 01 - 01 - -``` - -If there are no replicas at the moment on replicated table creation, a new first replica is instantiated. If there are already live replicas, the new replica clones data from existing ones. You have an option to create all replicated tables first, and then insert data to it. Another option is to create some replicas and add the others after or during data insertion. - -``` sql -CREATE TABLE tutorial.hits_replica (...) -ENGINE = ReplicatedMergeTree( - '/clickhouse_perftest/tables/{shard}/hits', - '{replica}' -) -... -``` - -Here we use [ReplicatedMergeTree](../engines/table-engines/mergetree-family/replication.md) table engine. In parameters we specify ZooKeeper path containing shard and replica identifiers. - -``` sql -INSERT INTO tutorial.hits_replica SELECT * FROM tutorial.hits_local; -``` - -Replication operates in multi-master mode. Data can be loaded into any replica, and the system then syncs it with other instances automatically. Replication is asynchronous so at a given moment, not all replicas may contain recently inserted data. At least one replica should be up to allow data ingestion. Others will sync up data and repair consistency once they will become active again. Note that this approach allows for the low possibility of a loss of recently inserted data. - -[Original article](https://clickhouse.com/docs/en/getting_started/tutorial/) diff --git a/docs/en/guides/apply-catboost-model.md b/docs/en/guides/apply-catboost-model.md deleted file mode 100644 index 859703a31df..00000000000 --- a/docs/en/guides/apply-catboost-model.md +++ /dev/null @@ -1,242 +0,0 @@ ---- -toc_priority: 41 -toc_title: Applying CatBoost Models ---- - -# Applying a Catboost Model in ClickHouse {#applying-catboost-model-in-clickhouse} - -[CatBoost](https://catboost.ai) is a free and open-source gradient boosting library developed at Yandex for machine learning. - -With this instruction, you will learn to apply pre-trained models in ClickHouse by running model inference from SQL. - -To apply a CatBoost model in ClickHouse: - -1. [Create a Table](#create-table). -2. [Insert the Data to the Table](#insert-data-to-table). -3. [Integrate CatBoost into ClickHouse](#integrate-catboost-into-clickhouse) (Optional step). -4. [Run the Model Inference from SQL](#run-model-inference). - -For more information about training CatBoost models, see [Training and applying models](https://catboost.ai/docs/features/training.html#training). - -You can reload CatBoost models if the configuration was updated without restarting the server using [RELOAD MODEL](../sql-reference/statements/system.md#query_language-system-reload-model) and [RELOAD MODELS](../sql-reference/statements/system.md#query_language-system-reload-models) system queries. - -## Prerequisites {#prerequisites} - -If you do not have the [Docker](https://docs.docker.com/install/) yet, install it. - -!!! note "Note" - [Docker](https://www.docker.com) is a software platform that allows you to create containers that isolate a CatBoost and ClickHouse installation from the rest of the system. - -Before applying a CatBoost model: - -**1.** Pull the [Docker image](https://hub.docker.com/r/yandex/tutorial-catboost-clickhouse) from the registry: - -``` bash -$ docker pull yandex/tutorial-catboost-clickhouse -``` - -This Docker image contains everything you need to run CatBoost and ClickHouse: code, runtime, libraries, environment variables, and configuration files. - -**2.** Make sure the Docker image has been successfully pulled: - -``` bash -$ docker image ls -REPOSITORY TAG IMAGE ID CREATED SIZE -yandex/tutorial-catboost-clickhouse latest 622e4d17945b 22 hours ago 1.37GB -``` - -**3.** Start a Docker container based on this image: - -``` bash -$ docker run -it -p 8888:8888 yandex/tutorial-catboost-clickhouse -``` - -## 1. Create a Table {#create-table} - -To create a ClickHouse table for the training sample: - -**1.** Start ClickHouse console client in the interactive mode: - -``` bash -$ clickhouse client -``` - -!!! note "Note" - The ClickHouse server is already running inside the Docker container. - -**2.** Create the table using the command: - -``` sql -:) CREATE TABLE amazon_train -( - date Date MATERIALIZED today(), - ACTION UInt8, - RESOURCE UInt32, - MGR_ID UInt32, - ROLE_ROLLUP_1 UInt32, - ROLE_ROLLUP_2 UInt32, - ROLE_DEPTNAME UInt32, - ROLE_TITLE UInt32, - ROLE_FAMILY_DESC UInt32, - ROLE_FAMILY UInt32, - ROLE_CODE UInt32 -) -ENGINE = MergeTree ORDER BY date -``` - -**3.** Exit from ClickHouse console client: - -``` sql -:) exit -``` - -## 2. Insert the Data to the Table {#insert-data-to-table} - -To insert the data: - -**1.** Run the following command: - -``` bash -$ clickhouse client --host 127.0.0.1 --query 'INSERT INTO amazon_train FORMAT CSVWithNames' < ~/amazon/train.csv -``` - -**2.** Start ClickHouse console client in the interactive mode: - -``` bash -$ clickhouse client -``` - -**3.** Make sure the data has been uploaded: - -``` sql -:) SELECT count() FROM amazon_train - -SELECT count() -FROM amazon_train - -+-count()-+ -| 65538 | -+-------+ -``` - -## 3. Integrate CatBoost into ClickHouse {#integrate-catboost-into-clickhouse} - -!!! note "Note" - **Optional step.** The Docker image contains everything you need to run CatBoost and ClickHouse. - -To integrate CatBoost into ClickHouse: - -**1.** Build the evaluation library. - -The fastest way to evaluate a CatBoost model is compile `libcatboostmodel.` library. For more information about how to build the library, see [CatBoost documentation](https://catboost.ai/docs/concepts/c-plus-plus-api_dynamic-c-pluplus-wrapper.html). - -**2.** Create a new directory anywhere and with any name, for example, `data` and put the created library in it. The Docker image already contains the library `data/libcatboostmodel.so`. - -**3.** Create a new directory for config model anywhere and with any name, for example, `models`. - -**4.** Create a model configuration file with any name, for example, `models/amazon_model.xml`. - -**5.** Describe the model configuration: - -``` xml - - - - catboost - - amazon - - /home/catboost/tutorial/catboost_model.bin - - 0 - - -``` - -**6.** Add the path to CatBoost and the model configuration to the ClickHouse configuration: - -``` xml - -/home/catboost/data/libcatboostmodel.so -/home/catboost/models/*_model.xml -``` - -!!! note "Note" - You can change path to the CatBoost model configuration later without restarting server. - -## 4. Run the Model Inference from SQL {#run-model-inference} - -For test model run the ClickHouse client `$ clickhouse client`. - -Let’s make sure that the model is working: - -``` sql -:) SELECT - modelEvaluate('amazon', - RESOURCE, - MGR_ID, - ROLE_ROLLUP_1, - ROLE_ROLLUP_2, - ROLE_DEPTNAME, - ROLE_TITLE, - ROLE_FAMILY_DESC, - ROLE_FAMILY, - ROLE_CODE) > 0 AS prediction, - ACTION AS target -FROM amazon_train -LIMIT 10 -``` - -!!! note "Note" - Function [modelEvaluate](../sql-reference/functions/other-functions.md#function-modelevaluate) returns tuple with per-class raw predictions for multiclass models. - -Let’s predict the probability: - -``` sql -:) SELECT - modelEvaluate('amazon', - RESOURCE, - MGR_ID, - ROLE_ROLLUP_1, - ROLE_ROLLUP_2, - ROLE_DEPTNAME, - ROLE_TITLE, - ROLE_FAMILY_DESC, - ROLE_FAMILY, - ROLE_CODE) AS prediction, - 1. / (1 + exp(-prediction)) AS probability, - ACTION AS target -FROM amazon_train -LIMIT 10 -``` - -!!! note "Note" - More info about [exp()](../sql-reference/functions/math-functions.md) function. - -Let’s calculate LogLoss on the sample: - -``` sql -:) SELECT -avg(tg * log(prob) + (1 - tg) * log(1 - prob)) AS logloss -FROM -( - SELECT - modelEvaluate('amazon', - RESOURCE, - MGR_ID, - ROLE_ROLLUP_1, - ROLE_ROLLUP_2, - ROLE_DEPTNAME, - ROLE_TITLE, - ROLE_FAMILY_DESC, - ROLE_FAMILY, - ROLE_CODE) AS prediction, - 1. / (1. + exp(-prediction)) AS prob, - ACTION AS tg - FROM amazon_train -) -``` - -!!! note "Note" - More info about [avg()](../sql-reference/aggregate-functions/reference/avg.md#agg_function-avg) and [log()](../sql-reference/functions/math-functions.md) functions. - -[Original article](https://clickhouse.com/docs/en/guides/apply_catboost_model/) diff --git a/docs/en/guides/index.md b/docs/en/guides/index.md deleted file mode 100644 index eb4ca9af367..00000000000 --- a/docs/en/guides/index.md +++ /dev/null @@ -1,14 +0,0 @@ ---- -toc_folder_title: Guides -toc_priority: 38 -toc_title: Overview ---- - -# ClickHouse Guides {#clickhouse-guides} - -List of detailed step-by-step instructions that help to solve various tasks using ClickHouse: - -- [Tutorial on simple cluster set-up](../getting-started/tutorial.md) -- [Applying a CatBoost model in ClickHouse](../guides/apply-catboost-model.md) - -[Original article](https://clickhouse.com/docs/en/guides/) diff --git a/docs/en/index.md b/docs/en/index.md deleted file mode 100644 index 532be035bbc..00000000000 --- a/docs/en/index.md +++ /dev/null @@ -1,95 +0,0 @@ ---- -toc_priority: 0 -toc_title: Overview ---- - -# What Is ClickHouse? {#what-is-clickhouse} - -ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP). - -In a “normal” row-oriented DBMS, data is stored in this order: - -| Row | WatchID | JavaEnable | Title | GoodEvent | EventTime | -|-----|-------------|------------|--------------------|-----------|---------------------| -| #0 | 89354350662 | 1 | Investor Relations | 1 | 2016-05-18 05:19:20 | -| #1 | 90329509958 | 0 | Contact us | 1 | 2016-05-18 08:10:20 | -| #2 | 89953706054 | 1 | Mission | 1 | 2016-05-18 07:38:00 | -| #N | … | … | … | … | … | - -In other words, all the values related to a row are physically stored next to each other. - -Examples of a row-oriented DBMS are MySQL, Postgres, and MS SQL Server. - -In a column-oriented DBMS, data is stored like this: - -| Row: | #0 | #1 | #2 | #N | -|-------------|---------------------|---------------------|---------------------|-----| -| WatchID: | 89354350662 | 90329509958 | 89953706054 | … | -| JavaEnable: | 1 | 0 | 1 | … | -| Title: | Investor Relations | Contact us | Mission | … | -| GoodEvent: | 1 | 1 | 1 | … | -| EventTime: | 2016-05-18 05:19:20 | 2016-05-18 08:10:20 | 2016-05-18 07:38:00 | … | - -These examples only show the order that data is arranged in. The values from different columns are stored separately, and data from the same column is stored together. - -Examples of a column-oriented DBMS: Vertica, Paraccel (Actian Matrix and Amazon Redshift), Sybase IQ, Exasol, Infobright, InfiniDB, MonetDB (VectorWise and Actian Vector), LucidDB, SAP HANA, Google Dremel, Google PowerDrill, Druid, and kdb+. - -Different orders for storing data are better suited to different scenarios. The data access scenario refers to what queries are made, how often, and in what proportion; how much data is read for each type of query – rows, columns, and bytes; the relationship between reading and updating data; the working size of the data and how locally it is used; whether transactions are used, and how isolated they are; requirements for data replication and logical integrity; requirements for latency and throughput for each type of query, and so on. - -The higher the load on the system, the more important it is to customize the system set up to match the requirements of the usage scenario, and the more fine grained this customization becomes. There is no system that is equally well-suited to significantly different scenarios. If a system is adaptable to a wide set of scenarios, under a high load, the system will handle all the scenarios equally poorly, or will work well for just one or few of possible scenarios. - -## Key Properties of OLAP Scenario {#key-properties-of-olap-scenario} - -- The vast majority of requests are for read access. -- Data is updated in fairly large batches (\> 1000 rows), not by single rows; or it is not updated at all. -- Data is added to the DB but is not modified. -- For reads, quite a large number of rows are extracted from the DB, but only a small subset of columns. -- Tables are “wide,” meaning they contain a large number of columns. -- Queries are relatively rare (usually hundreds of queries per server or less per second). -- For simple queries, latencies around 50 ms are allowed. -- Column values are fairly small: numbers and short strings (for example, 60 bytes per URL). -- Requires high throughput when processing a single query (up to billions of rows per second per server). -- Transactions are not necessary. -- Low requirements for data consistency. -- There is one large table per query. All tables are small, except for one. -- A query result is significantly smaller than the source data. In other words, data is filtered or aggregated, so the result fits in a single server’s RAM. - -It is easy to see that the OLAP scenario is very different from other popular scenarios (such as OLTP or Key-Value access). So it does not make sense to try to use OLTP or a Key-Value DB for processing analytical queries if you want to get decent performance. For example, if you try to use MongoDB or Redis for analytics, you will get very poor performance compared to OLAP databases. - -## Why Column-Oriented Databases Work Better in the OLAP Scenario {#why-column-oriented-databases-work-better-in-the-olap-scenario} - -Column-oriented databases are better suited to OLAP scenarios: they are at least 100 times faster in processing most queries. The reasons are explained in detail below, but the fact is easier to demonstrate visually: - -**Row-oriented DBMS** - -![Row-oriented](images/row-oriented.gif#) - -**Column-oriented DBMS** - -![Column-oriented](images/column-oriented.gif#) - -See the difference? - -### Input/output {#inputoutput} - -1. For an analytical query, only a small number of table columns need to be read. In a column-oriented database, you can read just the data you need. For example, if you need 5 columns out of 100, you can expect a 20-fold reduction in I/O. -2. Since data is read in packets, it is easier to compress. Data in columns is also easier to compress. This further reduces the I/O volume. -3. Due to the reduced I/O, more data fits in the system cache. - -For example, the query “count the number of records for each advertising platform” requires reading one “advertising platform ID” column, which takes up 1 byte uncompressed. If most of the traffic was not from advertising platforms, you can expect at least 10-fold compression of this column. When using a quick compression algorithm, data decompression is possible at a speed of at least several gigabytes of uncompressed data per second. In other words, this query can be processed at a speed of approximately several billion rows per second on a single server. This speed is actually achieved in practice. - -### CPU {#cpu} - -Since executing a query requires processing a large number of rows, it helps to dispatch all operations for entire vectors instead of for separate rows, or to implement the query engine so that there is almost no dispatching cost. If you do not do this, with any half-decent disk subsystem, the query interpreter inevitably stalls the CPU. It makes sense to both store data in columns and process it, when possible, by columns. - -There are two ways to do this: - -1. A vector engine. All operations are written for vectors, instead of for separate values. This means you do not need to call operations very often, and dispatching costs are negligible. Operation code contains an optimized internal cycle. - -2. Code generation. The code generated for the query has all the indirect calls in it. - -This is not done in “normal” databases, because it does not make sense when running simple queries. However, there are exceptions. For example, MemSQL uses code generation to reduce latency when processing SQL queries. (For comparison, analytical DBMSs require optimization of throughput, not latency.) - -Note that for CPU efficiency, the query language must be declarative (SQL or MDX), or at least a vector (J, K). The query should only contain implicit loops, allowing for optimization. - -{## [Original article](https://clickhouse.com/docs/en/) ##} diff --git a/docs/en/getting-started/install.md b/docs/en/install.md similarity index 69% rename from docs/en/getting-started/install.md rename to docs/en/install.md index cd734d4dc8b..b499b584865 100644 --- a/docs/en/getting-started/install.md +++ b/docs/en/install.md @@ -1,6 +1,8 @@ --- -toc_priority: 11 -toc_title: Installation +sidebar_label: Installation +sidebar_position: 1 +keywords: [clickhouse, install, installation, docs] +description: ClickHouse can run on any Linux, FreeBSD, or Mac OS X with x86_64, AArch64, or PowerPC64LE CPU architecture. --- # Installation {#installation} @@ -24,15 +26,36 @@ To run ClickHouse on processors that do not support SSE 4.2 or have AArch64 or P It is recommended to use official pre-compiled `deb` packages for Debian or Ubuntu. Run these commands to install packages: ``` bash -{% include 'install/deb.sh' %} +sudo apt-get install apt-transport-https ca-certificates dirmngr +sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv E0C56BD4 + +echo "deb https://repo.clickhouse.com/deb/stable/ main/" | sudo tee \ + /etc/apt/sources.list.d/clickhouse.list +sudo apt-get update + +sudo apt-get install -y clickhouse-server clickhouse-client + +sudo service clickhouse-server start +clickhouse-client # or "clickhouse-client --password" if you set up a password. ``` -
- +
Deprecated Method for installing deb-packages + ``` bash -{% include 'install/deb_repo.sh' %} +sudo apt-get install apt-transport-https ca-certificates dirmngr +sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv E0C56BD4 + +echo "deb https://repo.clickhouse.com/deb/stable/ main/" | sudo tee \ + /etc/apt/sources.list.d/clickhouse.list +sudo apt-get update + +sudo apt-get install -y clickhouse-server clickhouse-client + +sudo service clickhouse-server start +clickhouse-client # or "clickhouse-client --password" if you set up a password. ``` +
You can replace `stable` with `lts` or `testing` to use different [release trains](../faq/operations/production.md) based on your needs. @@ -57,15 +80,28 @@ It is recommended to use official pre-compiled `rpm` packages for CentOS, RedHat First, you need to add the official repository: ``` bash -{% include 'install/rpm.sh' %} +sudo yum install -y yum-utils +sudo yum-config-manager --add-repo https://packages.clickhouse.com/rpm/clickhouse.repo +sudo yum install -y clickhouse-server clickhouse-client + +sudo /etc/init.d/clickhouse-server start +clickhouse-client # or "clickhouse-client --password" if you set up a password. ```
Deprecated Method for installing rpm-packages + ``` bash -{% include 'install/rpm_repo.sh' %} +sudo yum install yum-utils +sudo rpm --import https://repo.clickhouse.com/CLICKHOUSE-KEY.GPG +sudo yum-config-manager --add-repo https://repo.clickhouse.com/rpm/clickhouse.repo +sudo yum install clickhouse-server clickhouse-client + +sudo /etc/init.d/clickhouse-server start +clickhouse-client # or "clickhouse-client --password" if you set up a password. ``` +
If you want to use the most recent version, replace `stable` with `testing` (this is recommended for your testing environments). `prestable` is sometimes also available. @@ -86,14 +122,52 @@ The required version can be downloaded with `curl` or `wget` from repository htt After that downloaded archives should be unpacked and installed with installation scripts. Example for the latest stable version: ``` bash -{% include 'install/tgz.sh' %} +LATEST_VERSION=$(curl -s https://packages.clickhouse.com/tgz/stable/ | \ + grep -Eo '[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' | sort -V -r | head -n 1) +export LATEST_VERSION +curl -O "https://packages.clickhouse.com/tgz/stable/clickhouse-common-static-$LATEST_VERSION.tgz" +curl -O "https://packages.clickhouse.com/tgz/stable/clickhouse-common-static-dbg-$LATEST_VERSION.tgz" +curl -O "https://packages.clickhouse.com/tgz/stable/clickhouse-server-$LATEST_VERSION.tgz" +curl -O "https://packages.clickhouse.com/tgz/stable/clickhouse-client-$LATEST_VERSION.tgz" + +tar -xzvf "clickhouse-common-static-$LATEST_VERSION.tgz" +sudo "clickhouse-common-static-$LATEST_VERSION/install/doinst.sh" + +tar -xzvf "clickhouse-common-static-dbg-$LATEST_VERSION.tgz" +sudo "clickhouse-common-static-dbg-$LATEST_VERSION/install/doinst.sh" + +tar -xzvf "clickhouse-server-$LATEST_VERSION.tgz" +sudo "clickhouse-server-$LATEST_VERSION/install/doinst.sh" +sudo /etc/init.d/clickhouse-server start + +tar -xzvf "clickhouse-client-$LATEST_VERSION.tgz" +sudo "clickhouse-client-$LATEST_VERSION/install/doinst.sh" ```
Deprecated Method for installing tgz archives + ``` bash -{% include 'install/tgz_repo.sh' %} +export LATEST_VERSION=$(curl -s https://repo.clickhouse.com/tgz/stable/ | \ + grep -Eo '[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' | sort -V -r | head -n 1) +curl -O https://repo.clickhouse.com/tgz/stable/clickhouse-common-static-$LATEST_VERSION.tgz +curl -O https://repo.clickhouse.com/tgz/stable/clickhouse-common-static-dbg-$LATEST_VERSION.tgz +curl -O https://repo.clickhouse.com/tgz/stable/clickhouse-server-$LATEST_VERSION.tgz +curl -O https://repo.clickhouse.com/tgz/stable/clickhouse-client-$LATEST_VERSION.tgz + +tar -xzvf clickhouse-common-static-$LATEST_VERSION.tgz +sudo clickhouse-common-static-$LATEST_VERSION/install/doinst.sh + +tar -xzvf clickhouse-common-static-dbg-$LATEST_VERSION.tgz +sudo clickhouse-common-static-dbg-$LATEST_VERSION/install/doinst.sh + +tar -xzvf clickhouse-server-$LATEST_VERSION.tgz +sudo clickhouse-server-$LATEST_VERSION/install/doinst.sh +sudo /etc/init.d/clickhouse-server start + +tar -xzvf clickhouse-client-$LATEST_VERSION.tgz +sudo clickhouse-client-$LATEST_VERSION/install/doinst.sh ```
diff --git a/docs/en/interfaces/index.md b/docs/en/interfaces/index.md index e747b93a1a6..16e97ed7c62 100644 --- a/docs/en/interfaces/index.md +++ b/docs/en/interfaces/index.md @@ -1,7 +1,8 @@ --- -toc_folder_title: Interfaces -toc_priority: 14 -toc_title: Introduction +sidebar_label: Interfaces +sidebar_position: 34 +keywords: [clickhouse, network, interfaces, http, tcp, grpc, command-line, client, jdbc, odbc, driver] +description: ClickHouse provides three network interfaces --- # Interfaces {#interfaces} diff --git a/docs/en/introduction/adopters.md b/docs/en/introduction/adopters.md deleted file mode 100644 index 9c7fab7424d..00000000000 --- a/docs/en/introduction/adopters.md +++ /dev/null @@ -1,199 +0,0 @@ ---- -toc_priority: 8 -toc_title: Adopters ---- - -# ClickHouse Adopters {#clickhouse-adopters} - -!!! warning "Disclaimer" - The following list of companies using ClickHouse and their success stories is assembled from public sources, thus might differ from current reality. We’d appreciate it if you share the story of adopting ClickHouse in your company and [add it to the list](https://github.com/ClickHouse/ClickHouse/edit/master/docs/en/introduction/adopters.md), but please make sure you won’t have any NDA issues by doing so. Providing updates with publications from other companies is also useful. - -| Company | Industry | Usecase | Cluster Size | (Un)Compressed Data Size\* | Reference | -|---------|----------|---------|--------------|------------------------------------------------------------------------------|-----------| -| 2gis | Maps | Monitoring | — | — | [Talk in Russian, July 2019](https://youtu.be/58sPkXfq6nw) | -| Adapty | Subscription Analytics | Main product | — | — | [Tweet, November 2021](https://twitter.com/iwitaly/status/1462698148061659139) | -| Admiral | Martech | Engagement Management | — | — | [Webinar Slides, June 2020](https://altinity.com/presentations/2020/06/16/big-data-in-real-time-how-clickhouse-powers-admirals-visitor-relationships-for-publishers) | -| AdScribe | Ads | TV Analytics | — | — | [A quote from CTO](https://altinity.com/24x7-support/) | -| Ahrefs | SEO | Analytics | — | — | [Job listing](https://ahrefs.com/jobs/data-scientist-search) | -| Alibaba Cloud | Cloud | Managed Service | — | — | [Official Website](https://help.aliyun.com/product/144466.html) | -| Alibaba Cloud | Cloud | E-MapReduce | — | — | [Official Website](https://help.aliyun.com/document_detail/212195.html) | -| Aloha Browser | Mobile App | Browser backend | — | — | [Slides in Russian, May 2019](https://presentations.clickhouse.com/meetup22/aloha.pdf) | -| Altinity | Cloud, SaaS | Main product | — | — | [Official Website](https://altinity.com/) | -| Amadeus | Travel | Analytics | — | — | [Press Release, April 2018](https://www.altinity.com/blog/2018/4/5/amadeus-technologies-launches-investment-and-insights-tool-based-on-machine-learning-and-strategy-algorithms) | -| ApiRoad | API marketplace | Analytics | — | — | [Blog post, November 2018, March 2020](https://pixeljets.com/blog/clickhouse-vs-elasticsearch/) | -| Appsflyer | Mobile analytics | Main product | — | — | [Talk in Russian, July 2019](https://www.youtube.com/watch?v=M3wbRlcpBbY) | -| ArenaData | Data Platform | Main product | — | — | [Slides in Russian, December 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup38/indexes.pdf) | -| Argedor | ClickHouse support | — | — | — | [Official website](https://www.argedor.com/en/clickhouse/) | -| Avito | Classifieds | Monitoring | — | — | [Meetup, April 2020](https://www.youtube.com/watch?v=n1tm4j4W8ZQ) | -| Badoo | Dating | Timeseries | — | 1.6 mln events/sec (2018) | [Slides in Russian, December 2019](https://presentations.clickhouse.com/meetup38/forecast.pdf) | -| Beeline | Telecom | Data Platform | — | — | [Blog post, July 2021](https://habr.com/en/company/beeline/blog/567508/) | -| Benocs | Network Telemetry and Analytics | Main Product | — | — | [Slides in English, October 2017](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup9/lpm.pdf) | -| BIGO | Video | Computing Platform | — | — | [Blog Article, August 2020](https://www.programmersought.com/article/44544895251/) | -| BiliBili | Video sharing | — | — | — | [Blog post, June 2021](https://chowdera.com/2021/06/20210622012241476b.html) | -| Bloomberg | Finance, Media | Monitoring | — | — | [Job opening, September 2021](https://careers.bloomberg.com/job/detail/94913), [slides, May 2018](https://www.slideshare.net/Altinity/http-analytics-for-6m-requests-per-second-using-clickhouse-by-alexander-bocharov) | -| Bloxy | Blockchain | Analytics | — | — | [Slides in Russian, August 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup17/4_bloxy.pptx) | -| Bytedance | Social platforms | — | — | — | [The ClickHouse Meetup East, October 2020](https://www.youtube.com/watch?v=ckChUkC3Pns) | -| CardsMobile | Finance | Analytics | — | — | [VC.ru](https://vc.ru/s/cardsmobile/143449-rukovoditel-gruppy-analiza-dannyh) | -| CARTO | Business Intelligence | Geo analytics | — | — | [Geospatial processing with ClickHouse](https://carto.com/blog/geospatial-processing-with-clickhouse/) | -| CERN | Research | Experiment | — | — | [Press release, April 2012](https://www.yandex.com/company/press_center/press_releases/2012/2012-04-10/) | -| Checkly | Software Development | Analytics | — | — | [Tweet, October 2021](https://twitter.com/tim_nolet/status/1445810665743081474?s=20) | -| ChelPipe Group | Analytics | — | — | — | [Blog post, June 2021](https://vc.ru/trade/253172-tyazhelomu-proizvodstvu-user-friendly-sayt-internet-magazin-trub-dlya-chtpz) | -| Cisco | Networking | Traffic analysis | — | — | [Lightning talk, October 2019](https://youtu.be/-hI1vDR2oPY?t=5057) | -| Citadel Securities | Finance | — | — | — | [Contribution, March 2019](https://github.com/ClickHouse/ClickHouse/pull/4774) | -| Citymobil | Taxi | Analytics | — | — | [Blog Post in Russian, March 2020](https://habr.com/en/company/citymobil/blog/490660/) | -| Cloudflare | CDN | Traffic analysis | 36 servers | — | [Blog post, May 2017](https://blog.cloudflare.com/how-cloudflare-analyzes-1m-dns-queries-per-second/), [Blog post, March 2018](https://blog.cloudflare.com/http-analytics-for-6m-requests-per-second-using-clickhouse/) | -| Comcast | Media | CDN Traffic Analysis | — | — | [ApacheCon 2019 Talk](https://www.youtube.com/watch?v=e9TZ6gFDjNg) | -| ContentSquare | Web analytics | Main product | — | — | [Blog post in French, November 2018](http://souslecapot.net/2018/11/21/patrick-chatain-vp-engineering-chez-contentsquare-penser-davantage-amelioration-continue-que-revolution-constante/) | -| Corunet | Analytics | Main product | — | — | [Slides in English, April 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup21/predictive_models.pdf) | -| CraiditX 氪信 | Finance AI | Analysis | — | — | [Slides in English, November 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup33/udf.pptx) | -| Crazypanda | Games | | — | — | Live session on ClickHouse meetup | -| Criteo | Retail | Main product | — | — | [Slides in English, October 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup18/3_storetail.pptx) | -| Cryptology | Digital Assets Trading Platform | — | — | — | [Job advertisement, March 2021](https://career.habr.com/companies/cryptology/vacancies) | -| Dataliance for China Telecom | Telecom | Analytics | — | — | [Slides in Chinese, January 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup12/telecom.pdf) | -| Deutsche Bank | Finance | BI Analytics | — | — | [Slides in English, October 2019](https://bigdatadays.ru/wp-content/uploads/2019/10/D2-H3-3_Yakunin-Goihburg.pdf) | -| Deepl | Machine Learning | — | — | — | [Video, October 2021](https://www.youtube.com/watch?v=WIYJiPwxXdM&t=1182s) | -| Deeplay | Gaming Analytics | — | — | — | [Job advertisement, 2020](https://career.habr.com/vacancies/1000062568) | -| Diva-e | Digital consulting | Main Product | — | — | [Slides in English, September 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup29/ClickHouse-MeetUp-Unusual-Applications-sd-2019-09-17.pdf) | -| Ecommpay | Payment Processing | Logs | — | — | [Video, Nov 2019](https://www.youtube.com/watch?v=d3GdZTOWGLk) | -| Ecwid | E-commerce SaaS | Metrics, Logging | — | — | [Slides in Russian, April 2019](https://nastachku.ru/var/files/1/presentation/backend/2_Backend_6.pdf) | -| eBay | E-commerce | Logs, Metrics and Events | — | — | [Official website, Sep 2020](https://tech.ebayinc.com/engineering/ou-online-analytical-processing/) | -| Exness | Trading | Metrics, Logging | — | — | [Talk in Russian, May 2019](https://youtu.be/_rpU-TvSfZ8?t=3215) | -| EventBunker.io | Serverless Data Processing | — | — | — | [Tweet, April 2021](https://twitter.com/Halil_D_/status/1379839133472985091) | -| FastNetMon | DDoS Protection | Main Product | | — | [Official website](https://fastnetmon.com/docs-fnm-advanced/fastnetmon-advanced-traffic-persistency/) | -| Firebolt | Analytics | Main product | - | - | [YouTube Tech Talk](https://www.youtube.com/watch?v=9rW9uEJ15tU) | -| Flipkart | e-Commerce | — | — | — | [Talk in English, July 2020](https://youtu.be/GMiXCMFDMow?t=239) | -| FunCorp | Games | | — | 14 bn records/day as of Jan 2021 | [Article](https://www.altinity.com/blog/migrating-from-redshift-to-clickhouse) | -| Futurra Group | Analytics | — | — | — | [Article in Russian, December 2021](https://dou.ua/forums/topic/35587/) | -| Geniee | Ad network | Main product | — | — | [Blog post in Japanese, July 2017](https://tech.geniee.co.jp/entry/2017/07/20/160100) | -| Genotek | Bioinformatics | Main product | — | — | [Video, August 2020](https://youtu.be/v3KyZbz9lEE) | -| Gigapipe | Managed ClickHouse | Main product | — | — | [Official website](https://gigapipe.com/) | -| Gigasheet | Analytics | Main product | — | — | Direct Reference, February 2022| -| Glaber | Monitoring | Main product | — | — | [Website](https://glaber.io/) | -| GraphCDN | CDN | Traffic Analytics | — | — | [Blog Post in English, August 2021](https://altinity.com/blog/delivering-insight-on-graphql-apis-with-clickhouse-at-graphcdn/) | -| Grouparoo | Data Warehouse Integrations | Main product | — | — | [Official Website, November 2021](https://www.grouparoo.com/integrations) | -| HUYA | Video Streaming | Analytics | — | — | [Slides in Chinese, October 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup19/7.%20ClickHouse万亿数据分析实践%20李本旺(sundy-li)%20虎牙.pdf) | -| Hydrolix | Cloud data platform | Main product | — | — | [Documentation](https://docs.hydrolix.io/guide/query) | -| Hystax | Cloud Operations | Observability Analytics | - | - | [Blog](https://hystax.com/clickhouse-for-real-time-cost-saving-analytics-how-to-stop-hammering-screws-and-use-an-electric-screwdriver/) | -| ICA | FinTech | Risk Management | — | — | [Blog Post in English, Sep 2020](https://altinity.com/blog/clickhouse-vs-redshift-performance-for-fintech-risk-management?utm_campaign=ClickHouse%20vs%20RedShift&utm_content=143520807&utm_medium=social&utm_source=twitter&hss_channel=tw-3894792263) | -| Idealista | Real Estate | Analytics | — | — | [Blog Post in English, April 2019](https://clickhouse.com/blog/en/clickhouse-meetup-in-madrid-on-april-2-2019) | -| Infobaleen | AI markting tool | Analytics | — | — | [Official site](https://infobaleen.com) | -| Infovista | Networks | Analytics | — | — | [Slides in English, October 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup30/infovista.pdf) | -| InnoGames | Games | Metrics, Logging | — | — | [Slides in Russian, September 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup28/graphite_and_clickHouse.pdf) | -| Instabug | APM Platform | Main product | — | — | [A quote from Co-Founder](https://altinity.com/) | -| Instana | APM Platform | Main product | — | — | [Twitter post](https://twitter.com/mieldonkers/status/1248884119158882304) | -| Integros | Platform for video services | Analytics | — | — | [Slides in Russian, May 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup22/strategies.pdf) | -| Ippon Technologies | Technology Consulting | — | — | — | [Talk in English, July 2020](https://youtu.be/GMiXCMFDMow?t=205) | -| Ivi | Online Cinema | Analytics, Monitoring | — | — | [Article in Russian, Jan 2018](https://habr.com/en/company/ivi/blog/347408/) | -| Jinshuju 金数据 | BI Analytics | Main product | — | — | [Slides in Chinese, October 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup24/3.%20金数据数据架构调整方案Public.pdf) | -| Jitsu | Cloud Software | Data Pipeline | — | — | [Documentation](https://jitsu.com/docs/destinations-configuration/clickhouse-destination), [Hacker News post](https://news.ycombinator.com/item?id=29106082) | -| JuiceFS | Storage | Shopping Cart | - | - | [Blog](https://juicefs.com/blog/en/posts/shopee-clickhouse-with-juicefs/) | -| kakaocorp | Internet company | — | — | — | [if(kakao)2020](https://tv.kakao.com/channel/3693125/cliplink/414129353), [if(kakao)2021](https://if.kakao.com/session/24) | -| Kodiak Data | Clouds | Main product | — | — | [Slides in Engish, April 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup13/kodiak_data.pdf) | -| Kontur | Software Development | Metrics | — | — | [Talk in Russian, November 2018](https://www.youtube.com/watch?v=U4u4Bd0FtrY) | -| Kuaishou | Video | — | — | — | [ClickHouse Meetup, October 2018](https://clickhouse.com/blog/en/2018/clickhouse-community-meetup-in-beijing-on-october-28-2018/) | -| KGK Global | Vehicle monitoring | — | — | — | [Press release, June 2021](https://zoom.cnews.ru/news/item/530921) | -| LANCOM Systems | Network Solutions | Traffic analysis | - | - | [ClickHouse Operator for Kubernetes](https://www.lancom-systems.com/), [Hacker News post] (https://news.ycombinator.com/item?id=29413660) | -| Lawrence Berkeley National Laboratory | Research | Traffic analysis | 5 servers | 55 TiB | [Slides in English, April 2019](https://www.smitasin.com/presentations/2019-04-17_DOE-NSM.pdf) | -| Lever | Talent Management | Recruiting | - | - | [Hacker News post](https://news.ycombinator.com/item?id=29558544) | -| LifeStreet | Ad network | Main product | 75 servers (3 replicas) | 5.27 PiB | [Blog post in Russian, February 2017](https://habr.com/en/post/322620/) | -| Lookforsale | E-Commerce | — | — | — | [Job Posting, December 2021](https://telegram.me/javascript_jobs/587318) | -| Mail.ru Cloud Solutions | Cloud services | Main product | — | — | [Article in Russian](https://mcs.mail.ru/help/db-create/clickhouse#) | -| MAXILECT | Ad Tech, Blockchain, ML, AI | — | — | — | [Job advertisement, 2021](https://www.linkedin.com/feed/update/urn:li:activity:6780842017229430784/) | -| Marilyn | Advertising | Statistics | — | — | [Talk in Russian, June 2017](https://www.youtube.com/watch?v=iXlIgx2khwc) | -| Mello | Marketing | Analytics | 1 server | — | [Article, October 2020](https://vc.ru/marketing/166180-razrabotka-tipovogo-otcheta-skvoznoy-analitiki) | -| MessageBird | Telecommunications | Statistics | — | — | [Slides in English, November 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup20/messagebird.pdf) | -| Microsoft | Web Analytics | Clarity (Main Product) | — | — | [A question on GitHub](https://github.com/ClickHouse/ClickHouse/issues/21556) | -| MindsDB | Machine Learning | Main Product | — | — | [Official Website](https://www.mindsdb.com/blog/machine-learning-models-as-tables-in-ch) | -| MUX | Online Video | Video Analytics | — | — | [Talk in English, August 2019](https://altinity.com/presentations/2019/8/13/how-clickhouse-became-the-default-analytics-database-for-mux/) | -| MGID | Ad network | Web-analytics | — | — | [Blog post in Russian, April 2020](http://gs-studio.com/news-about-it/32777----clickhouse---c) | -| Muse Group | Music Software | Performance Monitoring | — | — | [Blog post in Russian, January 2021](https://habr.com/en/post/647079/) | -| Netskope | Network Security | — | — | — | [Job advertisement, March 2021](https://www.mendeley.com/careers/job/senior-software-developer-backend-developer-1346348) | -| NIC Labs | Network Monitoring | RaTA-DNS | — | — | [Blog post, March 2021](https://niclabs.cl/ratadns/2021/03/Clickhouse) | -| NLMK | Steel | Monitoring | — | — | [Article in Russian, Jan 2022](https://habr.com/en/company/nlmk/blog/645943/) | -| NOC Project | Network Monitoring | Analytics | Main Product | — | [Official Website](https://getnoc.com/features/big-data/) | -| Noction | Network Technology | Main Product | — | — | [Official Website](https://www.noction.com/news/irp-3-11-remote-triggered-blackholing-capability) -| ntop | Network Monitoning | Monitoring | — | — | [Official website, Jan 2022](https://www.ntop.org/ntop/historical-traffic-analysis-at-scale-using-clickhouse-with-ntopng/) | -| Nuna Inc. | Health Data Analytics | — | — | — | [Talk in English, July 2020](https://youtu.be/GMiXCMFDMow?t=170) | -| Ok.ru | Social Network | — | 72 servers | 810 TB compressed, 50bn rows/day, 1.5 TB/day | [SmartData conference, October 2021](https://assets.ctfassets.net/oxjq45e8ilak/4JPHkbJenLgZhBGGyyonFP/57472ec6987003ec4078d0941740703b/____________________ClickHouse_______________________.pdf) | -| Omnicomm | Transportation Monitoring | — | — | — | [Facebook post, October 2021](https://www.facebook.com/OmnicommTeam/posts/2824479777774500) | -| OneAPM | Monitoring and Data Analysis | Main product | — | — | [Slides in Chinese, October 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup19/8.%20clickhouse在OneAPM的应用%20杜龙.pdf) | -| Opensee | Financial Analytics | Main product | - | - | [Blog](https://opensee.io/news/from-moscow-to-wall-street-the-remarkable-journey-of-clickhouse/) | -| Open Targets | Genome Research | Genome Search | — | — | [Tweet, October 2021](https://twitter.com/OpenTargets/status/1452570865342758913?s=20), [Blog](https://blog.opentargets.org/graphql/) | -| OZON | E-commerce | — | — | — | [Official website](https://job.ozon.ru/vacancy/razrabotchik-clickhouse-ekspluatatsiya-40991870/) | -| Panelbear | Analytics | Monitoring and Analytics | — | — | [Tech Stack, November 2020](https://panelbear.com/blog/tech-stack/) | -| Percent 百分点 | Analytics | Main Product | — | — | [Slides in Chinese, June 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup24/4.%20ClickHouse万亿数据双中心的设计与实践%20.pdf) | -| Percona | Performance analysis | Percona Monitoring and Management | — | — | [Official website, Mar 2020](https://www.percona.com/blog/2020/03/30/advanced-query-analysis-in-percona-monitoring-and-management-with-direct-clickhouse-access/) | -| Plausible | Analytics | Main Product | — | — | [Blog post, June 2020](https://twitter.com/PlausibleHQ/status/1273889629087969280) | -| PostHog | Product Analytics | Main Product | — | — | [Release Notes, October 2020](https://posthog.com/blog/the-posthog-array-1-15-0), [Blog, November 2021](https://posthog.com/blog/how-we-turned-clickhouse-into-our-eventmansion) | -| Postmates | Delivery | — | — | — | [Talk in English, July 2020](https://youtu.be/GMiXCMFDMow?t=188) | -| Pragma Innovation | Telemetry and Big Data Analysis | Main product | — | — | [Slides in English, October 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup18/4_pragma_innovation.pdf) | -| PRANA | Industrial predictive analytics | Main product | — | — | [News (russian), Feb 2021](https://habr.com/en/news/t/541392/) | -| QINGCLOUD | Cloud services | Main product | — | — | [Slides in Chinese, October 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup19/4.%20Cloud%20%2B%20TSDB%20for%20ClickHouse%20张健%20QingCloud.pdf) | -| Qrator | DDoS protection | Main product | — | — | [Blog Post, March 2019](https://blog.qrator.net/en/clickhouse-ddos-mitigation_37/) | -| R-Vision | Information Security | — | — | — | [Article in Russian, December 2021](https://www.anti-malware.ru/reviews/R-Vision-SENSE-15) | -| Raiffeisenbank | Banking | Analytics | — | — | [Lecture in Russian, December 2020](https://cs.hse.ru/announcements/421965599.html) | -| Rambler | Internet services | Analytics | — | — | [Talk in Russian, April 2018](https://medium.com/@ramblertop/разработка-api-clickhouse-для-рамблер-топ-100-f4c7e56f3141) | -| Replica | Urban Planning | Analytics | — | — | [Job advertisement](https://boards.greenhouse.io/replica/jobs/5547732002?gh_jid=5547732002) | -| Retell | Speech synthesis | Analytics | — | — | [Blog Article, August 2020](https://vc.ru/services/153732-kak-sozdat-audiostati-na-vashem-sayte-i-zachem-eto-nuzhno) | -| Rollbar | Software Development | Main Product | — | — | [Official Website](https://www.rollbar.com) | -| Rspamd | Antispam | Analytics | — | — | [Official Website](https://rspamd.com/doc/modules/clickhouse.html) | -| RuSIEM | SIEM | Main Product | — | — | [Official Website](https://rusiem.com/en/products/architecture) | -| S7 Airlines | Airlines | Metrics, Logging | — | — | [Talk in Russian, March 2019](https://www.youtube.com/watch?v=nwG68klRpPg&t=15s) | -| Sber | Banking, Fintech, Retail, Cloud, Media | — | 128 servers | >1 PB | [Job advertisement, March 2021](https://career.habr.com/vacancies/1000073536) | -| scireum GmbH | e-Commerce | Main product | — | — | [Talk in German, February 2020](https://www.youtube.com/watch?v=7QWAn5RbyR4) | -| Segment | Data processing | Main product | 9 * i3en.3xlarge nodes 7.5TB NVME SSDs, 96GB Memory, 12 vCPUs | — | [Slides, 2019](https://slides.com/abraithwaite/segment-clickhouse) | -| sembot.io | Shopping Ads | — | — | — | A comment on LinkedIn, 2020 | -| SEMrush | Marketing | Main product | — | — | [Slides in Russian, August 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup17/5_semrush.pdf) | -| Sentry | Software Development | Main product | — | — | [Blog Post in English, May 2019](https://blog.sentry.io/2019/05/16/introducing-snuba-sentrys-new-search-infrastructure) | -| seo.do | Analytics | Main product | — | — | [Slides in English, November 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup35/CH%20Presentation-%20Metehan%20Çetinkaya.pdf) | -| SGK | Government Social Security | Analytics | — | — | [Slides in English, November 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup35/ClickHouse%20Meetup-Ramazan%20POLAT.pdf) | -| SigNoz | Observability Platform | Main Product | — | — | [Source code](https://github.com/SigNoz/signoz) | -| Sina | News | — | — | — | [Slides in Chinese, October 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup19/6.%20ClickHouse最佳实践%20高鹏_新浪.pdf) | -| Sipfront | Software Development | Analytics | — | — | [Tweet, October 2021](https://twitter.com/andreasgranig/status/1446404332337913895?s=20) | -| SMI2 | News | Analytics | — | — | [Blog Post in Russian, November 2017](https://habr.com/ru/company/smi2/blog/314558/) | -| Spark New Zealand | Telecommunications | Security Operations | — | — | [Blog Post, Feb 2020](https://blog.n0p.me/2020/02/2020-02-05-dnsmonster/) | -| Splitbee | Analytics | Main Product | — | — | [Blog Post, Mai 2021](https://splitbee.io/blog/new-pricing) | -| Splunk | Business Analytics | Main product | — | — | [Slides in English, January 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup12/splunk.pdf) | -| Spotify | Music | Experimentation | — | — | [Slides, July 2018](https://www.slideshare.net/glebus/using-clickhouse-for-experimentation-104247173) | -| Staffcop | Information Security | Main Product | — | — | [Official website, Documentation](https://www.staffcop.ru/sce43) | -| Suning | E-Commerce | User behaviour analytics | — | — | [Blog article](https://www.sohu.com/a/434152235_411876) | -| Superwall | Monetization Tooling | Main product | — | — | [Word of mouth, Jan 2022](https://github.com/ClickHouse/ClickHouse/pull/33573) | -| Teralytics | Mobility | Analytics | — | — | [Tech blog](https://www.teralytics.net/knowledge-hub/visualizing-mobility-data-the-scalability-challenge) | -| Tencent | Big Data | Data processing | — | — | [Slides in Chinese, October 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup19/5.%20ClickHouse大数据集群应用_李俊飞腾讯网媒事业部.pdf) | -| Tencent | Messaging | Logging | — | — | [Talk in Chinese, November 2019](https://youtu.be/T-iVQRuw-QY?t=5050) | -| Tencent Music Entertainment (TME) | BigData | Data processing | — | — | [Blog in Chinese, June 2020](https://cloud.tencent.com/developer/article/1637840) | -| Tesla | Electric vehicle and clean energy company | — | — | — | [Vacancy description, March 2021](https://news.ycombinator.com/item?id=26306170) | -| Timeflow | Software | Analytics | — | — | [Blog](https://timeflow.systems/why-we-moved-from-druid-to-clickhouse/ ) | -| Tinybird | Real-time Data Products | Data processing | — | — | [Official website](https://www.tinybird.co/) | -| Traffic Stars | AD network | — | 300 servers in Europe/US | 1.8 PiB, 700 000 insert rps (as of 2021) | [Slides in Russian, May 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup15/lightning/ninja.pdf) | -| Uber | Taxi | Logging | — | — | [Slides, February 2020](https://presentations.clickhouse.com/meetup40/uber.pdf) | -| UseTech | Software Development | — | — | — | [Job Posting, December 2021](https://vk.com/wall136266658_2418) | -| UTMSTAT | Analytics | Main product | — | — | [Blog post, June 2020](https://vc.ru/tribuna/133956-striming-dannyh-iz-servisa-skvoznoy-analitiki-v-clickhouse) | -| Vercel | Traffic and Performance Analytics | — | — | — | Direct reference, October 2021 | -| VKontakte | Social Network | Statistics, Logging | — | — | [Slides in Russian, August 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup17/3_vk.pdf) | -| VMware | Cloud | VeloCloud, SDN | — | — | [Product documentation](https://docs.vmware.com/en/vRealize-Operations-Manager/8.3/com.vmware.vcom.metrics.doc/GUID-A9AD72E1-C948-4CA2-971B-919385AB3CA8.html) | -| Walmart Labs | Internet, Retail | — | — | — | [Talk in English, July 2020](https://youtu.be/GMiXCMFDMow?t=144) | -| Wargaming | Games | | — | — | [Interview](https://habr.com/en/post/496954/) | -| Wildberries | E-commerce | | — | — | [Official website](https://it.wildberries.ru/) | -| Wisebits | IT Solutions | Analytics | — | — | [Slides in Russian, May 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup22/strategies.pdf) | -| Workato | Automation Software | — | — | — | [Talk in English, July 2020](https://youtu.be/GMiXCMFDMow?t=334) | -| Xenoss | Marketing, Advertising | — | — | — | [Instagram, March 2021](https://www.instagram.com/p/CNATV7qBgB1/) | -| Xiaoxin Tech | Education | Common purpose | — | — | [Slides in English, November 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup33/sync-clickhouse-with-mysql-mongodb.pptx) | -| Ximalaya | Audio sharing | OLAP | — | — | [Slides in English, November 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup33/ximalaya.pdf) | -| Yandex Cloud | Public Cloud | Main product | — | — | [Talk in Russian, December 2019](https://www.youtube.com/watch?v=pgnak9e_E0o) | -| Yandex DataLens | Business Intelligence | Main product | — | — | [Slides in Russian, December 2019](https://presentations.clickhouse.com/meetup38/datalens.pdf) | -| Yandex Market | e-Commerce | Metrics, Logging | — | — | [Talk in Russian, January 2019](https://youtu.be/_l1qP0DyBcA?t=478) | -| Yandex Metrica | Web analytics | Main product | 630 servers in one cluster, 360 servers in another cluster, 1862 servers in one department | 133 PiB / 8.31 PiB / 120 trillion records | [Slides, February 2020](https://presentations.clickhouse.com/meetup40/introduction/#13) | -| | Analytics | Main product | - | - | [Integration](https://www.yellowfinbi.com/campaign/yellowfin-9-whats-new#el-30219e0e) | -| Yotascale | Cloud | Data pipeline | — | 2 bn records/day | [LinkedIn (Accomplishments)](https://www.linkedin.com/in/adilsaleem/) | -| Your Analytics | Product Analytics | Main Product | — | - | [Tweet, November 2021](https://twitter.com/mikenikles/status/1459737241165565953) | -| Zagrava Trading | — | — | — | — | [Job offer, May 2021](https://twitter.com/datastackjobs/status/1394707267082063874) | -| ЦВТ | Software Development | Metrics, Logging | — | — | [Blog Post, March 2019, in Russian](https://vc.ru/dev/62715-kak-my-stroili-monitoring-na-prometheus-clickhouse-i-elk) | -| МКБ | Bank | Web-system monitoring | — | — | [Slides in Russian, September 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup28/mkb.pdf) | -| ЦФТ | Banking, Financial products, Payments | — | — | — | [Meetup in Russian, April 2020](https://team.cft.ru/events/162) | -| Цифровой Рабочий | Industrial IoT, Analytics | — | — | — | [Blog post in Russian, March 2021](https://habr.com/en/company/croc/blog/548018/) | -| ООО «МПЗ Богородский» | Agriculture | — | — | — | [Article in Russian, November 2020](https://cloud.yandex.ru/cases/okraina) | -| ДомКлик | Real Estate | — | — | — | [Article in Russian, October 2021](https://habr.com/ru/company/domclick/blog/585936/) | -| АС "Стрела" | Transportation | — | — | — | [Job posting, Jan 2022](https://vk.com/topic-111905078_35689124?post=3553) | - -[Original article](https://clickhouse.com/docs/en/introduction/adopters/) diff --git a/docs/en/introduction/distinctive-features.md b/docs/en/introduction/distinctive-features.md deleted file mode 100644 index 951a8a9d3e5..00000000000 --- a/docs/en/introduction/distinctive-features.md +++ /dev/null @@ -1,96 +0,0 @@ ---- -toc_priority: 4 -toc_title: Distinctive Features ---- - -# Distinctive Features of ClickHouse {#distinctive-features-of-clickhouse} - -## True Column-Oriented Database Management System {#true-column-oriented-dbms} - -In a real column-oriented DBMS, no extra data is stored with the values. Among other things, this means that constant-length values must be supported, to avoid storing their length “number” next to the values. For example, a billion UInt8-type values should consume around 1 GB uncompressed, or this strongly affects the CPU use. It is essential to store data compactly (without any “garbage”) even when uncompressed since the speed of decompression (CPU usage) depends mainly on the volume of uncompressed data. - -It is worth noting because there are systems that can store values of different columns separately, but that can’t effectively process analytical queries due to their optimization for other scenarios. Examples are HBase, BigTable, Cassandra, and HyperTable. You would get throughput around a hundred thousand rows per second in these systems, but not hundreds of millions of rows per second. - -It’s also worth noting that ClickHouse is a database management system, not a single database. ClickHouse allows creating tables and databases in runtime, loading data, and running queries without reconfiguring and restarting the server. - -## Data Compression {#data-compression} - -Some column-oriented DBMSs do not use data compression. However, data compression does play a key role in achieving excellent performance. - -In addition to efficient general-purpose compression codecs with different trade-offs between disk space and CPU consumption, ClickHouse provides [specialized codecs](../sql-reference/statements/create/table.md#create-query-specialized-codecs) for specific kinds of data, which allow ClickHouse to compete with and outperform more niche databases, like time-series ones. - -## Disk Storage of Data {#disk-storage-of-data} - -Keeping data physically sorted by primary key makes it possible to extract data for its specific values or value ranges with low latency, less than a few dozen milliseconds. Some column-oriented DBMSs (such as SAP HANA and Google PowerDrill) can only work in RAM. This approach encourages the allocation of a larger hardware budget than is necessary for real-time analysis. - -ClickHouse is designed to work on regular hard drives, which means the cost per GB of data storage is low, but SSD and additional RAM are also fully used if available. - -## Parallel Processing on Multiple Cores {#parallel-processing-on-multiple-cores} - -Large queries are parallelized naturally, taking all the necessary resources available on the current server. - -## Distributed Processing on Multiple Servers {#distributed-processing-on-multiple-servers} - -Almost none of the columnar DBMSs mentioned above have support for distributed query processing. - -In ClickHouse, data can reside on different shards. Each shard can be a group of replicas used for fault tolerance. All shards are used to run a query in parallel, transparently for the user. - -## SQL Support {#sql-support} - -ClickHouse supports a [declarative query language based on SQL](../sql-reference/index.md) that is identical to the ANSI SQL standard in [many cases](../sql-reference/ansi.md). - -Supported queries include [GROUP BY](../sql-reference/statements/select/group-by.md), [ORDER BY](../sql-reference/statements/select/order-by.md), subqueries in [FROM](../sql-reference/statements/select/from.md), [JOIN](../sql-reference/statements/select/join.md) clause, [IN](../sql-reference/operators/in.md) operator, [window functions](../sql-reference/window-functions/index.md) and scalar subqueries. - -Correlated (dependent) subqueries are not supported at the time of writing but might become available in the future. - -## Vector Computation Engine {#vector-engine} - -Data is not only stored by columns but is processed by vectors (parts of columns), which allows achieving high CPU efficiency. - -## Real-time Data Updates {#real-time-data-updates} - -ClickHouse supports tables with a primary key. To quickly perform queries on the range of the primary key, the data is sorted incrementally using the merge tree. Due to this, data can continually be added to the table. No locks are taken when new data is ingested. - -## Primary Index {#primary-index} - -Having a data physically sorted by primary key makes it possible to extract data for its specific values or value ranges with low latency, less than a few dozen milliseconds. - -## Secondary Indexes {#secondary-indexes} - -Unlike other database management systems, secondary indexes in ClickHouse does not point to specific rows or row ranges. Instead, they allow the database to know in advance that all rows in some data parts wouldn’t match the query filtering conditions and do not read them at all, thus they are called [data skipping indexes](../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-data_skipping-indexes). - -## Suitable for Online Queries {#suitable-for-online-queries} - -Most OLAP database management systems do not aim for online queries with sub-second latencies. In alternative systems, report building time of tens of seconds or even minutes is often considered acceptable. Sometimes it takes even more which forces to prepare reports offline (in advance or by responding with “come back later”). - -In ClickHouse low latency means that queries can be processed without delay and without trying to prepare an answer in advance, right at the same moment while the user interface page is loading. In other words, online. - -## Support for Approximated Calculations {#support-for-approximated-calculations} - -ClickHouse provides various ways to trade accuracy for performance: - -1. Aggregate functions for approximated calculation of the number of distinct values, medians, and quantiles. -2. Running a query based on a part (sample) of data and getting an approximated result. In this case, proportionally less data is retrieved from the disk. -3. Running an aggregation for a limited number of random keys, instead of for all keys. Under certain conditions for key distribution in the data, this provides a reasonably accurate result while using fewer resources. - -## Adaptive Join Algorithm {#adaptive-join-algorithm} - -ClickHouse adaptively chooses how to [JOIN](../sql-reference/statements/select/join.md) multiple tables, by preferring hash-join algorithm and falling back to the merge-join algorithm if there’s more than one large table. - -## Data Replication and Data Integrity Support {#data-replication-and-data-integrity-support} - -ClickHouse uses asynchronous multi-master replication. After being written to any available replica, all the remaining replicas retrieve their copy in the background. The system maintains identical data on different replicas. Recovery after most failures is performed automatically, or semi-automatically in complex cases. - -For more information, see the section [Data replication](../engines/table-engines/mergetree-family/replication.md). - -## Role-Based Access Control {#role-based-access-control} - -ClickHouse implements user account management using SQL queries and allows for [role-based access control configuration](../operations/access-rights.md) similar to what can be found in ANSI SQL standard and popular relational database management systems. - -## Features that Can Be Considered Disadvantages {#clickhouse-features-that-can-be-considered-disadvantages} - -1. No full-fledged transactions. -2. Lack of ability to modify or delete already inserted data with a high rate and low latency. There are batch deletes and updates available to clean up or modify data, for example, to comply with [GDPR](https://gdpr-info.eu). -3. The sparse index makes ClickHouse not so efficient for point queries retrieving single rows by their keys. - -[Original article](https://clickhouse.com/docs/en/introduction/distinctive-features/) diff --git a/docs/en/introduction/history.md b/docs/en/introduction/history.md deleted file mode 100644 index d192eff80ea..00000000000 --- a/docs/en/introduction/history.md +++ /dev/null @@ -1,54 +0,0 @@ ---- -toc_priority: 7 -toc_title: History ---- - -# ClickHouse History {#clickhouse-history} - -ClickHouse has been developed initially to power [Yandex.Metrica](https://metrica.yandex.com/), [the second largest web analytics platform in the world](http://w3techs.com/technologies/overview/traffic_analysis/all), and continues to be the core component of this system. With more than 13 trillion records in the database and more than 20 billion events daily, ClickHouse allows generating custom reports on the fly directly from non-aggregated data. This article briefly covers the goals of ClickHouse in the early stages of its development. - -Yandex.Metrica builds customized reports on the fly based on hits and sessions, with arbitrary segments defined by the user. Doing so often requires building complex aggregates, such as the number of unique users. New data for building a report arrives in real-time. - -As of April 2014, Yandex.Metrica was tracking about 12 billion events (page views and clicks) daily. All these events must be stored to build custom reports. A single query may require scanning millions of rows within a few hundred milliseconds, or hundreds of millions of rows in just a few seconds. - -## Usage in Yandex.Metrica and Other Yandex Services {#usage-in-yandex-metrica-and-other-yandex-services} - -ClickHouse serves multiple purposes in Yandex.Metrica. -Its main task is to build reports in online mode using non-aggregated data. It uses a cluster of 374 servers, which store over 20.3 trillion rows in the database. The volume of compressed data is about 2 PB, without accounting for duplicates and replicas. The volume of uncompressed data (in TSV format) would be approximately 17 PB. - -ClickHouse also plays a key role in the following processes: - -- Storing data for Session Replay from Yandex.Metrica. -- Processing intermediate data. -- Building global reports with Analytics. -- Running queries for debugging the Yandex.Metrica engine. -- Analyzing logs from the API and the user interface. - -Nowadays, there are multiple dozen ClickHouse installations in other Yandex services and departments: search verticals, e-commerce, advertisement, business analytics, mobile development, personal services, and others. - -## Aggregated and Non-aggregated Data {#aggregated-and-non-aggregated-data} - -There is a widespread opinion that to calculate statistics effectively, you must aggregate data since this reduces the volume of data. - -But data aggregation comes with a lot of limitations: - -- You must have a pre-defined list of required reports. -- The user can’t make custom reports. -- When aggregating over a large number of distinct keys, the data volume is barely reduced, so aggregation is useless. -- For a large number of reports, there are too many aggregation variations (combinatorial explosion). -- When aggregating keys with high cardinality (such as URLs), the volume of data is not reduced by much (less than twofold). -- For this reason, the volume of data with aggregation might grow instead of shrink. -- Users do not view all the reports we generate for them. A large portion of those calculations is useless. -- The logical integrity of data may be violated for various aggregations. - -If we do not aggregate anything and work with non-aggregated data, this might reduce the volume of calculations. - -However, with aggregation, a significant part of the work is taken offline and completed relatively calmly. In contrast, online calculations require calculating as fast as possible, since the user is waiting for the result. - -Yandex.Metrica has a specialized system for aggregating data called Metrage, which was used for the majority of reports. -Starting in 2009, Yandex.Metrica also used a specialized OLAP database for non-aggregated data called OLAPServer, which was previously used for the report builder. -OLAPServer worked well for non-aggregated data, but it had many restrictions that did not allow it to be used for all reports as desired. These included the lack of support for data types (only numbers), and the inability to incrementally update data in real-time (it could only be done by rewriting data daily). OLAPServer is not a DBMS, but a specialized DB. - -The initial goal for ClickHouse was to remove the limitations of OLAPServer and solve the problem of working with non-aggregated data for all reports, but over the years, it has grown into a general-purpose database management system suitable for a wide range of analytical tasks. - -[Original article](https://clickhouse.com/docs/en/introduction/history/) diff --git a/docs/en/introduction/index.md b/docs/en/introduction/index.md deleted file mode 100644 index ba80f9c2640..00000000000 --- a/docs/en/introduction/index.md +++ /dev/null @@ -1,6 +0,0 @@ ---- -toc_folder_title: Introduction -toc_priority: 1 ---- - - diff --git a/docs/en/introduction/performance.md b/docs/en/introduction/performance.md deleted file mode 100644 index 684b4ee4179..00000000000 --- a/docs/en/introduction/performance.md +++ /dev/null @@ -1,30 +0,0 @@ ---- -toc_priority: 6 -toc_title: Performance ---- - -# Performance {#performance} - -ClickHouse shows the best performance (both the highest throughput for long queries and the lowest latency on short queries) for comparable operating scenarios among systems of its class that were available for testing. You can view the test results on a [separate page](https://clickhouse.com/benchmark/dbms/). - -Numerous independent benchmarks came to similar conclusions. They are not difficult to find using an internet search, or you can see [our small collection of related links](https://clickhouse.com/#independent-benchmarks). - -## Throughput for a Single Large Query {#throughput-for-a-single-large-query} - -Throughput can be measured in rows per second or megabytes per second. If the data is placed in the page cache, a query that is not too complex is processed on modern hardware at a speed of approximately 2-10 GB/s of uncompressed data on a single server (for the most straightforward cases, the speed may reach 30 GB/s). If data is not placed in the page cache, the speed depends on the disk subsystem and the data compression rate. For example, if the disk subsystem allows reading data at 400 MB/s, and the data compression rate is 3, the speed is expected to be around 1.2 GB/s. To get the speed in rows per second, divide the speed in bytes per second by the total size of the columns used in the query. For example, if 10 bytes of columns are extracted, the speed is expected to be around 100-200 million rows per second. - -The processing speed increases almost linearly for distributed processing, but only if the number of rows resulting from aggregation or sorting is not too large. - -## Latency When Processing Short Queries {#latency-when-processing-short-queries} - -If a query uses a primary key and does not select too many columns and rows to process (hundreds of thousands), you can expect less than 50 milliseconds of latency (single digits of milliseconds in the best case) if data is placed in the page cache. Otherwise, latency is mostly dominated by the number of seeks. If you use rotating disk drives, for a system that is not overloaded, the latency can be estimated with this formula: `seek time (10 ms) * count of columns queried * count of data parts`. - -## Throughput When Processing a Large Quantity of Short Queries {#throughput-when-processing-a-large-quantity-of-short-queries} - -Under the same conditions, ClickHouse can handle several hundred queries per second on a single server (up to several thousand in the best case). Since this scenario is not typical for analytical DBMSs, we recommend expecting a maximum of 100 queries per second. - -## Performance When Inserting Data {#performance-when-inserting-data} - -We recommend inserting data in packets of at least 1000 rows, or no more than a single request per second. When inserting to a MergeTree table from a tab-separated dump, the insertion speed can be from 50 to 200 MB/s. If the inserted rows are around 1 KB in size, the speed will be from 50,000 to 200,000 rows per second. If the rows are small, the performance can be higher in rows per second (on Banner System data -`>` 500,000 rows per second; on Graphite data -`>` 1,000,000 rows per second). To improve performance, you can make multiple INSERT queries in parallel, which scales linearly. - -{## [Original article](https://clickhouse.com/docs/en/introduction/performance/) ##} diff --git a/docs/en/operations/_category_.yml b/docs/en/operations/_category_.yml new file mode 100644 index 00000000000..011ab58d26d --- /dev/null +++ b/docs/en/operations/_category_.yml @@ -0,0 +1,7 @@ +position: 70 +label: 'Operations' +collapsible: true +collapsed: true +link: + type: generated-index + title: Operations \ No newline at end of file diff --git a/docs/en/sql-reference/_category_.yml b/docs/en/sql-reference/_category_.yml new file mode 100644 index 00000000000..cfddcf46548 --- /dev/null +++ b/docs/en/sql-reference/_category_.yml @@ -0,0 +1,7 @@ +position: 15 +label: 'SQL Reference' +collapsible: true +collapsed: true +link: + type: generated-index + title: SQL Reference \ No newline at end of file diff --git a/docs/en/sql-reference/functions/encoding-functions.md b/docs/en/sql-reference/functions/encoding-functions.md index ec1524f1fa3..e3d5a4b18db 100644 --- a/docs/en/sql-reference/functions/encoding-functions.md +++ b/docs/en/sql-reference/functions/encoding-functions.md @@ -168,7 +168,7 @@ Result: Performs the opposite operation of [hex](#hex). It interprets each pair of hexadecimal digits (in the argument) as a number and converts it to the byte represented by the number. The return value is a binary string (BLOB). -If you want to convert the result to a number, you can use the [reverse](../../sql-reference/functions/string-functions.md#reverse) and [reinterpretAs](../../sql-reference/functions/type-conversion-functions.md#type-conversion-functions) functions. +If you want to convert the result to a number, you can use the [reverse](../../sql-reference/functions/string-functions.md#reverse) and [reinterpretAs<Type>](../../sql-reference/functions/type-conversion-functions.md#type-conversion-functions) functions. !!! note "Note" If `unhex` is invoked from within the `clickhouse-client`, binary strings display using UTF-8. @@ -326,7 +326,7 @@ unbin(arg) Alias: `UNBIN`. -For a numeric argument `unbin()` does not return the inverse of `bin()`. If you want to convert the result to a number, you can use the [reverse](../../sql-reference/functions/string-functions.md#reverse) and [reinterpretAs](../../sql-reference/functions/type-conversion-functions.md#reinterpretasuint8163264) functions. +For a numeric argument `unbin()` does not return the inverse of `bin()`. If you want to convert the result to a number, you can use the [reverse](../../sql-reference/functions/string-functions.md#reverse) and [reinterpretAs<Type>](../../sql-reference/functions/type-conversion-functions.md#reinterpretasuint8163264) functions. !!! note "Note" If `unbin` is invoked from within the `clickhouse-client`, binary strings are displayed using UTF-8. diff --git a/docs/en/sql-reference/statements/select/sample.md b/docs/en/sql-reference/statements/select/sample.md index 2405cb0a03c..a587731e563 100644 --- a/docs/en/sql-reference/statements/select/sample.md +++ b/docs/en/sql-reference/statements/select/sample.md @@ -25,11 +25,12 @@ The features of data sampling are listed below: For the `SAMPLE` clause the following syntax is supported: -| SAMPLE Clause Syntax | Description | -|----------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `SAMPLE k` | Here `k` is the number from 0 to 1.
The query is executed on `k` fraction of data. For example, `SAMPLE 0.1` runs the query on 10% of data. [Read more](#select-sample-k) | -| `SAMPLE n` | Here `n` is a sufficiently large integer.
The query is executed on a sample of at least `n` rows (but not significantly more than this). For example, `SAMPLE 10000000` runs the query on a minimum of 10,000,000 rows. [Read more](#select-sample-n) | -| `SAMPLE k OFFSET m` | Here `k` and `m` are the numbers from 0 to 1.
The query is executed on a sample of `k` fraction of the data. The data used for the sample is offset by `m` fraction. [Read more](#select-sample-offset) | +| SAMPLE Clause Syntax | Description | +|----------------------|------------------------------| +| `SAMPLE k` | Here `k` is the number from 0 to 1. The query is executed on `k` fraction of data. For example, `SAMPLE 0.1` runs the query on 10% of data. [Read more](#select-sample-k) | +| `SAMPLE n` | Here `n` is a sufficiently large integer. The query is executed on a sample of at least `n` rows (but not significantly more than this). For example, `SAMPLE 10000000` runs the query on a minimum of 10,000,000 rows. [Read more](#select-sample-n) | +| `SAMPLE k OFFSET m` | Here `k` and `m` are the numbers from 0 to 1. The query is executed on a sample of `k` fraction of the data. The data used for the sample is offset by `m` fraction. [Read more](#select-sample-offset) | + ## SAMPLE K {#select-sample-k} diff --git a/docs/en/whats-new/changelog/2017.md b/docs/en/whats-new/changelog/2017.md index af82c69386a..6a9f599daa0 100644 --- a/docs/en/whats-new/changelog/2017.md +++ b/docs/en/whats-new/changelog/2017.md @@ -1,6 +1,6 @@ --- -toc_priority: 79 -toc_title: '2017' +sidebar_label: 2017 +sidebar_position: 26 --- ### ClickHouse Release 1.1.54327, 2017-12-21 {#clickhouse-release-1-1-54327-2017-12-21} diff --git a/docs/en/whats-new/changelog/2018.md b/docs/en/whats-new/changelog/2018.md index db09bcd8a03..d4edca54e52 100644 --- a/docs/en/whats-new/changelog/2018.md +++ b/docs/en/whats-new/changelog/2018.md @@ -1,6 +1,6 @@ --- -toc_priority: 78 -toc_title: '2018' +sidebar_label: 2018 +sidebar_position: 25 --- ## ClickHouse Release 18.16 {#clickhouse-release-18-16} diff --git a/docs/en/whats-new/changelog/2019.md b/docs/en/whats-new/changelog/2019.md index aa06f5cb1e3..c41041705d9 100644 --- a/docs/en/whats-new/changelog/2019.md +++ b/docs/en/whats-new/changelog/2019.md @@ -1,6 +1,6 @@ --- -toc_priority: 77 -toc_title: '2019' +sidebar_label: 2019 +sidebar_position: 22 --- ## ClickHouse Release 19.17 {#clickhouse-release-v19-17} diff --git a/docs/en/whats-new/changelog/2020.md b/docs/en/whats-new/changelog/2020.md index e0afe256777..7ec37c51eb1 100644 --- a/docs/en/whats-new/changelog/2020.md +++ b/docs/en/whats-new/changelog/2020.md @@ -1,6 +1,6 @@ --- -toc_priority: 76 -toc_title: '2020' +sidebar_label: 2020 +sidebar_position: 21 --- ### ClickHouse release 20.12 diff --git a/docs/en/whats-new/changelog/2021.md b/docs/en/whats-new/changelog/2021.md index 2e81d981990..e4c430342ce 100644 --- a/docs/en/whats-new/changelog/2021.md +++ b/docs/en/whats-new/changelog/2021.md @@ -1,6 +1,8 @@ --- -toc_priority: 75 -toc_title: '2021' +sidebar_label: 2021 +sidebar_position: 20 +keywords: [clickhouse, changelog] +description: Changelog --- ### ClickHouse release v21.12, 2021-12-15 diff --git a/docs/en/whats-new/changelog/index.md b/docs/en/whats-new/changelog/index.md index 517ea16f3e7..22f6a30452d 100644 --- a/docs/en/whats-new/changelog/index.md +++ b/docs/en/whats-new/changelog/index.md @@ -1,7 +1,498 @@ --- -toc_folder_title: Changelog -toc_priority: 74 -toc_title: '2022' +sidebar_label: Changelog +sidebar_position: 1 +keywords: [clickhouse, changelog] +description: Changelog --- -{% include "content/changelog.md" %} +# ClickHouse Changelog + +### Table of Contents +**[ClickHouse release v22.3-lts, 2022-03-17](#223)**
+**[ClickHouse release v22.2, 2022-02-17](#222)**
+**[ClickHouse release v22.1, 2022-01-18](#221)**
+**[Changelog for 2021](https://github.com/ClickHouse/ClickHouse/blob/master/docs/en/whats-new/changelog/2021.md)**
+ + +## ClickHouse release v22.3-lts, 2022-03-17 + +#### Backward Incompatible Change + +* Make `arrayCompact` function behave as other higher-order functions: perform compaction not of lambda function results but on the original array. If you're using nontrivial lambda functions in arrayCompact you may restore old behaviour by wrapping `arrayCompact` arguments into `arrayMap`. Closes [#34010](https://github.com/ClickHouse/ClickHouse/issues/34010) [#18535](https://github.com/ClickHouse/ClickHouse/issues/18535) [#14778](https://github.com/ClickHouse/ClickHouse/issues/14778). [#34795](https://github.com/ClickHouse/ClickHouse/pull/34795) ([Alexandre Snarskii](https://github.com/snar)). +* Change implementation specific behavior on overflow of function `toDatetime`. It will be saturated to the nearest min/max supported instant of datetime instead of wraparound. This change is highlighted as "backward incompatible" because someone may unintentionally rely on the old behavior. [#32898](https://github.com/ClickHouse/ClickHouse/pull/32898) ([HaiBo Li](https://github.com/marising)). +* Make function `cast(value, 'IPv4')`, `cast(value, 'IPv6')` behave same as `toIPv4`, `toIPv6` functions. Changed behavior of incorrect IP address passed into functions `toIPv4`,` toIPv6`, now if invalid IP address passes into this functions exception will be raised, before this function return default value. Added functions `IPv4StringToNumOrDefault`, `IPv4StringToNumOrNull`, `IPv6StringToNumOrDefault`, `IPv6StringOrNull` `toIPv4OrDefault`, `toIPv4OrNull`, `toIPv6OrDefault`, `toIPv6OrNull`. Functions `IPv4StringToNumOrDefault `, `toIPv4OrDefault `, `toIPv6OrDefault ` should be used if previous logic relied on `IPv4StringToNum`, `toIPv4`, `toIPv6` returning default value for invalid address. Added setting `cast_ipv4_ipv6_default_on_conversion_error`, if this setting enabled, then IP address conversion functions will behave as before. Closes [#22825](https://github.com/ClickHouse/ClickHouse/issues/22825). Closes [#5799](https://github.com/ClickHouse/ClickHouse/issues/5799). Closes [#35156](https://github.com/ClickHouse/ClickHouse/issues/35156). [#35240](https://github.com/ClickHouse/ClickHouse/pull/35240) ([Maksim Kita](https://github.com/kitaisreal)). + +#### New Feature + +* Support for caching data locally for remote filesystems. It can be enabled for `s3` disks. Closes [#28961](https://github.com/ClickHouse/ClickHouse/issues/28961). [#33717](https://github.com/ClickHouse/ClickHouse/pull/33717) ([Kseniia Sumarokova](https://github.com/kssenii)). In the meantime, we enabled the test suite on s3 filesystem and no more known issues exist, so it is started to be production ready. +* Add new table function `hive`. It can be used as follows `hive('', '', '', '', '')` for example `SELECT * FROM hive('thrift://hivetest:9083', 'test', 'demo', 'id Nullable(String), score Nullable(Int32), day Nullable(String)', 'day')`. [#34946](https://github.com/ClickHouse/ClickHouse/pull/34946) ([lgbo](https://github.com/lgbo-ustc)). +* Support authentication of users connected via SSL by their X.509 certificate. [#31484](https://github.com/ClickHouse/ClickHouse/pull/31484) ([eungenue](https://github.com/eungenue)). +* Support schema inference for inserting into table functions `file`/`hdfs`/`s3`/`url`. [#34732](https://github.com/ClickHouse/ClickHouse/pull/34732) ([Kruglov Pavel](https://github.com/Avogar)). +* Now you can read `system.zookeeper` table without restrictions on path or using `like` expression. This reads can generate quite heavy load for zookeeper so to enable this ability you have to enable setting `allow_unrestricted_reads_from_keeper`. [#34609](https://github.com/ClickHouse/ClickHouse/pull/34609) ([Sergei Trifonov](https://github.com/serxa)). +* Display CPU and memory metrics in clickhouse-local. Close [#34545](https://github.com/ClickHouse/ClickHouse/issues/34545). [#34605](https://github.com/ClickHouse/ClickHouse/pull/34605) ([李扬](https://github.com/taiyang-li)). +* Implement `startsWith` and `endsWith` function for arrays, closes [#33982](https://github.com/ClickHouse/ClickHouse/issues/33982). [#34368](https://github.com/ClickHouse/ClickHouse/pull/34368) ([usurai](https://github.com/usurai)). +* Add three functions for Map data type: 1. `mapReplace(map1, map2)` - replaces values for keys in map1 with the values of the corresponding keys in map2; adds keys from map2 that don't exist in map1. 2. `mapFilter` 3. `mapMap`. mapFilter and mapMap are higher order functions, accepting two arguments, the first argument is a lambda function with k, v pair as arguments, the second argument is a column of type Map. [#33698](https://github.com/ClickHouse/ClickHouse/pull/33698) ([hexiaoting](https://github.com/hexiaoting)). +* Allow getting default user and password for clickhouse-client from the `CLICKHOUSE_USER` and `CLICKHOUSE_PASSWORD` environment variables. Close [#34538](https://github.com/ClickHouse/ClickHouse/issues/34538). [#34947](https://github.com/ClickHouse/ClickHouse/pull/34947) ([DR](https://github.com/freedomDR)). + +#### Experimental Feature + +* New data type `Object()`, which supports storing of semi-structured data (for now JSON only). Data is written to such types as string. Then all paths are extracted according to format of semi-structured data and written as separate columns in most optimal types, that can store all their values. Those columns can be queried by names that match paths in source data. E.g `data.key1.key2` or with cast operator `data.key1.key2::Int64`. +* Add `database_replicated_allow_only_replicated_engine` setting. When enabled, it only allowed to only create `Replicated` tables or tables with stateless engines in `Replicated` databases. [#35214](https://github.com/ClickHouse/ClickHouse/pull/35214) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). Note that `Replicated` database is still an experimental feature. + +#### Performance Improvement + +* Improve performance of insertion into `MergeTree` tables by optimizing sorting. Up to 2x improvement is observed on realistic benchmarks. [#34750](https://github.com/ClickHouse/ClickHouse/pull/34750) ([Maksim Kita](https://github.com/kitaisreal)). +* Columns pruning when reading Parquet, ORC and Arrow files from URL and S3. Closes [#34163](https://github.com/ClickHouse/ClickHouse/issues/34163). [#34849](https://github.com/ClickHouse/ClickHouse/pull/34849) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Columns pruning when reading Parquet, ORC and Arrow files from Hive. [#34954](https://github.com/ClickHouse/ClickHouse/pull/34954) ([lgbo](https://github.com/lgbo-ustc)). +* A bunch of performance optimizations from a performance superhero. Improve performance of processing queries with large `IN` section. Improve performance of `direct` dictionary if its source is `ClickHouse`. Improve performance of `detectCharset `, `detectLanguageUnknown ` functions. [#34888](https://github.com/ClickHouse/ClickHouse/pull/34888) ([Maksim Kita](https://github.com/kitaisreal)). +* Improve performance of `any` aggregate function by using more batching. [#34760](https://github.com/ClickHouse/ClickHouse/pull/34760) ([Raúl Marín](https://github.com/Algunenano)). +* Multiple improvements for performance of `clickhouse-keeper`: less locking [#35010](https://github.com/ClickHouse/ClickHouse/pull/35010) ([zhanglistar](https://github.com/zhanglistar)), lower memory usage by streaming reading and writing of snapshot instead of full copy. [#34584](https://github.com/ClickHouse/ClickHouse/pull/34584) ([zhanglistar](https://github.com/zhanglistar)), optimizing compaction of log store in the RAFT implementation. [#34534](https://github.com/ClickHouse/ClickHouse/pull/34534) ([zhanglistar](https://github.com/zhanglistar)), versioning of the internal data structure [#34486](https://github.com/ClickHouse/ClickHouse/pull/34486) ([zhanglistar](https://github.com/zhanglistar)). + +#### Improvement + +* Allow asynchronous inserts to table functions. Fixes [#34864](https://github.com/ClickHouse/ClickHouse/issues/34864). [#34866](https://github.com/ClickHouse/ClickHouse/pull/34866) ([Anton Popov](https://github.com/CurtizJ)). +* Implicit type casting of the key argument for functions `dictGetHierarchy`, `dictIsIn`, `dictGetChildren`, `dictGetDescendants`. Closes [#34970](https://github.com/ClickHouse/ClickHouse/issues/34970). [#35027](https://github.com/ClickHouse/ClickHouse/pull/35027) ([Maksim Kita](https://github.com/kitaisreal)). +* `EXPLAIN AST` query can output AST in form of a graph in Graphviz format: `EXPLAIN AST graph = 1 SELECT * FROM system.parts`. [#35173](https://github.com/ClickHouse/ClickHouse/pull/35173) ([李扬](https://github.com/taiyang-li)). +* When large files were written with `s3` table function or table engine, the content type on the files was mistakenly set to `application/xml` due to a bug in the AWS SDK. This closes [#33964](https://github.com/ClickHouse/ClickHouse/issues/33964). [#34433](https://github.com/ClickHouse/ClickHouse/pull/34433) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Change restrictive row policies a bit to make them an easier alternative to permissive policies in easy cases. If for a particular table only restrictive policies exist (without permissive policies) users will be able to see some rows. Also `SHOW CREATE ROW POLICY` will always show `AS permissive` or `AS restrictive` in row policy's definition. [#34596](https://github.com/ClickHouse/ClickHouse/pull/34596) ([Vitaly Baranov](https://github.com/vitlibar)). +* Improve schema inference with globs in File/S3/HDFS/URL engines. Try to use the next path for schema inference in case of error. [#34465](https://github.com/ClickHouse/ClickHouse/pull/34465) ([Kruglov Pavel](https://github.com/Avogar)). +* Play UI now correctly detects the preferred light/dark theme from the OS. [#35068](https://github.com/ClickHouse/ClickHouse/pull/35068) ([peledni](https://github.com/peledni)). +* Added `date_time_input_format = 'best_effort_us'`. Closes [#34799](https://github.com/ClickHouse/ClickHouse/issues/34799). [#34982](https://github.com/ClickHouse/ClickHouse/pull/34982) ([WenYao](https://github.com/Cai-Yao)). +* A new settings called `allow_plaintext_password` and `allow_no_password` are added in server configuration which turn on/off authentication types that can be potentially insecure in some environments. They are allowed by default. [#34738](https://github.com/ClickHouse/ClickHouse/pull/34738) ([Heena Bansal](https://github.com/HeenaBansal2009)). +* Support for `DateTime64` data type in `Arrow` format, closes [#8280](https://github.com/ClickHouse/ClickHouse/issues/8280) and closes [#28574](https://github.com/ClickHouse/ClickHouse/issues/28574). [#34561](https://github.com/ClickHouse/ClickHouse/pull/34561) ([李扬](https://github.com/taiyang-li)). +* Reload `remote_url_allow_hosts` (filtering of outgoing connections) on config update. [#35294](https://github.com/ClickHouse/ClickHouse/pull/35294) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Support `--testmode` parameter for `clickhouse-local`. This parameter enables interpretation of test hints that we use in functional tests. [#35264](https://github.com/ClickHouse/ClickHouse/pull/35264) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Add `distributed_depth` to query log. It is like a more detailed variant of `is_initial_query` [#35207](https://github.com/ClickHouse/ClickHouse/pull/35207) ([李扬](https://github.com/taiyang-li)). +* Respect `remote_url_allow_hosts` for `MySQL` and `PostgreSQL` table functions. [#35191](https://github.com/ClickHouse/ClickHouse/pull/35191) ([Heena Bansal](https://github.com/HeenaBansal2009)). +* Added `disk_name` field to `system.part_log`. [#35178](https://github.com/ClickHouse/ClickHouse/pull/35178) ([Artyom Yurkov](https://github.com/Varinara)). +* Do not retry non-rertiable errors when querying remote URLs. Closes [#35161](https://github.com/ClickHouse/ClickHouse/issues/35161). [#35172](https://github.com/ClickHouse/ClickHouse/pull/35172) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Support distributed INSERT SELECT queries (the setting `parallel_distributed_insert_select`) table function `view()`. [#35132](https://github.com/ClickHouse/ClickHouse/pull/35132) ([Azat Khuzhin](https://github.com/azat)). +* More precise memory tracking during `INSERT` into `Buffer` with `AggregateFunction`. [#35072](https://github.com/ClickHouse/ClickHouse/pull/35072) ([Azat Khuzhin](https://github.com/azat)). +* Avoid division by zero in Query Profiler if Linux kernel has a bug. Closes [#34787](https://github.com/ClickHouse/ClickHouse/issues/34787). [#35032](https://github.com/ClickHouse/ClickHouse/pull/35032) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Add more sanity checks for keeper configuration: now mixing of localhost and non-local servers is not allowed, also add checks for same value of internal raft port and keeper client port. [#35004](https://github.com/ClickHouse/ClickHouse/pull/35004) ([alesapin](https://github.com/alesapin)). +* Currently, if the user changes the settings of the system tables there will be tons of logs and ClickHouse will rename the tables every minute. This fixes [#34929](https://github.com/ClickHouse/ClickHouse/issues/34929). [#34949](https://github.com/ClickHouse/ClickHouse/pull/34949) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). +* Use connection pool for Hive metastore client. [#34940](https://github.com/ClickHouse/ClickHouse/pull/34940) ([lgbo](https://github.com/lgbo-ustc)). +* Ignore per-column `TTL` in `CREATE TABLE AS` if new table engine does not support it (i.e. if the engine is not of `MergeTree` family). [#34938](https://github.com/ClickHouse/ClickHouse/pull/34938) ([Azat Khuzhin](https://github.com/azat)). +* Allow `LowCardinality` strings for `ngrambf_v1`/`tokenbf_v1` indexes. Closes [#21865](https://github.com/ClickHouse/ClickHouse/issues/21865). [#34911](https://github.com/ClickHouse/ClickHouse/pull/34911) ([Lars Hiller Eidnes](https://github.com/larspars)). +* Allow opening empty sqlite db if the file doesn't exist. Closes [#33367](https://github.com/ClickHouse/ClickHouse/issues/33367). [#34907](https://github.com/ClickHouse/ClickHouse/pull/34907) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Implement memory statistics for FreeBSD - this is required for `max_server_memory_usage` to work correctly. [#34902](https://github.com/ClickHouse/ClickHouse/pull/34902) ([Alexandre Snarskii](https://github.com/snar)). +* In previous versions the progress bar in clickhouse-client can jump forward near 50% for no reason. This closes [#34324](https://github.com/ClickHouse/ClickHouse/issues/34324). [#34801](https://github.com/ClickHouse/ClickHouse/pull/34801) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Now `ALTER TABLE DROP COLUMN columnX` queries for `MergeTree` table engines will work instantly when `columnX` is an `ALIAS` column. Fixes [#34660](https://github.com/ClickHouse/ClickHouse/issues/34660). [#34786](https://github.com/ClickHouse/ClickHouse/pull/34786) ([alesapin](https://github.com/alesapin)). +* Show hints when user mistyped the name of a data skipping index. Closes [#29698](https://github.com/ClickHouse/ClickHouse/issues/29698). [#34764](https://github.com/ClickHouse/ClickHouse/pull/34764) ([flynn](https://github.com/ucasfl)). +* Support `remote()`/`cluster()` table functions for `parallel_distributed_insert_select`. [#34728](https://github.com/ClickHouse/ClickHouse/pull/34728) ([Azat Khuzhin](https://github.com/azat)). +* Do not reset logging that configured via `--log-file`/`--errorlog-file` command line options in case of empty configuration in the config file. [#34718](https://github.com/ClickHouse/ClickHouse/pull/34718) ([Amos Bird](https://github.com/amosbird)). +* Extract schema only once on table creation and prevent reading from local files/external sources to extract schema on each server startup. [#34684](https://github.com/ClickHouse/ClickHouse/pull/34684) ([Kruglov Pavel](https://github.com/Avogar)). +* Allow specifying argument names for executable UDFs. This is necessary for formats where argument name is part of serialization, like `Native`, `JSONEachRow`. Closes [#34604](https://github.com/ClickHouse/ClickHouse/issues/34604). [#34653](https://github.com/ClickHouse/ClickHouse/pull/34653) ([Maksim Kita](https://github.com/kitaisreal)). +* `MaterializedMySQL` (experimental feature) now supports `materialized_mysql_tables_list` (a comma-separated list of MySQL database tables, which will be replicated by the MaterializedMySQL database engine. Default value: empty list — means all the tables will be replicated), mentioned at [#32977](https://github.com/ClickHouse/ClickHouse/issues/32977). [#34487](https://github.com/ClickHouse/ClickHouse/pull/34487) ([zzsmdfj](https://github.com/zzsmdfj)). +* Improve OpenTelemetry span logs for INSERT operation on distributed table. [#34480](https://github.com/ClickHouse/ClickHouse/pull/34480) ([Frank Chen](https://github.com/FrankChen021)). +* Make the znode `ctime` and `mtime` consistent between servers in ClickHouse Keeper. [#33441](https://github.com/ClickHouse/ClickHouse/pull/33441) ([小路](https://github.com/nicelulu)). + +#### Build/Testing/Packaging Improvement + +* Package repository is migrated to JFrog Artifactory (**Mikhail f. Shiryaev**). +* Randomize some settings in functional tests, so more possible combinations of settings will be tested. This is yet another fuzzing method to ensure better test coverage. This closes [#32268](https://github.com/ClickHouse/ClickHouse/issues/32268). [#34092](https://github.com/ClickHouse/ClickHouse/pull/34092) ([Kruglov Pavel](https://github.com/Avogar)). +* Drop PVS-Studio from our CI. [#34680](https://github.com/ClickHouse/ClickHouse/pull/34680) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Add an ability to build stripped binaries with CMake. In previous versions it was performed by dh-tools. [#35196](https://github.com/ClickHouse/ClickHouse/pull/35196) ([alesapin](https://github.com/alesapin)). +* Smaller "fat-free" `clickhouse-keeper` build. [#35031](https://github.com/ClickHouse/ClickHouse/pull/35031) ([alesapin](https://github.com/alesapin)). +* Use @robot-clickhouse as an author and committer for PRs like https://github.com/ClickHouse/ClickHouse/pull/34685. [#34793](https://github.com/ClickHouse/ClickHouse/pull/34793) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Limit DWARF version for debug info by 4 max, because our internal stack symbolizer cannot parse DWARF version 5. This makes sense if you compile ClickHouse with clang-15. [#34777](https://github.com/ClickHouse/ClickHouse/pull/34777) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Remove `clickhouse-test` debian package as unneeded complication. CI use tests from repository and standalone testing via deb package is no longer supported. [#34606](https://github.com/ClickHouse/ClickHouse/pull/34606) ([Ilya Yatsishin](https://github.com/qoega)). + +#### Bug Fix (user-visible misbehaviour in official stable or prestable release) + +* A fix for HDFS integration: When the inner buffer size is too small, NEED_MORE_INPUT in `HadoopSnappyDecoder` will run multi times (>=3) for one compressed block. This makes the input data be copied into the wrong place in `HadoopSnappyDecoder::buffer`. [#35116](https://github.com/ClickHouse/ClickHouse/pull/35116) ([lgbo](https://github.com/lgbo-ustc)). +* Ignore obsolete grants in ATTACH GRANT statements. This PR fixes [#34815](https://github.com/ClickHouse/ClickHouse/issues/34815). [#34855](https://github.com/ClickHouse/ClickHouse/pull/34855) ([Vitaly Baranov](https://github.com/vitlibar)). +* Fix segfault in Postgres database when getting create table query if database was created using named collections. Closes [#35312](https://github.com/ClickHouse/ClickHouse/issues/35312). [#35313](https://github.com/ClickHouse/ClickHouse/pull/35313) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix partial merge join duplicate rows bug, close [#31009](https://github.com/ClickHouse/ClickHouse/issues/31009). [#35311](https://github.com/ClickHouse/ClickHouse/pull/35311) ([Vladimir C](https://github.com/vdimir)). +* Fix possible `Assertion 'position() != working_buffer.end()' failed` while using bzip2 compression with small `max_read_buffer_size` setting value. The bug was found in https://github.com/ClickHouse/ClickHouse/pull/35047. [#35300](https://github.com/ClickHouse/ClickHouse/pull/35300) ([Kruglov Pavel](https://github.com/Avogar)). While using lz4 compression with a small max_read_buffer_size setting value. [#35296](https://github.com/ClickHouse/ClickHouse/pull/35296) ([Kruglov Pavel](https://github.com/Avogar)). While using lzma compression with small `max_read_buffer_size` setting value. [#35295](https://github.com/ClickHouse/ClickHouse/pull/35295) ([Kruglov Pavel](https://github.com/Avogar)). While using `brotli` compression with a small `max_read_buffer_size` setting value. The bug was found in https://github.com/ClickHouse/ClickHouse/pull/35047. [#35281](https://github.com/ClickHouse/ClickHouse/pull/35281) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix possible segfault in `JSONEachRow` schema inference. [#35291](https://github.com/ClickHouse/ClickHouse/pull/35291) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix `CHECK TABLE` query in case when sparse columns are enabled in table. [#35274](https://github.com/ClickHouse/ClickHouse/pull/35274) ([Anton Popov](https://github.com/CurtizJ)). +* Avoid std::terminate in case of exception in reading from remote VFS. [#35257](https://github.com/ClickHouse/ClickHouse/pull/35257) ([Azat Khuzhin](https://github.com/azat)). +* Fix reading port from config, close [#34776](https://github.com/ClickHouse/ClickHouse/issues/34776). [#35193](https://github.com/ClickHouse/ClickHouse/pull/35193) ([Vladimir C](https://github.com/vdimir)). +* Fix error in query with `WITH TOTALS` in case if `HAVING` returned empty result. This fixes [#33711](https://github.com/ClickHouse/ClickHouse/issues/33711). [#35186](https://github.com/ClickHouse/ClickHouse/pull/35186) ([Amos Bird](https://github.com/amosbird)). +* Fix a corner case of `replaceRegexpAll`, close [#35117](https://github.com/ClickHouse/ClickHouse/issues/35117). [#35182](https://github.com/ClickHouse/ClickHouse/pull/35182) ([Vladimir C](https://github.com/vdimir)). +* Schema inference didn't work properly on case of `INSERT INTO FUNCTION s3(...) FROM ...`, it tried to read schema from s3 file instead of from select query. [#35176](https://github.com/ClickHouse/ClickHouse/pull/35176) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix MaterializedPostgreSQL (experimental feature) `table overrides` for partition by, etc. Closes [#35048](https://github.com/ClickHouse/ClickHouse/issues/35048). [#35162](https://github.com/ClickHouse/ClickHouse/pull/35162) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix MaterializedPostgreSQL (experimental feature) adding new table to replication (ATTACH TABLE) after manually removing (DETACH TABLE). Closes [#33800](https://github.com/ClickHouse/ClickHouse/issues/33800). Closes [#34922](https://github.com/ClickHouse/ClickHouse/issues/34922). Closes [#34315](https://github.com/ClickHouse/ClickHouse/issues/34315). [#35158](https://github.com/ClickHouse/ClickHouse/pull/35158) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix partition pruning error when non-monotonic function is used with IN operator. This fixes [#35136](https://github.com/ClickHouse/ClickHouse/issues/35136). [#35146](https://github.com/ClickHouse/ClickHouse/pull/35146) ([Amos Bird](https://github.com/amosbird)). +* Fixed slightly incorrect translation of YAML configs to XML. [#35135](https://github.com/ClickHouse/ClickHouse/pull/35135) ([Miel Donkers](https://github.com/mdonkers)). +* Fix `optimize_skip_unused_shards_rewrite_in` for signed columns and negative values. [#35134](https://github.com/ClickHouse/ClickHouse/pull/35134) ([Azat Khuzhin](https://github.com/azat)). +* The `update_lag` external dictionary configuration option was unusable showing the error message ``Unexpected key `update_lag` in dictionary source configuration``. [#35089](https://github.com/ClickHouse/ClickHouse/pull/35089) ([Jason Chu](https://github.com/1lann)). +* Avoid possible deadlock on server shutdown. [#35081](https://github.com/ClickHouse/ClickHouse/pull/35081) ([Azat Khuzhin](https://github.com/azat)). +* Fix missing alias after function is optimized to a subcolumn when setting `optimize_functions_to_subcolumns` is enabled. Closes [#33798](https://github.com/ClickHouse/ClickHouse/issues/33798). [#35079](https://github.com/ClickHouse/ClickHouse/pull/35079) ([qieqieplus](https://github.com/qieqieplus)). +* Fix reading from `system.asynchronous_inserts` table if there exists asynchronous insert into table function. [#35050](https://github.com/ClickHouse/ClickHouse/pull/35050) ([Anton Popov](https://github.com/CurtizJ)). +* Fix possible exception `Reading for MergeTree family tables must be done with last position boundary` (relevant to operation on remote VFS). Closes [#34979](https://github.com/ClickHouse/ClickHouse/issues/34979). [#35001](https://github.com/ClickHouse/ClickHouse/pull/35001) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix unexpected result when use -State type aggregate function in window frame. [#34999](https://github.com/ClickHouse/ClickHouse/pull/34999) ([metahys](https://github.com/metahys)). +* Fix possible segfault in FileLog (experimental feature). Closes [#30749](https://github.com/ClickHouse/ClickHouse/issues/30749). [#34996](https://github.com/ClickHouse/ClickHouse/pull/34996) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix possible rare error `Cannot push block to port which already has data`. [#34993](https://github.com/ClickHouse/ClickHouse/pull/34993) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Fix wrong schema inference for unquoted dates in CSV. Closes [#34768](https://github.com/ClickHouse/ClickHouse/issues/34768). [#34961](https://github.com/ClickHouse/ClickHouse/pull/34961) ([Kruglov Pavel](https://github.com/Avogar)). +* Integration with Hive: Fix unexpected result when use `in` in `where` in hive query. [#34945](https://github.com/ClickHouse/ClickHouse/pull/34945) ([lgbo](https://github.com/lgbo-ustc)). +* Avoid busy polling in ClickHouse Keeper while searching for changelog files to delete. [#34931](https://github.com/ClickHouse/ClickHouse/pull/34931) ([Azat Khuzhin](https://github.com/azat)). +* Fix DateTime64 conversion from PostgreSQL. Closes [#33364](https://github.com/ClickHouse/ClickHouse/issues/33364). [#34910](https://github.com/ClickHouse/ClickHouse/pull/34910) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix possible "Part directory doesn't exist" during `INSERT` into MergeTree table backed by VFS over s3. [#34876](https://github.com/ClickHouse/ClickHouse/pull/34876) ([Azat Khuzhin](https://github.com/azat)). +* Support DDLs like CREATE USER to be executed on cross replicated cluster. [#34860](https://github.com/ClickHouse/ClickHouse/pull/34860) ([Jianmei Zhang](https://github.com/zhangjmruc)). +* Fix bugs for multiple columns group by in `WindowView` (experimental feature). [#34859](https://github.com/ClickHouse/ClickHouse/pull/34859) ([vxider](https://github.com/Vxider)). +* Fix possible failures in S2 functions when queries contain const columns. [#34745](https://github.com/ClickHouse/ClickHouse/pull/34745) ([Bharat Nallan](https://github.com/bharatnc)). +* Fix bug for H3 funcs containing const columns which cause queries to fail. [#34743](https://github.com/ClickHouse/ClickHouse/pull/34743) ([Bharat Nallan](https://github.com/bharatnc)). +* Fix `No such file or directory` with enabled `fsync_part_directory` and vertical merge. [#34739](https://github.com/ClickHouse/ClickHouse/pull/34739) ([Azat Khuzhin](https://github.com/azat)). +* Fix serialization/printing for system queries `RELOAD MODEL`, `RELOAD FUNCTION`, `RESTART DISK` when used `ON CLUSTER`. Closes [#34514](https://github.com/ClickHouse/ClickHouse/issues/34514). [#34696](https://github.com/ClickHouse/ClickHouse/pull/34696) ([Maksim Kita](https://github.com/kitaisreal)). +* Fix `allow_experimental_projection_optimization` with `enable_global_with_statement` (before it may lead to `Stack size too large` error in case of multiple expressions in `WITH` clause, and also it executes scalar subqueries again and again, so not it will be more optimal). [#34650](https://github.com/ClickHouse/ClickHouse/pull/34650) ([Azat Khuzhin](https://github.com/azat)). +* Stop to select part for mutate when the other replica has already updated the transaction log for `ReplatedMergeTree` engine. [#34633](https://github.com/ClickHouse/ClickHouse/pull/34633) ([Jianmei Zhang](https://github.com/zhangjmruc)). +* Fix incorrect result of trivial count query when part movement feature is used [#34089](https://github.com/ClickHouse/ClickHouse/issues/34089). [#34385](https://github.com/ClickHouse/ClickHouse/pull/34385) ([nvartolomei](https://github.com/nvartolomei)). +* Fix inconsistency of `max_query_size` limitation in distributed subqueries. [#34078](https://github.com/ClickHouse/ClickHouse/pull/34078) ([Chao Ma](https://github.com/godliness)). + + +### ClickHouse release v22.2, 2022-02-17 + +#### Upgrade Notes + +* Applying data skipping indexes for queries with FINAL may produce incorrect result. In this release we disabled data skipping indexes by default for queries with FINAL (a new setting `use_skip_indexes_if_final` is introduced and disabled by default). [#34243](https://github.com/ClickHouse/ClickHouse/pull/34243) ([Azat Khuzhin](https://github.com/azat)). + +#### New Feature + +* Projections are production ready. Set `allow_experimental_projection_optimization` by default and deprecate this setting. [#34456](https://github.com/ClickHouse/ClickHouse/pull/34456) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* An option to create a new files on insert for `File`/`S3`/`HDFS` engines. Allow to overwrite a file in `HDFS`. Throw an exception in attempt to overwrite a file in `S3` by default. Throw an exception in attempt to append data to file in formats that have a suffix (and thus don't support appends, like `Parquet`, `ORC`). Closes [#31640](https://github.com/ClickHouse/ClickHouse/issues/31640) Closes [#31622](https://github.com/ClickHouse/ClickHouse/issues/31622) Closes [#23862](https://github.com/ClickHouse/ClickHouse/issues/23862) Closes [#15022](https://github.com/ClickHouse/ClickHouse/issues/15022) Closes [#16674](https://github.com/ClickHouse/ClickHouse/issues/16674). [#33302](https://github.com/ClickHouse/ClickHouse/pull/33302) ([Kruglov Pavel](https://github.com/Avogar)). +* Add a setting that allows a user to provide own deduplication semantic in `MergeTree`/`ReplicatedMergeTree` If provided, it's used instead of data digest to generate block ID. So, for example, by providing a unique value for the setting in each INSERT statement, the user can avoid the same inserted data being deduplicated. This closes: [#7461](https://github.com/ClickHouse/ClickHouse/issues/7461). [#32304](https://github.com/ClickHouse/ClickHouse/pull/32304) ([Igor Nikonov](https://github.com/devcrafter)). +* Add support of `DEFAULT` keyword for INSERT statements. Closes [#6331](https://github.com/ClickHouse/ClickHouse/issues/6331). [#33141](https://github.com/ClickHouse/ClickHouse/pull/33141) ([Andrii Buriachevskyi](https://github.com/1over)). +* `EPHEMERAL` column specifier is added to `CREATE TABLE` query. Closes [#9436](https://github.com/ClickHouse/ClickHouse/issues/9436). [#34424](https://github.com/ClickHouse/ClickHouse/pull/34424) ([yakov-olkhovskiy](https://github.com/yakov-olkhovskiy)). +* Support `IF EXISTS` clause for `TTL expr TO [DISK|VOLUME] [IF EXISTS] 'xxx'` feature. Parts will be moved to disk or volume only if it exists on replica, so `MOVE TTL` rules will be able to behave differently on replicas according to the existing storage policies. Resolves [#34455](https://github.com/ClickHouse/ClickHouse/issues/34455). [#34504](https://github.com/ClickHouse/ClickHouse/pull/34504) ([Anton Popov](https://github.com/CurtizJ)). +* Allow set default table engine and to create tables without specifying ENGINE. [#34187](https://github.com/ClickHouse/ClickHouse/pull/34187) ([Ilya Yatsishin](https://github.com/qoega)). +* Add table function `format(format_name, data)`. [#34125](https://github.com/ClickHouse/ClickHouse/pull/34125) ([Kruglov Pavel](https://github.com/Avogar)). +* Detect format in `clickhouse-local` by file name even in the case when it is passed to stdin. [#33829](https://github.com/ClickHouse/ClickHouse/pull/33829) ([Kruglov Pavel](https://github.com/Avogar)). +* Add schema inference for `values` table function. Closes [#33811](https://github.com/ClickHouse/ClickHouse/issues/33811). [#34017](https://github.com/ClickHouse/ClickHouse/pull/34017) ([Kruglov Pavel](https://github.com/Avogar)). +* Dynamic reload of server TLS certificates on config reload. Closes [#15764](https://github.com/ClickHouse/ClickHouse/issues/15764). [#15765](https://github.com/ClickHouse/ClickHouse/pull/15765) ([johnskopis](https://github.com/johnskopis)). [#31257](https://github.com/ClickHouse/ClickHouse/pull/31257) ([Filatenkov Artur](https://github.com/FArthur-cmd)). +* Now ReplicatedMergeTree can recover data when some of its disks are broken. [#13544](https://github.com/ClickHouse/ClickHouse/pull/13544) ([Amos Bird](https://github.com/amosbird)). +* Fault-tolerant connections in clickhouse-client: `clickhouse-client ... --host host1 --host host2 --port port2 --host host3 --port port --host host4`. [#34490](https://github.com/ClickHouse/ClickHouse/pull/34490) ([Kruglov Pavel](https://github.com/Avogar)). [#33824](https://github.com/ClickHouse/ClickHouse/pull/33824) ([Filippov Denis](https://github.com/DF5HSE)). +* Add `DEGREES` and `RADIANS` functions for MySQL compatibility. [#33769](https://github.com/ClickHouse/ClickHouse/pull/33769) ([Bharat Nallan](https://github.com/bharatnc)). +* Add `h3ToCenterChild` function. [#33313](https://github.com/ClickHouse/ClickHouse/pull/33313) ([Bharat Nallan](https://github.com/bharatnc)). Add new h3 miscellaneous functions: `edgeLengthKm`,`exactEdgeLengthKm`,`exactEdgeLengthM`,`exactEdgeLengthRads`,`numHexagons`. [#33621](https://github.com/ClickHouse/ClickHouse/pull/33621) ([Bharat Nallan](https://github.com/bharatnc)). +* Add function `bitSlice` to extract bit subsequences from String/FixedString. [#33360](https://github.com/ClickHouse/ClickHouse/pull/33360) ([RogerYK](https://github.com/RogerYK)). +* Implemented `meanZTest` aggregate function. [#33354](https://github.com/ClickHouse/ClickHouse/pull/33354) ([achimbab](https://github.com/achimbab)). +* Add confidence intervals to T-tests aggregate functions. [#33260](https://github.com/ClickHouse/ClickHouse/pull/33260) ([achimbab](https://github.com/achimbab)). +* Add function `addressToLineWithInlines`. Close [#26211](https://github.com/ClickHouse/ClickHouse/issues/26211). [#33467](https://github.com/ClickHouse/ClickHouse/pull/33467) ([SuperDJY](https://github.com/cmsxbc)). +* Added `#!` and `# ` as a recognised start of a single line comment. Closes [#34138](https://github.com/ClickHouse/ClickHouse/issues/34138). [#34230](https://github.com/ClickHouse/ClickHouse/pull/34230) ([Aaron Katz](https://github.com/aaronstephenkatz)). + +#### Experimental Feature + +* Functions for text classification: language and charset detection. See [#23271](https://github.com/ClickHouse/ClickHouse/issues/23271). [#33314](https://github.com/ClickHouse/ClickHouse/pull/33314) ([Nikolay Degterinsky](https://github.com/evillique)). +* Add memory overcommit to `MemoryTracker`. Added `guaranteed` settings for memory limits which represent soft memory limits. In case when hard memory limit is reached, `MemoryTracker` tries to cancel the most overcommited query. New setting `memory_usage_overcommit_max_wait_microseconds` specifies how long queries may wait another query to stop. Closes [#28375](https://github.com/ClickHouse/ClickHouse/issues/28375). [#31182](https://github.com/ClickHouse/ClickHouse/pull/31182) ([Dmitry Novik](https://github.com/novikd)). +* Enable stream to table join in WindowView. [#33729](https://github.com/ClickHouse/ClickHouse/pull/33729) ([vxider](https://github.com/Vxider)). +* Support `SET`, `YEAR`, `TIME` and `GEOMETRY` data types in `MaterializedMySQL` (experimental feature). Fixes [#18091](https://github.com/ClickHouse/ClickHouse/issues/18091), [#21536](https://github.com/ClickHouse/ClickHouse/issues/21536), [#26361](https://github.com/ClickHouse/ClickHouse/issues/26361). [#33429](https://github.com/ClickHouse/ClickHouse/pull/33429) ([zzsmdfj](https://github.com/zzsmdfj)). +* Fix various issues when projection is enabled by default. Each issue is described in separate commit. This is for [#33678](https://github.com/ClickHouse/ClickHouse/issues/33678) . This fixes [#34273](https://github.com/ClickHouse/ClickHouse/issues/34273). [#34305](https://github.com/ClickHouse/ClickHouse/pull/34305) ([Amos Bird](https://github.com/amosbird)). + +#### Performance Improvement + +* Support `optimize_read_in_order` if prefix of sorting key is already sorted. E.g. if we have sorting key `ORDER BY (a, b)` in table and query with `WHERE a = const ORDER BY b` clauses, now it will be applied reading in order of sorting key instead of full sort. [#32748](https://github.com/ClickHouse/ClickHouse/pull/32748) ([Anton Popov](https://github.com/CurtizJ)). +* Improve performance of partitioned insert into table functions `URL`, `S3`, `File`, `HDFS`. Closes [#34348](https://github.com/ClickHouse/ClickHouse/issues/34348). [#34510](https://github.com/ClickHouse/ClickHouse/pull/34510) ([Maksim Kita](https://github.com/kitaisreal)). +* Multiple performance improvements of clickhouse-keeper. [#34484](https://github.com/ClickHouse/ClickHouse/pull/34484) [#34587](https://github.com/ClickHouse/ClickHouse/pull/34587) ([zhanglistar](https://github.com/zhanglistar)). +* `FlatDictionary` improve performance of dictionary data load. [#33871](https://github.com/ClickHouse/ClickHouse/pull/33871) ([Maksim Kita](https://github.com/kitaisreal)). +* Improve performance of `mapPopulateSeries` function. Closes [#33944](https://github.com/ClickHouse/ClickHouse/issues/33944). [#34318](https://github.com/ClickHouse/ClickHouse/pull/34318) ([Maksim Kita](https://github.com/kitaisreal)). +* `_file` and `_path` virtual columns (in file-like table engines) are made `LowCardinality` - it will make queries for multiple files faster. Closes [#34300](https://github.com/ClickHouse/ClickHouse/issues/34300). [#34317](https://github.com/ClickHouse/ClickHouse/pull/34317) ([flynn](https://github.com/ucasfl)). +* Speed up loading of data parts. It was not parallelized before: the setting `part_loading_threads` did not have effect. See [#4699](https://github.com/ClickHouse/ClickHouse/issues/4699). [#34310](https://github.com/ClickHouse/ClickHouse/pull/34310) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Improve performance of `LineAsString` format. This closes [#34303](https://github.com/ClickHouse/ClickHouse/issues/34303). [#34306](https://github.com/ClickHouse/ClickHouse/pull/34306) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Optimize `quantilesExact{Low,High}` to use `nth_element` instead of `sort`. [#34287](https://github.com/ClickHouse/ClickHouse/pull/34287) ([Danila Kutenin](https://github.com/danlark1)). +* Slightly improve performance of `Regexp` format. [#34202](https://github.com/ClickHouse/ClickHouse/pull/34202) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Minor improvement for analysis of scalar subqueries. [#34128](https://github.com/ClickHouse/ClickHouse/pull/34128) ([Federico Rodriguez](https://github.com/fedrod)). +* Make ORDER BY tuple almost as fast as ORDER BY columns. We have special optimizations for multiple column ORDER BY: https://github.com/ClickHouse/ClickHouse/pull/10831 . It's beneficial to also apply to tuple columns. [#34060](https://github.com/ClickHouse/ClickHouse/pull/34060) ([Amos Bird](https://github.com/amosbird)). +* Rework and reintroduce the scalar subqueries cache to Materialized Views execution. [#33958](https://github.com/ClickHouse/ClickHouse/pull/33958) ([Raúl Marín](https://github.com/Algunenano)). +* Slightly improve performance of `ORDER BY` by adding x86-64 AVX-512 support for `memcmpSmall` functions to accelerate memory comparison. It works only if you compile ClickHouse by yourself. [#33706](https://github.com/ClickHouse/ClickHouse/pull/33706) ([hanqf-git](https://github.com/hanqf-git)). +* Improve `range_hashed` dictionary performance if for key there are a lot of intervals. Fixes [#23821](https://github.com/ClickHouse/ClickHouse/issues/23821). [#33516](https://github.com/ClickHouse/ClickHouse/pull/33516) ([Maksim Kita](https://github.com/kitaisreal)). +* For inserts and merges into S3, write files in parallel whenever possible (TODO: check if it's merged). [#33291](https://github.com/ClickHouse/ClickHouse/pull/33291) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Improve `clickhouse-keeper` performance and fix several memory leaks in NuRaft library. [#33329](https://github.com/ClickHouse/ClickHouse/pull/33329) ([alesapin](https://github.com/alesapin)). + +#### Improvement + +* Support asynchronous inserts in `clickhouse-client` for queries with inlined data. [#34267](https://github.com/ClickHouse/ClickHouse/pull/34267) ([Anton Popov](https://github.com/CurtizJ)). +* Functions `dictGet`, `dictHas` implicitly cast key argument to dictionary key structure, if they are different. [#33672](https://github.com/ClickHouse/ClickHouse/pull/33672) ([Maksim Kita](https://github.com/kitaisreal)). +* Improvements for `range_hashed` dictionaries. Improve performance of load time if there are multiple attributes. Allow to create a dictionary without attributes. Added option to specify strategy when intervals `start` and `end` have `Nullable` type `convert_null_range_bound_to_open` by default is `true`. Closes [#29791](https://github.com/ClickHouse/ClickHouse/issues/29791). Allow to specify `Float`, `Decimal`, `DateTime64`, `Int128`, `Int256`, `UInt128`, `UInt256` as range types. `RangeHashedDictionary` added support for range values that extend `Int64` type. Closes [#28322](https://github.com/ClickHouse/ClickHouse/issues/28322). Added option `range_lookup_strategy` to specify range lookup type `min`, `max` by default is `min` . Closes [#21647](https://github.com/ClickHouse/ClickHouse/issues/21647). Fixed allocated bytes calculations. Fixed type name in `system.dictionaries` in case of `ComplexKeyHashedDictionary`. [#33927](https://github.com/ClickHouse/ClickHouse/pull/33927) ([Maksim Kita](https://github.com/kitaisreal)). +* `flat`, `hashed`, `hashed_array` dictionaries now support creating with empty attributes, with support of reading the keys and using `dictHas`. Fixes [#33820](https://github.com/ClickHouse/ClickHouse/issues/33820). [#33918](https://github.com/ClickHouse/ClickHouse/pull/33918) ([Maksim Kita](https://github.com/kitaisreal)). +* Added support for `DateTime64` data type in dictionaries. [#33914](https://github.com/ClickHouse/ClickHouse/pull/33914) ([Maksim Kita](https://github.com/kitaisreal)). +* Allow to write `s3(url, access_key_id, secret_access_key)` (autodetect of data format and table structure, but with explicit credentials). [#34503](https://github.com/ClickHouse/ClickHouse/pull/34503) ([Kruglov Pavel](https://github.com/Avogar)). +* Added sending of the output format back to client like it's done in HTTP protocol as suggested in [#34362](https://github.com/ClickHouse/ClickHouse/issues/34362). Closes [#34362](https://github.com/ClickHouse/ClickHouse/issues/34362). [#34499](https://github.com/ClickHouse/ClickHouse/pull/34499) ([Vitaly Baranov](https://github.com/vitlibar)). +* Send ProfileEvents statistics in case of INSERT SELECT query (to display query metrics in `clickhouse-client` for this type of queries). [#34498](https://github.com/ClickHouse/ClickHouse/pull/34498) ([Dmitry Novik](https://github.com/novikd)). +* Recognize `.jsonl` extension for JSONEachRow format. [#34496](https://github.com/ClickHouse/ClickHouse/pull/34496) ([Kruglov Pavel](https://github.com/Avogar)). +* Improve schema inference in clickhouse-local. Allow to write just `clickhouse-local -q "select * from table" < data.format`. [#34495](https://github.com/ClickHouse/ClickHouse/pull/34495) ([Kruglov Pavel](https://github.com/Avogar)). +* Privileges CREATE/ALTER/DROP ROW POLICY now can be granted on a table or on `database.*` as well as globally `*.*`. [#34489](https://github.com/ClickHouse/ClickHouse/pull/34489) ([Vitaly Baranov](https://github.com/vitlibar)). +* Allow to export arbitrary large files to `s3`. Add two new settings: `s3_upload_part_size_multiply_factor` and `s3_upload_part_size_multiply_parts_count_threshold`. Now each time `s3_upload_part_size_multiply_parts_count_threshold` uploaded to S3 from a single query `s3_min_upload_part_size` multiplied by `s3_upload_part_size_multiply_factor`. Fixes [#34244](https://github.com/ClickHouse/ClickHouse/issues/34244). [#34422](https://github.com/ClickHouse/ClickHouse/pull/34422) ([alesapin](https://github.com/alesapin)). +* Allow to skip not found (404) URLs for globs when using URL storage / table function. Also closes [#34359](https://github.com/ClickHouse/ClickHouse/issues/34359). [#34392](https://github.com/ClickHouse/ClickHouse/pull/34392) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Default input and output formats for `clickhouse-local` that can be overriden by --input-format and --output-format. Close [#30631](https://github.com/ClickHouse/ClickHouse/issues/30631). [#34352](https://github.com/ClickHouse/ClickHouse/pull/34352) ([李扬](https://github.com/taiyang-li)). +* Add options for `clickhouse-format`. Which close [#30528](https://github.com/ClickHouse/ClickHouse/issues/30528) - `max_query_size` - `max_parser_depth`. [#34349](https://github.com/ClickHouse/ClickHouse/pull/34349) ([李扬](https://github.com/taiyang-li)). +* Better handling of pre-inputs before client start. This is for [#34308](https://github.com/ClickHouse/ClickHouse/issues/34308). [#34336](https://github.com/ClickHouse/ClickHouse/pull/34336) ([Amos Bird](https://github.com/amosbird)). +* `REGEXP_MATCHES` and `REGEXP_REPLACE` function aliases for compatibility with PostgreSQL. Close [#30885](https://github.com/ClickHouse/ClickHouse/issues/30885). [#34334](https://github.com/ClickHouse/ClickHouse/pull/34334) ([李扬](https://github.com/taiyang-li)). +* Some servers expect a User-Agent header in their HTTP requests. A `User-Agent` header entry has been added to HTTP requests of the form: User-Agent: ClickHouse/VERSION_STRING. [#34330](https://github.com/ClickHouse/ClickHouse/pull/34330) ([Saad Ur Rahman](https://github.com/surahman)). +* Cancel merges before acquiring table lock for `TRUNCATE` query to avoid `DEADLOCK_AVOIDED` error in some cases. Fixes [#34302](https://github.com/ClickHouse/ClickHouse/issues/34302). [#34304](https://github.com/ClickHouse/ClickHouse/pull/34304) ([tavplubix](https://github.com/tavplubix)). +* Change severity of the "Cancelled merging parts" message in logs, because it's not an error. This closes [#34148](https://github.com/ClickHouse/ClickHouse/issues/34148). [#34232](https://github.com/ClickHouse/ClickHouse/pull/34232) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Add ability to compose PostgreSQL-style cast operator `::` with expressions using `[]` and `.` operators (array and tuple indexing). [#34229](https://github.com/ClickHouse/ClickHouse/pull/34229) ([Nikolay Degterinsky](https://github.com/evillique)). +* Recognize `YYYYMMDD-hhmmss` format in `parseDateTimeBestEffort` function. This closes [#34206](https://github.com/ClickHouse/ClickHouse/issues/34206). [#34208](https://github.com/ClickHouse/ClickHouse/pull/34208) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Allow carriage return in the middle of the line while parsing by `Regexp` format. This closes [#34200](https://github.com/ClickHouse/ClickHouse/issues/34200). [#34205](https://github.com/ClickHouse/ClickHouse/pull/34205) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Allow to parse dictionary's `PRIMARY KEY` as `PRIMARY KEY (id, value)`; previously supported only `PRIMARY KEY id, value`. Closes [#34135](https://github.com/ClickHouse/ClickHouse/issues/34135). [#34141](https://github.com/ClickHouse/ClickHouse/pull/34141) ([Maksim Kita](https://github.com/kitaisreal)). +* An optional argument for `splitByChar` to limit the number of resulting elements. close [#34081](https://github.com/ClickHouse/ClickHouse/issues/34081). [#34140](https://github.com/ClickHouse/ClickHouse/pull/34140) ([李扬](https://github.com/taiyang-li)). +* Improving the experience of multiple line editing for clickhouse-client. This is a follow-up of [#31123](https://github.com/ClickHouse/ClickHouse/pull/31123). [#34114](https://github.com/ClickHouse/ClickHouse/pull/34114) ([Amos Bird](https://github.com/amosbird)). +* Add `UUID` suport in `MsgPack` input/output format. [#34065](https://github.com/ClickHouse/ClickHouse/pull/34065) ([Kruglov Pavel](https://github.com/Avogar)). +* Tracing context (for OpenTelemetry) is now propagated from GRPC client metadata (this change is relevant for GRPC client-server protocol). [#34064](https://github.com/ClickHouse/ClickHouse/pull/34064) ([andremarianiello](https://github.com/andremarianiello)). +* Supports all types of `SYSTEM` queries with `ON CLUSTER` clause. [#34005](https://github.com/ClickHouse/ClickHouse/pull/34005) ([小路](https://github.com/nicelulu)). +* Improve memory accounting for queries that are using less than `max_untracker_memory`. [#34001](https://github.com/ClickHouse/ClickHouse/pull/34001) ([Azat Khuzhin](https://github.com/azat)). +* Fixed UTF-8 string case-insensitive search when lowercase and uppercase characters are represented by different number of bytes. Example is `ẞ` and `ß`. This closes [#7334](https://github.com/ClickHouse/ClickHouse/issues/7334). [#33992](https://github.com/ClickHouse/ClickHouse/pull/33992) ([Harry Lee](https://github.com/HarryLeeIBM)). +* Detect format and schema from stdin in `clickhouse-local`. [#33960](https://github.com/ClickHouse/ClickHouse/pull/33960) ([Kruglov Pavel](https://github.com/Avogar)). +* Correctly handle the case of misconfiguration when multiple disks are using the same path on the filesystem. [#29072](https://github.com/ClickHouse/ClickHouse/issues/29072). [#33905](https://github.com/ClickHouse/ClickHouse/pull/33905) ([zhongyuankai](https://github.com/zhongyuankai)). +* Try every resolved IP address while getting S3 proxy. S3 proxies are rarely used, mostly in Yandex Cloud. [#33862](https://github.com/ClickHouse/ClickHouse/pull/33862) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Support EXPLAIN AST CREATE FUNCTION query `EXPLAIN AST CREATE FUNCTION mycast AS (n) -> cast(n as String)` will return `EXPLAIN AST CREATE FUNCTION mycast AS n -> CAST(n, 'String')`. [#33819](https://github.com/ClickHouse/ClickHouse/pull/33819) ([李扬](https://github.com/taiyang-li)). +* Added support for cast from `Map(Key, Value)` to `Array(Tuple(Key, Value))`. [#33794](https://github.com/ClickHouse/ClickHouse/pull/33794) ([Maksim Kita](https://github.com/kitaisreal)). +* Add some improvements and fixes for `Bool` data type. Fixes [#33244](https://github.com/ClickHouse/ClickHouse/issues/33244). [#33737](https://github.com/ClickHouse/ClickHouse/pull/33737) ([Kruglov Pavel](https://github.com/Avogar)). +* Parse and store OpenTelemetry trace-id in big-endian order. [#33723](https://github.com/ClickHouse/ClickHouse/pull/33723) ([Frank Chen](https://github.com/FrankChen021)). +* Improvement for `fromUnixTimestamp64` family functions.. They now accept any integer value that can be converted to `Int64`. This closes: [#14648](https://github.com/ClickHouse/ClickHouse/issues/14648). [#33505](https://github.com/ClickHouse/ClickHouse/pull/33505) ([Andrey Zvonov](https://github.com/zvonand)). +* Reimplement `_shard_num` from constants (see [#7624](https://github.com/ClickHouse/ClickHouse/issues/7624)) with `shardNum()` function (seee [#27020](https://github.com/ClickHouse/ClickHouse/issues/27020)), to avoid possible issues (like those that had been found in [#16947](https://github.com/ClickHouse/ClickHouse/issues/16947)). [#33392](https://github.com/ClickHouse/ClickHouse/pull/33392) ([Azat Khuzhin](https://github.com/azat)). +* Enable binary arithmetic (plus, minus, multiply, division, least, greatest) between Decimal and Float. [#33355](https://github.com/ClickHouse/ClickHouse/pull/33355) ([flynn](https://github.com/ucasfl)). +* Respect cgroups limits in max_threads autodetection. [#33342](https://github.com/ClickHouse/ClickHouse/pull/33342) ([JaySon](https://github.com/JaySon-Huang)). +* Add new clickhouse-keeper setting `min_session_timeout_ms`. Now clickhouse-keeper will determine client session timeout according to `min_session_timeout_ms` and `session_timeout_ms` settings. [#33288](https://github.com/ClickHouse/ClickHouse/pull/33288) ([JackyWoo](https://github.com/JackyWoo)). +* Added `UUID` data type support for functions `hex` and `bin`. [#32170](https://github.com/ClickHouse/ClickHouse/pull/32170) ([Frank Chen](https://github.com/FrankChen021)). +* Fix reading of subcolumns with dots in their names. In particular fixed reading of `Nested` columns, if their element names contain dots (e.g ```Nested(`keys.name` String, `keys.id` UInt64, values UInt64)```). [#34228](https://github.com/ClickHouse/ClickHouse/pull/34228) ([Anton Popov](https://github.com/CurtizJ)). +* Fixes `parallel_view_processing = 0` not working when inserting into a table using `VALUES`. - Fixes `view_duration_ms` in the `query_views_log` not being set correctly for materialized views. [#34067](https://github.com/ClickHouse/ClickHouse/pull/34067) ([Raúl Marín](https://github.com/Algunenano)). +* Fix parsing tables structure from ZooKeeper: now metadata from ZooKeeper compared with local metadata in canonical form. It helps when canonical function names can change between ClickHouse versions. [#33933](https://github.com/ClickHouse/ClickHouse/pull/33933) ([sunny](https://github.com/sunny19930321)). +* Properly escape some characters for interaction with LDAP. [#33401](https://github.com/ClickHouse/ClickHouse/pull/33401) ([IlyaTsoi](https://github.com/IlyaTsoi)). + +#### Build/Testing/Packaging Improvement + +* Remove unbundled build support. [#33690](https://github.com/ClickHouse/ClickHouse/pull/33690) ([Azat Khuzhin](https://github.com/azat)). +* Ensure that tests don't depend on the result of non-stable sorting of equal elements. Added equal items ranges randomization in debug after sort to prevent issues when we rely on equal items sort order. [#34393](https://github.com/ClickHouse/ClickHouse/pull/34393) ([Maksim Kita](https://github.com/kitaisreal)). +* Add verbosity to a style check. [#34289](https://github.com/ClickHouse/ClickHouse/pull/34289) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Remove `clickhouse-test` debian package because it's obsolete. [#33948](https://github.com/ClickHouse/ClickHouse/pull/33948) ([Ilya Yatsishin](https://github.com/qoega)). +* Multiple improvements for build system to remove the possibility of occasionally using packages from the OS and to enforce hermetic builds. [#33695](https://github.com/ClickHouse/ClickHouse/pull/33695) ([Amos Bird](https://github.com/amosbird)). + +#### Bug Fix (user-visible misbehaviour in official stable or prestable release) + +* Fixed the assertion in case of using `allow_experimental_parallel_reading_from_replicas` with `max_parallel_replicas` equals to 1. This fixes [#34525](https://github.com/ClickHouse/ClickHouse/issues/34525). [#34613](https://github.com/ClickHouse/ClickHouse/pull/34613) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). +* Fix rare bug while reading of empty arrays, which could lead to `Data compressed with different methods` error. It can reproduce if you have mostly empty arrays, but not always. And reading is performed in backward direction with ORDER BY ... DESC. This error is extremely unlikely to happen. [#34327](https://github.com/ClickHouse/ClickHouse/pull/34327) ([Anton Popov](https://github.com/CurtizJ)). +* Fix wrong result of `round`/`roundBankers` if integer values of small types are rounded. Closes [#33267](https://github.com/ClickHouse/ClickHouse/issues/33267). [#34562](https://github.com/ClickHouse/ClickHouse/pull/34562) ([李扬](https://github.com/taiyang-li)). +* Sometimes query cancellation did not work immediately when we were reading multiple files from s3 or HDFS. Fixes [#34301](https://github.com/ClickHouse/ClickHouse/issues/34301) Relates to [#34397](https://github.com/ClickHouse/ClickHouse/issues/34397). [#34539](https://github.com/ClickHouse/ClickHouse/pull/34539) ([Dmitry Novik](https://github.com/novikd)). +* Fix exception `Chunk should have AggregatedChunkInfo in MergingAggregatedTransform` (in case of `optimize_aggregation_in_order = 1` and `distributed_aggregation_memory_efficient = 0`). Fixes [#34526](https://github.com/ClickHouse/ClickHouse/issues/34526). [#34532](https://github.com/ClickHouse/ClickHouse/pull/34532) ([Anton Popov](https://github.com/CurtizJ)). +* Fix comparison between integers and floats in index analysis. Previously it could lead to skipping some granules for reading by mistake. Fixes [#34493](https://github.com/ClickHouse/ClickHouse/issues/34493). [#34528](https://github.com/ClickHouse/ClickHouse/pull/34528) ([Anton Popov](https://github.com/CurtizJ)). +* Fix compression support in URL engine. [#34524](https://github.com/ClickHouse/ClickHouse/pull/34524) ([Frank Chen](https://github.com/FrankChen021)). +* Fix possible error 'file_size: Operation not supported' in files' schema autodetection. [#34479](https://github.com/ClickHouse/ClickHouse/pull/34479) ([Kruglov Pavel](https://github.com/Avogar)). +* Fixes possible race with table deletion. [#34416](https://github.com/ClickHouse/ClickHouse/pull/34416) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix possible error `Cannot convert column Function to mask` in short circuit function evaluation. Closes [#34171](https://github.com/ClickHouse/ClickHouse/issues/34171). [#34415](https://github.com/ClickHouse/ClickHouse/pull/34415) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix potential crash when doing schema inference from url source. Closes [#34147](https://github.com/ClickHouse/ClickHouse/issues/34147). [#34405](https://github.com/ClickHouse/ClickHouse/pull/34405) ([Kruglov Pavel](https://github.com/Avogar)). +* For UDFs access permissions were checked for database level instead of global level as it should be. Closes [#34281](https://github.com/ClickHouse/ClickHouse/issues/34281). [#34404](https://github.com/ClickHouse/ClickHouse/pull/34404) ([Maksim Kita](https://github.com/kitaisreal)). +* Fix wrong engine syntax in result of `SHOW CREATE DATABASE` query for databases with engine `Memory`. This closes [#34335](https://github.com/ClickHouse/ClickHouse/issues/34335). [#34345](https://github.com/ClickHouse/ClickHouse/pull/34345) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Fixed a couple of extremely rare race conditions that might lead to broken state of replication queue and "intersecting parts" error. [#34297](https://github.com/ClickHouse/ClickHouse/pull/34297) ([tavplubix](https://github.com/tavplubix)). +* Fix progress bar width. It was incorrectly rounded to integer number of characters. [#34275](https://github.com/ClickHouse/ClickHouse/pull/34275) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Fix current_user/current_address client information fields for inter-server communication (before this patch current_user/current_address will be preserved from the previous query). [#34263](https://github.com/ClickHouse/ClickHouse/pull/34263) ([Azat Khuzhin](https://github.com/azat)). +* Fix memory leak in case of some Exception during query processing with `optimize_aggregation_in_order=1`. [#34234](https://github.com/ClickHouse/ClickHouse/pull/34234) ([Azat Khuzhin](https://github.com/azat)). +* Fix metric `Query`, which shows the number of executing queries. In last several releases it was always 0. [#34224](https://github.com/ClickHouse/ClickHouse/pull/34224) ([Anton Popov](https://github.com/CurtizJ)). +* Fix schema inference for table runction `s3`. [#34186](https://github.com/ClickHouse/ClickHouse/pull/34186) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix rare and benign race condition in `HDFS`, `S3` and `URL` storage engines which can lead to additional connections. [#34172](https://github.com/ClickHouse/ClickHouse/pull/34172) ([alesapin](https://github.com/alesapin)). +* Fix bug which can rarely lead to error "Cannot read all data" while reading LowCardinality columns of MergeTree table engines family which stores data on remote file system like S3 (virtual filesystem over s3 is an experimental feature that is not ready for production). [#34139](https://github.com/ClickHouse/ClickHouse/pull/34139) ([alesapin](https://github.com/alesapin)). +* Fix inserts to distributed tables in case of a change of native protocol. The last change was in the version 22.1, so there may be some failures of inserts to distributed tables after upgrade to that version. [#34132](https://github.com/ClickHouse/ClickHouse/pull/34132) ([Anton Popov](https://github.com/CurtizJ)). +* Fix possible data race in `File` table engine that was introduced in [#33960](https://github.com/ClickHouse/ClickHouse/pull/33960). Closes [#34111](https://github.com/ClickHouse/ClickHouse/issues/34111). [#34113](https://github.com/ClickHouse/ClickHouse/pull/34113) ([Kruglov Pavel](https://github.com/Avogar)). +* Fixed minor race condition that might cause "intersecting parts" error in extremely rare cases after ZooKeeper connection loss. [#34096](https://github.com/ClickHouse/ClickHouse/pull/34096) ([tavplubix](https://github.com/tavplubix)). +* Fix asynchronous inserts with `Native` format. [#34068](https://github.com/ClickHouse/ClickHouse/pull/34068) ([Anton Popov](https://github.com/CurtizJ)). +* Fix bug which lead to inability for server to start when both replicated access storage and keeper (embedded in clickhouse-server) are used. Introduced two settings for keeper socket timeout instead of settings from default user: `keeper_server.socket_receive_timeout_sec` and `keeper_server.socket_send_timeout_sec`. Fixes [#33973](https://github.com/ClickHouse/ClickHouse/issues/33973). [#33988](https://github.com/ClickHouse/ClickHouse/pull/33988) ([alesapin](https://github.com/alesapin)). +* Fix segfault while parsing ORC file with corrupted footer. Closes [#33797](https://github.com/ClickHouse/ClickHouse/issues/33797). [#33984](https://github.com/ClickHouse/ClickHouse/pull/33984) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix parsing IPv6 from query parameter (prepared statements) and fix IPv6 to string conversion. Closes [#33928](https://github.com/ClickHouse/ClickHouse/issues/33928). [#33971](https://github.com/ClickHouse/ClickHouse/pull/33971) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix crash while reading of nested tuples. Fixes [#33838](https://github.com/ClickHouse/ClickHouse/issues/33838). [#33956](https://github.com/ClickHouse/ClickHouse/pull/33956) ([Anton Popov](https://github.com/CurtizJ)). +* Fix usage of functions `array` and `tuple` with literal arguments in distributed queries. Previously it could lead to `Not found columns` exception. [#33938](https://github.com/ClickHouse/ClickHouse/pull/33938) ([Anton Popov](https://github.com/CurtizJ)). +* Aggregate function combinator `-If` did not correctly process `Nullable` filter argument. This closes [#27073](https://github.com/ClickHouse/ClickHouse/issues/27073). [#33920](https://github.com/ClickHouse/ClickHouse/pull/33920) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Fix potential race condition when doing remote disk read (virtual filesystem over s3 is an experimental feature that is not ready for production). [#33912](https://github.com/ClickHouse/ClickHouse/pull/33912) ([Amos Bird](https://github.com/amosbird)). +* Fix crash if SQL UDF is created with lambda with non identifier arguments. Closes [#33866](https://github.com/ClickHouse/ClickHouse/issues/33866). [#33868](https://github.com/ClickHouse/ClickHouse/pull/33868) ([Maksim Kita](https://github.com/kitaisreal)). +* Fix usage of sparse columns (which can be enabled by experimental setting `ratio_of_defaults_for_sparse_serialization`). [#33849](https://github.com/ClickHouse/ClickHouse/pull/33849) ([Anton Popov](https://github.com/CurtizJ)). +* Fixed `replica is not readonly` logical error on `SYSTEM RESTORE REPLICA` query when replica is actually readonly. Fixes [#33806](https://github.com/ClickHouse/ClickHouse/issues/33806). [#33847](https://github.com/ClickHouse/ClickHouse/pull/33847) ([tavplubix](https://github.com/tavplubix)). +* Fix memory leak in `clickhouse-keeper` in case of compression is used (default). [#33840](https://github.com/ClickHouse/ClickHouse/pull/33840) ([Azat Khuzhin](https://github.com/azat)). +* Fix index analysis with no common types available. [#33833](https://github.com/ClickHouse/ClickHouse/pull/33833) ([Amos Bird](https://github.com/amosbird)). +* Fix schema inference for `JSONEachRow` and `JSONCompactEachRow`. [#33830](https://github.com/ClickHouse/ClickHouse/pull/33830) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix usage of external dictionaries with `redis` source and large number of keys. [#33804](https://github.com/ClickHouse/ClickHouse/pull/33804) ([Anton Popov](https://github.com/CurtizJ)). +* Fix bug in client that led to 'Connection reset by peer' in server. Closes [#33309](https://github.com/ClickHouse/ClickHouse/issues/33309). [#33790](https://github.com/ClickHouse/ClickHouse/pull/33790) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix parsing query INSERT INTO ... VALUES SETTINGS ... (...), ... [#33776](https://github.com/ClickHouse/ClickHouse/pull/33776) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix bug of check table when creating data part with wide format and projection. [#33774](https://github.com/ClickHouse/ClickHouse/pull/33774) ([李扬](https://github.com/taiyang-li)). +* Fix tiny race between count() and INSERT/merges/... in MergeTree (it is possible to return incorrect number of rows for SELECT with optimize_trivial_count_query). [#33753](https://github.com/ClickHouse/ClickHouse/pull/33753) ([Azat Khuzhin](https://github.com/azat)). +* Throw exception when directory listing request has failed in storage HDFS. [#33724](https://github.com/ClickHouse/ClickHouse/pull/33724) ([LiuNeng](https://github.com/liuneng1994)). +* Fix mutation when table contains projections. This fixes [#33010](https://github.com/ClickHouse/ClickHouse/issues/33010). This fixes [#33275](https://github.com/ClickHouse/ClickHouse/issues/33275). [#33679](https://github.com/ClickHouse/ClickHouse/pull/33679) ([Amos Bird](https://github.com/amosbird)). +* Correctly determine current database if `CREATE TEMPORARY TABLE AS SELECT` is queried inside a named HTTP session. This is a very rare use case. This closes [#8340](https://github.com/ClickHouse/ClickHouse/issues/8340). [#33676](https://github.com/ClickHouse/ClickHouse/pull/33676) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Allow some queries with sorting, LIMIT BY, ARRAY JOIN and lambda functions. This closes [#7462](https://github.com/ClickHouse/ClickHouse/issues/7462). [#33675](https://github.com/ClickHouse/ClickHouse/pull/33675) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Fix bug in "zero copy replication" (a feature that is under development and should not be used in production) which lead to data duplication in case of TTL move. Fixes [#33643](https://github.com/ClickHouse/ClickHouse/issues/33643). [#33642](https://github.com/ClickHouse/ClickHouse/pull/33642) ([alesapin](https://github.com/alesapin)). +* Fix `Chunk should have AggregatedChunkInfo in GroupingAggregatedTransform` (in case of `optimize_aggregation_in_order = 1`). [#33637](https://github.com/ClickHouse/ClickHouse/pull/33637) ([Azat Khuzhin](https://github.com/azat)). +* Fix error `Bad cast from type ... to DB::DataTypeArray` which may happen when table has `Nested` column with dots in name, and default value is generated for it (e.g. during insert, when column is not listed). Continuation of [#28762](https://github.com/ClickHouse/ClickHouse/issues/28762). [#33588](https://github.com/ClickHouse/ClickHouse/pull/33588) ([Alexey Pavlenko](https://github.com/alexeypavlenko)). +* Export into `lz4` files has been fixed. Closes [#31421](https://github.com/ClickHouse/ClickHouse/issues/31421). [#31862](https://github.com/ClickHouse/ClickHouse/pull/31862) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix potential crash if `group_by_overflow_mode` was set to `any` (approximate GROUP BY) and aggregation was performed by single column of type `LowCardinality`. [#34506](https://github.com/ClickHouse/ClickHouse/pull/34506) ([DR](https://github.com/freedomDR)). +* Fix inserting to temporary tables via gRPC client-server protocol. Fixes [#34347](https://github.com/ClickHouse/ClickHouse/issues/34347), issue `#2`. [#34364](https://github.com/ClickHouse/ClickHouse/pull/34364) ([Vitaly Baranov](https://github.com/vitlibar)). +* Fix issue [#19429](https://github.com/ClickHouse/ClickHouse/issues/19429). [#34225](https://github.com/ClickHouse/ClickHouse/pull/34225) ([Vitaly Baranov](https://github.com/vitlibar)). +* Fix issue [#18206](https://github.com/ClickHouse/ClickHouse/issues/18206). [#33977](https://github.com/ClickHouse/ClickHouse/pull/33977) ([Vitaly Baranov](https://github.com/vitlibar)). +* This PR allows using multiple LDAP storages in the same list of user directories. It worked earlier but was broken because LDAP tests are disabled (they are part of the testflows tests). [#33574](https://github.com/ClickHouse/ClickHouse/pull/33574) ([Vitaly Baranov](https://github.com/vitlibar)). + + +### ClickHouse release v22.1, 2022-01-18 + +#### Upgrade Notes + +* The functions `left` and `right` were previously implemented in parser and now full-featured. Distributed queries with `left` or `right` functions without aliases may throw exception if cluster contains different versions of clickhouse-server. If you are upgrading your cluster and encounter this error, you should finish upgrading your cluster to ensure all nodes have the same version. Also you can add aliases (`AS something`) to the columns in your queries to avoid this issue. [#33407](https://github.com/ClickHouse/ClickHouse/pull/33407) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Resource usage by scalar subqueries is fully accounted since this version. With this change, rows read in scalar subqueries are now reported in the query_log. If the scalar subquery is cached (repeated or called for several rows) the rows read are only counted once. This change allows KILLing queries and reporting progress while they are executing scalar subqueries. [#32271](https://github.com/ClickHouse/ClickHouse/pull/32271) ([Raúl Marín](https://github.com/Algunenano)). + +#### New Feature + +* Implement data schema inference for input formats. Allow to skip structure (or write just `auto`) in table functions `file`, `url`, `s3`, `hdfs` and in parameters of `clickhouse-local` . Allow to skip structure in create query for table engines `File`, `HDFS`, `S3`, `URL`, `Merge`, `Buffer`, `Distributed` and `ReplicatedMergeTree` (if we add new replicas). [#32455](https://github.com/ClickHouse/ClickHouse/pull/32455) ([Kruglov Pavel](https://github.com/Avogar)). +* Detect format by file extension in `file`/`hdfs`/`s3`/`url` table functions and `HDFS`/`S3`/`URL` table engines and also for `SELECT INTO OUTFILE` and `INSERT FROM INFILE` [#33565](https://github.com/ClickHouse/ClickHouse/pull/33565) ([Kruglov Pavel](https://github.com/Avogar)). Close [#30918](https://github.com/ClickHouse/ClickHouse/issues/30918). [#33443](https://github.com/ClickHouse/ClickHouse/pull/33443) ([OnePiece](https://github.com/zhongyuankai)). +* A tool for collecting diagnostics data if you need support. [#33175](https://github.com/ClickHouse/ClickHouse/pull/33175) ([Alexander Burmak](https://github.com/Alex-Burmak)). +* Automatic cluster discovery via Zoo/Keeper. It allows to add replicas to the cluster without changing configuration on every server. [#31442](https://github.com/ClickHouse/ClickHouse/pull/31442) ([vdimir](https://github.com/vdimir)). +* Implement hive table engine to access apache hive from clickhouse. This implements: [#29245](https://github.com/ClickHouse/ClickHouse/issues/29245). [#31104](https://github.com/ClickHouse/ClickHouse/pull/31104) ([taiyang-li](https://github.com/taiyang-li)). +* Add aggregate functions `cramersV`, `cramersVBiasCorrected`, `theilsU` and `contingency`. These functions calculate dependency (measure of association) between categorical values. All these functions are using cross-tab (histogram on pairs) for implementation. You can imagine it like a correlation coefficient but for any discrete values (not necessary numbers). [#33366](https://github.com/ClickHouse/ClickHouse/pull/33366) ([alexey-milovidov](https://github.com/alexey-milovidov)). Initial implementation by [Vanyok-All-is-OK](https://github.com/Vanyok-All-is-OK) and [antikvist](https://github.com/antikvist). +* Added table function `hdfsCluster` which allows processing files from HDFS in parallel from many nodes in a specified cluster, similarly to `s3Cluster`. [#32400](https://github.com/ClickHouse/ClickHouse/pull/32400) ([Zhichang Yu](https://github.com/yuzhichang)). +* Adding support for disks backed by Azure Blob Storage, in a similar way it has been done for disks backed by AWS S3. [#31505](https://github.com/ClickHouse/ClickHouse/pull/31505) ([Jakub Kuklis](https://github.com/jkuklis)). +* Allow `COMMENT` in `CREATE VIEW` (for all VIEW kinds). [#31062](https://github.com/ClickHouse/ClickHouse/pull/31062) ([Vasily Nemkov](https://github.com/Enmk)). +* Dynamically reinitialize listening ports and protocols when configuration changes. [#30549](https://github.com/ClickHouse/ClickHouse/pull/30549) ([Kevin Michel](https://github.com/kmichel-aiven)). +* Added `left`, `right`, `leftUTF8`, `rightUTF8` functions. Fix error in implementation of `substringUTF8` function with negative offset (offset from the end of string). [#33407](https://github.com/ClickHouse/ClickHouse/pull/33407) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Add new functions for `H3` coordinate system: `h3HexAreaKm2`, `h3CellAreaM2`, `h3CellAreaRads2`. [#33479](https://github.com/ClickHouse/ClickHouse/pull/33479) ([Bharat Nallan](https://github.com/bharatnc)). +* Add `MONTHNAME` function. [#33436](https://github.com/ClickHouse/ClickHouse/pull/33436) ([usurai](https://github.com/usurai)). +* Added function `arrayLast`. Closes [#33390](https://github.com/ClickHouse/ClickHouse/issues/33390). [#33415](https://github.com/ClickHouse/ClickHouse/pull/33415) Added function `arrayLastIndex`. [#33465](https://github.com/ClickHouse/ClickHouse/pull/33465) ([Maksim Kita](https://github.com/kitaisreal)). +* Add function `decodeURLFormComponent` slightly different to `decodeURLComponent`. Close [#10298](https://github.com/ClickHouse/ClickHouse/issues/10298). [#33451](https://github.com/ClickHouse/ClickHouse/pull/33451) ([SuperDJY](https://github.com/cmsxbc)). +* Allow to split `GraphiteMergeTree` rollup rules for plain/tagged metrics (optional rule_type field). [#33494](https://github.com/ClickHouse/ClickHouse/pull/33494) ([Michail Safronov](https://github.com/msaf1980)). + + +#### Performance Improvement + +* Support moving conditions to `PREWHERE` (setting `optimize_move_to_prewhere`) for tables of `Merge` engine if its all underlying tables supports `PREWHERE`. [#33300](https://github.com/ClickHouse/ClickHouse/pull/33300) ([Anton Popov](https://github.com/CurtizJ)). +* More efficient handling of globs for URL storage. Now you can easily query million URLs in parallel with retries. Closes [#32866](https://github.com/ClickHouse/ClickHouse/issues/32866). [#32907](https://github.com/ClickHouse/ClickHouse/pull/32907) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Avoid exponential backtracking in parser. This closes [#20158](https://github.com/ClickHouse/ClickHouse/issues/20158). [#33481](https://github.com/ClickHouse/ClickHouse/pull/33481) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Abuse of `untuple` function was leading to exponential complexity of query analysis (found by fuzzer). This closes [#33297](https://github.com/ClickHouse/ClickHouse/issues/33297). [#33445](https://github.com/ClickHouse/ClickHouse/pull/33445) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Reduce allocated memory for dictionaries with string attributes. [#33466](https://github.com/ClickHouse/ClickHouse/pull/33466) ([Maksim Kita](https://github.com/kitaisreal)). +* Slight performance improvement of `reinterpret` function. [#32587](https://github.com/ClickHouse/ClickHouse/pull/32587) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Non significant change. In extremely rare cases when data part is lost on every replica, after merging of some data parts, the subsequent queries may skip less amount of partitions during partition pruning. This hardly affects anything. [#32220](https://github.com/ClickHouse/ClickHouse/pull/32220) ([Azat Khuzhin](https://github.com/azat)). +* Improve `clickhouse-keeper` writing performance by optimization the size calculation logic. [#32366](https://github.com/ClickHouse/ClickHouse/pull/32366) ([zhanglistar](https://github.com/zhanglistar)). +* Optimize single part projection materialization. This closes [#31669](https://github.com/ClickHouse/ClickHouse/issues/31669). [#31885](https://github.com/ClickHouse/ClickHouse/pull/31885) ([Amos Bird](https://github.com/amosbird)). +* Improve query performance of system tables. [#33312](https://github.com/ClickHouse/ClickHouse/pull/33312) ([OnePiece](https://github.com/zhongyuankai)). +* Optimize selecting of MergeTree parts that can be moved between volumes. [#33225](https://github.com/ClickHouse/ClickHouse/pull/33225) ([OnePiece](https://github.com/zhongyuankai)). +* Fix `sparse_hashed` dict performance with sequential keys (wrong hash function). [#32536](https://github.com/ClickHouse/ClickHouse/pull/32536) ([Azat Khuzhin](https://github.com/azat)). + + +#### Experimental Feature + +* Parallel reading from multiple replicas within a shard during distributed query without using sample key. To enable this, set `allow_experimental_parallel_reading_from_replicas = 1` and `max_parallel_replicas` to any number. This closes [#26748](https://github.com/ClickHouse/ClickHouse/issues/26748). [#29279](https://github.com/ClickHouse/ClickHouse/pull/29279) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). +* Implemented sparse serialization. It can reduce usage of disk space and improve performance of some queries for columns, which contain a lot of default (zero) values. It can be enabled by setting `ratio_for_sparse_serialization`. Sparse serialization will be chosen dynamically for column, if it has ratio of number of default values to number of all values above that threshold. Serialization (default or sparse) will be fixed for every column in part, but may varies between parts. [#22535](https://github.com/ClickHouse/ClickHouse/pull/22535) ([Anton Popov](https://github.com/CurtizJ)). +* Add "TABLE OVERRIDE" feature for customizing MaterializedMySQL table schemas. [#32325](https://github.com/ClickHouse/ClickHouse/pull/32325) ([Stig Bakken](https://github.com/stigsb)). +* Add `EXPLAIN TABLE OVERRIDE` query. [#32836](https://github.com/ClickHouse/ClickHouse/pull/32836) ([Stig Bakken](https://github.com/stigsb)). +* Support TABLE OVERRIDE clause for MaterializedPostgreSQL. RFC: [#31480](https://github.com/ClickHouse/ClickHouse/issues/31480). [#32749](https://github.com/ClickHouse/ClickHouse/pull/32749) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Change ZooKeeper path for zero-copy marks for shared data. Note that "zero-copy replication" is non-production feature (in early stages of development) that you shouldn't use anyway. But in case if you have used it, let you keep in mind this change. [#32061](https://github.com/ClickHouse/ClickHouse/pull/32061) ([ianton-ru](https://github.com/ianton-ru)). +* Events clause support for WINDOW VIEW watch query. [#32607](https://github.com/ClickHouse/ClickHouse/pull/32607) ([vxider](https://github.com/Vxider)). +* Fix ACL with explicit digit hash in `clickhouse-keeper`: now the behavior consistent with ZooKeeper and generated digest is always accepted. [#33249](https://github.com/ClickHouse/ClickHouse/pull/33249) ([小路](https://github.com/nicelulu)). [#33246](https://github.com/ClickHouse/ClickHouse/pull/33246). +* Fix unexpected projection removal when detaching parts. [#32067](https://github.com/ClickHouse/ClickHouse/pull/32067) ([Amos Bird](https://github.com/amosbird)). + + +#### Improvement + +* Now date time conversion functions that generates time before `1970-01-01 00:00:00` will be saturated to zero instead of overflow. [#29953](https://github.com/ClickHouse/ClickHouse/pull/29953) ([Amos Bird](https://github.com/amosbird)). It also fixes a bug in index analysis if date truncation function would yield result before the Unix epoch. +* Always display resource usage (total CPU usage, total RAM usage and max RAM usage per host) in client. [#33271](https://github.com/ClickHouse/ClickHouse/pull/33271) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Improve `Bool` type serialization and deserialization, check the range of values. [#32984](https://github.com/ClickHouse/ClickHouse/pull/32984) ([Kruglov Pavel](https://github.com/Avogar)). +* If an invalid setting is defined using the `SET` query or using the query parameters in the HTTP request, error message will contain suggestions that are similar to the invalid setting string (if any exists). [#32946](https://github.com/ClickHouse/ClickHouse/pull/32946) ([Antonio Andelic](https://github.com/antonio2368)). +* Support hints for mistyped setting names for clickhouse-client and clickhouse-local. Closes [#32237](https://github.com/ClickHouse/ClickHouse/issues/32237). [#32841](https://github.com/ClickHouse/ClickHouse/pull/32841) ([凌涛](https://github.com/lingtaolf)). +* Allow to use virtual columns in Materialized Views. Close [#11210](https://github.com/ClickHouse/ClickHouse/issues/11210). [#33482](https://github.com/ClickHouse/ClickHouse/pull/33482) ([OnePiece](https://github.com/zhongyuankai)). +* Add config to disable IPv6 in clickhouse-keeper if needed. This close [#33381](https://github.com/ClickHouse/ClickHouse/issues/33381). [#33450](https://github.com/ClickHouse/ClickHouse/pull/33450) ([Wu Xueyang](https://github.com/wuxueyang96)). +* Add more info to `system.build_options` about current git revision. [#33431](https://github.com/ClickHouse/ClickHouse/pull/33431) ([taiyang-li](https://github.com/taiyang-li)). +* `clickhouse-local`: track memory under `--max_memory_usage_in_client` option. [#33341](https://github.com/ClickHouse/ClickHouse/pull/33341) ([Azat Khuzhin](https://github.com/azat)). +* Allow negative intervals in function `intervalLengthSum`. Their length will be added as well. This closes [#33323](https://github.com/ClickHouse/ClickHouse/issues/33323). [#33335](https://github.com/ClickHouse/ClickHouse/pull/33335) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* `LineAsString` can be used as output format. This closes [#30919](https://github.com/ClickHouse/ClickHouse/issues/30919). [#33331](https://github.com/ClickHouse/ClickHouse/pull/33331) ([Sergei Trifonov](https://github.com/serxa)). +* Support `` in cluster configuration, as an alternative form of `1`. Close [#33270](https://github.com/ClickHouse/ClickHouse/issues/33270). [#33330](https://github.com/ClickHouse/ClickHouse/pull/33330) ([SuperDJY](https://github.com/cmsxbc)). +* Pressing Ctrl+C twice will terminate `clickhouse-benchmark` immediately without waiting for in-flight queries. This closes [#32586](https://github.com/ClickHouse/ClickHouse/issues/32586). [#33303](https://github.com/ClickHouse/ClickHouse/pull/33303) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Support Unix timestamp with milliseconds in `parseDateTimeBestEffort` function. [#33276](https://github.com/ClickHouse/ClickHouse/pull/33276) ([Ben](https://github.com/benbiti)). +* Allow to cancel query while reading data from external table in the formats: `Arrow` / `Parquet` / `ORC` - it failed to be cancelled it case of big files and setting input_format_allow_seeks as false. Closes [#29678](https://github.com/ClickHouse/ClickHouse/issues/29678). [#33238](https://github.com/ClickHouse/ClickHouse/pull/33238) ([Kseniia Sumarokova](https://github.com/kssenii)). +* If table engine supports `SETTINGS` clause, allow to pass the settings as key-value or via config. Add this support for MySQL. [#33231](https://github.com/ClickHouse/ClickHouse/pull/33231) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Correctly prevent Nullable primary keys if necessary. This is for [#32780](https://github.com/ClickHouse/ClickHouse/issues/32780). [#33218](https://github.com/ClickHouse/ClickHouse/pull/33218) ([Amos Bird](https://github.com/amosbird)). +* Add retry for `PostgreSQL` connections in case nothing has been fetched yet. Closes [#33199](https://github.com/ClickHouse/ClickHouse/issues/33199). [#33209](https://github.com/ClickHouse/ClickHouse/pull/33209) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Validate config keys for external dictionaries. [#33095](https://github.com/ClickHouse/ClickHouse/issues/33095#issuecomment-1000577517). [#33130](https://github.com/ClickHouse/ClickHouse/pull/33130) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Send profile info inside `clickhouse-local`. Closes [#33093](https://github.com/ClickHouse/ClickHouse/issues/33093). [#33097](https://github.com/ClickHouse/ClickHouse/pull/33097) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Short circuit evaluation: support for function `throwIf`. Closes [#32969](https://github.com/ClickHouse/ClickHouse/issues/32969). [#32973](https://github.com/ClickHouse/ClickHouse/pull/32973) ([Maksim Kita](https://github.com/kitaisreal)). +* (This only happens in unofficial builds). Fixed segfault when inserting data into compressed Decimal, String, FixedString and Array columns. This closes [#32939](https://github.com/ClickHouse/ClickHouse/issues/32939). [#32940](https://github.com/ClickHouse/ClickHouse/pull/32940) ([N. Kolotov](https://github.com/nkolotov)). +* Added support for specifying subquery as SQL user defined function. Example: `CREATE FUNCTION test AS () -> (SELECT 1)`. Closes [#30755](https://github.com/ClickHouse/ClickHouse/issues/30755). [#32758](https://github.com/ClickHouse/ClickHouse/pull/32758) ([Maksim Kita](https://github.com/kitaisreal)). +* Improve gRPC compression support for [#28671](https://github.com/ClickHouse/ClickHouse/issues/28671). [#32747](https://github.com/ClickHouse/ClickHouse/pull/32747) ([Vitaly Baranov](https://github.com/vitlibar)). +* Flush all In-Memory data parts when WAL is not enabled while shutdown server or detaching table. [#32742](https://github.com/ClickHouse/ClickHouse/pull/32742) ([nauta](https://github.com/nautaa)). +* Allow to control connection timeouts for MySQL (previously was supported only for dictionary source). Closes [#16669](https://github.com/ClickHouse/ClickHouse/issues/16669). Previously default connect_timeout was rather small, now it is configurable. [#32734](https://github.com/ClickHouse/ClickHouse/pull/32734) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Support `authSource` option for storage `MongoDB`. Closes [#32594](https://github.com/ClickHouse/ClickHouse/issues/32594). [#32702](https://github.com/ClickHouse/ClickHouse/pull/32702) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Support `Date32` type in `genarateRandom` table function. [#32643](https://github.com/ClickHouse/ClickHouse/pull/32643) ([nauta](https://github.com/nautaa)). +* Add settings `max_concurrent_select_queries` and `max_concurrent_insert_queries` for control concurrent queries by query kind. Close [#3575](https://github.com/ClickHouse/ClickHouse/issues/3575). [#32609](https://github.com/ClickHouse/ClickHouse/pull/32609) ([SuperDJY](https://github.com/cmsxbc)). +* Improve handling nested structures with missing columns while reading data in `Protobuf` format. Follow-up to https://github.com/ClickHouse/ClickHouse/pull/31988. [#32531](https://github.com/ClickHouse/ClickHouse/pull/32531) ([Vitaly Baranov](https://github.com/vitlibar)). +* Allow empty credentials for `MongoDB` engine. Closes [#26267](https://github.com/ClickHouse/ClickHouse/issues/26267). [#32460](https://github.com/ClickHouse/ClickHouse/pull/32460) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Disable some optimizations for window functions that may lead to exceptions. Closes [#31535](https://github.com/ClickHouse/ClickHouse/issues/31535). Closes [#31620](https://github.com/ClickHouse/ClickHouse/issues/31620). [#32453](https://github.com/ClickHouse/ClickHouse/pull/32453) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Allows to connect to MongoDB 5.0. Closes [#31483](https://github.com/ClickHouse/ClickHouse/issues/31483),. [#32416](https://github.com/ClickHouse/ClickHouse/pull/32416) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Enable comparison between `Decimal` and `Float`. Closes [#22626](https://github.com/ClickHouse/ClickHouse/issues/22626). [#31966](https://github.com/ClickHouse/ClickHouse/pull/31966) ([flynn](https://github.com/ucasFL)). +* Added settings `command_read_timeout`, `command_write_timeout` for `StorageExecutable`, `StorageExecutablePool`, `ExecutableDictionary`, `ExecutablePoolDictionary`, `ExecutableUserDefinedFunctions`. Setting `command_read_timeout` controls timeout for reading data from command stdout in milliseconds. Setting `command_write_timeout` timeout for writing data to command stdin in milliseconds. Added settings `command_termination_timeout` for `ExecutableUserDefinedFunction`, `ExecutableDictionary`, `StorageExecutable`. Added setting `execute_direct` for `ExecutableUserDefinedFunction`, by default true. Added setting `execute_direct` for `ExecutableDictionary`, `ExecutablePoolDictionary`, by default false. [#30957](https://github.com/ClickHouse/ClickHouse/pull/30957) ([Maksim Kita](https://github.com/kitaisreal)). +* Bitmap aggregate functions will give correct result for out of range argument instead of wraparound. [#33127](https://github.com/ClickHouse/ClickHouse/pull/33127) ([DR](https://github.com/freedomDR)). +* Fix parsing incorrect queries with `FROM INFILE` statement. [#33521](https://github.com/ClickHouse/ClickHouse/pull/33521) ([Kruglov Pavel](https://github.com/Avogar)). +* Don't allow to write into `S3` if path contains globs. [#33142](https://github.com/ClickHouse/ClickHouse/pull/33142) ([Kruglov Pavel](https://github.com/Avogar)). +* `--echo` option was not used by `clickhouse-client` in batch mode with single query. [#32843](https://github.com/ClickHouse/ClickHouse/pull/32843) ([N. Kolotov](https://github.com/nkolotov)). +* Use `--database` option for clickhouse-local. [#32797](https://github.com/ClickHouse/ClickHouse/pull/32797) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix surprisingly bad code in SQL ordinary function `file`. Now it supports symlinks. [#32640](https://github.com/ClickHouse/ClickHouse/pull/32640) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Updating `modification_time` for data part in `system.parts` after part movement [#32964](https://github.com/ClickHouse/ClickHouse/issues/32964). [#32965](https://github.com/ClickHouse/ClickHouse/pull/32965) ([save-my-heart](https://github.com/save-my-heart)). +* Potential issue, cannot be exploited: integer overflow may happen in array resize. [#33024](https://github.com/ClickHouse/ClickHouse/pull/33024) ([varadarajkumar](https://github.com/varadarajkumar)). + + +#### Build/Testing/Packaging Improvement + +* Add packages, functional tests and Docker builds for AArch64 (ARM) version of ClickHouse. [#32911](https://github.com/ClickHouse/ClickHouse/pull/32911) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). [#32415](https://github.com/ClickHouse/ClickHouse/pull/32415) +* Prepare ClickHouse to be built with musl-libc. It is not enabled by default. [#33134](https://github.com/ClickHouse/ClickHouse/pull/33134) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Make installation script working on FreeBSD. This closes [#33384](https://github.com/ClickHouse/ClickHouse/issues/33384). [#33418](https://github.com/ClickHouse/ClickHouse/pull/33418) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Add `actionlint` for GitHub Actions workflows and verify workflow files via `act --list` to check the correct workflow syntax. [#33612](https://github.com/ClickHouse/ClickHouse/pull/33612) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Add more tests for the nullable primary key feature. Add more tests with different types and merge tree kinds, plus randomly generated data. [#33228](https://github.com/ClickHouse/ClickHouse/pull/33228) ([Amos Bird](https://github.com/amosbird)). +* Add a simple tool to visualize flaky tests in web browser. [#33185](https://github.com/ClickHouse/ClickHouse/pull/33185) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Enable hermetic build for shared builds. This is mainly for developers. [#32968](https://github.com/ClickHouse/ClickHouse/pull/32968) ([Amos Bird](https://github.com/amosbird)). +* Update `libc++` and `libc++abi` to the latest. [#32484](https://github.com/ClickHouse/ClickHouse/pull/32484) ([Raúl Marín](https://github.com/Algunenano)). +* Added integration test for external .NET client ([ClickHouse.Client](https://github.com/DarkWanderer/ClickHouse.Client)). [#23230](https://github.com/ClickHouse/ClickHouse/pull/23230) ([Oleg V. Kozlyuk](https://github.com/DarkWanderer)). +* Inject git information into clickhouse binary file. So we can get source code revision easily from clickhouse binary file. [#33124](https://github.com/ClickHouse/ClickHouse/pull/33124) ([taiyang-li](https://github.com/taiyang-li)). +* Remove obsolete code from ConfigProcessor. Yandex specific code is not used anymore. The code contained one minor defect. This defect was reported by [Mallik Hassan](https://github.com/SadiHassan) in [#33032](https://github.com/ClickHouse/ClickHouse/issues/33032). This closes [#33032](https://github.com/ClickHouse/ClickHouse/issues/33032). [#33026](https://github.com/ClickHouse/ClickHouse/pull/33026) ([alexey-milovidov](https://github.com/alexey-milovidov)). + + +#### Bug Fix (user-visible misbehavior in official stable or prestable release) + +* Several fixes for format parsing. This is relevant if `clickhouse-server` is open for write access to adversary. Specifically crafted input data for `Native` format may lead to reading uninitialized memory or crash. This is relevant if `clickhouse-server` is open for write access to adversary. [#33050](https://github.com/ClickHouse/ClickHouse/pull/33050) ([Heena Bansal](https://github.com/HeenaBansal2009)). Fixed Apache Avro Union type index out of boundary issue in Apache Avro binary format. [#33022](https://github.com/ClickHouse/ClickHouse/pull/33022) ([Harry Lee](https://github.com/HarryLeeIBM)). Fix null pointer dereference in `LowCardinality` data when deserializing `LowCardinality` data in the Native format. [#33021](https://github.com/ClickHouse/ClickHouse/pull/33021) ([Harry Lee](https://github.com/HarryLeeIBM)). +* ClickHouse Keeper handler will correctly remove operation when response sent. [#32988](https://github.com/ClickHouse/ClickHouse/pull/32988) ([JackyWoo](https://github.com/JackyWoo)). +* Potential off-by-one miscalculation of quotas: quota limit was not reached, but the limit was exceeded. This fixes [#31174](https://github.com/ClickHouse/ClickHouse/issues/31174). [#31656](https://github.com/ClickHouse/ClickHouse/pull/31656) ([sunny](https://github.com/sunny19930321)). +* Fixed CASTing from String to IPv4 or IPv6 and back. Fixed error message in case of failed conversion. [#29224](https://github.com/ClickHouse/ClickHouse/pull/29224) ([Dmitry Novik](https://github.com/novikd)) [#27914](https://github.com/ClickHouse/ClickHouse/pull/27914) ([Vasily Nemkov](https://github.com/Enmk)). +* Fixed an exception like `Unknown aggregate function nothing` during an execution on a remote server. This fixes [#16689](https://github.com/ClickHouse/ClickHouse/issues/16689). [#26074](https://github.com/ClickHouse/ClickHouse/pull/26074) ([hexiaoting](https://github.com/hexiaoting)). +* Fix wrong database for JOIN without explicit database in distributed queries (Fixes: [#10471](https://github.com/ClickHouse/ClickHouse/issues/10471)). [#33611](https://github.com/ClickHouse/ClickHouse/pull/33611) ([Azat Khuzhin](https://github.com/azat)). +* Fix segfault in Apache `Avro` format that appears after the second insert into file. [#33566](https://github.com/ClickHouse/ClickHouse/pull/33566) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix segfault in Apache `Arrow` format if schema contains `Dictionary` type. Closes [#33507](https://github.com/ClickHouse/ClickHouse/issues/33507). [#33529](https://github.com/ClickHouse/ClickHouse/pull/33529) ([Kruglov Pavel](https://github.com/Avogar)). +* Out of band `offset` and `limit` settings may be applied incorrectly for views. Close [#33289](https://github.com/ClickHouse/ClickHouse/issues/33289) [#33518](https://github.com/ClickHouse/ClickHouse/pull/33518) ([hexiaoting](https://github.com/hexiaoting)). +* Fix an exception `Block structure mismatch` which may happen during insertion into table with default nested `LowCardinality` column. Fixes [#33028](https://github.com/ClickHouse/ClickHouse/issues/33028). [#33504](https://github.com/ClickHouse/ClickHouse/pull/33504) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Fix dictionary expressions for `range_hashed` range min and range max attributes when created using DDL. Closes [#30809](https://github.com/ClickHouse/ClickHouse/issues/30809). [#33478](https://github.com/ClickHouse/ClickHouse/pull/33478) ([Maksim Kita](https://github.com/kitaisreal)). +* Fix possible use-after-free for INSERT into Materialized View with concurrent DROP ([Azat Khuzhin](https://github.com/azat)). +* Do not try to read pass EOF (to workaround for a bug in the Linux kernel), this bug can be reproduced on kernels (3.14..5.9), and requires `index_granularity_bytes=0` (i.e. turn off adaptive index granularity). [#33372](https://github.com/ClickHouse/ClickHouse/pull/33372) ([Azat Khuzhin](https://github.com/azat)). +* The commands `SYSTEM SUSPEND` and `SYSTEM ... THREAD FUZZER` missed access control. It is fixed. Author: Kevin Michel. [#33333](https://github.com/ClickHouse/ClickHouse/pull/33333) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Fix when `COMMENT` for dictionaries does not appear in `system.tables`, `system.dictionaries`. Allow to modify the comment for `Dictionary` engine. Closes [#33251](https://github.com/ClickHouse/ClickHouse/issues/33251). [#33261](https://github.com/ClickHouse/ClickHouse/pull/33261) ([Maksim Kita](https://github.com/kitaisreal)). +* Add asynchronous inserts (with enabled setting `async_insert`) to query log. Previously such queries didn't appear in the query log. [#33239](https://github.com/ClickHouse/ClickHouse/pull/33239) ([Anton Popov](https://github.com/CurtizJ)). +* Fix sending `WHERE 1 = 0` expressions for external databases query. Closes [#33152](https://github.com/ClickHouse/ClickHouse/issues/33152). [#33214](https://github.com/ClickHouse/ClickHouse/pull/33214) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix DDL validation for MaterializedPostgreSQL. Fix setting `materialized_postgresql_allow_automatic_update`. Closes [#29535](https://github.com/ClickHouse/ClickHouse/issues/29535). [#33200](https://github.com/ClickHouse/ClickHouse/pull/33200) ([Kseniia Sumarokova](https://github.com/kssenii)). Make sure unused replication slots are always removed. Found in [#26952](https://github.com/ClickHouse/ClickHouse/issues/26952). [#33187](https://github.com/ClickHouse/ClickHouse/pull/33187) ([Kseniia Sumarokova](https://github.com/kssenii)). Fix MaterializedPostreSQL detach/attach (removing / adding to replication) tables with non-default schema. Found in [#29535](https://github.com/ClickHouse/ClickHouse/issues/29535). [#33179](https://github.com/ClickHouse/ClickHouse/pull/33179) ([Kseniia Sumarokova](https://github.com/kssenii)). Fix DROP MaterializedPostgreSQL database. [#33468](https://github.com/ClickHouse/ClickHouse/pull/33468) ([Kseniia Sumarokova](https://github.com/kssenii)). +* The metric `StorageBufferBytes` sometimes was miscalculated. [#33159](https://github.com/ClickHouse/ClickHouse/pull/33159) ([xuyatian](https://github.com/xuyatian)). +* Fix error `Invalid version for SerializationLowCardinality key column` in case of reading from `LowCardinality` column with `local_filesystem_read_prefetch` or `remote_filesystem_read_prefetch` enabled. [#33046](https://github.com/ClickHouse/ClickHouse/pull/33046) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Fix `s3` table function reading empty file. Closes [#33008](https://github.com/ClickHouse/ClickHouse/issues/33008). [#33037](https://github.com/ClickHouse/ClickHouse/pull/33037) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix Context leak in case of cancel_http_readonly_queries_on_client_close (i.e. leaking of external tables that had been uploaded the the server and other resources). [#32982](https://github.com/ClickHouse/ClickHouse/pull/32982) ([Azat Khuzhin](https://github.com/azat)). +* Fix wrong tuple output in `CSV` format in case of custom csv delimiter. [#32981](https://github.com/ClickHouse/ClickHouse/pull/32981) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix HDFS URL check that didn't allow using HA namenode address. Bug was introduced in https://github.com/ClickHouse/ClickHouse/pull/31042. [#32976](https://github.com/ClickHouse/ClickHouse/pull/32976) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix throwing exception like positional argument out of bounds for non-positional arguments. Closes [#31173](https://github.com/ClickHouse/ClickHouse/issues/31173)#event-5789668239. [#32961](https://github.com/ClickHouse/ClickHouse/pull/32961) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix UB in case of unexpected EOF during filling a set from HTTP query (i.e. if the client interrupted in the middle, i.e. `timeout 0.15s curl -Ss -F 's=@t.csv;' 'http://127.0.0.1:8123/?s_structure=key+Int&query=SELECT+dummy+IN+s'` and with large enough `t.csv`). [#32955](https://github.com/ClickHouse/ClickHouse/pull/32955) ([Azat Khuzhin](https://github.com/azat)). +* Fix a regression in `replaceRegexpAll` function. The function worked incorrectly when matched substring was empty. This closes [#32777](https://github.com/ClickHouse/ClickHouse/issues/32777). This closes [#30245](https://github.com/ClickHouse/ClickHouse/issues/30245). [#32945](https://github.com/ClickHouse/ClickHouse/pull/32945) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Fix `ORC` format stripe reading. [#32929](https://github.com/ClickHouse/ClickHouse/pull/32929) ([kreuzerkrieg](https://github.com/kreuzerkrieg)). +* `topKWeightedState` failed for some input types. [#32487](https://github.com/ClickHouse/ClickHouse/issues/32487). [#32914](https://github.com/ClickHouse/ClickHouse/pull/32914) ([vdimir](https://github.com/vdimir)). +* Fix exception `Single chunk is expected from view inner query (LOGICAL_ERROR)` in materialized view. Fixes [#31419](https://github.com/ClickHouse/ClickHouse/issues/31419). [#32862](https://github.com/ClickHouse/ClickHouse/pull/32862) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Fix optimization with lazy seek for async reads from remote filesystems. Closes [#32803](https://github.com/ClickHouse/ClickHouse/issues/32803). [#32835](https://github.com/ClickHouse/ClickHouse/pull/32835) ([Kseniia Sumarokova](https://github.com/kssenii)). +* `MergeTree` table engine might silently skip some mutations if there are too many running mutations or in case of high memory consumption, it's fixed. Fixes [#17882](https://github.com/ClickHouse/ClickHouse/issues/17882). [#32814](https://github.com/ClickHouse/ClickHouse/pull/32814) ([tavplubix](https://github.com/tavplubix)). +* Avoid reusing the scalar subquery cache when processing MV blocks. This fixes a bug when the scalar query reference the source table but it means that all subscalar queries in the MV definition will be calculated for each block. [#32811](https://github.com/ClickHouse/ClickHouse/pull/32811) ([Raúl Marín](https://github.com/Algunenano)). +* Server might fail to start if database with `MySQL` engine cannot connect to MySQL server, it's fixed. Fixes [#14441](https://github.com/ClickHouse/ClickHouse/issues/14441). [#32802](https://github.com/ClickHouse/ClickHouse/pull/32802) ([tavplubix](https://github.com/tavplubix)). +* Fix crash when used `fuzzBits` function, close [#32737](https://github.com/ClickHouse/ClickHouse/issues/32737). [#32755](https://github.com/ClickHouse/ClickHouse/pull/32755) ([SuperDJY](https://github.com/cmsxbc)). +* Fix error `Column is not under aggregate function` in case of MV with `GROUP BY (list of columns)` (which is pared as `GROUP BY tuple(...)`) over `Kafka`/`RabbitMQ`. Fixes [#32668](https://github.com/ClickHouse/ClickHouse/issues/32668) and [#32744](https://github.com/ClickHouse/ClickHouse/issues/32744). [#32751](https://github.com/ClickHouse/ClickHouse/pull/32751) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Fix `ALTER TABLE ... MATERIALIZE TTL` query with `TTL ... DELETE WHERE ...` and `TTL ... GROUP BY ...` modes. [#32695](https://github.com/ClickHouse/ClickHouse/pull/32695) ([Anton Popov](https://github.com/CurtizJ)). +* Fix `optimize_read_in_order` optimization in case when table engine is `Distributed` or `Merge` and its underlying `MergeTree` tables have monotonous function in prefix of sorting key. [#32670](https://github.com/ClickHouse/ClickHouse/pull/32670) ([Anton Popov](https://github.com/CurtizJ)). +* Fix LOGICAL_ERROR exception when the target of a materialized view is a JOIN or a SET table. [#32669](https://github.com/ClickHouse/ClickHouse/pull/32669) ([Raúl Marín](https://github.com/Algunenano)). +* Inserting into S3 with multipart upload to Google Cloud Storage may trigger abort. [#32504](https://github.com/ClickHouse/ClickHouse/issues/32504). [#32649](https://github.com/ClickHouse/ClickHouse/pull/32649) ([vdimir](https://github.com/vdimir)). +* Fix possible exception at `RabbitMQ` storage startup by delaying channel creation. [#32584](https://github.com/ClickHouse/ClickHouse/pull/32584) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Fix table lifetime (i.e. possible use-after-free) in case of parallel DROP TABLE and INSERT. [#32572](https://github.com/ClickHouse/ClickHouse/pull/32572) ([Azat Khuzhin](https://github.com/azat)). +* Fix async inserts with formats `CustomSeparated`, `Template`, `Regexp`, `MsgPack` and `JSONAsString`. Previousely the async inserts with these formats didn't read any data. [#32530](https://github.com/ClickHouse/ClickHouse/pull/32530) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix `groupBitmapAnd` function on distributed table. [#32529](https://github.com/ClickHouse/ClickHouse/pull/32529) ([minhthucdao](https://github.com/dmthuc)). +* Fix crash in JOIN found by fuzzer, close [#32458](https://github.com/ClickHouse/ClickHouse/issues/32458). [#32508](https://github.com/ClickHouse/ClickHouse/pull/32508) ([vdimir](https://github.com/vdimir)). +* Proper handling of the case with Apache Arrow column duplication. [#32507](https://github.com/ClickHouse/ClickHouse/pull/32507) ([Dmitriy Mokhnatkin](https://github.com/DMokhnatkin)). +* Fix issue with ambiguous query formatting in distributed queries that led to errors when some table columns were named `ALL` or `DISTINCT`. This closes [#32391](https://github.com/ClickHouse/ClickHouse/issues/32391). [#32490](https://github.com/ClickHouse/ClickHouse/pull/32490) ([alexey-milovidov](https://github.com/alexey-milovidov)). +* Fix failures in queries that are trying to use skipping indices, which are not materialized yet. Fixes [#32292](https://github.com/ClickHouse/ClickHouse/issues/32292) and [#30343](https://github.com/ClickHouse/ClickHouse/issues/30343). [#32359](https://github.com/ClickHouse/ClickHouse/pull/32359) ([Anton Popov](https://github.com/CurtizJ)). +* Fix broken select query when there are more than 2 row policies on same column, begin at second queries on the same session. [#31606](https://github.com/ClickHouse/ClickHouse/issues/31606). [#32291](https://github.com/ClickHouse/ClickHouse/pull/32291) ([SuperDJY](https://github.com/cmsxbc)). +* Fix fractional unix timestamp conversion to `DateTime64`, fractional part was reversed for negative unix timestamps (before 1970-01-01). [#32240](https://github.com/ClickHouse/ClickHouse/pull/32240) ([Ben](https://github.com/benbiti)). +* Some entries of replication queue might hang for `temporary_directories_lifetime` (1 day by default) with `Directory tmp_merge_` or `Part ... (state Deleting) already exists, but it will be deleted soon` or similar error. It's fixed. Fixes [#29616](https://github.com/ClickHouse/ClickHouse/issues/29616). [#32201](https://github.com/ClickHouse/ClickHouse/pull/32201) ([tavplubix](https://github.com/tavplubix)). +* Fix parsing of `APPLY lambda` column transformer which could lead to client/server crash. [#32138](https://github.com/ClickHouse/ClickHouse/pull/32138) ([Kruglov Pavel](https://github.com/Avogar)). +* Fix `base64Encode` adding trailing bytes on small strings. [#31797](https://github.com/ClickHouse/ClickHouse/pull/31797) ([Kevin Michel](https://github.com/kmichel-aiven)). +* Fix possible crash (or incorrect result) in case of `LowCardinality` arguments of window function. Fixes [#31114](https://github.com/ClickHouse/ClickHouse/issues/31114). [#31888](https://github.com/ClickHouse/ClickHouse/pull/31888) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Fix hang up with command `DROP TABLE system.query_log sync`. [#33293](https://github.com/ClickHouse/ClickHouse/pull/33293) ([zhanghuajie](https://github.com/zhanghuajieHIT)). + + +## [Changelog for 2021](https://github.com/ClickHouse/ClickHouse/blob/master/docs/en/whats-new/changelog/2021.md) \ No newline at end of file diff --git a/docs/en/whats-new/index.md b/docs/en/whats-new/index.md index 8033fdf71d9..ac2b41a6637 100644 --- a/docs/en/whats-new/index.md +++ b/docs/en/whats-new/index.md @@ -1,8 +1,10 @@ --- -toc_folder_title: What's New -toc_priority: 82 +sidebar_label: What's New +sidebar_position: 500 +keywords: [clickhouse, what's, new, roadmap, changelog] +description: What's New in ClickHouse --- -# What’s New in ClickHouse? {#whats-new-in-clickhouse} +# What’s New in ClickHouse {#whats-new-in-clickhouse} There’s a short high-level [roadmap](../whats-new/roadmap.md) and a detailed [changelog](../whats-new/changelog/index.md) for releases that have already been published. diff --git a/docs/en/whats-new/roadmap.md b/docs/en/whats-new/roadmap.md index 54f8f9d68a3..be7298ccd79 100644 --- a/docs/en/whats-new/roadmap.md +++ b/docs/en/whats-new/roadmap.md @@ -7,4 +7,3 @@ toc_title: Roadmap The roadmap for the year 2022 is published for open discussion [here](https://github.com/ClickHouse/ClickHouse/issues/32513). -{## [Original article](https://clickhouse.com/docs/en/roadmap/) ##} diff --git a/docs/en/whats-new/security-changelog.md b/docs/en/whats-new/security-changelog.md index bcfeaa06e24..0a5c926f227 100644 --- a/docs/en/whats-new/security-changelog.md +++ b/docs/en/whats-new/security-changelog.md @@ -1,8 +1,12 @@ --- -toc_priority: 76 -toc_title: Security Changelog +sidebar_label: Security Changelog +sidebar_position: 100 +keywords: [clickhouse, security, changelog] +description: Security Changelog --- +# Security Changelog + ## Fixed in ClickHouse 21.4.3.21, 2021-04-12 {#fixed-in-clickhouse-release-21-4-3-21-2021-04-12} ### CVE-2021-25263 {#cve-2021-25263} @@ -80,5 +84,3 @@ Credits: Andrey Krasichkov and Evgeny Sidorov of Yandex Information Security Tea Incorrect configuration in deb package could lead to the unauthorized use of the database. Credits: the UK’s National Cyber Security Centre (NCSC) - -{## [Original article](https://clickhouse.com/docs/en/security_changelog/) ##} From 95565b45d640b5e1b1afb53a1b7692ed7d880fa7 Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 29 Mar 2022 13:33:17 +0200 Subject: [PATCH 051/239] Fix --- programs/local/LocalServer.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/programs/local/LocalServer.cpp b/programs/local/LocalServer.cpp index 26d42a11315..0e02453dcd9 100644 --- a/programs/local/LocalServer.cpp +++ b/programs/local/LocalServer.cpp @@ -455,16 +455,13 @@ void LocalServer::processConfig() auto logging = (config().has("logger.console") || config().has("logger.level") || config().has("log-level") + || config().has("send_logs_level") || config().has("logger.log")); - auto file_logging = config().has("server_logs_file"); - if (is_interactive && logging && !file_logging) - throw Exception("For interactive mode logging is allowed only with --server_logs_file option", - ErrorCodes::BAD_ARGUMENTS); + auto level = Poco::Logger::parseLevel(config().getString("log-level", config().getString("send_logs_level", "trace"))); - if (file_logging) + if (config().has("server_logs_file")) { - auto level = Poco::Logger::parseLevel(config().getString("log-level", "trace")); Poco::Logger::root().setLevel(level); Poco::Logger::root().setChannel(Poco::AutoPtr(new Poco::SimpleFileChannel(server_logs_file))); } @@ -472,6 +469,7 @@ void LocalServer::processConfig() { // force enable logging config().setString("logger", "logger"); + Poco::Logger::root().setLevel(level); // sensitive data rules are not used here buildLoggers(config(), logger(), "clickhouse-local"); } @@ -712,6 +710,8 @@ void LocalServer::processOptions(const OptionsDescription &, const CommandLineOp config().setString("logger.log", options["logger.log"].as()); if (options.count("logger.level")) config().setString("logger.level", options["logger.level"].as()); + if (options.count("send_logs_level")) + config().setString("send_logs_level", options["send_logs_level"].as()); } } From 8edf6e74487a59b6e21cb679509dd8fd95ca1d59 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Tue, 29 Mar 2022 14:15:19 +0200 Subject: [PATCH 052/239] Mark test 02242_optimize_to_subcolumns_no_storage as backward incompatible for version 22.3.2.1 --- .../0_stateless/02242_optimize_to_subcolumns_no_storage.sql | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/queries/0_stateless/02242_optimize_to_subcolumns_no_storage.sql b/tests/queries/0_stateless/02242_optimize_to_subcolumns_no_storage.sql index e6e4663c5aa..8f8485eb58f 100644 --- a/tests/queries/0_stateless/02242_optimize_to_subcolumns_no_storage.sql +++ b/tests/queries/0_stateless/02242_optimize_to_subcolumns_no_storage.sql @@ -1,3 +1,4 @@ +-- Tags: no-backward-compatibility-check:22.3.2.1 SET optimize_functions_to_subcolumns = 1; SELECT count(*) FROM numbers(2) AS n1, numbers(3) AS n2, numbers(4) AS n3 WHERE (n1.number = n2.number) AND (n2.number = n3.number); From 33e28bcb18874dbd3514468ea81c577197a99f22 Mon Sep 17 00:00:00 2001 From: rfraposa Date: Tue, 29 Mar 2022 08:26:35 -0600 Subject: [PATCH 053/239] Updated /development folder --- docs/en/development/_category_.yml | 4 +- docs/en/development/adding_test_queries.md | 6 + docs/en/development/architecture.md | 9 +- docs/en/development/browse-code.md | 11 +- docs/en/development/build-cross-arm.md | 9 +- docs/en/development/build-cross-osx.md | 9 +- docs/en/development/build-cross-riscv.md | 6 +- docs/en/development/build-osx.md | 19 +- docs/en/development/build.md | 7 +- docs/en/development/continuous-integration.md | 7 +- docs/en/development/contrib.md | 7 +- docs/en/development/developer-instruction.md | 28 +-- docs/en/development/style.md | 7 +- docs/en/development/tests.md | 9 +- .../database-engines/materialized-mysql.md | 2 +- docs/en/engines/database-engines/mysql.md | 2 - .../table-engines/integrations/hive.md | 2 +- docs/en/example-datasets/amplab-benchmark.md | 4 +- docs/en/example-datasets/brown-benchmark.md | 4 +- docs/en/example-datasets/cell-towers.md | 5 +- docs/en/example-datasets/criteo.md | 5 +- docs/en/example-datasets/github-events.md | 3 +- docs/en/example-datasets/menus.md | 5 +- docs/en/example-datasets/metrica.md | 6 +- docs/en/example-datasets/nyc-taxi.md | 6 +- docs/en/example-datasets/ontime.md | 6 +- docs/en/example-datasets/opensky.md | 8 +- docs/en/example-datasets/recipes.md | 3 +- docs/en/example-datasets/star-schema.md | 8 +- docs/en/example-datasets/uk-price-paid.md | 5 +- docs/en/example-datasets/wikistat.md | 7 +- docs/en/install.md | 27 ++- docs/en/interfaces/formats.md | 49 +--- docs/en/operations/caches.md | 2 +- docs/en/operations/clickhouse-keeper.md | 6 +- docs/en/operations/named-collections.md | 229 ------------------ docs/en/operations/quotas.md | 2 +- .../settings.md | 11 - .../operations/settings/memory-overcommit.md | 31 --- docs/en/operations/settings/settings.md | 43 +--- .../external-dicts-dict-layout.md | 26 +- .../dictionaries/internal-dicts.md | 2 + .../functions/array-functions.md | 12 - docs/en/sql-reference/functions/geo/h3.md | 181 -------------- .../functions/ip-address-functions.md | 36 +-- .../sql-reference/functions/json-functions.md | 4 +- .../functions/rounding-functions.md | 2 +- docs/en/sql-reference/functions/statistics.md | 48 ---- docs/en/whats-new/security-changelog.md | 43 ---- 49 files changed, 160 insertions(+), 813 deletions(-) delete mode 100644 docs/en/operations/named-collections.md delete mode 100644 docs/en/operations/settings/memory-overcommit.md delete mode 100644 docs/en/sql-reference/functions/statistics.md diff --git a/docs/en/development/_category_.yml b/docs/en/development/_category_.yml index ef272510d47..e0291aa2aa6 100644 --- a/docs/en/development/_category_.yml +++ b/docs/en/development/_category_.yml @@ -1,7 +1,7 @@ position: 100 -label: 'Development' +label: 'Building ClickHouse' collapsible: true collapsed: true link: type: generated-index - title: Reference \ No newline at end of file + title: Building ClickHouse \ No newline at end of file diff --git a/docs/en/development/adding_test_queries.md b/docs/en/development/adding_test_queries.md index a73b0e1ac5d..9b993a96ed5 100644 --- a/docs/en/development/adding_test_queries.md +++ b/docs/en/development/adding_test_queries.md @@ -1,3 +1,9 @@ +--- +sidebar_label: Adding Test Queries +sidebar_position: 63 +description: Instructions on how to add a test case to ClickHouse continuous integration +--- + # How to add test queries to ClickHouse CI ClickHouse has hundreds (or even thousands) of features. Every commit gets checked by a complex set of tests containing many thousands of test cases. diff --git a/docs/en/development/architecture.md b/docs/en/development/architecture.md index b696c441374..d824ace0c65 100644 --- a/docs/en/development/architecture.md +++ b/docs/en/development/architecture.md @@ -1,11 +1,12 @@ --- -toc_priority: 62 -toc_title: Architecture Overview +sidebar_label: Architecture Overview +sidebar_position: 62 --- -# Overview of ClickHouse Architecture {#overview-of-clickhouse-architecture} +# Overview of ClickHouse Architecture -ClickHouse is a true column-oriented DBMS. Data is stored by columns, and during the execution of arrays (vectors or chunks of columns). Whenever possible, operations are dispatched on arrays, rather than on individual values. It is called “vectorized query execution” and it helps lower the cost of actual data processing. +ClickHouse is a true column-oriented DBMS. Data is stored by columns, and during the execution of arrays (vectors or chunks of columns). +Whenever possible, operations are dispatched on arrays, rather than on individual values. It is called “vectorized query execution” and it helps lower the cost of actual data processing. > This idea is nothing new. It dates back to the `APL` (A programming language, 1957) and its descendants: `A +` (APL dialect), `J` (1990), `K` (1993), and `Q` (programming language from Kx Systems, 2003). Array programming is used in scientific data processing. Neither is this idea something new in relational databases: for example, it is used in the `VectorWise` system (also known as Actian Vector Analytic Database by Actian Corporation). diff --git a/docs/en/development/browse-code.md b/docs/en/development/browse-code.md index 0fe8a46873c..da924c359ff 100644 --- a/docs/en/development/browse-code.md +++ b/docs/en/development/browse-code.md @@ -1,12 +1,13 @@ --- -toc_priority: 72 -toc_title: Source Code Browser +sidebar_label: Source Code Browser +sidebar_position: 72 +description: Various ways to browse and edit the source code --- -# Browse ClickHouse Source Code {#browse-clickhouse-source-code} +# Browse ClickHouse Source Code -You can use **Woboq** online code browser available [here](https://clickhouse.com/codebrowser/ClickHouse/src/index.html). It provides code navigation and semantic highlighting, search and indexing. The code snapshot is updated daily. +You can use the **Woboq** online code browser available [here](https://clickhouse.com/codebrowser/ClickHouse/src/index.html). It provides code navigation and semantic highlighting, search and indexing. The code snapshot is updated daily. Also, you can browse sources on [GitHub](https://github.com/ClickHouse/ClickHouse) as usual. -If you’re interested what IDE to use, we recommend CLion, QT Creator, VS Code and KDevelop (with caveats). You can use any favourite IDE. Vim and Emacs also count. +If you’re interested what IDE to use, we recommend CLion, QT Creator, VS Code and KDevelop (with caveats). You can use any favorite IDE. Vim and Emacs also count. diff --git a/docs/en/development/build-cross-arm.md b/docs/en/development/build-cross-arm.md index eb99105a857..305c09ae217 100644 --- a/docs/en/development/build-cross-arm.md +++ b/docs/en/development/build-cross-arm.md @@ -1,11 +1,12 @@ --- -toc_priority: 67 -toc_title: Build on Linux for AARCH64 (ARM64) +sidebar_position: 67 +sidebar_label: Build on Linux for AARCH64 (ARM64) --- -# How to Build ClickHouse on Linux for AARCH64 (ARM64) Architecture {#how-to-build-clickhouse-on-linux-for-aarch64-arm64-architecture} +# How to Build ClickHouse on Linux for AARCH64 (ARM64) Architecture -This is for the case when you have Linux machine and want to use it to build `clickhouse` binary that will run on another Linux machine with AARCH64 CPU architecture. This is intended for continuous integration checks that run on Linux servers. +This is for the case when you have Linux machine and want to use it to build `clickhouse` binary that will run on another Linux machine with AARCH64 CPU architecture. +This is intended for continuous integration checks that run on Linux servers. The cross-build for AARCH64 is based on the [Build instructions](../development/build.md), follow them first. diff --git a/docs/en/development/build-cross-osx.md b/docs/en/development/build-cross-osx.md index c7e40013113..1dbd0ec6430 100644 --- a/docs/en/development/build-cross-osx.md +++ b/docs/en/development/build-cross-osx.md @@ -1,11 +1,12 @@ --- -toc_priority: 66 -toc_title: Build on Linux for Mac OS X +sidebar_position: 66 +sidebar_label: Build on Linux for Mac OS X --- -# How to Build ClickHouse on Linux for Mac OS X {#how-to-build-clickhouse-on-linux-for-mac-os-x} +# How to Build ClickHouse on Linux for Mac OS X -This is for the case when you have Linux machine and want to use it to build `clickhouse` binary that will run on OS X. This is intended for continuous integration checks that run on Linux servers. If you want to build ClickHouse directly on Mac OS X, then proceed with [another instruction](../development/build-osx.md). +This is for the case when you have a Linux machine and want to use it to build `clickhouse` binary that will run on OS X. +This is intended for continuous integration checks that run on Linux servers. If you want to build ClickHouse directly on Mac OS X, then proceed with [another instruction](../development/build-osx.md). The cross-build for Mac OS X is based on the [Build instructions](../development/build.md), follow them first. diff --git a/docs/en/development/build-cross-riscv.md b/docs/en/development/build-cross-riscv.md index 5cdce710b41..94c0f47a05d 100644 --- a/docs/en/development/build-cross-riscv.md +++ b/docs/en/development/build-cross-riscv.md @@ -1,9 +1,9 @@ --- -toc_priority: 68 -toc_title: Build on Linux for RISC-V 64 +sidebar_position: 68 +sidebar_label: Build on Linux for RISC-V 64 --- -# How to Build ClickHouse on Linux for RISC-V 64 Architecture {#how-to-build-clickhouse-on-linux-for-risc-v-64-architecture} +# How to Build ClickHouse on Linux for RISC-V 64 Architecture As of writing (11.11.2021) building for risc-v considered to be highly experimental. Not all features can be enabled. diff --git a/docs/en/development/build-osx.md b/docs/en/development/build-osx.md index 19d157bcd7d..5d5706f6e6b 100644 --- a/docs/en/development/build-osx.md +++ b/docs/en/development/build-osx.md @@ -1,16 +1,21 @@ --- -toc_priority: 65 -toc_title: Build on Mac OS X +sidebar_position: 65 +sidebar_label: Build on Mac OS X +description: How to build ClickHouse on Mac OS X --- -# How to Build ClickHouse on Mac OS X {#how-to-build-clickhouse-on-mac-os-x} +# How to Build ClickHouse on Mac OS X -!!! info "You don't have to build ClickHouse yourself" - You can install pre-built ClickHouse as described in [Quick Start](https://clickhouse.com/#quick-start). - Follow `macOS (Intel)` or `macOS (Apple silicon)` installation instructions. +:::info You don't have to build ClickHouse yourself! +You can install pre-built ClickHouse as described in [Quick Start](https://clickhouse.com/#quick-start). Follow **macOS (Intel)** or **macOS (Apple silicon)** installation instructions. +::: Build should work on x86_64 (Intel) and arm64 (Apple silicon) based macOS 10.15 (Catalina) and higher with Homebrew's vanilla Clang. -It is always recommended to use vanilla `clang` compiler. It is possible to use XCode's `apple-clang` or `gcc` but it's strongly discouraged. +It is always recommended to use vanilla `clang` compiler. + +:::note +It is possible to use XCode's `apple-clang` or `gcc`, but it's strongly discouraged. +::: ## Install Homebrew {#install-homebrew} diff --git a/docs/en/development/build.md b/docs/en/development/build.md index 5379fc37937..b128412a55e 100644 --- a/docs/en/development/build.md +++ b/docs/en/development/build.md @@ -1,9 +1,10 @@ --- -toc_priority: 64 -toc_title: Build on Linux +sidebar_position: 64 +sidebar_label: Build on Linux +description: How to build ClickHouse on Linux --- -# How to Build ClickHouse on Linux {#how-to-build-clickhouse-for-development} +# How to Build ClickHouse on Linux Supported platforms: diff --git a/docs/en/development/continuous-integration.md b/docs/en/development/continuous-integration.md index f9dfebff3f9..379b78a2c42 100644 --- a/docs/en/development/continuous-integration.md +++ b/docs/en/development/continuous-integration.md @@ -1,6 +1,7 @@ --- -toc_priority: 62 -toc_title: Continuous Integration Checks +sidebar_position: 62 +sidebar_label: Continuous Integration Checks +description: When you submit a pull request, some automated checks are ran for your code by the ClickHouse continuous integration (CI) system --- # Continuous Integration Checks @@ -71,8 +72,6 @@ This check means that the CI system started to process the pull request. When it Performs some simple regex-based checks of code style, using the [`utils/check-style/check-style`](https://github.com/ClickHouse/ClickHouse/blob/master/utils/check-style/check-style) binary (note that it can be run locally). If it fails, fix the style errors following the [code style guide](style.md). -Python code is checked with [black](https://github.com/psf/black/). - ### Report Details - [Status page example](https://clickhouse-test-reports.s3.yandex.net/12550/659c78c7abb56141723af6a81bfae39335aa8cb2/style_check.html) - `output.txt` contains the check resulting errors (invalid tabulation etc), blank page means no errors. [Successful result example](https://clickhouse-test-reports.s3.yandex.net/12550/659c78c7abb56141723af6a81bfae39335aa8cb2/style_check/output.txt). diff --git a/docs/en/development/contrib.md b/docs/en/development/contrib.md index 6c12a3d9055..7cbe32fdd8b 100644 --- a/docs/en/development/contrib.md +++ b/docs/en/development/contrib.md @@ -1,9 +1,10 @@ --- -toc_priority: 71 -toc_title: Third-Party Libraries Used +sidebar_position: 71 +sidebar_label: Third-Party Libraries +description: A list of third-party libraries used --- -# Third-Party Libraries Used {#third-party-libraries-used} +# Third-Party Libraries Used The list of third-party libraries: diff --git a/docs/en/development/developer-instruction.md b/docs/en/development/developer-instruction.md index db78637f104..291e57fef66 100644 --- a/docs/en/development/developer-instruction.md +++ b/docs/en/development/developer-instruction.md @@ -1,11 +1,12 @@ --- -toc_priority: 61 -toc_title: For Beginners +sidebar_position: 61 +sidebar_label: Getting Started +description: Prerequisites and an overview of how to build ClickHouse --- -# The Beginner ClickHouse Developer Instruction {#the-beginner-clickhouse-developer-instruction} +# Getting Started Guide for Building ClickHouse -Building of ClickHouse is supported on Linux, FreeBSD and Mac OS X. +The building of ClickHouse is supported on Linux, FreeBSD and Mac OS X. If you use Windows, you need to create a virtual machine with Ubuntu. To start working with a virtual machine please install VirtualBox. You can download Ubuntu from the website: https://www.ubuntu.com/#download. Please create a virtual machine from the downloaded image (you should reserve at least 4GB of RAM for it). To run a command-line terminal in Ubuntu, please locate a program containing the word “terminal” in its name (gnome-terminal, konsole etc.) or just press Ctrl+Alt+T. @@ -229,25 +230,6 @@ As simple code editors, you can use Sublime Text or Visual Studio Code, or Kate Just in case, it is worth mentioning that CLion creates `build` path on its own, it also on its own selects `debug` for build type, for configuration it uses a version of CMake that is defined in CLion and not the one installed by you, and finally, CLion will use `make` to run build tasks instead of `ninja`. This is normal behaviour, just keep that in mind to avoid confusion. -## Debugging - -Many graphical IDEs offer with an integrated debugger but you can also use a standalone debugger. - -### GDB - -### LLDB - - # tell LLDB where to find the source code - settings set target.source-map /path/to/build/dir /path/to/source/dir - - # configure LLDB to display code before/after currently executing line - settings set stop-line-count-before 10 - settings set stop-line-count-after 10 - - target create ./clickhouse-client - # - process launch -- --query="SELECT * FROM TAB" - ## Writing Code {#writing-code} The description of ClickHouse architecture can be found here: https://clickhouse.com/docs/en/development/architecture/ diff --git a/docs/en/development/style.md b/docs/en/development/style.md index 03121880555..82cd9273680 100644 --- a/docs/en/development/style.md +++ b/docs/en/development/style.md @@ -1,9 +1,10 @@ --- -toc_priority: 69 -toc_title: C++ Guide +sidebar_position: 69 +sidebar_label: C++ Guide +description: A list of recommendations regarding coding style, naming convention, formatting and more --- -# How to Write C++ Code {#how-to-write-c-code} +# How to Write C++ Code ## General Recommendations {#general-recommendations} diff --git a/docs/en/development/tests.md b/docs/en/development/tests.md index be9fc7907af..29b69f0b697 100644 --- a/docs/en/development/tests.md +++ b/docs/en/development/tests.md @@ -1,11 +1,12 @@ --- -toc_priority: 70 -toc_title: Testing +sidebar_position: 70 +sidebar_label: Testing +description: Most of ClickHouse features can be tested with functional tests and they are mandatory to use for every change in ClickHouse code that can be tested that way. --- -# ClickHouse Testing {#clickhouse-testing} +# ClickHouse Testing -## Functional Tests {#functional-tests} +## Functional Tests Functional tests are the most simple and convenient to use. Most of ClickHouse features can be tested with functional tests and they are mandatory to use for every change in ClickHouse code that can be tested that way. diff --git a/docs/en/engines/database-engines/materialized-mysql.md b/docs/en/engines/database-engines/materialized-mysql.md index d7dcf21cb02..3dc14c87be7 100644 --- a/docs/en/engines/database-engines/materialized-mysql.md +++ b/docs/en/engines/database-engines/materialized-mysql.md @@ -76,7 +76,7 @@ When working with the `MaterializedMySQL` database engine, [ReplacingMergeTree]( | FLOAT | [Float32](../../sql-reference/data-types/float.md) | | DOUBLE | [Float64](../../sql-reference/data-types/float.md) | | DECIMAL, NEWDECIMAL | [Decimal](../../sql-reference/data-types/decimal.md) | -| DATE, NEWDATE | [Date32](../../sql-reference/data-types/date32.md) | +| DATE, NEWDATE | [Date](../../sql-reference/data-types/date.md) | | DATETIME, TIMESTAMP | [DateTime](../../sql-reference/data-types/datetime.md) | | DATETIME2, TIMESTAMP2 | [DateTime64](../../sql-reference/data-types/datetime64.md) | | YEAR | [UInt16](../../sql-reference/data-types/int-uint.md) | diff --git a/docs/en/engines/database-engines/mysql.md b/docs/en/engines/database-engines/mysql.md index df4965b1f8c..c5a1bba44b2 100644 --- a/docs/en/engines/database-engines/mysql.md +++ b/docs/en/engines/database-engines/mysql.md @@ -49,8 +49,6 @@ ENGINE = MySQL('host:port', ['database' | database], 'user', 'password') All other MySQL data types are converted into [String](../../sql-reference/data-types/string.md). -Because of the ClickHouse date type has a different range from the MySQL date range,If the MySQL date type is out of the range of ClickHouse date, you can use the setting mysql_datatypes_support_level to modify the mapping from the MySQL date type to the Clickhouse date type: date2Date32 (convert MySQL's date type to ClickHouse Date32) or date2String(convert MySQL's date type to ClickHouse String,this is usually used when your mysql data is less than 1925) or default(convert MySQL's date type to ClickHouse Date). - [Nullable](../../sql-reference/data-types/nullable.md) is supported. ## Global Variables Support {#global-variables-support} diff --git a/docs/en/engines/table-engines/integrations/hive.md b/docs/en/engines/table-engines/integrations/hive.md index 61147467690..b804b9c2279 100644 --- a/docs/en/engines/table-engines/integrations/hive.md +++ b/docs/en/engines/table-engines/integrations/hive.md @@ -137,7 +137,7 @@ CREATE TABLE test.test_orc `f_array_array_float` Array(Array(Float32)), `day` String ) -ENGINE = Hive('thrift://localhost:9083', 'test', 'test_orc') +ENGINE = Hive('thrift://202.168.117.26:9083', 'test', 'test_orc') PARTITION BY day ``` diff --git a/docs/en/example-datasets/amplab-benchmark.md b/docs/en/example-datasets/amplab-benchmark.md index b410a3595ec..a87ac53e2e3 100644 --- a/docs/en/example-datasets/amplab-benchmark.md +++ b/docs/en/example-datasets/amplab-benchmark.md @@ -1,6 +1,6 @@ --- -toc_priority: 19 -toc_title: AMPLab Big Data Benchmark +sidebar_label: AMPLab Big Data Benchmark +description: A benchmark dataset used for comparing the performance of data warehousing solutions. --- # AMPLab Big Data Benchmark {#amplab-big-data-benchmark} diff --git a/docs/en/example-datasets/brown-benchmark.md b/docs/en/example-datasets/brown-benchmark.md index 93049d1f76a..0960756dbe9 100644 --- a/docs/en/example-datasets/brown-benchmark.md +++ b/docs/en/example-datasets/brown-benchmark.md @@ -1,6 +1,6 @@ --- -toc_priority: 20 -toc_title: Brown University Benchmark +sidebar_label: Brown University Benchmark +description: A new analytical benchmark for machine-generated log data --- # Brown University Benchmark diff --git a/docs/en/example-datasets/cell-towers.md b/docs/en/example-datasets/cell-towers.md index 1f681fc32d8..7a35a28faa6 100644 --- a/docs/en/example-datasets/cell-towers.md +++ b/docs/en/example-datasets/cell-towers.md @@ -1,9 +1,8 @@ --- -toc_priority: 21 -toc_title: Cell Towers +sidebar_label: Cell Towers --- -# Cell Towers {#cell-towers} +# Cell Towers This dataset is from [OpenCellid](https://www.opencellid.org/) - The world's largest Open Database of Cell Towers. diff --git a/docs/en/example-datasets/criteo.md b/docs/en/example-datasets/criteo.md index 08298172c70..2d1c700d15c 100644 --- a/docs/en/example-datasets/criteo.md +++ b/docs/en/example-datasets/criteo.md @@ -1,9 +1,8 @@ --- -toc_priority: 18 -toc_title: Terabyte Click Logs from Criteo +sidebar_label: Terabyte Click Logs from Criteo --- -# Terabyte of Click Logs from Criteo {#terabyte-of-click-logs-from-criteo} +# Terabyte of Click Logs from Criteo Download the data from http://labs.criteo.com/downloads/download-terabyte-click-logs/ diff --git a/docs/en/example-datasets/github-events.md b/docs/en/example-datasets/github-events.md index e470e88b182..3a0cbc3324d 100644 --- a/docs/en/example-datasets/github-events.md +++ b/docs/en/example-datasets/github-events.md @@ -1,6 +1,5 @@ --- -toc_priority: 11 -toc_title: GitHub Events +sidebar_label: GitHub Events --- # GitHub Events Dataset diff --git a/docs/en/example-datasets/menus.md b/docs/en/example-datasets/menus.md index 665944b3e6f..c572dcdb491 100644 --- a/docs/en/example-datasets/menus.md +++ b/docs/en/example-datasets/menus.md @@ -1,9 +1,8 @@ --- -toc_priority: 21 -toc_title: Menus +sidebar_label: New York Public Library "What's on the Menu?" Dataset --- -# New York Public Library "What's on the Menu?" Dataset {#menus-dataset} +# New York Public Library "What's on the Menu?" Dataset The dataset is created by the New York Public Library. It contains historical data on the menus of hotels, restaurants and cafes with the dishes along with their prices. diff --git a/docs/en/example-datasets/metrica.md b/docs/en/example-datasets/metrica.md index d9d8beb0181..2194ad85091 100644 --- a/docs/en/example-datasets/metrica.md +++ b/docs/en/example-datasets/metrica.md @@ -1,9 +1,9 @@ --- -toc_priority: 15 -toc_title: Web Analytics Data +sidebar_label: Web Analytics Data +description: Dataset consists of two tables containing anonymized web analytics data with hits and visits --- -# Anonymized Web Analytics Data {#anonymized-web-analytics-data} +# Anonymized Web Analytics Data Dataset consists of two tables containing anonymized web analytics data with hits (`hits_v1`) and visits (`visits_v1`). diff --git a/docs/en/example-datasets/nyc-taxi.md b/docs/en/example-datasets/nyc-taxi.md index a7825988695..da7be71d46b 100644 --- a/docs/en/example-datasets/nyc-taxi.md +++ b/docs/en/example-datasets/nyc-taxi.md @@ -1,9 +1,9 @@ --- -toc_priority: 20 -toc_title: New York Taxi Data +sidebar_label: New York Taxi Data +description: Data for billions of taxi and for-hire vehicle (Uber, Lyft, etc.) trips originating in New York City since 2009 --- -# New York Taxi Data {#new-york-taxi-data} +# New York Taxi Data This dataset can be obtained in two ways: diff --git a/docs/en/example-datasets/ontime.md b/docs/en/example-datasets/ontime.md index efc807b75fa..51df6186bd5 100644 --- a/docs/en/example-datasets/ontime.md +++ b/docs/en/example-datasets/ontime.md @@ -1,9 +1,9 @@ --- -toc_priority: 21 -toc_title: OnTime +sidebar_label: OnTime Airline Flight Data +description: Dataset containing the on-time performance of airline flights --- -# OnTime {#ontime} +# OnTime This dataset can be obtained in two ways: diff --git a/docs/en/example-datasets/opensky.md b/docs/en/example-datasets/opensky.md index 2d901397cb2..f55ebc79590 100644 --- a/docs/en/example-datasets/opensky.md +++ b/docs/en/example-datasets/opensky.md @@ -1,11 +1,11 @@ --- -toc_priority: 20 -toc_title: OpenSky +sidebar_label: Air Traffic Data +description: The data in this dataset is derived and cleaned from the full OpenSky dataset to illustrate the development of air traffic during the COVID-19 pandemic. --- -# Crowdsourced air traffic data from The OpenSky Network 2020 {#opensky} +# Crowdsourced air traffic data from The OpenSky Network 2020 -"The data in this dataset is derived and cleaned from the full OpenSky dataset to illustrate the development of air traffic during the COVID-19 pandemic. It spans all flights seen by the network's more than 2500 members since 1 January 2019. More data will be periodically included in the dataset until the end of the COVID-19 pandemic". +The data in this dataset is derived and cleaned from the full OpenSky dataset to illustrate the development of air traffic during the COVID-19 pandemic. It spans all flights seen by the network's more than 2500 members since 1 January 2019. More data will be periodically included in the dataset until the end of the COVID-19 pandemic. Source: https://zenodo.org/record/5092942#.YRBCyTpRXYd diff --git a/docs/en/example-datasets/recipes.md b/docs/en/example-datasets/recipes.md index 70a56a0547f..9a27255e6a8 100644 --- a/docs/en/example-datasets/recipes.md +++ b/docs/en/example-datasets/recipes.md @@ -1,6 +1,5 @@ --- -toc_priority: 16 -toc_title: Recipes Dataset +sidebar_label: Recipes Dataset --- # Recipes Dataset diff --git a/docs/en/example-datasets/star-schema.md b/docs/en/example-datasets/star-schema.md index 14fa7cef654..a8949ef74b9 100644 --- a/docs/en/example-datasets/star-schema.md +++ b/docs/en/example-datasets/star-schema.md @@ -1,9 +1,11 @@ --- -toc_priority: 16 -toc_title: Star Schema Benchmark +sidebar_label: Star Schema Benchmark +description: "Dataset based on the TPC-H dbgen source. The coding style and architecture +follows the TPCH dbgen." --- -# Star Schema Benchmark {#star-schema-benchmark} +# Star Schema Benchmark + Compiling dbgen: diff --git a/docs/en/example-datasets/uk-price-paid.md b/docs/en/example-datasets/uk-price-paid.md index 4b0ba25907d..e0f20639aea 100644 --- a/docs/en/example-datasets/uk-price-paid.md +++ b/docs/en/example-datasets/uk-price-paid.md @@ -1,9 +1,8 @@ --- -toc_priority: 20 -toc_title: UK Property Price Paid +sidebar_label: UK Property Price Paid --- -# UK Property Price Paid {#uk-property-price-paid} +# UK Property Price Paid The dataset contains data about prices paid for real-estate property in England and Wales. The data is available since year 1995. The size of the dataset in uncompressed form is about 4 GiB and it will take about 278 MiB in ClickHouse. diff --git a/docs/en/example-datasets/wikistat.md b/docs/en/example-datasets/wikistat.md index 3e3f7b164ce..1185338a1da 100644 --- a/docs/en/example-datasets/wikistat.md +++ b/docs/en/example-datasets/wikistat.md @@ -1,11 +1,10 @@ --- -toc_priority: 17 -toc_title: WikiStat +sidebar_label: WikiStat --- -# WikiStat {#wikistat} +# WikiStat -See: http://dumps.wikimedia.org/other/pagecounts-raw/ +See http://dumps.wikimedia.org/other/pagecounts-raw/ for details. Creating a table: diff --git a/docs/en/install.md b/docs/en/install.md index b499b584865..ecb4eb93042 100644 --- a/docs/en/install.md +++ b/docs/en/install.md @@ -188,18 +188,29 @@ sudo ./clickhouse install ### From Precompiled Binaries for Non-Standard Environments {#from-binaries-non-linux} -For non-Linux operating systems and for AArch64 CPU arhitecture, ClickHouse builds are provided as a cross-compiled binary from the latest commit of the `master` branch (with a few hours delay). +For non-Linux operating systems and for AArch64 CPU arhitecture, ClickHouse builds are provided as a cross-compiled binary from the latest commit of the `master` branch (with a few hours delay). These builds are not recommended for use in production environments because they are less thoroughly tested, and they also only contain a subset of ClickHouse features available. -- [MacOS x86_64](https://builds.clickhouse.com/master/macos/clickhouse) — `curl -O 'https://builds.clickhouse.com/master/macos/clickhouse' && chmod a+x ./clickhouse` -- [MacOS Aarch64 (Apple Silicon)](https://builds.clickhouse.com/master/macos-aarch64/clickhouse) — `curl -O 'https://builds.clickhouse.com/master/macos-aarch64/clickhouse' && chmod a+x ./clickhouse` -- [FreeBSD x86_64](https://builds.clickhouse.com/master/freebsd/clickhouse) — `curl -O 'https://builds.clickhouse.com/master/freebsd/clickhouse' && chmod a+x ./clickhouse` -- [Linux AArch64](https://builds.clickhouse.com/master/aarch64/clickhouse) — `curl -O 'https://builds.clickhouse.com/master/aarch64/clickhouse' && chmod a+x ./clickhouse` -After downloading, you can use the `clickhouse client` to connect to the server, or `clickhouse local` to process local data. +- [MacOS x86_64](https://builds.clickhouse.com/master/macos/clickhouse) + ```bash + curl -O 'https://builds.clickhouse.com/master/macos/clickhouse' && chmod a+x ./clickhouse + ``` +- [MacOS Aarch64 (Apple Silicon)](https://builds.clickhouse.com/master/macos-aarch64/clickhouse) + ```bash + curl -O 'https://builds.clickhouse.com/master/macos-aarch64/clickhouse' && chmod a+x ./clickhouse + ``` +- [FreeBSD x86_64](https://builds.clickhouse.com/master/freebsd/clickhouse) + ```bash + curl -O 'https://builds.clickhouse.com/master/freebsd/clickhouse' && chmod a+x ./clickhouse + ``` +- [Linux AArch64](https://builds.clickhouse.com/master/aarch64/clickhouse) + ```bash + curl -O 'https://builds.clickhouse.com/master/aarch64/clickhouse' && chmod a+x ./clickhouse + ``` -Run `sudo ./clickhouse install` if you want to install clickhouse system-wide (also with needed configuration files, configuring users etc.). After that run `clickhouse start` commands to start the clickhouse-server and `clickhouse-client` to connect to it. +Run `sudo ./clickhouse install` to install ClickHouse system-wide (also with needed configuration files, configuring users etc.). Then run `clickhouse start` commands to start the clickhouse-server and `clickhouse-client` to connect to it. -These builds are not recommended for use in production environments because they are less thoroughly tested, but you can do so on your own risk. They also have only a subset of ClickHouse features available. +Use the `clickhouse client` to connect to the server, or `clickhouse local` to process local data. ### From Sources {#from-sources} diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index a7066fca087..058c9b6fd4a 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -51,7 +51,6 @@ The supported formats are: | [PrettySpace](#prettyspace) | ✗ | ✔ | | [Protobuf](#protobuf) | ✔ | ✔ | | [ProtobufSingle](#protobufsingle) | ✔ | ✔ | -| [ProtobufList](#protobuflist) | ✔ | ✔ | | [Avro](#data-format-avro) | ✔ | ✔ | | [AvroConfluent](#data-format-avro-confluent) | ✔ | ✗ | | [Parquet](#data-format-parquet) | ✔ | ✔ | @@ -65,7 +64,7 @@ The supported formats are: | [Null](#null) | ✗ | ✔ | | [XML](#xml) | ✗ | ✔ | | [CapnProto](#capnproto) | ✔ | ✔ | -| [LineAsString](#lineasstring) | ✔ | ✔ | +| [LineAsString](#lineasstring) | ✔ | ✗ | | [Regexp](#data-format-regexp) | ✔ | ✗ | | [RawBLOB](#rawblob) | ✔ | ✔ | | [MsgPack](#msgpack) | ✔ | ✔ | @@ -402,7 +401,7 @@ Parsing allows the presence of the additional field `tskv` without the equal sig Comma Separated Values format ([RFC](https://tools.ietf.org/html/rfc4180)). -When formatting, strings are enclosed in double-quotes. A double quote inside a string is output as two double quotes in a row. There are no other rules for escaping characters. Date and date-time are enclosed in double-quotes. Numbers are output without quotes. Values are separated by a delimiter character, which is `,` by default. The delimiter character is defined in the setting [format_csv_delimiter](../operations/settings/settings.md#settings-format_csv_delimiter). Rows are separated using the Unix line feed (LF). Arrays are serialized in CSV as follows: first, the array is serialized to a string as in TabSeparated format, and then the resulting string is output to CSV in double-quotes. Tuples in CSV format are serialized as separate columns (that is, their nesting in the tuple is lost). +When formatting, rows are enclosed in double-quotes. A double quote inside a string is output as two double quotes in a row. There are no other rules for escaping characters. Date and date-time are enclosed in double-quotes. Numbers are output without quotes. Values are separated by a delimiter character, which is `,` by default. The delimiter character is defined in the setting [format_csv_delimiter](../operations/settings/settings.md#settings-format_csv_delimiter). Rows are separated using the Unix line feed (LF). Arrays are serialized in CSV as follows: first, the array is serialized to a string as in TabSeparated format, and then the resulting string is output to CSV in double-quotes. Tuples in CSV format are serialized as separate columns (that is, their nesting in the tuple is lost). ``` bash $ clickhouse-client --format_csv_delimiter="|" --query="INSERT INTO test.csv FORMAT CSV" < data.csv @@ -410,7 +409,7 @@ $ clickhouse-client --format_csv_delimiter="|" --query="INSERT INTO test.csv FOR \*By default, the delimiter is `,`. See the [format_csv_delimiter](../operations/settings/settings.md#settings-format_csv_delimiter) setting for more information. -When parsing, all values can be parsed either with or without quotes. Both double and single quotes are supported. Strings can also be arranged without quotes. In this case, they are parsed up to the delimiter character or line feed (CR or LF). In violation of the RFC, when parsing strings without quotes, the leading and trailing spaces and tabs are ignored. For the line feed, Unix (LF), Windows (CR LF) and Mac OS Classic (CR LF) types are all supported. +When parsing, all values can be parsed either with or without quotes. Both double and single quotes are supported. Rows can also be arranged without quotes. In this case, they are parsed up to the delimiter character or line feed (CR or LF). In violation of the RFC, when parsing rows without quotes, the leading and trailing spaces and tabs are ignored. For the line feed, Unix (LF), Windows (CR LF) and Mac OS Classic (CR LF) types are all supported. If setting [input_format_csv_empty_as_default](../operations/settings/settings.md#settings-input_format_csv_empty_as_default) is enabled, empty unquoted input values are replaced with default values. For complex default expressions [input_format_defaults_for_omitted_fields](../operations/settings/settings.md#settings-input_format_defaults_for_omitted_fields) must be enabled too. @@ -1231,38 +1230,7 @@ See also [how to read/write length-delimited protobuf messages in popular langua ## ProtobufSingle {#protobufsingle} -Same as [Protobuf](#protobuf) but for storing/parsing a single Protobuf message without length delimiter. -As a result, only a single table row can be written/read. - -## ProtobufList {#protobuflist} - -Similar to Protobuf but rows are represented as a sequence of sub-messages contained in a message with fixed name "Envelope". - -Usage example: - -``` sql -SELECT * FROM test.table FORMAT ProtobufList SETTINGS format_schema = 'schemafile:MessageType' -``` - -``` bash -cat protobuflist_messages.bin | clickhouse-client --query "INSERT INTO test.table FORMAT ProtobufList SETTINGS format_schema='schemafile:MessageType'" -``` - -where the file `schemafile.proto` looks like this: - -``` capnp -syntax = "proto3"; - -message Envelope { - message MessageType { - string name = 1; - string surname = 2; - uint32 birthDate = 3; - repeated string phoneNumbers = 4; - }; - MessageType row = 1; -}; -``` +Same as [Protobuf](#protobuf) but for storing/parsing single Protobuf message without length delimiters. ## Avro {#data-format-avro} @@ -1396,8 +1364,7 @@ The table below shows supported data types and how they match ClickHouse [data t | `FLOAT`, `HALF_FLOAT` | [Float32](../sql-reference/data-types/float.md) | `FLOAT` | | `DOUBLE` | [Float64](../sql-reference/data-types/float.md) | `DOUBLE` | | `DATE32` | [Date](../sql-reference/data-types/date.md) | `UINT16` | -| `DATE64` | [DateTime](../sql-reference/data-types/datetime.md) | `UINT32` | -| `TIMESTAMP` | [DateTime64](../sql-reference/data-types/datetime64.md) | `TIMESTAMP` | +| `DATE64`, `TIMESTAMP` | [DateTime](../sql-reference/data-types/datetime.md) | `UINT32` | | `STRING`, `BINARY` | [String](../sql-reference/data-types/string.md) | `BINARY` | | — | [FixedString](../sql-reference/data-types/fixedstring.md) | `BINARY` | | `DECIMAL` | [Decimal](../sql-reference/data-types/decimal.md) | `DECIMAL` | @@ -1454,8 +1421,7 @@ The table below shows supported data types and how they match ClickHouse [data t | `FLOAT`, `HALF_FLOAT` | [Float32](../sql-reference/data-types/float.md) | `FLOAT32` | | `DOUBLE` | [Float64](../sql-reference/data-types/float.md) | `FLOAT64` | | `DATE32` | [Date](../sql-reference/data-types/date.md) | `UINT16` | -| `DATE64` | [DateTime](../sql-reference/data-types/datetime.md) | `UINT32` | -| `TIMESTAMP` | [DateTime64](../sql-reference/data-types/datetime64.md) | `TIMESTAMP` | +| `DATE64`, `TIMESTAMP` | [DateTime](../sql-reference/data-types/datetime.md) | `UINT32` | | `STRING`, `BINARY` | [String](../sql-reference/data-types/string.md) | `BINARY` | | `STRING`, `BINARY` | [FixedString](../sql-reference/data-types/fixedstring.md) | `BINARY` | | `DECIMAL` | [Decimal](../sql-reference/data-types/decimal.md) | `DECIMAL` | @@ -1517,8 +1483,7 @@ The table below shows supported data types and how they match ClickHouse [data t | `FLOAT`, `HALF_FLOAT` | [Float32](../sql-reference/data-types/float.md) | `FLOAT` | | `DOUBLE` | [Float64](../sql-reference/data-types/float.md) | `DOUBLE` | | `DATE32` | [Date](../sql-reference/data-types/date.md) | `DATE32` | -| `DATE64` | [DateTime](../sql-reference/data-types/datetime.md) | `UINT32` | -| `TIMESTAMP` | [DateTime64](../sql-reference/data-types/datetime64.md) | `TIMESTAMP` | +| `DATE64`, `TIMESTAMP` | [DateTime](../sql-reference/data-types/datetime.md) | `TIMESTAMP` | | `STRING`, `BINARY` | [String](../sql-reference/data-types/string.md) | `BINARY` | | `DECIMAL` | [Decimal](../sql-reference/data-types/decimal.md) | `DECIMAL` | | `LIST` | [Array](../sql-reference/data-types/array.md) | `LIST` | diff --git a/docs/en/operations/caches.md b/docs/en/operations/caches.md index 9aa6419d89c..279204a8af1 100644 --- a/docs/en/operations/caches.md +++ b/docs/en/operations/caches.md @@ -5,7 +5,7 @@ toc_title: Caches # Cache Types {#cache-types} -When performing queries, ClickHouse uses different caches. +When performing queries, ClichHouse uses different caches. Main cache types: diff --git a/docs/en/operations/clickhouse-keeper.md b/docs/en/operations/clickhouse-keeper.md index a8ca2079070..35ec5d858f5 100644 --- a/docs/en/operations/clickhouse-keeper.md +++ b/docs/en/operations/clickhouse-keeper.md @@ -55,7 +55,7 @@ Internal coordination settings are located in `..` section and contain servers description. @@ -121,7 +121,7 @@ clickhouse keeper --config /etc/your_path_to_config/config.xml ClickHouse Keeper also provides 4lw commands which are almost the same with Zookeeper. Each command is composed of four letters such as `mntr`, `stat` etc. There are some more interesting commands: `stat` gives some general information about the server and connected clients, while `srvr` and `cons` give extended details on server and connections respectively. -The 4lw commands has a allow list configuration `four_letter_word_allow_list` which has default value "conf,cons,crst,envi,ruok,srst,srvr,stat,wchs,dirs,mntr,isro". +The 4lw commands has a white list configuration `four_letter_word_white_list` which has default value "conf,cons,crst,envi,ruok,srst,srvr,stat,wchc,wchs,dirs,mntr,isro". You can issue the commands to ClickHouse Keeper via telnet or nc, at the client port. @@ -201,7 +201,7 @@ Server stats reset. ``` server_id=1 tcp_port=2181 -four_letter_word_allow_list=* +four_letter_word_white_list=* log_storage_path=./coordination/logs snapshot_storage_path=./coordination/snapshots max_requests_batch_size=100 diff --git a/docs/en/operations/named-collections.md b/docs/en/operations/named-collections.md deleted file mode 100644 index dce7938f98b..00000000000 --- a/docs/en/operations/named-collections.md +++ /dev/null @@ -1,229 +0,0 @@ ---- -toc_priority: 69 -toc_title: "Named connections" ---- - -# Storing details for connecting to external sources in configuration files {#named-collections} - -Details for connecting to external sources (dictionaries, tables, table functions) can be saved -in configuration files and thus simplify the creation of objects and hide credentials -from users with only SQL access. - -Parameters can be set in XML `CSV` and overridden in SQL `, format = 'TSV'`. -The parameters in SQL can be overridden using format `key` = `value`: `compression_method = 'gzip'`. - -Named connections are stored in the `config.xml` file of the ClickHouse server in the `` section and are applied when ClickHouse starts. - -Example of configuration: -```xml -$ cat /etc/clickhouse-server/config.d/named_collections.xml - - - ... - - -``` - -## Named connections for accessing S3. - -The description of parameters see [s3 Table Function](../sql-reference/table-functions/s3.md). - -Example of configuration: -```xml - - - - AKIAIOSFODNN7EXAMPLE - wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY - CSV - - - -``` - -### Example of using named connections with the s3 function - -```sql -INSERT INTO FUNCTION s3(s3_mydata, url = 'https://s3.us-east-1.amazonaws.com/yourbucket/mydata/test_file.tsv.gz', - format = 'TSV', structure = 'number UInt64', compression_method = 'gzip') -SELECT * FROM numbers(10000); - -SELECT count() -FROM s3(s3_mydata, url = 'https://s3.us-east-1.amazonaws.com/yourbucket/mydata/test_file.tsv.gz') - -┌─count()─┐ -│ 10000 │ -└─────────┘ -1 rows in set. Elapsed: 0.279 sec. Processed 10.00 thousand rows, 90.00 KB (35.78 thousand rows/s., 322.02 KB/s.) -``` - -### Example of using named connections with an S3 table - -```sql -CREATE TABLE s3_engine_table (number Int64) -ENGINE=S3(s3_mydata, url='https://s3.us-east-1.amazonaws.com/yourbucket/mydata/test_file.tsv.gz', format = 'TSV') -SETTINGS input_format_with_names_use_header = 0; - -SELECT * FROM s3_engine_table LIMIT 3; -┌─number─┐ -│ 0 │ -│ 1 │ -│ 2 │ -└────────┘ -``` - -## Named connections for accessing MySQL database - -The description of parameters see [mysql](../sql-reference/table-functions/mysql.md). - -Example of configuration: -```xml - - - - myuser - mypass - 127.0.0.1 - 3306 - test - 8 - 1 - 1 - - - -``` - -### Example of using named connections with the mysql function - -```sql -SELECT count() FROM mysql(mymysql, table = 'test'); - -┌─count()─┐ -│ 3 │ -└─────────┘ -``` - -### Example of using named connections with an MySQL table - -```sql -CREATE TABLE mytable(A Int64) ENGINE = MySQL(mymysql, table = 'test', connection_pool_size=3, replace_query=0); -SELECT count() FROM mytable; - -┌─count()─┐ -│ 3 │ -└─────────┘ -``` - -### Example of using named connections with database with engine MySQL - -```sql -CREATE DATABASE mydatabase ENGINE = MySQL(mymysql); - -SHOW TABLES FROM mydatabase; - -┌─name───┐ -│ source │ -│ test │ -└────────┘ -``` - -### Example of using named connections with an external dictionary with source MySQL - -```sql -CREATE DICTIONARY dict (A Int64, B String) -PRIMARY KEY A -SOURCE(MYSQL(NAME mymysql TABLE 'source')) -LIFETIME(MIN 1 MAX 2) -LAYOUT(HASHED()); - -SELECT dictGet('dict', 'B', 2); - -┌─dictGet('dict', 'B', 2)─┐ -│ two │ -└─────────────────────────┘ -``` - -## Named connections for accessing PostgreSQL database - -The description of parameters see [postgresql](../sql-reference/table-functions/postgresql.md). - -Example of configuration: -```xml - - - - pguser - jw8s0F4 - 127.0.0.1 - 5432 - test - test_schema - 8 - - - -``` - -### Example of using named connections with the postgresql function - -```sql -SELECT * FROM postgresql(mypg, table = 'test'); - -┌─a─┬─b───┐ -│ 2 │ two │ -│ 1 │ one │ -└───┴─────┘ - - -SELECT * FROM postgresql(mypg, table = 'test', schema = 'public'); - -┌─a─┐ -│ 1 │ -│ 2 │ -│ 3 │ -└───┘ -``` - - -### Example of using named connections with database with engine PostgreSQL - -```sql -CREATE TABLE mypgtable (a Int64) ENGINE = PostgreSQL(mypg, table = 'test', schema = 'public'); - -SELECT * FROM mypgtable; - -┌─a─┐ -│ 1 │ -│ 2 │ -│ 3 │ -└───┘ -``` - -### Example of using named connections with database with engine PostgreSQL - -```sql -CREATE DATABASE mydatabase ENGINE = PostgreSQL(mypg); - -SHOW TABLES FROM mydatabase - -┌─name─┐ -│ test │ -└──────┘ -``` - -### Example of using named connections with an external dictionary with source POSTGRESQL - -```sql -CREATE DICTIONARY dict (a Int64, b String) -PRIMARY KEY a -SOURCE(POSTGRESQL(NAME mypg TABLE test)) -LIFETIME(MIN 1 MAX 2) -LAYOUT(HASHED()); - -SELECT dictGet('dict', 'b', 2); - -┌─dictGet('dict', 'b', 2)─┐ -│ two │ -└─────────────────────────┘ -``` diff --git a/docs/en/operations/quotas.md b/docs/en/operations/quotas.md index 6c6fbbf9cfb..6d22a5f2a33 100644 --- a/docs/en/operations/quotas.md +++ b/docs/en/operations/quotas.md @@ -101,7 +101,7 @@ Quotas can use the “quota key” feature to report on resources for multiple k diff --git a/docs/en/sql-reference/functions/ip-address-functions.md b/docs/en/sql-reference/functions/ip-address-functions.md index cf3f92580aa..469a66d460f 100644 --- a/docs/en/sql-reference/functions/ip-address-functions.md +++ b/docs/en/sql-reference/functions/ip-address-functions.md @@ -13,18 +13,10 @@ Alias: `INET_NTOA`. ## IPv4StringToNum(s) {#ipv4stringtonums} -The reverse function of IPv4NumToString. If the IPv4 address has an invalid format, it throws exception. +The reverse function of IPv4NumToString. If the IPv4 address has an invalid format, it returns 0. Alias: `INET_ATON`. -## IPv4StringToNumOrDefault(s) {#ipv4stringtonums} - -Same as `IPv4StringToNum`, but if the IPv4 address has an invalid format, it returns 0. - -## IPv4StringToNumOrNull(s) {#ipv4stringtonums} - -Same as `IPv4StringToNum`, but if the IPv4 address has an invalid format, it returns null. - ## IPv4NumToStringClassC(num) {#ipv4numtostringclasscnum} Similar to IPv4NumToString, but using xxx instead of the last octet. @@ -131,7 +123,7 @@ LIMIT 10 ## IPv6StringToNum {#ipv6stringtonums} -The reverse function of [IPv6NumToString](#ipv6numtostringx). If the IPv6 address has an invalid format, it throws exception. +The reverse function of [IPv6NumToString](#ipv6numtostringx). If the IPv6 address has an invalid format, it returns a string of null bytes. If the input string contains a valid IPv4 address, returns its IPv6 equivalent. HEX can be uppercase or lowercase. @@ -176,14 +168,6 @@ Result: - [cutIPv6](#cutipv6x-bytestocutforipv6-bytestocutforipv4). -## IPv6StringToNumOrDefault(s) {#ipv6stringtonums} - -Same as `IPv6StringToNum`, but if the IPv6 address has an invalid format, it returns 0. - -## IPv6StringToNumOrNull(s) {#ipv6stringtonums} - -Same as `IPv6StringToNum`, but if the IPv6 address has an invalid format, it returns null. - ## IPv4ToIPv6(x) {#ipv4toipv6x} Takes a `UInt32` number. Interprets it as an IPv4 address in [big endian](https://en.wikipedia.org/wiki/Endianness). Returns a `FixedString(16)` value containing the IPv6 address in binary format. Examples: @@ -277,14 +261,6 @@ SELECT └───────────────────────────────────┴──────────────────────────┘ ``` -## toIPv4OrDefault(string) {#toipv4ordefaultstring} - -Same as `toIPv4`, but if the IPv4 address has an invalid format, it returns 0. - -## toIPv4OrNull(string) {#toipv4ornullstring} - -Same as `toIPv4`, but if the IPv4 address has an invalid format, it returns null. - ## toIPv6 {#toipv6string} Converts a string form of IPv6 address to [IPv6](../../sql-reference/data-types/domains/ipv6.md) type. If the IPv6 address has an invalid format, returns an empty value. @@ -341,14 +317,6 @@ Result: └─────────────────────┘ ``` -## IPv6StringToNumOrDefault(s) {#toipv6ordefaultstring} - -Same as `toIPv6`, but if the IPv6 address has an invalid format, it returns 0. - -## IPv6StringToNumOrNull(s) {#toipv6ornullstring} - -Same as `toIPv6`, but if the IPv6 address has an invalid format, it returns null. - ## isIPv4String {#isipv4string} Determines whether the input string is an IPv4 address or not. If `string` is IPv6 address returns `0`. diff --git a/docs/en/sql-reference/functions/json-functions.md b/docs/en/sql-reference/functions/json-functions.md index 8270864de74..d5622ac5fdc 100644 --- a/docs/en/sql-reference/functions/json-functions.md +++ b/docs/en/sql-reference/functions/json-functions.md @@ -5,7 +5,9 @@ toc_title: JSON # Functions for Working with JSON {#functions-for-working-with-json} -ClickHouse has special functions for working with this JSON. The `visitParam` functions make strong assumptions about what the JSON can be, but they try to do as little as possible to get the job done. The following assumptions are made: +ClickHouse has special functions for working with this JSON. All the JSON functions are based on strong assumptions about what the JSON can be, but they try to do as little as possible to get the job done. + +The following assumptions are made: 1. The field name (function argument) must be a constant. 2. The field name is somehow canonically encoded in JSON. For example: `visitParamHas('{"abc":"def"}', 'abc') = 1`, but `visitParamHas('{"\\u0061\\u0062\\u0063":"def"}', 'abc') = 0` diff --git a/docs/en/sql-reference/functions/rounding-functions.md b/docs/en/sql-reference/functions/rounding-functions.md index c9044c62ca4..b224e7ab406 100644 --- a/docs/en/sql-reference/functions/rounding-functions.md +++ b/docs/en/sql-reference/functions/rounding-functions.md @@ -189,7 +189,7 @@ Accepts a number. If the number is less than one, it returns 0. Otherwise, it ro ## roundDuration(num) {#rounddurationnum} -Accepts a number. If the number is less than one, it returns 0. Otherwise, it rounds the number down to numbers from the set: 1, 10, 30, 60, 120, 180, 240, 300, 600, 1200, 1800, 3600, 7200, 18000, 36000. This function was specifically implemented for a web analytics use case for reporting on session lengths. +Accepts a number. If the number is less than one, it returns 0. Otherwise, it rounds the number down to numbers from the set: 1, 10, 30, 60, 120, 180, 240, 300, 600, 1200, 1800, 3600, 7200, 18000, 36000. ## roundAge(num) {#roundagenum} diff --git a/docs/en/sql-reference/functions/statistics.md b/docs/en/sql-reference/functions/statistics.md deleted file mode 100644 index 3f337b05cbc..00000000000 --- a/docs/en/sql-reference/functions/statistics.md +++ /dev/null @@ -1,48 +0,0 @@ ---- -toc_priority: 69 -toc_title: Statistics ---- - -# Functions for Working with Statistics {#functions-for-working-with-statistics} - -# proportionsZTest {#proportionsztest} - -Applies proportion z-test to samples from two populations (X and Y). The alternative is 'two-sided'. - -**Syntax** - -``` sql -proportionsZTest(successes_x, successes_y, trials_x, trials_y, significance_level, usevar) -``` - -**Arguments** - -- `successes_x` — The number of successes for X in trials. -- `successes_y` — The number of successes for X in trials. -- `trials_x` — The number of trials for X. -- `trials_y` — The number of trials for Y. -- `significance_level` -- `usevar` - It can be `'pooled'` or `'unpooled'`. - - `'pooled'` - The variance of the two populations are assumed to be equal. - - `'unpooled'` - The assumption of equal variances is dropped. - -**Returned value** - -- A tuple with the (z-statistic, p-value, confidence-interval-lower, confidence-interval-upper). - -Type: [Tuple](../../sql-reference/data-types/tuple.md). - -**Example** - -Query: - -``` sql -SELECT proportionsZTest(10, 11, 100, 101, 0.95, 'unpooled'); -``` - -Result: - -``` text -(-0.20656724435948853,0.8363478437079654,-0.09345975390115283,0.07563797172293502) -``` - diff --git a/docs/en/whats-new/security-changelog.md b/docs/en/whats-new/security-changelog.md index aad3da91aa4..0a5c926f227 100644 --- a/docs/en/whats-new/security-changelog.md +++ b/docs/en/whats-new/security-changelog.md @@ -4,49 +4,6 @@ sidebar_position: 100 keywords: [clickhouse, security, changelog] description: Security Changelog --- -## Fixed in ClickHouse 21.10.2.15, 2021-10-18 {#fixed-in-clickhouse-release-21-10-2-215-2021-10-18} - -### CVE-2021-43304 {#cve-2021-43304} - -Heap buffer overflow in Clickhouse's LZ4 compression codec when parsing a malicious query. There is no verification that the copy operations in the LZ4::decompressImpl loop and especially the arbitrary copy operation wildCopy(op, ip, copy_end), don’t exceed the destination buffer’s limits. - -Credits: JFrog Security Research Team - -### CVE-2021-43305 {#cve-2021-43305} - -Heap buffer overflow in Clickhouse's LZ4 compression codec when parsing a malicious query. There is no verification that the copy operations in the LZ4::decompressImpl loop and especially the arbitrary copy operation wildCopy(op, ip, copy_end), don’t exceed the destination buffer’s limits. This issue is very similar to CVE-2021-43304, but the vulnerable copy operation is in a different wildCopy call. - -Credits: JFrog Security Research Team - -### CVE-2021-42387 {#cve-2021-42387} - -Heap out-of-bounds read in Clickhouse's LZ4 compression codec when parsing a malicious query. As part of the LZ4::decompressImpl() loop, a 16-bit unsigned user-supplied value ('offset') is read from the compressed data. The offset is later used in the length of a copy operation, without checking the upper bounds of the source of the copy operation. - -Credits: JFrog Security Research Team - -### CVE-2021-42388 {#cve-2021-42388} - -Heap out-of-bounds read in Clickhouse's LZ4 compression codec when parsing a malicious query. As part of the LZ4::decompressImpl() loop, a 16-bit unsigned user-supplied value ('offset') is read from the compressed data. The offset is later used in the length of a copy operation, without checking the lower bounds of the source of the copy operation. - -Credits: JFrog Security Research Team - -### CVE-2021-42389 {#cve-2021-42389} - -Divide-by-zero in Clickhouse's Delta compression codec when parsing a malicious query. The first byte of the compressed buffer is used in a modulo operation without being checked for 0. - -Credits: JFrog Security Research Team - -### CVE-2021-42390 {#cve-2021-42390} - -Divide-by-zero in Clickhouse's DeltaDouble compression codec when parsing a malicious query. The first byte of the compressed buffer is used in a modulo operation without being checked for 0. - -Credits: JFrog Security Research Team - -### CVE-2021-42391 {#cve-2021-42391} - -Divide-by-zero in Clickhouse's Gorilla compression codec when parsing a malicious query. The first byte of the compressed buffer is used in a modulo operation without being checked for 0. - -Credits: JFrog Security Research Team # Security Changelog From ce97ccbfb93e7f7b7feb32ff0d73396dd025f2f0 Mon Sep 17 00:00:00 2001 From: avogar Date: Tue, 29 Mar 2022 14:47:51 +0000 Subject: [PATCH 054/239] Improve schema inference for JSONEachRow and TSKV formats --- src/Formats/JSONEachRowUtils.cpp | 6 +-- src/Formats/JSONEachRowUtils.h | 4 +- src/Processors/Formats/ISchemaReader.cpp | 29 ++++++++---- src/Processors/Formats/ISchemaReader.h | 6 +-- .../Impl/JSONEachRowRowInputFormat.cpp | 5 +- .../Formats/Impl/JSONEachRowRowInputFormat.h | 2 +- .../Formats/Impl/TSKVRowInputFormat.cpp | 9 ++-- .../Formats/Impl/TSKVRowInputFormat.h | 2 +- ...247_names_order_in_json_and_tskv.reference | 34 ++++++++++++++ .../02247_names_order_in_json_and_tskv.sh | 47 +++++++++++++++++++ 10 files changed, 122 insertions(+), 22 deletions(-) create mode 100644 tests/queries/0_stateless/02247_names_order_in_json_and_tskv.reference create mode 100644 tests/queries/0_stateless/02247_names_order_in_json_and_tskv.sh diff --git a/src/Formats/JSONEachRowUtils.cpp b/src/Formats/JSONEachRowUtils.cpp index fb1ddb479f2..d90b925d753 100644 --- a/src/Formats/JSONEachRowUtils.cpp +++ b/src/Formats/JSONEachRowUtils.cpp @@ -270,13 +270,13 @@ struct JSONEachRowFieldsExtractor std::vector column_names; }; -std::unordered_map readRowAndGetNamesAndDataTypesForJSONEachRow(ReadBuffer & in, bool json_strings) +NamesAndTypesList readRowAndGetNamesAndDataTypesForJSONEachRow(ReadBuffer & in, bool json_strings) { JSONEachRowFieldsExtractor extractor; auto data_types = determineColumnDataTypesFromJSONEachRowDataImpl(in, json_strings, extractor); - std::unordered_map result; + NamesAndTypesList result; for (size_t i = 0; i != extractor.column_names.size(); ++i) - result[extractor.column_names[i]] = data_types[i]; + result.emplace_back(extractor.column_names[i], data_types[i]); return result; } diff --git a/src/Formats/JSONEachRowUtils.h b/src/Formats/JSONEachRowUtils.h index 8d304e2ffd8..4cd2a161858 100644 --- a/src/Formats/JSONEachRowUtils.h +++ b/src/Formats/JSONEachRowUtils.h @@ -20,9 +20,9 @@ std::pair fileSegmentationEngineJSONCompactEachRow(ReadBuffer & in DataTypePtr getDataTypeFromJSONField(const String & field); /// Read row in JSONEachRow format and try to determine type for each field. -/// Return map {column_name : type}. +/// Return list of names and types. /// If cannot determine the type of some field, return nullptr for it. -std::unordered_map readRowAndGetNamesAndDataTypesForJSONEachRow(ReadBuffer & in, bool json_strings); +NamesAndTypesList readRowAndGetNamesAndDataTypesForJSONEachRow(ReadBuffer & in, bool json_strings); /// Read row in JSONCompactEachRow format and try to determine type for each field. /// If cannot determine the type of some field, return nullptr for it. diff --git a/src/Processors/Formats/ISchemaReader.cpp b/src/Processors/Formats/ISchemaReader.cpp index 096e39a2893..2da27885b59 100644 --- a/src/Processors/Formats/ISchemaReader.cpp +++ b/src/Processors/Formats/ISchemaReader.cpp @@ -96,21 +96,33 @@ IRowWithNamesSchemaReader::IRowWithNamesSchemaReader(ReadBuffer & in_, size_t ma NamesAndTypesList IRowWithNamesSchemaReader::readSchema() { - auto names_and_types = readRowAndGetNamesAndDataTypes(); + bool eof = false; + auto names_and_types = readRowAndGetNamesAndDataTypes(eof); + std::unordered_map names_to_types; + std::vector names_order; + names_to_types.reserve(names_and_types.size()); + names_order.reserve(names_and_types.size()); + for (const auto & [name, type] : names_and_types) + { + names_to_types[name] = type; + names_order.push_back(name); + } + for (size_t row = 1; row < max_rows_to_read; ++row) { - auto new_names_and_types = readRowAndGetNamesAndDataTypes(); - if (new_names_and_types.empty()) + auto new_names_and_types = readRowAndGetNamesAndDataTypes(eof); + if (eof) /// We reached eof. break; for (const auto & [name, new_type] : new_names_and_types) { - auto it = names_and_types.find(name); + auto it = names_to_types.find(name); /// If we didn't see this column before, just add it. - if (it == names_and_types.end()) + if (it == names_to_types.end()) { - names_and_types[name] = new_type; + names_to_types[name] = new_type; + names_order.push_back(name); continue; } @@ -133,12 +145,13 @@ NamesAndTypesList IRowWithNamesSchemaReader::readSchema() } /// Check that we read at list one column. - if (names_and_types.empty()) + if (names_to_types.empty()) throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "Cannot read rows from the data"); NamesAndTypesList result; - for (auto & [name, type] : names_and_types) + for (auto & name : names_order) { + auto & type = names_to_types[name]; /// Check that we could determine the type of this column. if (!type) { diff --git a/src/Processors/Formats/ISchemaReader.h b/src/Processors/Formats/ISchemaReader.h index 36cf0656119..811034cb0b0 100644 --- a/src/Processors/Formats/ISchemaReader.h +++ b/src/Processors/Formats/ISchemaReader.h @@ -68,10 +68,10 @@ public: protected: /// Read one row and determine types of columns in it. - /// Return map {column_name : type}. + /// Return list with names and types. /// If it's impossible to determine the type for some column, return nullptr for it. - /// Return empty map is can't read more data. - virtual std::unordered_map readRowAndGetNamesAndDataTypes() = 0; + /// Set eof = true if can't read more data. + virtual NamesAndTypesList readRowAndGetNamesAndDataTypes(bool & eof) = 0; private: size_t max_rows_to_read; diff --git a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp index 549fd7a6113..7f349db3c6c 100644 --- a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp @@ -312,7 +312,7 @@ JSONEachRowSchemaReader::JSONEachRowSchemaReader(ReadBuffer & in_, bool json_str } -std::unordered_map JSONEachRowSchemaReader::readRowAndGetNamesAndDataTypes() +NamesAndTypesList JSONEachRowSchemaReader::readRowAndGetNamesAndDataTypes(bool & eof) { if (first_row) { @@ -339,7 +339,10 @@ std::unordered_map JSONEachRowSchemaReader::readRowAndGetNa skipWhitespaceIfAny(in); if (in.eof()) + { + eof = true; return {}; + } return readRowAndGetNamesAndDataTypesForJSONEachRow(in, json_strings); } diff --git a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h index 29aba696411..1da14a532de 100644 --- a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h +++ b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h @@ -91,7 +91,7 @@ public: JSONEachRowSchemaReader(ReadBuffer & in_, bool json_strings, const FormatSettings & format_settings); private: - std::unordered_map readRowAndGetNamesAndDataTypes() override; + NamesAndTypesList readRowAndGetNamesAndDataTypes(bool & eof) override; bool json_strings; bool first_row = true; diff --git a/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp b/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp index 87ba1b18fa7..ceea174c0e8 100644 --- a/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp @@ -222,7 +222,7 @@ TSKVSchemaReader::TSKVSchemaReader(ReadBuffer & in_, const FormatSettings & form { } -std::unordered_map TSKVSchemaReader::readRowAndGetNamesAndDataTypes() +NamesAndTypesList TSKVSchemaReader::readRowAndGetNamesAndDataTypes(bool & eof) { if (first_row) { @@ -231,7 +231,10 @@ std::unordered_map TSKVSchemaReader::readRowAndGetNamesAndD } if (in.eof()) + { + eof = true; return {}; + } if (*in.position() == '\n') { @@ -239,7 +242,7 @@ std::unordered_map TSKVSchemaReader::readRowAndGetNamesAndD return {}; } - std::unordered_map names_and_types; + NamesAndTypesList names_and_types; StringRef name_ref; String name_buf; String value; @@ -250,7 +253,7 @@ std::unordered_map TSKVSchemaReader::readRowAndGetNamesAndD if (has_value) { readEscapedString(value, in); - names_and_types[std::move(name)] = determineDataTypeByEscapingRule(value, format_settings, FormatSettings::EscapingRule::Escaped); + names_and_types.emplace_back(std::move(name), determineDataTypeByEscapingRule(value, format_settings, FormatSettings::EscapingRule::Escaped)); } else { diff --git a/src/Processors/Formats/Impl/TSKVRowInputFormat.h b/src/Processors/Formats/Impl/TSKVRowInputFormat.h index 3f708355b85..bf8580bc6b7 100644 --- a/src/Processors/Formats/Impl/TSKVRowInputFormat.h +++ b/src/Processors/Formats/Impl/TSKVRowInputFormat.h @@ -59,7 +59,7 @@ public: TSKVSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_); private: - std::unordered_map readRowAndGetNamesAndDataTypes() override; + NamesAndTypesList readRowAndGetNamesAndDataTypes(bool & eof) override; const FormatSettings format_settings; bool first_row = true; diff --git a/tests/queries/0_stateless/02247_names_order_in_json_and_tskv.reference b/tests/queries/0_stateless/02247_names_order_in_json_and_tskv.reference new file mode 100644 index 00000000000..49a285dc11a --- /dev/null +++ b/tests/queries/0_stateless/02247_names_order_in_json_and_tskv.reference @@ -0,0 +1,34 @@ +a Nullable(String) +b Nullable(String) +c Nullable(String) +1 s1 \N +2 } [2] +\N \N \N +\N \N \N +\N \N [3] +b Nullable(String) +a Nullable(String) +c Nullable(String) +e Nullable(String) +1 \N \N \N +\N 2 3 \N +\N \N \N \N +\N \N \N 3 +3 3 1 \N +a Nullable(Float64) +b Nullable(String) +c Array(Nullable(Float64)) +1 s1 [] +2 \N [2] +\N \N [] +\N \N [] +\N \N [3] +b Nullable(Float64) +a Nullable(Float64) +c Nullable(Float64) +e Nullable(Float64) +1 \N \N \N +\N 2 3 \N +\N \N \N \N +\N \N \N 3 +3 3 1 \N diff --git a/tests/queries/0_stateless/02247_names_order_in_json_and_tskv.sh b/tests/queries/0_stateless/02247_names_order_in_json_and_tskv.sh new file mode 100644 index 00000000000..3a775a4b75a --- /dev/null +++ b/tests/queries/0_stateless/02247_names_order_in_json_and_tskv.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash +# Tags: no-parallel, no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + + +USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') +FILE_NAME=test_02247.data +DATA_FILE=${USER_FILES_PATH:?}/$FILE_NAME + +touch $DATA_FILE + +echo -e 'a=1\tb=s1\tc=\N +c=[2]\ta=2\tb=\N} +a=\N +c=[3]\ta=\N' > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSKV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSKV')" + +echo -e 'b=1 +a=2\tc=3 +e=3 +c=1\tb=3\ta=3' > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSKV')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSKV')" + + +echo -e '{"a" : 1, "b" : "s1", "c" : null} +{"c" : [2], "a" : 2, "b" : null} +{} +{"a" : null} +{"c" : [3], "a" : null}' > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'JSONEachRow')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'JSONEachRow')" + +echo -e '{"b" : 1} +{"a" : 2, "c" : 3} +{} +{"e" : 3} +{"c" : 1, "b" : 3, "a" : 3}' > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'JSONEachRow')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'JSONEachRow')" + + +rm $DATA_FILE From 5a5aa3ab317b957c85c49ec9d224a24b3768e0c4 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Tue, 29 Mar 2022 17:35:29 +0200 Subject: [PATCH 055/239] Update test --- tests/queries/0_stateless/02247_names_order_in_json_and_tskv.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/queries/0_stateless/02247_names_order_in_json_and_tskv.sh b/tests/queries/0_stateless/02247_names_order_in_json_and_tskv.sh index 3a775a4b75a..0be26371585 100644 --- a/tests/queries/0_stateless/02247_names_order_in_json_and_tskv.sh +++ b/tests/queries/0_stateless/02247_names_order_in_json_and_tskv.sh @@ -15,12 +15,14 @@ touch $DATA_FILE echo -e 'a=1\tb=s1\tc=\N c=[2]\ta=2\tb=\N} a=\N + c=[3]\ta=\N' > $DATA_FILE $CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSKV')" $CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'TSKV')" echo -e 'b=1 a=2\tc=3 + e=3 c=1\tb=3\ta=3' > $DATA_FILE $CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'TSKV')" From f1f4e85c4b609c49050e3b7df3d1957751b40e46 Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Tue, 29 Mar 2022 19:02:59 +0200 Subject: [PATCH 056/239] Added test for insert of invalid IPv6 value --- .../0_stateless/02244_ipv6_invalid_insert.reference | 4 ++++ .../queries/0_stateless/02244_ipv6_invalid_insert.sql | 11 +++++++++++ 2 files changed, 15 insertions(+) create mode 100644 tests/queries/0_stateless/02244_ipv6_invalid_insert.reference create mode 100644 tests/queries/0_stateless/02244_ipv6_invalid_insert.sql diff --git a/tests/queries/0_stateless/02244_ipv6_invalid_insert.reference b/tests/queries/0_stateless/02244_ipv6_invalid_insert.reference new file mode 100644 index 00000000000..783d8f124dd --- /dev/null +++ b/tests/queries/0_stateless/02244_ipv6_invalid_insert.reference @@ -0,0 +1,4 @@ +fe80::9801:43ff:fe1f:7690 +1.1.1.1 + +::ffff:1.1.1.1 diff --git a/tests/queries/0_stateless/02244_ipv6_invalid_insert.sql b/tests/queries/0_stateless/02244_ipv6_invalid_insert.sql new file mode 100644 index 00000000000..98fb45a5758 --- /dev/null +++ b/tests/queries/0_stateless/02244_ipv6_invalid_insert.sql @@ -0,0 +1,11 @@ +DROP TABLE IF EXISTS test_table; +CREATE TABLE test_table(ip String, ipv6 IPv6 MATERIALIZED toIPv6(ip)) ENGINE = TinyLog; + +INSERT INTO test_table(ip) VALUES ('fe80::9801:43ff:fe1f:7690'), ('1.1.1.1'), (''), ('::ffff:1.1.1.1' ); --{serverError 441} + +SET cast_ipv4_ipv6_default_on_conversion_error = 1; + +INSERT INTO test_table(ip) VALUES ( 'fe80::9801:43ff:fe1f:7690'), ('1.1.1.1'), (''), ('::ffff:1.1.1.1' ); +SELECT * FROM test_table; + +DROP TABLE test_table; From d9044cc2ac6bdaaae7d3c041861056c0b6508a00 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Tue, 29 Mar 2022 18:51:12 +0300 Subject: [PATCH 057/239] Do not require writable source directory for generating krb5 error tables Signed-off-by: Azat Khuzhin --- contrib/krb5-cmake/CMakeLists.txt | 122 +++++++++++++++--------------- 1 file changed, 63 insertions(+), 59 deletions(-) diff --git a/contrib/krb5-cmake/CMakeLists.txt b/contrib/krb5-cmake/CMakeLists.txt index 685e8737ef0..0e29f98ca20 100644 --- a/contrib/krb5-cmake/CMakeLists.txt +++ b/contrib/krb5-cmake/CMakeLists.txt @@ -16,6 +16,7 @@ if(NOT AWK_PROGRAM) endif() set(KRB5_SOURCE_DIR "${ClickHouse_SOURCE_DIR}/contrib/krb5/src") +set(KRB5_ET_BIN_DIR "${CMAKE_CURRENT_BINARY_DIR}/include_private") set(ALL_SRCS "${KRB5_SOURCE_DIR}/util/et/et_name.c" @@ -90,7 +91,6 @@ set(ALL_SRCS "${KRB5_SOURCE_DIR}/lib/gssapi/krb5/get_tkt_flags.c" "${KRB5_SOURCE_DIR}/lib/gssapi/krb5/set_allowable_enctypes.c" "${KRB5_SOURCE_DIR}/lib/gssapi/krb5/k5sealiov.c" - "${KRB5_SOURCE_DIR}/lib/gssapi/krb5/gssapi_err_krb5.c" "${KRB5_SOURCE_DIR}/lib/gssapi/krb5/canon_name.c" "${KRB5_SOURCE_DIR}/lib/gssapi/krb5/inq_cred.c" "${KRB5_SOURCE_DIR}/lib/gssapi/krb5/export_sec_context.c" @@ -143,11 +143,12 @@ set(ALL_SRCS "${KRB5_SOURCE_DIR}/lib/gssapi/generic/util_buffer_set.c" "${KRB5_SOURCE_DIR}/lib/gssapi/generic/util_set.c" "${KRB5_SOURCE_DIR}/lib/gssapi/generic/util_token.c" - "${KRB5_SOURCE_DIR}/lib/gssapi/generic/gssapi_err_generic.c" "${KRB5_SOURCE_DIR}/lib/gssapi/generic/disp_major_status.c" "${KRB5_SOURCE_DIR}/lib/gssapi/generic/util_seqstate.c" "${KRB5_SOURCE_DIR}/lib/gssapi/generic/util_errmap.c" "${KRB5_SOURCE_DIR}/lib/gssapi/generic/rel_buffer.c" + "${KRB5_ET_BIN_DIR}/lib/gssapi/krb5/gssapi_err_krb5.c" + "${KRB5_ET_BIN_DIR}/lib/gssapi/generic/gssapi_err_generic.c" "${KRB5_SOURCE_DIR}/lib/gssapi/spnego/spnego_mech.c" "${KRB5_SOURCE_DIR}/lib/gssapi/spnego/negoex_util.c" @@ -256,8 +257,8 @@ set(ALL_SRCS "${KRB5_SOURCE_DIR}/util/profile/prof_parse.c" "${KRB5_SOURCE_DIR}/util/profile/prof_get.c" "${KRB5_SOURCE_DIR}/util/profile/prof_set.c" - "${KRB5_SOURCE_DIR}/util/profile/prof_err.c" "${KRB5_SOURCE_DIR}/util/profile/prof_init.c" + "${KRB5_ET_BIN_DIR}/util/profile/prof_err.c" "${KRB5_SOURCE_DIR}/lib/krb5/krb/fwd_tgt.c" "${KRB5_SOURCE_DIR}/lib/krb5/krb/conv_creds.c" "${KRB5_SOURCE_DIR}/lib/krb5/krb/fast.c" @@ -450,13 +451,12 @@ set(ALL_SRCS - "${KRB5_SOURCE_DIR}/lib/krb5/error_tables/k5e1_err.c" - "${KRB5_SOURCE_DIR}/lib/krb5/error_tables/kdb5_err.c" - "${KRB5_SOURCE_DIR}/lib/krb5/error_tables/asn1_err.c" - "${KRB5_SOURCE_DIR}/lib/krb5/error_tables/krb5_err.c" - "${KRB5_SOURCE_DIR}/lib/krb5/error_tables/krb524_err.c" - "${KRB5_SOURCE_DIR}/lib/krb5/error_tables/kv5m_err.c" - + "${KRB5_ET_BIN_DIR}/lib/krb5/error_tables/k5e1_err.c" + "${KRB5_ET_BIN_DIR}/lib/krb5/error_tables/kdb5_err.c" + "${KRB5_ET_BIN_DIR}/lib/krb5/error_tables/asn1_err.c" + "${KRB5_ET_BIN_DIR}/lib/krb5/error_tables/krb5_err.c" + "${KRB5_ET_BIN_DIR}/lib/krb5/error_tables/krb524_err.c" + "${KRB5_ET_BIN_DIR}/lib/krb5/error_tables/kv5m_err.c" "${KRB5_SOURCE_DIR}/lib/krb5/rcache/rc_base.c" @@ -473,7 +473,7 @@ set(ALL_SRCS ) add_custom_command( - OUTPUT "${KRB5_SOURCE_DIR}/util/et/compile_et" + OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/compile_et" COMMAND /bin/sh ./config_script ./compile_et.sh @@ -481,50 +481,17 @@ add_custom_command( ${AWK_PROGRAM} sed > - compile_et + ${CMAKE_CURRENT_BINARY_DIR}/compile_et DEPENDS "${KRB5_SOURCE_DIR}/util/et/compile_et.sh" "${KRB5_SOURCE_DIR}/util/et/config_script" WORKING_DIRECTORY "${KRB5_SOURCE_DIR}/util/et" ) -file(GLOB_RECURSE ET_FILES - "${KRB5_SOURCE_DIR}/*.et" -) - -function(preprocess_et out_var) - set(result) - foreach(in_f ${ARGN}) - string(REPLACE - .et - .c - F_C - ${in_f} - ) - string(REPLACE - .et - .h - F_H - ${in_f} - ) - - get_filename_component(ET_PATH ${in_f} DIRECTORY) - - add_custom_command(OUTPUT ${F_C} ${F_H} - COMMAND perl "${KRB5_SOURCE_DIR}/util/et/compile_et" -d "${KRB5_SOURCE_DIR}/util/et" ${in_f} - DEPENDS ${in_f} "${KRB5_SOURCE_DIR}/util/et/compile_et" - WORKING_DIRECTORY ${ET_PATH} - VERBATIM - ) - list(APPEND result ${F_C}) - endforeach() - set(${out_var} "${result}" PARENT_SCOPE) -endfunction() - add_custom_command( - OUTPUT "${KRB5_SOURCE_DIR}/lib/gssapi/krb5/error_map.h" + OUTPUT "${KRB5_ET_BIN_DIR}/error_map.h" COMMAND perl -I../../../util ../../../util/gen-map.pl - -oerror_map.h + -o${KRB5_ET_BIN_DIR}/error_map.h NAME=gsserrmap KEY=OM_uint32 VALUE=char* @@ -536,22 +503,21 @@ add_custom_command( add_custom_target( ERROR_MAP_H - DEPENDS "${KRB5_SOURCE_DIR}/lib/gssapi/krb5/error_map.h" + DEPENDS "${KRB5_ET_BIN_DIR}/error_map.h" VERBATIM ) add_custom_command( - OUTPUT "${KRB5_SOURCE_DIR}/lib/gssapi/generic/errmap.h" - COMMAND perl -w -I../../../util ../../../util/gen.pl bimap errmap.h NAME=mecherrmap LEFT=OM_uint32 RIGHT=struct\ mecherror LEFTPRINT=print_OM_uint32 RIGHTPRINT=mecherror_print LEFTCMP=cmp_OM_uint32 RIGHTCMP=mecherror_cmp + OUTPUT "${KRB5_ET_BIN_DIR}/errmap.h" + COMMAND perl -w -I../../../util ../../../util/gen.pl bimap ${KRB5_ET_BIN_DIR}/errmap.h NAME=mecherrmap LEFT=OM_uint32 RIGHT=struct\ mecherror LEFTPRINT=print_OM_uint32 RIGHTPRINT=mecherror_print LEFTCMP=cmp_OM_uint32 RIGHTCMP=mecherror_cmp WORKING_DIRECTORY "${KRB5_SOURCE_DIR}/lib/gssapi/generic" ) add_custom_target( ERRMAP_H - DEPENDS "${KRB5_SOURCE_DIR}/lib/gssapi/generic/errmap.h" + DEPENDS "${KRB5_ET_BIN_DIR}/errmap.h" VERBATIM ) - add_custom_target( KRB_5_H DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/include/krb5/krb5.h" @@ -567,7 +533,40 @@ add_dependencies( KRB_5_H ) -preprocess_et(processed_et_files ${ET_FILES}) +# +# Generate error tables +# +function(preprocess_et et_path) + string(REPLACE .et .c F_C ${et_path}) + string(REPLACE .et .h F_H ${et_path}) + get_filename_component(et_dir ${et_path} DIRECTORY) + get_filename_component(et_name ${et_path} NAME_WLE) + + add_custom_command(OUTPUT ${F_C} ${F_H} ${KRB5_ET_BIN_DIR}/${et_name}.h + COMMAND perl "${CMAKE_CURRENT_BINARY_DIR}/compile_et" -d "${KRB5_SOURCE_DIR}/util/et" ${et_path} + # for #include w/o path (via -iquote) + COMMAND ${CMAKE_COMMAND} -E create_symlink ${F_H} ${KRB5_ET_BIN_DIR}/${et_name}.h + DEPENDS ${et_path} "${CMAKE_CURRENT_BINARY_DIR}/compile_et" + WORKING_DIRECTORY ${et_dir} + VERBATIM + ) +endfunction() + +function(generate_error_tables) + file(GLOB_RECURSE ET_FILES "${KRB5_SOURCE_DIR}/*.et") + foreach(et_path ${ET_FILES}) + string(REPLACE ${KRB5_SOURCE_DIR} ${KRB5_ET_BIN_DIR} et_bin_path ${et_path}) + string(REPLACE / _ et_target_name ${et_path}) + get_filename_component(et_bin_dir ${et_bin_path} DIRECTORY) + add_custom_command(OUTPUT ${et_bin_path} + COMMAND ${CMAKE_COMMAND} -E make_directory ${et_bin_dir} + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${et_path} ${et_bin_path} + VERBATIM + ) + preprocess_et(${et_bin_path}) + endforeach() +endfunction() +generate_error_tables() if(CMAKE_SYSTEM_NAME MATCHES "Darwin") add_custom_command( @@ -634,12 +633,12 @@ file(MAKE_DIRECTORY SET(KRBHDEP "${KRB5_SOURCE_DIR}/include/krb5/krb5.hin" - "${KRB5_SOURCE_DIR}/lib/krb5/error_tables/krb5_err.h" - "${KRB5_SOURCE_DIR}/lib/krb5/error_tables/k5e1_err.h" - "${KRB5_SOURCE_DIR}/lib/krb5/error_tables/kdb5_err.h" - "${KRB5_SOURCE_DIR}/lib/krb5/error_tables/kv5m_err.h" - "${KRB5_SOURCE_DIR}/lib/krb5/error_tables/krb524_err.h" - "${KRB5_SOURCE_DIR}/lib/krb5/error_tables/asn1_err.h" + "${KRB5_ET_BIN_DIR}/lib/krb5/error_tables/krb5_err.h" + "${KRB5_ET_BIN_DIR}/lib/krb5/error_tables/k5e1_err.h" + "${KRB5_ET_BIN_DIR}/lib/krb5/error_tables/kdb5_err.h" + "${KRB5_ET_BIN_DIR}/lib/krb5/error_tables/kv5m_err.h" + "${KRB5_ET_BIN_DIR}/lib/krb5/error_tables/krb524_err.h" + "${KRB5_ET_BIN_DIR}/lib/krb5/error_tables/asn1_err.h" ) # cmake < 3.18 does not have 'cat' command @@ -656,6 +655,11 @@ target_include_directories(_krb5 SYSTEM BEFORE PUBLIC "${CMAKE_CURRENT_BINARY_DIR}/include" ) +target_compile_options(_krb5 PRIVATE + # For '#include "file.h"' + -iquote "${CMAKE_CURRENT_BINARY_DIR}/include_private" +) + target_include_directories(_krb5 PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/include_private" # For autoconf.h and other generated headers. ${KRB5_SOURCE_DIR} From 3fc36627b3f2ff4b736bfecd9e20d0dc56792252 Mon Sep 17 00:00:00 2001 From: avogar Date: Tue, 29 Mar 2022 17:37:31 +0000 Subject: [PATCH 058/239] Allow to infer and parse bools as numbers in JSON input formats --- src/Core/Settings.h | 1 + .../Serializations/SerializationNumber.cpp | 4 +- src/Formats/FormatFactory.cpp | 1 + src/Formats/FormatSettings.h | 1 + src/Formats/JSONEachRowUtils.cpp | 3 +- src/Processors/Formats/ISchemaReader.cpp | 34 +++++++++++--- src/Processors/Formats/ISchemaReader.h | 6 ++- .../Impl/JSONCompactEachRowRowInputFormat.cpp | 10 +++- .../Impl/JSONEachRowRowInputFormat.cpp | 4 +- .../RowInputFormatWithNamesAndTypes.cpp | 5 +- .../Formats/RowInputFormatWithNamesAndTypes.h | 3 +- ...02247_read_bools_as_numbers_json.reference | 18 +++++++ .../02247_read_bools_as_numbers_json.sh | 47 +++++++++++++++++++ 13 files changed, 120 insertions(+), 17 deletions(-) create mode 100644 tests/queries/0_stateless/02247_read_bools_as_numbers_json.reference create mode 100755 tests/queries/0_stateless/02247_read_bools_as_numbers_json.sh diff --git a/src/Core/Settings.h b/src/Core/Settings.h index f81b61ea648..1b6fcd5ccb1 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -633,6 +633,7 @@ class IColumn; M(UInt64, input_format_msgpack_number_of_columns, 0, "The number of columns in inserted MsgPack data. Used for automatic schema inference from data.", 0) \ M(MsgPackUUIDRepresentation, output_format_msgpack_uuid_representation, FormatSettings::MsgPackUUIDRepresentation::EXT, "The way how to output UUID in MsgPack format.", 0) \ M(UInt64, input_format_max_rows_to_read_for_schema_inference, 100, "The maximum rows of data to read for automatic schema inference", 0) \ + M(Bool, input_format_json_read_bools_as_numbers, true, "Allow to parse bools as numbers in JSON input formats", 0) \ \ M(DateTimeInputFormat, date_time_input_format, FormatSettings::DateTimeInputFormat::Basic, "Method to read DateTime from text input formats. Possible values: 'basic', 'best_effort' and 'best_effort_us'.", 0) \ M(DateTimeOutputFormat, date_time_output_format, FormatSettings::DateTimeOutputFormat::Simple, "Method to write DateTime to text output. Possible values: 'simple', 'iso', 'unix_timestamp'.", 0) \ diff --git a/src/DataTypes/Serializations/SerializationNumber.cpp b/src/DataTypes/Serializations/SerializationNumber.cpp index 4b6b79151bc..14c53dd5956 100644 --- a/src/DataTypes/Serializations/SerializationNumber.cpp +++ b/src/DataTypes/Serializations/SerializationNumber.cpp @@ -43,7 +43,7 @@ void SerializationNumber::serializeTextJSON(const IColumn & column, size_t ro } template -void SerializationNumber::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +void SerializationNumber::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { bool has_quote = false; if (!istr.eof() && *istr.position() == '"') /// We understand the number both in quotes and without. @@ -67,7 +67,7 @@ void SerializationNumber::deserializeTextJSON(IColumn & column, ReadBuffer & static constexpr bool is_uint8 = std::is_same_v; static constexpr bool is_int8 = std::is_same_v; - if (is_uint8 || is_int8) + if (settings.json.read_bools_as_numbers || is_uint8 || is_int8) { // extra conditions to parse true/false strings into 1/0 if (istr.eof()) diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index 3aa82cb79b4..d4e47f11d0d 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -88,6 +88,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.json.named_tuples_as_objects = settings.output_format_json_named_tuples_as_objects; format_settings.json.quote_64bit_integers = settings.output_format_json_quote_64bit_integers; format_settings.json.quote_denormals = settings.output_format_json_quote_denormals; + format_settings.json.read_bools_as_numbers = settings.input_format_json_read_bools_as_numbers; format_settings.null_as_default = settings.input_format_null_as_default; format_settings.decimal_trailing_zeros = settings.output_format_decimal_trailing_zeros; format_settings.parquet.row_group_size = settings.output_format_parquet_row_group_size; diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index bd0a84d9ded..435579ad2f9 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -130,6 +130,7 @@ struct FormatSettings bool escape_forward_slashes = true; bool named_tuples_as_objects = false; bool serialize_as_strings = false; + bool read_bools_as_numbers = true; } json; struct diff --git a/src/Formats/JSONEachRowUtils.cpp b/src/Formats/JSONEachRowUtils.cpp index fb1ddb479f2..a56a27ec491 100644 --- a/src/Formats/JSONEachRowUtils.cpp +++ b/src/Formats/JSONEachRowUtils.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -118,7 +119,7 @@ DataTypePtr getDataTypeFromJSONFieldImpl(const Element & field) return nullptr; if (field.isBool()) - return makeNullable(std::make_shared()); + return DataTypeFactory::instance().get("Nullable(Bool)"); if (field.isInt64() || field.isUInt64() || field.isDouble()) return makeNullable(std::make_shared()); diff --git a/src/Processors/Formats/ISchemaReader.cpp b/src/Processors/Formats/ISchemaReader.cpp index 096e39a2893..0fd44755445 100644 --- a/src/Processors/Formats/ISchemaReader.cpp +++ b/src/Processors/Formats/ISchemaReader.cpp @@ -1,6 +1,7 @@ #include #include #include +#include namespace DB { @@ -10,8 +11,8 @@ namespace ErrorCodes extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; } -IRowSchemaReader::IRowSchemaReader(ReadBuffer & in_, size_t max_rows_to_read_, DataTypePtr default_type_) - : ISchemaReader(in_), max_rows_to_read(max_rows_to_read_), default_type(default_type_) +IRowSchemaReader::IRowSchemaReader(ReadBuffer & in_, size_t max_rows_to_read_, DataTypePtr default_type_, bool allow_bools_as_numbers_) + : ISchemaReader(in_), max_rows_to_read(max_rows_to_read_), default_type(default_type_), allow_bools_as_numbers(allow_bools_as_numbers_) { } @@ -39,9 +40,18 @@ NamesAndTypesList IRowSchemaReader::readSchema() data_types[i] = new_data_types[i]; /// If the new type and the previous type for this column are different, /// we will use default type if we have it or throw an exception. - else if (data_types[i]->getName() != new_data_types[i]->getName()) + else if (!data_types[i]->equals(*new_data_types[i])) { - if (default_type) + /// Check if we have Bool and Number and if allow_bools_as_numbers + /// is true make the result type Number + auto not_nullable_type = removeNullable(data_types[i]); + auto not_nullable_new_type = removeNullable(new_data_types[i]); + if (allow_bools_as_numbers && (isBool(not_nullable_type) || isBool(not_nullable_new_type)) + && (isNumber(not_nullable_type) || isNumber(not_nullable_new_type))) { + if (isBool(not_nullable_type)) + data_types[i] = new_data_types[i]; + } + else if (default_type) data_types[i] = default_type; else throw Exception( @@ -89,8 +99,8 @@ NamesAndTypesList IRowSchemaReader::readSchema() return result; } -IRowWithNamesSchemaReader::IRowWithNamesSchemaReader(ReadBuffer & in_, size_t max_rows_to_read_, DataTypePtr default_type_) - : ISchemaReader(in_), max_rows_to_read(max_rows_to_read_), default_type(default_type_) +IRowWithNamesSchemaReader::IRowWithNamesSchemaReader(ReadBuffer & in_, size_t max_rows_to_read_, DataTypePtr default_type_, bool allow_bools_as_numbers_) + : ISchemaReader(in_), max_rows_to_read(max_rows_to_read_), default_type(default_type_), allow_bools_as_numbers(allow_bools_as_numbers_) { } @@ -122,7 +132,17 @@ NamesAndTypesList IRowWithNamesSchemaReader::readSchema() /// we will use default type if we have it or throw an exception. else if (new_type && type->getName() != new_type->getName()) { - if (default_type) + /// Check if we have Bool and Number and if allow_bools_as_numbers + /// is true make the result type Number + auto not_nullable_type = removeNullable(type); + auto not_nullable_new_type = removeNullable(new_type); + if (allow_bools_as_numbers && (isBool(not_nullable_type) || isBool(not_nullable_new_type)) + && (isNumber(not_nullable_type) || isNumber(not_nullable_new_type))) + { + if (isBool(not_nullable_type)) + type = new_type; + } + else if (default_type) type = default_type; else throw Exception( diff --git a/src/Processors/Formats/ISchemaReader.h b/src/Processors/Formats/ISchemaReader.h index 36cf0656119..19673c3a651 100644 --- a/src/Processors/Formats/ISchemaReader.h +++ b/src/Processors/Formats/ISchemaReader.h @@ -36,7 +36,7 @@ protected: class IRowSchemaReader : public ISchemaReader { public: - IRowSchemaReader(ReadBuffer & in_, size_t max_rows_to_read_, DataTypePtr default_type_ = nullptr); + IRowSchemaReader(ReadBuffer & in_, size_t max_rows_to_read_, DataTypePtr default_type_ = nullptr, bool allow_bools_as_numbers_ = false); NamesAndTypesList readSchema() override; protected: @@ -51,6 +51,7 @@ protected: private: size_t max_rows_to_read; DataTypePtr default_type; + bool allow_bools_as_numbers; std::vector column_names; }; @@ -62,7 +63,7 @@ private: class IRowWithNamesSchemaReader : public ISchemaReader { public: - IRowWithNamesSchemaReader(ReadBuffer & in_, size_t max_rows_to_read_, DataTypePtr default_type_ = nullptr); + IRowWithNamesSchemaReader(ReadBuffer & in_, size_t max_rows_to_read_, DataTypePtr default_type_ = nullptr, bool allow_bools_as_numbers_ = false); NamesAndTypesList readSchema() override; bool hasStrictOrderOfColumns() const override { return false; } @@ -76,6 +77,7 @@ protected: private: size_t max_rows_to_read; DataTypePtr default_type; + bool allow_bools_as_numbers; }; /// Base class for schema inference for formats that don't need any data to diff --git a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp index c087749d8d8..0496e3e41a8 100644 --- a/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONCompactEachRowRowInputFormat.cpp @@ -182,7 +182,15 @@ bool JSONCompactEachRowFormatReader::parseRowEndWithDiagnosticInfo(WriteBuffer & } JSONCompactEachRowRowSchemaReader::JSONCompactEachRowRowSchemaReader(ReadBuffer & in_, bool with_names_, bool with_types_, bool yield_strings_, const FormatSettings & format_settings_) - : FormatWithNamesAndTypesSchemaReader(in_, format_settings_.max_rows_to_read_for_schema_inference, with_names_, with_types_, &reader), reader(in_, yield_strings_, format_settings_) + : FormatWithNamesAndTypesSchemaReader( + in_, + format_settings_.max_rows_to_read_for_schema_inference, + with_names_, + with_types_, + &reader, + nullptr, + format_settings_.json.read_bools_as_numbers) + , reader(in_, yield_strings_, format_settings_) { } diff --git a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp index 549fd7a6113..e132e4ebb9c 100644 --- a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.cpp @@ -307,7 +307,9 @@ void JSONEachRowRowInputFormat::readSuffix() } JSONEachRowSchemaReader::JSONEachRowSchemaReader(ReadBuffer & in_, bool json_strings_, const FormatSettings & format_settings) - : IRowWithNamesSchemaReader(in_, format_settings.max_rows_to_read_for_schema_inference), json_strings(json_strings_) + : IRowWithNamesSchemaReader( + in_, format_settings.max_rows_to_read_for_schema_inference, nullptr, format_settings.json.read_bools_as_numbers) + , json_strings(json_strings_) { } diff --git a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp index 7720b01dc74..0157c54a5b3 100644 --- a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp +++ b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.cpp @@ -299,8 +299,9 @@ FormatWithNamesAndTypesSchemaReader::FormatWithNamesAndTypesSchemaReader( bool with_names_, bool with_types_, FormatWithNamesAndTypesReader * format_reader_, - DataTypePtr default_type_) - : IRowSchemaReader(in_, max_rows_to_read_, default_type_), with_names(with_names_), with_types(with_types_), format_reader(format_reader_) + DataTypePtr default_type_, + bool allow_bools_as_numbers_) + : IRowSchemaReader(in_, max_rows_to_read_, default_type_, allow_bools_as_numbers_), with_names(with_names_), with_types(with_types_), format_reader(format_reader_) { } diff --git a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h index 25ffc8d6de2..8fbd426112c 100644 --- a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h +++ b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h @@ -128,7 +128,8 @@ public: bool with_names_, bool with_types_, FormatWithNamesAndTypesReader * format_reader_, - DataTypePtr default_type_ = nullptr); + DataTypePtr default_type_ = nullptr, + bool allow_bools_as_numbers_ = false); NamesAndTypesList readSchema() override; diff --git a/tests/queries/0_stateless/02247_read_bools_as_numbers_json.reference b/tests/queries/0_stateless/02247_read_bools_as_numbers_json.reference new file mode 100644 index 00000000000..a7609bdd86b --- /dev/null +++ b/tests/queries/0_stateless/02247_read_bools_as_numbers_json.reference @@ -0,0 +1,18 @@ +x Nullable(Bool) +true +false +x Nullable(Float64) +42.42 +0 +x Nullable(Float64) +1 +0.42 +c1 Nullable(Bool) +true +false +c1 Nullable(Float64) +42.42 +0 +c1 Nullable(Float64) +1 +0.42 diff --git a/tests/queries/0_stateless/02247_read_bools_as_numbers_json.sh b/tests/queries/0_stateless/02247_read_bools_as_numbers_json.sh new file mode 100755 index 00000000000..10f050ea6d1 --- /dev/null +++ b/tests/queries/0_stateless/02247_read_bools_as_numbers_json.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash +# Tags: no-parallel, no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + + +USER_FILES_PATH=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') +FILE_NAME=test_02247.data +DATA_FILE=${USER_FILES_PATH:?}/$FILE_NAME + +touch $DATA_FILE + +echo -e '{"x" : true} +{"x" : false}' > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'JSONEachRow')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'JSONEachRow')" + +echo -e '{"x" : 42.42} +{"x" : false}' > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'JSONEachRow')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'JSONEachRow')" + +echo -e '{"x" : true} +{"x" : 0.42}' > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'JSONEachRow')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'JSONEachRow')" + + +echo -e '[true] +[false]' > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'JSONCompactEachRow')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'JSONCompactEachRow')" + +echo -e '[42.42] +[false]' > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'JSONCompactEachRow')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'JSONCompactEachRow')" + +echo -e '[true] +[0.42]' > $DATA_FILE +$CLICKHOUSE_CLIENT -q "desc file('$FILE_NAME', 'JSONCompactEachRow')" +$CLICKHOUSE_CLIENT -q "select * from file('$FILE_NAME', 'JSONCompactEachRow')" + + +rm $DATA_FILE From 000f3043e763bc6f6a79522df430a45cea392c9d Mon Sep 17 00:00:00 2001 From: avogar Date: Tue, 29 Mar 2022 17:40:07 +0000 Subject: [PATCH 059/239] Make better --- src/Processors/Formats/ISchemaReader.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Processors/Formats/ISchemaReader.cpp b/src/Processors/Formats/ISchemaReader.cpp index 0fd44755445..796cdccbe8f 100644 --- a/src/Processors/Formats/ISchemaReader.cpp +++ b/src/Processors/Formats/ISchemaReader.cpp @@ -130,7 +130,7 @@ NamesAndTypesList IRowWithNamesSchemaReader::readSchema() type = new_type; /// If the new type and the previous type for this column are different, /// we will use default type if we have it or throw an exception. - else if (new_type && type->getName() != new_type->getName()) + else if (new_type && type->equals(*new_type)) { /// Check if we have Bool and Number and if allow_bools_as_numbers /// is true make the result type Number From 00ddb72eead2cf03006c32fb5bd9b2078951009e Mon Sep 17 00:00:00 2001 From: rfraposa Date: Tue, 29 Mar 2022 17:43:34 -0600 Subject: [PATCH 060/239] Update /engines docs --- docs/en/engines/database-engines/atomic.md | 17 ++- docs/en/engines/database-engines/index.md | 8 +- docs/en/engines/database-engines/lazy.md | 4 +- .../database-engines/materialized-mysql.md | 35 +++-- .../materialized-postgresql.md | 143 +++++++++--------- docs/en/engines/database-engines/mysql.md | 11 +- .../en/engines/database-engines/postgresql.md | 4 +- .../en/engines/database-engines/replicated.md | 4 +- docs/en/engines/database-engines/sqlite.md | 4 +- .../integrations/ExternalDistributed.md | 7 +- .../integrations/embedded-rocksdb.md | 4 +- .../table-engines/integrations/hdfs.md | 9 +- .../table-engines/integrations/hive.md | 6 +- .../table-engines/integrations/index.md | 4 +- .../table-engines/integrations/jdbc.md | 4 +- .../table-engines/integrations/kafka.md | 9 +- .../integrations/materialized-postgresql.md | 11 +- .../table-engines/integrations/mongodb.md | 4 +- .../table-engines/integrations/mysql.md | 6 +- .../table-engines/integrations/odbc.md | 4 +- .../table-engines/integrations/postgresql.md | 9 +- .../table-engines/integrations/rabbitmq.md | 4 +- .../engines/table-engines/integrations/s3.md | 11 +- .../table-engines/integrations/sqlite.md | 9 +- .../engines/table-engines/log-family/index.md | 5 +- .../engines/table-engines/log-family/log.md | 3 + .../mergetree-family/aggregatingmergetree.md | 9 +- .../mergetree-family/collapsingmergetree.md | 9 +- .../custom-partitioning-key.md | 21 ++- .../mergetree-family/graphitemergetree.md | 27 ++-- .../table-engines/mergetree-family/index.md | 5 +- .../mergetree-family/mergetree.md | 23 +-- .../mergetree-family/replacingmergetree.md | 14 +- .../mergetree-family/replication.md | 9 +- .../mergetree-family/summingmergetree.md | 9 +- .../versionedcollapsingmergetree.md | 9 +- .../engines/table-engines/special/buffer.md | 11 +- .../table-engines/special/dictionary.md | 6 +- .../table-engines/special/distributed.md | 30 ++-- .../table-engines/special/external-data.md | 5 +- docs/en/engines/table-engines/special/file.md | 11 +- .../engines/table-engines/special/generate.md | 6 +- .../en/engines/table-engines/special/index.md | 4 +- docs/en/engines/table-engines/special/join.md | 11 +- .../table-engines/special/materializedview.md | 6 +- .../engines/table-engines/special/memory.md | 6 +- .../en/engines/table-engines/special/merge.md | 8 +- docs/en/engines/table-engines/special/null.md | 12 +- docs/en/engines/table-engines/special/set.md | 6 +- docs/en/engines/table-engines/special/url.md | 6 +- docs/en/engines/table-engines/special/view.md | 6 +- docs/en/install.md | 2 +- docs/en/operations/settings/index.md | 6 +- 53 files changed, 337 insertions(+), 279 deletions(-) diff --git a/docs/en/engines/database-engines/atomic.md b/docs/en/engines/database-engines/atomic.md index 1e555a0a502..878307121aa 100644 --- a/docs/en/engines/database-engines/atomic.md +++ b/docs/en/engines/database-engines/atomic.md @@ -1,9 +1,9 @@ --- -toc_priority: 32 -toc_title: Atomic +sidebar_label: Atomic +sidebar_position: 10 --- -# Atomic {#atomic} +# Atomic It supports non-blocking [DROP TABLE](#drop-detach-table) and [RENAME TABLE](#rename-table) queries and atomic [EXCHANGE TABLES](#exchange-tables) queries. `Atomic` database engine is used by default. @@ -18,14 +18,21 @@ CREATE DATABASE test [ENGINE = Atomic]; ### Table UUID {#table-uuid} All tables in database `Atomic` have persistent [UUID](../../sql-reference/data-types/uuid.md) and store data in directory `/clickhouse_path/store/xxx/xxxyyyyy-yyyy-yyyy-yyyy-yyyyyyyyyyyy/`, where `xxxyyyyy-yyyy-yyyy-yyyy-yyyyyyyyyyyy` is UUID of the table. -Usually, the UUID is generated automatically, but the user can also explicitly specify the UUID in the same way when creating the table (this is not recommended). To display the `SHOW CREATE` query with the UUID you can use setting [show_table_uuid_in_table_create_query_if_not_nil](../../operations/settings/settings.md#show_table_uuid_in_table_create_query_if_not_nil). For example: +Usually, the UUID is generated automatically, but the user can also explicitly specify the UUID in the same way when creating the table (this is not recommended). + +For example: ```sql CREATE TABLE name UUID '28f1c61c-2970-457a-bffe-454156ddcfef' (n UInt64) ENGINE = ...; ``` + +:::note +You can use the [show_table_uuid_in_table_create_query_if_not_nil](../../operations/settings/settings.md#show_table_uuid_in_table_create_query_if_not_nil) setting to display the UUID with the `SHOW CREATE` query. +::: + ### RENAME TABLE {#rename-table} -[RENAME](../../sql-reference/statements/rename.md) queries are performed without changing UUID and moving table data. These queries do not wait for the completion of queries using the table and are executed instantly. +[RENAME](../../sql-reference/statements/rename.md) queries are performed without changing the UUID or moving table data. These queries do not wait for the completion of queries using the table and are executed instantly. ### DROP/DETACH TABLE {#drop-detach-table} diff --git a/docs/en/engines/database-engines/index.md b/docs/en/engines/database-engines/index.md index dd8959d2700..0cee580abcd 100644 --- a/docs/en/engines/database-engines/index.md +++ b/docs/en/engines/database-engines/index.md @@ -6,11 +6,11 @@ toc_title: Introduction # Database Engines {#database-engines} -Database engines allow you to work with tables. +Database engines allow you to work with tables. By default, ClickHouse uses the [Atomic](../../engines/database-engines/atomic.md) database engine, which provides configurable [table engines](../../engines/table-engines/index.md) and an [SQL dialect](../../sql-reference/syntax.md). -By default, ClickHouse uses database engine [Atomic](../../engines/database-engines/atomic.md). It provides configurable [table engines](../../engines/table-engines/index.md) and an [SQL dialect](../../sql-reference/syntax.md). +Here is a complete list of available database engines. Follow the links for more details: -You can also use the following database engines: +- [Atomic](../../engines/database-engines/atomic.md) - [MySQL](../../engines/database-engines/mysql.md) @@ -18,8 +18,6 @@ You can also use the following database engines: - [Lazy](../../engines/database-engines/lazy.md) -- [Atomic](../../engines/database-engines/atomic.md) - - [PostgreSQL](../../engines/database-engines/postgresql.md) - [Replicated](../../engines/database-engines/replicated.md) diff --git a/docs/en/engines/database-engines/lazy.md b/docs/en/engines/database-engines/lazy.md index ecd4b94f579..b95ade19df4 100644 --- a/docs/en/engines/database-engines/lazy.md +++ b/docs/en/engines/database-engines/lazy.md @@ -1,6 +1,6 @@ --- -toc_priority: 31 -toc_title: Lazy +sidebar_label: Lazy +sidebar_position: 20 --- # Lazy {#lazy} diff --git a/docs/en/engines/database-engines/materialized-mysql.md b/docs/en/engines/database-engines/materialized-mysql.md index 3dc14c87be7..df072682097 100644 --- a/docs/en/engines/database-engines/materialized-mysql.md +++ b/docs/en/engines/database-engines/materialized-mysql.md @@ -1,16 +1,15 @@ --- -toc_priority: 29 -toc_title: MaterializedMySQL +sidebar_label: MaterializedMySQL +sidebar_position: 70 --- -# [experimental] MaterializedMySQL {#materialized-mysql} +# [experimental] MaterializedMySQL -!!! warning "Warning" - This is an experimental feature that should not be used in production. +:::warning +This is an experimental feature that should not be used in production. +::: -Creates ClickHouse database with all the tables existing in MySQL, and all the data in those tables. - -ClickHouse server works as MySQL replica. It reads binlog and performs DDL and DML queries. +Creates a ClickHouse database with all the tables existing in MySQL, and all the data in those tables. The ClickHouse server works as MySQL replica. It reads `binlog` and performs DDL and DML queries. ## Creating a Database {#creating-a-database} @@ -31,8 +30,6 @@ ENGINE = MaterializedMySQL('host:port', ['database' | database], 'user', 'passwo - `max_rows_in_buffer` — Maximum number of rows that data is allowed to cache in memory (for single table and the cache data unable to query). When this number is exceeded, the data will be materialized. Default: `65 505`. - `max_bytes_in_buffer` — Maximum number of bytes that data is allowed to cache in memory (for single table and the cache data unable to query). When this number is exceeded, the data will be materialized. Default: `1 048 576`. -- `max_rows_in_buffers` — Maximum number of rows that data is allowed to cache in memory (for database and the cache data unable to query). When this number is exceeded, the data will be materialized. Default: `65 505`. -- `max_bytes_in_buffers` — Maximum number of bytes that data is allowed to cache in memory (for database and the cache data unable to query). When this number is exceeded, the data will be materialized. Default: `1 048 576`. - `max_flush_data_time` — Maximum number of milliseconds that data is allowed to cache in memory (for database and the cache data unable to query). When this time is exceeded, the data will be materialized. Default: `1000`. - `max_wait_time_when_mysql_unavailable` — Retry interval when MySQL is not available (milliseconds). Negative value disables retry. Default: `1000`. - `allows_query_when_mysql_lost` — Allows to query a materialized table when MySQL is lost. Default: `0` (`false`). @@ -52,8 +49,9 @@ For the correct work of `MaterializedMySQL`, there are few mandatory `MySQL`-sid - `default_authentication_plugin = mysql_native_password` since `MaterializedMySQL` can only authorize with this method. - `gtid_mode = on` since GTID based logging is a mandatory for providing correct `MaterializedMySQL` replication. -!!! attention "Attention" - While turning on `gtid_mode` you should also specify `enforce_gtid_consistency = on`. +:::note +While turning on `gtid_mode` you should also specify `enforce_gtid_consistency = on`. +::: ## Virtual Columns {#virtual-columns} @@ -220,13 +218,14 @@ extra care needs to be taken. You may specify overrides for tables that do not exist yet. -!!! warning "Warning" - It is easy to break replication with table overrides if not used with care. For example: +:::warning +It is easy to break replication with table overrides if not used with care. For example: - * If an ALIAS column is added with a table override, and a column with the same name is later added to the source - MySQL table, the converted ALTER TABLE query in ClickHouse will fail and replication stops. - * It is currently possible to add overrides that reference nullable columns where not-nullable are required, such as in - `ORDER BY` or `PARTITION BY`. This will cause CREATE TABLE queries that will fail, also causing replication to stop. +* If an ALIAS column is added with a table override, and a column with the same name is later added to the source + MySQL table, the converted ALTER TABLE query in ClickHouse will fail and replication stops. +* It is currently possible to add overrides that reference nullable columns where not-nullable are required, such as in + `ORDER BY` or `PARTITION BY`. This will cause CREATE TABLE queries that will fail, also causing replication to stop. +::: ## Examples of Use {#examples-of-use} diff --git a/docs/en/engines/database-engines/materialized-postgresql.md b/docs/en/engines/database-engines/materialized-postgresql.md index 56793435fac..ff8f7b192e0 100644 --- a/docs/en/engines/database-engines/materialized-postgresql.md +++ b/docs/en/engines/database-engines/materialized-postgresql.md @@ -1,6 +1,6 @@ --- -toc_priority: 30 -toc_title: MaterializedPostgreSQL +sidebar_label: MaterializedPostgreSQL +sidebar_position: 60 --- # [experimental] MaterializedPostgreSQL {#materialize-postgresql} @@ -46,7 +46,9 @@ After `MaterializedPostgreSQL` database is created, it does not automatically de ATTACH TABLE postgres_database.new_table; ``` -Warning: before version 22.1 adding table to replication left unremoved temprorary replication slot (named `{db_name}_ch_replication_slot_tmp`). If attaching tables in clickhouse version before 22.1, make sure to delete it manually (`SELECT pg_drop_replication_slot('{db_name}_ch_replication_slot_tmp')`). Otherwise disk usage will grow. Issue is fixed in 22.1. +:::warning +Before version 22.1, adding a table to replication left an unremoved temporary replication slot (named `{db_name}_ch_replication_slot_tmp`). If attaching tables in ClickHouse version before 22.1, make sure to delete it manually (`SELECT pg_drop_replication_slot('{db_name}_ch_replication_slot_tmp')`). Otherwise disk usage will grow. This issue is fixed in 22.1. +::: ## Dynamically removing tables from replication {#dynamically-removing-table-from-replication} @@ -135,69 +137,70 @@ FROM pg_class WHERE oid = 'postgres_table'::regclass; ``` -!!! warning "Warning" - Replication of [**TOAST**](https://www.postgresql.org/docs/9.5/storage-toast.html) values is not supported. The default value for the data type will be used. +:::warning +Replication of [**TOAST**](https://www.postgresql.org/docs/9.5/storage-toast.html) values is not supported. The default value for the data type will be used. +::: ## Settings {#settings} -1. materialized_postgresql_tables_list {#materialized-postgresql-tables-list} +1. `materialized_postgresql_tables_list` {#materialized-postgresql-tables-list} -Sets a comma-separated list of PostgreSQL database tables, which will be replicated via [MaterializedPostgreSQL](../../engines/database-engines/materialized-postgresql.md) database engine. + Sets a comma-separated list of PostgreSQL database tables, which will be replicated via [MaterializedPostgreSQL](../../engines/database-engines/materialized-postgresql.md) database engine. -Default value: empty list — means whole PostgreSQL database will be replicated. + Default value: empty list — means whole PostgreSQL database will be replicated. -2. materialized_postgresql_schema {#materialized-postgresql-schema} +2. `materialized_postgresql_schema` {#materialized-postgresql-schema} -Default value: empty string. (Default schema is used) + Default value: empty string. (Default schema is used) -3. materialized_postgresql_schema_list {#materialized-postgresql-schema-list} +3. `materialized_postgresql_schema_list` {#materialized-postgresql-schema-list} -Default value: empty list. (Default schema is used) + Default value: empty list. (Default schema is used) -4. materialized_postgresql_allow_automatic_update {#materialized-postgresql-allow-automatic-update} +4. `materialized_postgresql_allow_automatic_update` {#materialized-postgresql-allow-automatic-update} -Do not use this setting before 22.1 version. + Do not use this setting before 22.1 version. -Allows reloading table in the background, when schema changes are detected. DDL queries on the PostgreSQL side are not replicated via ClickHouse [MaterializedPostgreSQL](../../engines/database-engines/materialized-postgresql.md) engine, because it is not allowed with PostgreSQL logical replication protocol, but the fact of DDL changes is detected transactionally. In this case, the default behaviour is to stop replicating those tables once DDL is detected. However, if this setting is enabled, then, instead of stopping the replication of those tables, they will be reloaded in the background via database snapshot without data losses and replication will continue for them. + Allows reloading table in the background, when schema changes are detected. DDL queries on the PostgreSQL side are not replicated via ClickHouse [MaterializedPostgreSQL](../../engines/database-engines/materialized-postgresql.md) engine, because it is not allowed with PostgreSQL logical replication protocol, but the fact of DDL changes is detected transactionally. In this case, the default behaviour is to stop replicating those tables once DDL is detected. However, if this setting is enabled, then, instead of stopping the replication of those tables, they will be reloaded in the background via database snapshot without data losses and replication will continue for them. -Possible values: + Possible values: -- 0 — The table is not automatically updated in the background, when schema changes are detected. -- 1 — The table is automatically updated in the background, when schema changes are detected. + - 0 — The table is not automatically updated in the background, when schema changes are detected. + - 1 — The table is automatically updated in the background, when schema changes are detected. -Default value: `0`. + Default value: `0`. -5. materialized_postgresql_max_block_size {#materialized-postgresql-max-block-size} +5. `materialized_postgresql_max_block_size` {#materialized-postgresql-max-block-size} -Sets the number of rows collected in memory before flushing data into PostgreSQL database table. + Sets the number of rows collected in memory before flushing data into PostgreSQL database table. -Possible values: + Possible values: -- Positive integer. + - Positive integer. -Default value: `65536`. + Default value: `65536`. -6. materialized_postgresql_replication_slot {#materialized-postgresql-replication-slot} +6. `materialized_postgresql_replication_slot` {#materialized-postgresql-replication-slot} -A user-created replication slot. Must be used together with `materialized_postgresql_snapshot`. + A user-created replication slot. Must be used together with `materialized_postgresql_snapshot`. -7. materialized_postgresql_snapshot {#materialized-postgresql-snapshot} +7. `materialized_postgresql_snapshot` {#materialized-postgresql-snapshot} -A text string identifying a snapshot, from which [initial dump of PostgreSQL tables](../../engines/database-engines/materialized-postgresql.md) will be performed. Must be used together with `materialized_postgresql_replication_slot`. + A text string identifying a snapshot, from which [initial dump of PostgreSQL tables](../../engines/database-engines/materialized-postgresql.md) will be performed. Must be used together with `materialized_postgresql_replication_slot`. -``` sql -CREATE DATABASE database1 -ENGINE = MaterializedPostgreSQL('postgres1:5432', 'postgres_database', 'postgres_user', 'postgres_password') -SETTINGS materialized_postgresql_tables_list = 'table1,table2,table3'; + ``` sql + CREATE DATABASE database1 + ENGINE = MaterializedPostgreSQL('postgres1:5432', 'postgres_database', 'postgres_user', 'postgres_password') + SETTINGS materialized_postgresql_tables_list = 'table1,table2,table3'; -SELECT * FROM database1.table1; -``` + SELECT * FROM database1.table1; + ``` -The settings can be changed, if necessary, using a DDL query. But it is impossible to change the setting `materialized_postgresql_tables_list`. To update the list of tables in this setting use the `ATTACH TABLE` query. + The settings can be changed, if necessary, using a DDL query. But it is impossible to change the setting `materialized_postgresql_tables_list`. To update the list of tables in this setting use the `ATTACH TABLE` query. -``` sql -ALTER DATABASE postgres_database MODIFY SETTING materialized_postgresql_max_block_size = ; -``` + ``` sql + ALTER DATABASE postgres_database MODIFY SETTING materialized_postgresql_max_block_size = ; + ``` ## Notes {#notes} @@ -213,47 +216,47 @@ Please note that this should be used only if it is actually needed. If there is 1. Configure replication slot in PostgreSQL. -```yaml -apiVersion: "acid.zalan.do/v1" -kind: postgresql -metadata: - name: acid-demo-cluster -spec: - numberOfInstances: 2 - postgresql: - parameters: - wal_level: logical - patroni: - slots: - clickhouse_sync: - type: logical - database: demodb - plugin: pgoutput -``` + ```yaml + apiVersion: "acid.zalan.do/v1" + kind: postgresql + metadata: + name: acid-demo-cluster + spec: + numberOfInstances: 2 + postgresql: + parameters: + wal_level: logical + patroni: + slots: + clickhouse_sync: + type: logical + database: demodb + plugin: pgoutput + ``` 2. Wait for replication slot to be ready, then begin a transaction and export the transaction snapshot identifier: -```sql -BEGIN; -SELECT pg_export_snapshot(); -``` + ```sql + BEGIN; + SELECT pg_export_snapshot(); + ``` 3. In ClickHouse create database: -```sql -CREATE DATABASE demodb -ENGINE = MaterializedPostgreSQL('postgres1:5432', 'postgres_database', 'postgres_user', 'postgres_password') -SETTINGS - materialized_postgresql_replication_slot = 'clickhouse_sync', - materialized_postgresql_snapshot = '0000000A-0000023F-3', - materialized_postgresql_tables_list = 'table1,table2,table3'; -``` + ```sql + CREATE DATABASE demodb + ENGINE = MaterializedPostgreSQL('postgres1:5432', 'postgres_database', 'postgres_user', 'postgres_password') + SETTINGS + materialized_postgresql_replication_slot = 'clickhouse_sync', + materialized_postgresql_snapshot = '0000000A-0000023F-3', + materialized_postgresql_tables_list = 'table1,table2,table3'; + ``` 4. End the PostgreSQL transaction once replication to ClickHouse DB is confirmed. Verify that replication continues after failover: -```bash -kubectl exec acid-demo-cluster-0 -c postgres -- su postgres -c 'patronictl failover --candidate acid-demo-cluster-1 --force' -``` + ```bash + kubectl exec acid-demo-cluster-0 -c postgres -- su postgres -c 'patronictl failover --candidate acid-demo-cluster-1 --force' + ``` ### Required permissions diff --git a/docs/en/engines/database-engines/mysql.md b/docs/en/engines/database-engines/mysql.md index c5a1bba44b2..89a0786a9ec 100644 --- a/docs/en/engines/database-engines/mysql.md +++ b/docs/en/engines/database-engines/mysql.md @@ -1,9 +1,9 @@ --- -toc_priority: 30 -toc_title: MySQL +sidebar_position: 50 +sidebar_label: MySQL --- -# MySQL {#mysql} +# MySQL Allows to connect to databases on a remote MySQL server and perform `INSERT` and `SELECT` queries to exchange data between ClickHouse and MySQL. @@ -59,8 +59,9 @@ These variables are supported: - `version` - `max_allowed_packet` -!!! warning "Warning" - By now these variables are stubs and don't correspond to anything. +:::warning +By now these variables are stubs and don't correspond to anything. +::: Example: diff --git a/docs/en/engines/database-engines/postgresql.md b/docs/en/engines/database-engines/postgresql.md index 76ef484e773..bc5e93d0923 100644 --- a/docs/en/engines/database-engines/postgresql.md +++ b/docs/en/engines/database-engines/postgresql.md @@ -1,6 +1,6 @@ --- -toc_priority: 35 -toc_title: PostgreSQL +sidebar_position: 40 +sidebar_label: PostgreSQL --- # PostgreSQL {#postgresql} diff --git a/docs/en/engines/database-engines/replicated.md b/docs/en/engines/database-engines/replicated.md index bdc17d32393..07d6fcd9ece 100644 --- a/docs/en/engines/database-engines/replicated.md +++ b/docs/en/engines/database-engines/replicated.md @@ -1,6 +1,6 @@ --- -toc_priority: 36 -toc_title: Replicated +sidebar_position: 30 +sidebar_label: Replicated --- # [experimental] Replicated {#replicated} diff --git a/docs/en/engines/database-engines/sqlite.md b/docs/en/engines/database-engines/sqlite.md index ee9db90859f..2f8b44c9a09 100644 --- a/docs/en/engines/database-engines/sqlite.md +++ b/docs/en/engines/database-engines/sqlite.md @@ -1,6 +1,6 @@ --- -toc_priority: 32 -toc_title: SQLite +sidebar_position: 55 +sidebar_label: SQLite --- # SQLite {#sqlite} diff --git a/docs/en/engines/table-engines/integrations/ExternalDistributed.md b/docs/en/engines/table-engines/integrations/ExternalDistributed.md index 0ecbc5383e1..c9aae1934db 100644 --- a/docs/en/engines/table-engines/integrations/ExternalDistributed.md +++ b/docs/en/engines/table-engines/integrations/ExternalDistributed.md @@ -1,6 +1,6 @@ --- -toc_priority: 12 -toc_title: ExternalDistributed +sidebar_position: 12 +sidebar_label: ExternalDistributed --- # ExternalDistributed {#externaldistributed} @@ -51,3 +51,6 @@ You can specify any number of shards and any number of replicas for each shard. - [MySQL table engine](../../../engines/table-engines/integrations/mysql.md) - [PostgreSQL table engine](../../../engines/table-engines/integrations/postgresql.md) - [Distributed table engine](../../../engines/table-engines/special/distributed.md) + + +[Original article](https://clickhouse.com/docs/en/engines/table-engines/integrations/ExternalDistributed/) diff --git a/docs/en/engines/table-engines/integrations/embedded-rocksdb.md b/docs/en/engines/table-engines/integrations/embedded-rocksdb.md index 385abeb83ad..701d190f022 100644 --- a/docs/en/engines/table-engines/integrations/embedded-rocksdb.md +++ b/docs/en/engines/table-engines/integrations/embedded-rocksdb.md @@ -1,6 +1,6 @@ --- -toc_priority: 9 -toc_title: EmbeddedRocksDB +sidebar_position: 9 +sidebar_label: EmbeddedRocksDB --- # EmbeddedRocksDB Engine {#EmbeddedRocksDB-engine} diff --git a/docs/en/engines/table-engines/integrations/hdfs.md b/docs/en/engines/table-engines/integrations/hdfs.md index 0d6d90f9d31..503bd779abf 100644 --- a/docs/en/engines/table-engines/integrations/hdfs.md +++ b/docs/en/engines/table-engines/integrations/hdfs.md @@ -1,6 +1,6 @@ --- -toc_priority: 6 -toc_title: HDFS +sidebar_position: 6 +sidebar_label: HDFS --- # HDFS {#table_engines-hdfs} @@ -98,8 +98,9 @@ Table consists of all the files in both directories (all files should satisfy fo CREATE TABLE table_with_asterisk (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/{some,another}_dir/*', 'TSV') ``` -!!! warning "Warning" - If the listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`. +:::warning +If the listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`. +::: **Example** diff --git a/docs/en/engines/table-engines/integrations/hive.md b/docs/en/engines/table-engines/integrations/hive.md index b804b9c2279..6731f0e7559 100644 --- a/docs/en/engines/table-engines/integrations/hive.md +++ b/docs/en/engines/table-engines/integrations/hive.md @@ -1,6 +1,6 @@ --- -toc_priority: 4 -toc_title: Hive +sidebar_position: 4 +sidebar_label: Hive --- # Hive {#hive} @@ -406,3 +406,5 @@ f_char: hello world f_bool: true day: 2021-09-18 ``` + +[Original article](https://clickhouse.com/docs/en/engines/table-engines/integrations/hive/) diff --git a/docs/en/engines/table-engines/integrations/index.md b/docs/en/engines/table-engines/integrations/index.md index a06b4c78394..9230ad624ba 100644 --- a/docs/en/engines/table-engines/integrations/index.md +++ b/docs/en/engines/table-engines/integrations/index.md @@ -1,6 +1,6 @@ --- -toc_folder_title: Integrations -toc_priority: 1 +sidebar_position: 40 +sidebar_label: Integrations --- # Table Engines for Integrations {#table-engines-for-integrations} diff --git a/docs/en/engines/table-engines/integrations/jdbc.md b/docs/en/engines/table-engines/integrations/jdbc.md index 2f442fd7753..0ce31f36070 100644 --- a/docs/en/engines/table-engines/integrations/jdbc.md +++ b/docs/en/engines/table-engines/integrations/jdbc.md @@ -1,6 +1,6 @@ --- -toc_priority: 3 -toc_title: JDBC +sidebar_position: 3 +sidebar_label: JDBC --- # JDBC {#table-engine-jdbc} diff --git a/docs/en/engines/table-engines/integrations/kafka.md b/docs/en/engines/table-engines/integrations/kafka.md index 1d80f143098..90e0925f531 100644 --- a/docs/en/engines/table-engines/integrations/kafka.md +++ b/docs/en/engines/table-engines/integrations/kafka.md @@ -1,6 +1,6 @@ --- -toc_priority: 8 -toc_title: Kafka +sidebar_position: 8 +sidebar_label: Kafka --- # Kafka {#kafka} @@ -87,8 +87,9 @@ Examples: Deprecated Method for Creating a Table -!!! attention "Attention" - Do not use this method in new projects. If possible, switch old projects to the method described above. +:::warning +Do not use this method in new projects. If possible, switch old projects to the method described above. +::: ``` sql Kafka(kafka_broker_list, kafka_topic_list, kafka_group_name, kafka_format diff --git a/docs/en/engines/table-engines/integrations/materialized-postgresql.md b/docs/en/engines/table-engines/integrations/materialized-postgresql.md index fa349e49af5..61f97961ddb 100644 --- a/docs/en/engines/table-engines/integrations/materialized-postgresql.md +++ b/docs/en/engines/table-engines/integrations/materialized-postgresql.md @@ -1,6 +1,6 @@ --- -toc_priority: 12 -toc_title: MaterializedPostgreSQL +sidebar_position: 12 +sidebar_label: MaterializedPostgreSQL --- # MaterializedPostgreSQL {#materialize-postgresql} @@ -52,5 +52,8 @@ PRIMARY KEY key; SELECT key, value, _version FROM postgresql_db.postgresql_replica; ``` -!!! warning "Warning" - Replication of [**TOAST**](https://www.postgresql.org/docs/9.5/storage-toast.html) values is not supported. The default value for the data type will be used. +:::warning +Replication of [**TOAST**](https://www.postgresql.org/docs/9.5/storage-toast.html) values is not supported. The default value for the data type will be used. +::: + +[Original article](https://clickhouse.com/docs/en/engines/table-engines/integrations/materialized-postgresql) diff --git a/docs/en/engines/table-engines/integrations/mongodb.md b/docs/en/engines/table-engines/integrations/mongodb.md index 475416ffb94..d212ab4720f 100644 --- a/docs/en/engines/table-engines/integrations/mongodb.md +++ b/docs/en/engines/table-engines/integrations/mongodb.md @@ -1,6 +1,6 @@ --- -toc_priority: 5 -toc_title: MongoDB +sidebar_position: 5 +sidebar_label: MongoDB --- # MongoDB {#mongodb} diff --git a/docs/en/engines/table-engines/integrations/mysql.md b/docs/en/engines/table-engines/integrations/mysql.md index 7f28f16aa27..e962db58873 100644 --- a/docs/en/engines/table-engines/integrations/mysql.md +++ b/docs/en/engines/table-engines/integrations/mysql.md @@ -1,6 +1,6 @@ --- -toc_priority: 4 -toc_title: MySQL +sidebar_position: 4 +sidebar_label: MySQL --- # MySQL {#mysql} @@ -148,3 +148,5 @@ Default value: `16`. - [The mysql table function](../../../sql-reference/table-functions/mysql.md) - [Using MySQL as a source of external dictionary](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-mysql) + +[Original article](https://clickhouse.com/docs/en/engines/table-engines/integrations/mysql/) diff --git a/docs/en/engines/table-engines/integrations/odbc.md b/docs/en/engines/table-engines/integrations/odbc.md index 0ef21d8565a..ed2b77d7ca3 100644 --- a/docs/en/engines/table-engines/integrations/odbc.md +++ b/docs/en/engines/table-engines/integrations/odbc.md @@ -1,6 +1,6 @@ --- -toc_priority: 2 -toc_title: ODBC +sidebar_position: 2 +sidebar_label: ODBC --- # ODBC {#table-engine-odbc} diff --git a/docs/en/engines/table-engines/integrations/postgresql.md b/docs/en/engines/table-engines/integrations/postgresql.md index 789759ec521..d6826000a1a 100644 --- a/docs/en/engines/table-engines/integrations/postgresql.md +++ b/docs/en/engines/table-engines/integrations/postgresql.md @@ -1,6 +1,6 @@ --- -toc_priority: 11 -toc_title: PostgreSQL +sidebar_position: 11 +sidebar_label: PostgreSQL --- # PostgreSQL {#postgresql} @@ -73,8 +73,9 @@ All joins, aggregations, sorting, `IN [ array ]` conditions and the `LIMIT` samp PostgreSQL `Array` types are converted into ClickHouse arrays. -!!! info "Note" - Be careful - in PostgreSQL an array data, created like a `type_name[]`, may contain multi-dimensional arrays of different dimensions in different table rows in same column. But in ClickHouse it is only allowed to have multidimensional arrays of the same count of dimensions in all table rows in same column. +:::warning +Be careful - in PostgreSQL an array data, created like a `type_name[]`, may contain multi-dimensional arrays of different dimensions in different table rows in same column. But in ClickHouse it is only allowed to have multidimensional arrays of the same count of dimensions in all table rows in same column. +::: Supports multiple replicas that must be listed by `|`. For example: diff --git a/docs/en/engines/table-engines/integrations/rabbitmq.md b/docs/en/engines/table-engines/integrations/rabbitmq.md index 78c144ac76f..6653b76594a 100644 --- a/docs/en/engines/table-engines/integrations/rabbitmq.md +++ b/docs/en/engines/table-engines/integrations/rabbitmq.md @@ -1,6 +1,6 @@ --- -toc_priority: 10 -toc_title: RabbitMQ +sidebar_position: 10 +sidebar_label: RabbitMQ --- # RabbitMQ Engine {#rabbitmq-engine} diff --git a/docs/en/engines/table-engines/integrations/s3.md b/docs/en/engines/table-engines/integrations/s3.md index c7301a55bf0..42abc2a0b1e 100644 --- a/docs/en/engines/table-engines/integrations/s3.md +++ b/docs/en/engines/table-engines/integrations/s3.md @@ -1,6 +1,6 @@ --- -toc_priority: 7 -toc_title: S3 +sidebar_position: 7 +sidebar_label: S3 --- # S3 Table Engine {#table-engine-s3} @@ -66,8 +66,9 @@ For more information about virtual columns see [here](../../../engines/table-eng Constructions with `{}` are similar to the [remote](../../../sql-reference/table-functions/remote.md) table function. -!!! warning "Warning" - If the listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`. +:::warning +If the listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`. +::: **Example with wildcards 1** @@ -158,3 +159,5 @@ The following settings can be specified in configuration file for given endpoint ## See also - [s3 table function](../../../sql-reference/table-functions/s3.md) + +[Original article](https://clickhouse.com/docs/en/engines/table-engines/integrations/s3/) diff --git a/docs/en/engines/table-engines/integrations/sqlite.md b/docs/en/engines/table-engines/integrations/sqlite.md index 391f1696291..45cc1cfc28a 100644 --- a/docs/en/engines/table-engines/integrations/sqlite.md +++ b/docs/en/engines/table-engines/integrations/sqlite.md @@ -1,6 +1,6 @@ --- -toc_priority: 7 -toc_title: SQLite +sidebar_position: 7 +sidebar_label: SQLite --- # SQLite {#sqlite} @@ -56,4 +56,7 @@ SELECT * FROM sqlite_db.table2 ORDER BY col1; **See Also** - [SQLite](../../../engines/database-engines/sqlite.md) engine -- [sqlite](../../../sql-reference/table-functions/sqlite.md) table function \ No newline at end of file +- [sqlite](../../../sql-reference/table-functions/sqlite.md) table function + + +[Original article](https://clickhouse.com/docs/en/engines/table-engines/integrations/sqlite/) diff --git a/docs/en/engines/table-engines/log-family/index.md b/docs/en/engines/table-engines/log-family/index.md index 910df09e67f..89eb08ad7b9 100644 --- a/docs/en/engines/table-engines/log-family/index.md +++ b/docs/en/engines/table-engines/log-family/index.md @@ -1,7 +1,6 @@ --- -toc_folder_title: Log Family -toc_priority: 29 -toc_title: Introduction +sidebar_position: 20 +sidebar_label: Log Family --- # Log Engine Family {#log-engine-family} diff --git a/docs/en/engines/table-engines/log-family/log.md b/docs/en/engines/table-engines/log-family/log.md index 2aeef171128..8858699f045 100644 --- a/docs/en/engines/table-engines/log-family/log.md +++ b/docs/en/engines/table-engines/log-family/log.md @@ -10,3 +10,6 @@ The engine belongs to the family of `Log` engines. See the common properties of `Log` differs from [TinyLog](../../../engines/table-engines/log-family/tinylog.md) in that a small file of "marks" resides with the column files. These marks are written on every data block and contain offsets that indicate where to start reading the file in order to skip the specified number of rows. This makes it possible to read table data in multiple threads. For concurrent data access, the read operations can be performed simultaneously, while write operations block reads and each other. The `Log` engine does not support indexes. Similarly, if writing to a table failed, the table is broken, and reading from it returns an error. The `Log` engine is appropriate for temporary data, write-once tables, and for testing or demonstration purposes. + +[Original article](https://clickhouse.com/docs/en/engines/table-engines/log-family/log/) + diff --git a/docs/en/engines/table-engines/mergetree-family/aggregatingmergetree.md b/docs/en/engines/table-engines/mergetree-family/aggregatingmergetree.md index 8c9f8dd8ce3..7be10cec2f5 100644 --- a/docs/en/engines/table-engines/mergetree-family/aggregatingmergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/aggregatingmergetree.md @@ -1,6 +1,6 @@ --- -toc_priority: 35 -toc_title: AggregatingMergeTree +sidebar_position: 60 +sidebar_label: AggregatingMergeTree --- # AggregatingMergeTree {#aggregatingmergetree} @@ -42,8 +42,9 @@ When creating a `AggregatingMergeTree` table the same [clauses](../../../engines Deprecated Method for Creating a Table -!!! attention "Attention" - Do not use this method in new projects and, if possible, switch the old projects to the method described above. +:::warning +Do not use this method in new projects and, if possible, switch the old projects to the method described above. +::: ``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] diff --git a/docs/en/engines/table-engines/mergetree-family/collapsingmergetree.md b/docs/en/engines/table-engines/mergetree-family/collapsingmergetree.md index 271b8b20fdb..22863611e79 100644 --- a/docs/en/engines/table-engines/mergetree-family/collapsingmergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/collapsingmergetree.md @@ -1,6 +1,6 @@ --- -toc_priority: 36 -toc_title: CollapsingMergeTree +sidebar_position: 70 +sidebar_label: CollapsingMergeTree --- # CollapsingMergeTree {#table_engine-collapsingmergetree} @@ -42,8 +42,9 @@ When creating a `CollapsingMergeTree` table, the same [query clauses](../../../e Deprecated Method for Creating a Table -!!! attention "Attention" - Do not use this method in new projects and, if possible, switch the old projects to the method described above. +:::warning +Do not use this method in new projects and, if possible, switch old projects to the method described above. +::: ``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] diff --git a/docs/en/engines/table-engines/mergetree-family/custom-partitioning-key.md b/docs/en/engines/table-engines/mergetree-family/custom-partitioning-key.md index b58e90a3d92..716528f8d77 100644 --- a/docs/en/engines/table-engines/mergetree-family/custom-partitioning-key.md +++ b/docs/en/engines/table-engines/mergetree-family/custom-partitioning-key.md @@ -1,12 +1,15 @@ --- -toc_priority: 32 -toc_title: Custom Partitioning Key +sidebar_position: 30 +sidebar_label: Custom Partitioning Key --- # Custom Partitioning Key {#custom-partitioning-key} -!!! warning "Warning" - In most cases you don't need partition key, and in most other cases you don't need partition key more granular than by months. Partitioning does not speed up queries (in contrast to the ORDER BY expression). You should never use too granular partitioning. Don't partition your data by client identifiers or names (instead make client identifier or name the first column in the ORDER BY expression). +:::warning +In most cases you do not need a partition key, and in most other cases you do not need a partition key more granular than by months. Partitioning does not speed up queries (in contrast to the ORDER BY expression). + +You should never use too granular of partitioning. Don't partition your data by client identifiers or names. Instead, make a client identifier or name the first column in the ORDER BY expression. +::: Partitioning is available for the [MergeTree](../../../engines/table-engines/mergetree-family/mergetree.md) family tables (including [replicated](../../../engines/table-engines/mergetree-family/replication.md) tables). [Materialized views](../../../engines/table-engines/special/materializedview.md#materializedview) based on MergeTree tables support partitioning, as well. @@ -40,8 +43,9 @@ By default, the floating-point partition key is not supported. To use it enable When inserting new data to a table, this data is stored as a separate part (chunk) sorted by the primary key. In 10-15 minutes after inserting, the parts of the same partition are merged into the entire part. -!!! info "Info" - A merge only works for data parts that have the same value for the partitioning expression. This means **you shouldn’t make overly granular partitions** (more than about a thousand partitions). Otherwise, the `SELECT` query performs poorly because of an unreasonably large number of files in the file system and open file descriptors. +:::info +A merge only works for data parts that have the same value for the partitioning expression. This means **you shouldn’t make overly granular partitions** (more than about a thousand partitions). Otherwise, the `SELECT` query performs poorly because of an unreasonably large number of files in the file system and open file descriptors. +::: Use the [system.parts](../../../operations/system-tables/parts.md#system_tables-parts) table to view the table parts and partitions. For example, let’s assume that we have a `visits` table with partitioning by month. Let’s perform the `SELECT` query for the `system.parts` table: @@ -78,8 +82,9 @@ Let’s break down the name of the part: `201901_1_9_2_11`: - `2` is the chunk level (the depth of the merge tree it is formed from). - `11` is the mutation version (if a part mutated) -!!! info "Info" - The parts of old-type tables have the name: `20190117_20190123_2_2_0` (minimum date - maximum date - minimum block number - maximum block number - level). +:::info +The parts of old-type tables have the name: `20190117_20190123_2_2_0` (minimum date - maximum date - minimum block number - maximum block number - level). +::: The `active` column shows the status of the part. `1` is active; `0` is inactive. The inactive parts are, for example, source parts remaining after merging to a larger part. The corrupted data parts are also indicated as inactive. diff --git a/docs/en/engines/table-engines/mergetree-family/graphitemergetree.md b/docs/en/engines/table-engines/mergetree-family/graphitemergetree.md index e1d571c909c..35f3f99d5a9 100644 --- a/docs/en/engines/table-engines/mergetree-family/graphitemergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/graphitemergetree.md @@ -1,6 +1,6 @@ --- -toc_priority: 38 -toc_title: GraphiteMergeTree +sidebar_position: 90 +sidebar_label: GraphiteMergeTree --- # GraphiteMergeTree {#graphitemergetree} @@ -54,8 +54,9 @@ When creating a `GraphiteMergeTree` table, the same [clauses](../../../engines/t Deprecated Method for Creating a Table -!!! attention "Attention" - Do not use this method in new projects and, if possible, switch the old projects to the method described above. +:::warning +Do not use this method in new projects and, if possible, switch old projects to the method described above. +::: ``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] @@ -119,12 +120,13 @@ default ... ``` -!!! warning "Attention" - Patterns must be strictly ordered: +:::warning +Patterns must be strictly ordered: - 1. Patterns without `function` or `retention`. - 1. Patterns with both `function` and `retention`. - 1. Pattern `default`. +1. Patterns without `function` or `retention`. +1. Patterns with both `function` and `retention`. +1. Pattern `default`. +::: When processing a row, ClickHouse checks the rules in the `pattern` sections. Each of `pattern` (including `default`) sections can contain `function` parameter for aggregation, `retention` parameters or both. If the metric name matches the `regexp`, the rules from the `pattern` section (or sections) are applied; otherwise, the rules from the `default` section are used. @@ -253,7 +255,6 @@ Valid values: ``` -!!! warning "Warning" - Data rollup is performed during merges. Usually, for old partitions, merges are not started, so for rollup it is necessary to trigger an unscheduled merge using [optimize](../../../sql-reference/statements/optimize.md). Or use additional tools, for example [graphite-ch-optimizer](https://github.com/innogames/graphite-ch-optimizer). - -[Original article](https://clickhouse.com/docs/en/operations/table_engines/graphitemergetree/) +:::warning +Data rollup is performed during merges. Usually, for old partitions, merges are not started, so for rollup it is necessary to trigger an unscheduled merge using [optimize](../../../sql-reference/statements/optimize.md). Or use additional tools, for example [graphite-ch-optimizer](https://github.com/innogames/graphite-ch-optimizer). +::: diff --git a/docs/en/engines/table-engines/mergetree-family/index.md b/docs/en/engines/table-engines/mergetree-family/index.md index 32796a252ac..37e7bf5b589 100644 --- a/docs/en/engines/table-engines/mergetree-family/index.md +++ b/docs/en/engines/table-engines/mergetree-family/index.md @@ -1,7 +1,6 @@ --- -toc_folder_title: MergeTree Family -toc_priority: 28 -toc_title: Introduction +sidebar_position: 10 +sidebar_label: MergeTree Family --- # MergeTree Engine Family {#mergetree-engine-family} diff --git a/docs/en/engines/table-engines/mergetree-family/mergetree.md b/docs/en/engines/table-engines/mergetree-family/mergetree.md index 9d820e4961b..095adc32505 100644 --- a/docs/en/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md @@ -1,6 +1,6 @@ --- -toc_priority: 30 -toc_title: MergeTree +sidebar_position: 11 +sidebar_label: MergeTree --- # MergeTree {#table_engines-mergetree} @@ -27,8 +27,9 @@ Main features: If necessary, you can set the data sampling method in the table. -!!! info "Info" - The [Merge](../../../engines/table-engines/special/merge.md#merge) engine does not belong to the `*MergeTree` family. +:::info +The [Merge](../../../engines/table-engines/special/merge.md#merge) engine does not belong to the `*MergeTree` family. +::: ## Creating a Table {#table_engine-mergetree-creating-a-table} @@ -127,8 +128,9 @@ The `index_granularity` setting can be omitted because 8192 is the default value Deprecated Method for Creating a Table -!!! attention "Attention" - Do not use this method in new projects. If possible, switch old projects to the method described above. +:::warning +Do not use this method in new projects. If possible, switch old projects to the method described above. +::: ``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] @@ -382,8 +384,10 @@ The `set` index can be used with all functions. Function subsets for other index Functions with a constant argument that is less than ngram size can’t be used by `ngrambf_v1` for query optimization. -!!! note "Note" - Bloom filters can have false positive matches, so the `ngrambf_v1`, `tokenbf_v1`, and `bloom_filter` indexes can’t be used for optimizing queries where the result of a function is expected to be false, for example: +:::note +Bloom filters can have false positive matches, so the `ngrambf_v1`, `tokenbf_v1`, and `bloom_filter` indexes can not be used for optimizing queries where the result of a function is expected to be false. + +For example: - Can be optimized: - `s LIKE '%test%'` @@ -391,12 +395,13 @@ Functions with a constant argument that is less than ngram size can’t be used - `s = 1` - `NOT s != 1` - `startsWith(s, 'test')` -- Can’t be optimized: +- Can not be optimized: - `NOT s LIKE '%test%'` - `s NOT LIKE '%test%'` - `NOT s = 1` - `s != 1` - `NOT startsWith(s, 'test')` +::: ## Projections {#projections} Projections are like [materialized views](../../../sql-reference/statements/create/view.md#materialized) but defined in part-level. It provides consistency guarantees along with automatic usage in queries. diff --git a/docs/en/engines/table-engines/mergetree-family/replacingmergetree.md b/docs/en/engines/table-engines/mergetree-family/replacingmergetree.md index ca0db24e640..47651527f99 100644 --- a/docs/en/engines/table-engines/mergetree-family/replacingmergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/replacingmergetree.md @@ -1,6 +1,6 @@ --- -toc_priority: 33 -toc_title: ReplacingMergeTree +sidebar_position: 40 +sidebar_label: ReplacingMergeTree --- # ReplacingMergeTree {#replacingmergetree} @@ -29,8 +29,9 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] For a description of request parameters, see [statement description](../../../sql-reference/statements/create/table.md). -!!! note "Attention" - Uniqueness of rows is determined by the `ORDER BY` table section, not `PRIMARY KEY`. +:::warning +Uniqueness of rows is determined by the `ORDER BY` table section, not `PRIMARY KEY`. +::: **ReplacingMergeTree Parameters** @@ -49,8 +50,9 @@ When creating a `ReplacingMergeTree` table the same [clauses](../../../engines/t Deprecated Method for Creating a Table -!!! attention "Attention" - Do not use this method in new projects and, if possible, switch the old projects to the method described above. +:::warning +Do not use this method in new projects and, if possible, switch old projects to the method described above. +::: ``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] diff --git a/docs/en/engines/table-engines/mergetree-family/replication.md b/docs/en/engines/table-engines/mergetree-family/replication.md index d574bd9449e..67c503854a9 100644 --- a/docs/en/engines/table-engines/mergetree-family/replication.md +++ b/docs/en/engines/table-engines/mergetree-family/replication.md @@ -1,6 +1,6 @@ --- -toc_priority: 31 -toc_title: Data Replication +sidebar_position: 20 +sidebar_label: Data Replication --- # Data Replication {#table_engines-replication} @@ -31,8 +31,9 @@ ClickHouse uses [Apache ZooKeeper](https://zookeeper.apache.org) for storing rep To use replication, set parameters in the [zookeeper](../../../operations/server-configuration-parameters/settings.md#server-settings_zookeeper) server configuration section. -!!! attention "Attention" - Don’t neglect the security setting. ClickHouse supports the `digest` [ACL scheme](https://zookeeper.apache.org/doc/current/zookeeperProgrammers.html#sc_ZooKeeperAccessControl) of the ZooKeeper security subsystem. +:::warning +Don’t neglect the security setting. ClickHouse supports the `digest` [ACL scheme](https://zookeeper.apache.org/doc/current/zookeeperProgrammers.html#sc_ZooKeeperAccessControl) of the ZooKeeper security subsystem. +::: Example of setting the addresses of the ZooKeeper cluster: diff --git a/docs/en/engines/table-engines/mergetree-family/summingmergetree.md b/docs/en/engines/table-engines/mergetree-family/summingmergetree.md index 5726acf000e..5d180782ed3 100644 --- a/docs/en/engines/table-engines/mergetree-family/summingmergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/summingmergetree.md @@ -1,6 +1,6 @@ --- -toc_priority: 34 -toc_title: SummingMergeTree +sidebar_position: 50 +sidebar_label: SummingMergeTree --- # SummingMergeTree {#summingmergetree} @@ -41,8 +41,9 @@ When creating a `SummingMergeTree` table the same [clauses](../../../engines/tab Deprecated Method for Creating a Table -!!! attention "Attention" - Do not use this method in new projects and, if possible, switch the old projects to the method described above. +:::warning +Do not use this method in new projects and, if possible, switch the old projects to the method described above. +::: ``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] diff --git a/docs/en/engines/table-engines/mergetree-family/versionedcollapsingmergetree.md b/docs/en/engines/table-engines/mergetree-family/versionedcollapsingmergetree.md index 8266bf34876..77cf192dcda 100644 --- a/docs/en/engines/table-engines/mergetree-family/versionedcollapsingmergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/versionedcollapsingmergetree.md @@ -1,6 +1,6 @@ --- -toc_priority: 37 -toc_title: VersionedCollapsingMergeTree +sidebar_position: 80 +sidebar_label: VersionedCollapsingMergeTree --- # VersionedCollapsingMergeTree {#versionedcollapsingmergetree} @@ -53,8 +53,9 @@ When creating a `VersionedCollapsingMergeTree` table, the same [clauses](../../. Deprecated Method for Creating a Table -!!! attention "Attention" - Do not use this method in new projects. If possible, switch the old projects to the method described above. +:::warning +Do not use this method in new projects. If possible, switch old projects to the method described above. +::: ``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] diff --git a/docs/en/engines/table-engines/special/buffer.md b/docs/en/engines/table-engines/special/buffer.md index d1f92d347a4..a0aff2ec813 100644 --- a/docs/en/engines/table-engines/special/buffer.md +++ b/docs/en/engines/table-engines/special/buffer.md @@ -1,6 +1,6 @@ --- -toc_priority: 45 -toc_title: Buffer +sidebar_position: 120 +sidebar_label: Buffer --- # Buffer Table Engine {#buffer} @@ -54,8 +54,9 @@ If the set of columns in the Buffer table does not match the set of columns in a If the types do not match for one of the columns in the Buffer table and a subordinate table, an error message is entered in the server log, and the buffer is cleared. The same thing happens if the subordinate table does not exist when the buffer is flushed. -!!! attention "Attention" - Running ALTER on the Buffer table in releases made before 26 Oct 2021 will cause a `Block structure mismatch` error (see [#15117](https://github.com/ClickHouse/ClickHouse/issues/15117) and [#30565](https://github.com/ClickHouse/ClickHouse/pull/30565)), so deleting the Buffer table and then recreating is the only option. It is advisable to check that this error is fixed in your release before trying to run ALTER on the Buffer table. +:::warning +Running ALTER on the Buffer table in releases made before 26 Oct 2021 will cause a `Block structure mismatch` error (see [#15117](https://github.com/ClickHouse/ClickHouse/issues/15117) and [#30565](https://github.com/ClickHouse/ClickHouse/pull/30565)), so deleting the Buffer table and then recreating is the only option. It is advisable to check that this error is fixed in your release before trying to run ALTER on the Buffer table. +::: If the server is restarted abnormally, the data in the buffer is lost. @@ -73,4 +74,4 @@ A Buffer table is used when too many INSERTs are received from a large number of Note that it does not make sense to insert data one row at a time, even for Buffer tables. This will only produce a speed of a few thousand rows per second, while inserting larger blocks of data can produce over a million rows per second (see the section “Performance”). -[Original article](https://clickhouse.com/docs/en/operations/table_engines/buffer/) +[Original article](https://clickhouse.com/docs/en/engines/table-engines/special/buffer/) diff --git a/docs/en/engines/table-engines/special/dictionary.md b/docs/en/engines/table-engines/special/dictionary.md index d76adebe01e..67b97e37d44 100644 --- a/docs/en/engines/table-engines/special/dictionary.md +++ b/docs/en/engines/table-engines/special/dictionary.md @@ -1,6 +1,6 @@ --- -toc_priority: 35 -toc_title: Dictionary +sidebar_position: 20 +sidebar_label: Dictionary --- # Dictionary Table Engine {#dictionary} @@ -97,3 +97,5 @@ select * from products limit 1; **See Also** - [Dictionary function](../../../sql-reference/table-functions/dictionary.md#dictionary-function) + +[Original article](https://clickhouse.com/docs/en/engines/table-engines/special/dictionary/) diff --git a/docs/en/engines/table-engines/special/distributed.md b/docs/en/engines/table-engines/special/distributed.md index 5072465687e..db89175e4d9 100644 --- a/docs/en/engines/table-engines/special/distributed.md +++ b/docs/en/engines/table-engines/special/distributed.md @@ -1,6 +1,6 @@ --- -toc_priority: 33 -toc_title: Distributed +sidebar_position: 10 +sidebar_label: Distributed --- # Distributed Table Engine {#distributed} @@ -64,19 +64,19 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] AS [db2.]name2 - `monitor_max_sleep_time_ms` - same as [distributed_directory_monitor_max_sleep_time_ms](../../../operations/settings/settings.md#distributed_directory_monitor_max_sleep_time_ms) -!!! note "Note" +:::note +**Durability settings** (`fsync_...`): - **Durability settings** (`fsync_...`): +- Affect only asynchronous INSERTs (i.e. `insert_distributed_sync=false`) when data first stored on the initiator node disk and later asynchronously send to shards. +- May significantly decrease the inserts' performance +- Affect writing the data stored inside Distributed table folder into the **node which accepted your insert**. If you need to have guarantees of writing data to underlying MergeTree tables - see durability settings (`...fsync...`) in `system.merge_tree_settings` - - Affect only asynchronous INSERTs (i.e. `insert_distributed_sync=false`) when data first stored on the initiator node disk and later asynchronously send to shards. - - May significantly decrease the inserts' performance - - Affect writing the data stored inside Distributed table folder into the **node which accepted your insert**. If you need to have guarantees of writing data to underlying MergeTree tables - see durability settings (`...fsync...`) in `system.merge_tree_settings` +For **Insert limit settings** (`..._insert`) see also: - For **Insert limit settings** (`..._insert`) see also: - - - [insert_distributed_sync](../../../operations/settings/settings.md#insert_distributed_sync) setting - - [prefer_localhost_replica](../../../operations/settings/settings.md#settings-prefer-localhost-replica) setting - - `bytes_to_throw_insert` handled before `bytes_to_delay_insert`, so you should not set it to the value less then `bytes_to_delay_insert` +- [insert_distributed_sync](../../../operations/settings/settings.md#insert_distributed_sync) setting +- [prefer_localhost_replica](../../../operations/settings/settings.md#settings-prefer-localhost-replica) setting +- `bytes_to_throw_insert` handled before `bytes_to_delay_insert`, so you should not set it to the value less then `bytes_to_delay_insert` +::: **Example** @@ -215,8 +215,9 @@ To learn more about how distibuted `in` and `global in` queries are processed, r - `_shard_num` — Contains the `shard_num` value from the table `system.clusters`. Type: [UInt32](../../../sql-reference/data-types/int-uint.md). -!!! note "Note" - Since [remote](../../../sql-reference/table-functions/remote.md) and [cluster](../../../sql-reference/table-functions/cluster.md) table functions internally create temporary Distributed table, `_shard_num` is available there too. +:::note +Since [remote](../../../sql-reference/table-functions/remote.md) and [cluster](../../../sql-reference/table-functions/cluster.md) table functions internally create temporary Distributed table, `_shard_num` is available there too. +::: **See Also** @@ -225,3 +226,4 @@ To learn more about how distibuted `in` and `global in` queries are processed, r - [shardNum()](../../../sql-reference/functions/other-functions.md#shard-num) and [shardCount()](../../../sql-reference/functions/other-functions.md#shard-count) functions +[Original article](https://clickhouse.com/docs/en/engines/table-engines/special/distributed/) diff --git a/docs/en/engines/table-engines/special/external-data.md b/docs/en/engines/table-engines/special/external-data.md index 4ec90905fe5..1f4336c74fe 100644 --- a/docs/en/engines/table-engines/special/external-data.md +++ b/docs/en/engines/table-engines/special/external-data.md @@ -1,6 +1,6 @@ --- -toc_priority: 45 -toc_title: External Data +sidebar_position: 130 +sidebar_label: External Data --- # External Data for Query Processing {#external-data-for-query-processing} @@ -63,4 +63,3 @@ $ curl -F 'passwd=@passwd.tsv;' 'http://localhost:8123/?query=SELECT+shell,+coun For distributed query processing, the temporary tables are sent to all the remote servers. -[Original article](https://clickhouse.com/docs/en/operations/table_engines/external_data/) diff --git a/docs/en/engines/table-engines/special/file.md b/docs/en/engines/table-engines/special/file.md index 7673f45ca8d..6e4449bf1a9 100644 --- a/docs/en/engines/table-engines/special/file.md +++ b/docs/en/engines/table-engines/special/file.md @@ -1,6 +1,6 @@ --- -toc_priority: 37 -toc_title: File +sidebar_position: 40 +sidebar_label: File --- # File Table Engine {#table_engines-file} @@ -30,8 +30,9 @@ When creating table using `File(Format)` it creates empty subdirectory in that f You may manually create this subfolder and file in server filesystem and then [ATTACH](../../../sql-reference/statements/attach.md) it to table information with matching name, so you can query data from that file. -!!! warning "Warning" - Be careful with this functionality, because ClickHouse does not keep track of external changes to such files. The result of simultaneous writes via ClickHouse and outside of ClickHouse is undefined. +:::warning +Be careful with this functionality, because ClickHouse does not keep track of external changes to such files. The result of simultaneous writes via ClickHouse and outside of ClickHouse is undefined. +::: ## Example {#example} @@ -85,4 +86,4 @@ $ echo -e "1,2\n3,4" | clickhouse-local -q "CREATE TABLE table (a Int64, b Int64 - Indices - Replication -[Original article](https://clickhouse.com/docs/en/operations/table_engines/file/) +[Original article](https://clickhouse.com/docs/en/operations/table_engines/special/file/) diff --git a/docs/en/engines/table-engines/special/generate.md b/docs/en/engines/table-engines/special/generate.md index fabe31897bb..453f3b5db0b 100644 --- a/docs/en/engines/table-engines/special/generate.md +++ b/docs/en/engines/table-engines/special/generate.md @@ -1,6 +1,6 @@ --- -toc_priority: 46 -toc_title: GenerateRandom +sidebar_position: 140 +sidebar_label: GenerateRandom --- # GenerateRandom Table Engine {#table_engines-generate} @@ -56,4 +56,4 @@ SELECT * FROM generate_engine_table LIMIT 3 - Indices - Replication -[Original article](https://clickhouse.com/docs/en/operations/table_engines/generate/) +[Original article](https://clickhouse.com/docs/en/engines/table-engines/special/generate/) diff --git a/docs/en/engines/table-engines/special/index.md b/docs/en/engines/table-engines/special/index.md index 872c01385e0..f87cd86c891 100644 --- a/docs/en/engines/table-engines/special/index.md +++ b/docs/en/engines/table-engines/special/index.md @@ -1,6 +1,6 @@ --- -toc_folder_title: Special -toc_priority: 31 +sidebar_position: 50 +sidebar_label: Special --- # Special Table Engines {#special-table-engines} diff --git a/docs/en/engines/table-engines/special/join.md b/docs/en/engines/table-engines/special/join.md index 4e4a5e9fc03..7d6f6e99b9f 100644 --- a/docs/en/engines/table-engines/special/join.md +++ b/docs/en/engines/table-engines/special/join.md @@ -1,14 +1,15 @@ --- -toc_priority: 40 -toc_title: Join +sidebar_position: 70 +sidebar_label: Join --- # Join Table Engine {#join} Optional prepared data structure for usage in [JOIN](../../../sql-reference/statements/select/join.md#select-join) operations. -!!! note "Note" - This is not an article about the [JOIN clause](../../../sql-reference/statements/select/join.md#select-join) itself. +:::note +This is not an article about the [JOIN clause](../../../sql-reference/statements/select/join.md#select-join) itself. +::: ## Creating a Table {#creating-a-table} @@ -125,3 +126,5 @@ ALTER TABLE id_val_join DELETE WHERE id = 3; │ 1 │ 21 │ └────┴─────┘ ``` + +[Original article](https://clickhouse.com/docs/en/operations/table_engines/special/join/) diff --git a/docs/en/engines/table-engines/special/materializedview.md b/docs/en/engines/table-engines/special/materializedview.md index 75161829a7e..6c9a5e84f60 100644 --- a/docs/en/engines/table-engines/special/materializedview.md +++ b/docs/en/engines/table-engines/special/materializedview.md @@ -1,10 +1,10 @@ --- -toc_priority: 43 -toc_title: MaterializedView +sidebar_position: 100 +sidebar_label: MaterializedView --- # MaterializedView Table Engine {#materializedview} Used for implementing materialized views (for more information, see [CREATE VIEW](../../../sql-reference/statements/create/view.md#materialized)). For storing data, it uses a different engine that was specified when creating the view. When reading from a table, it just uses that engine. -[Original article](https://clickhouse.com/docs/en/operations/table_engines/materializedview/) +[Original article](https://clickhouse.com/docs/en/engines/table-engines/special/materializedview/) diff --git a/docs/en/engines/table-engines/special/memory.md b/docs/en/engines/table-engines/special/memory.md index eb557d36c50..1e154a323d1 100644 --- a/docs/en/engines/table-engines/special/memory.md +++ b/docs/en/engines/table-engines/special/memory.md @@ -1,6 +1,6 @@ --- -toc_priority: 44 -toc_title: Memory +sidebar_position: 110 +sidebar_label: Memory --- # Memory Table Engine {#memory} @@ -15,4 +15,4 @@ Normally, using this table engine is not justified. However, it can be used for The Memory engine is used by the system for temporary tables with external query data (see the section “External data for processing a query”), and for implementing `GLOBAL IN` (see the section “IN operators”). -[Original article](https://clickhouse.com/docs/en/operations/table_engines/memory/) +[Original article](https://clickhouse.com/docs/en/engines/table-engines/special/memory/) diff --git a/docs/en/engines/table-engines/special/merge.md b/docs/en/engines/table-engines/special/merge.md index 27f783a3cea..bcad7a0c1f6 100644 --- a/docs/en/engines/table-engines/special/merge.md +++ b/docs/en/engines/table-engines/special/merge.md @@ -1,6 +1,6 @@ --- -toc_priority: 36 -toc_title: Merge +sidebar_position: 30 +sidebar_label: Merge --- # Merge Table Engine {#merge} @@ -12,7 +12,7 @@ Reading is automatically parallelized. Writing to a table is not supported. When ## Creating a Table {#creating-a-table} ``` sql - CREATE TABLE ... Engine=Merge(db_name, tables_regexp) +CREATE TABLE ... Engine=Merge(db_name, tables_regexp) ``` **Engine Parameters** @@ -81,3 +81,5 @@ SELECT * FROM WatchLog; - [Virtual columns](../../../engines/table-engines/special/index.md#table_engines-virtual_columns) - [merge](../../../sql-reference/table-functions/merge.md) table function + +[Original article](https://clickhouse.com/docs/en/operations/table_engines/special/merge/) diff --git a/docs/en/engines/table-engines/special/null.md b/docs/en/engines/table-engines/special/null.md index 39ed9c1c1a6..309b09ba779 100644 --- a/docs/en/engines/table-engines/special/null.md +++ b/docs/en/engines/table-engines/special/null.md @@ -1,13 +1,15 @@ --- -toc_priority: 38 -toc_title: 'Null' +sidebar_position: 50 +sidebar_label: 'Null' --- # Null Table Engine {#null} When writing to a `Null` table, data is ignored. When reading from a `Null` table, the response is empty. -!!! info "Hint" - However, you can create a materialized view on a `Null` table. So the data written to the table will end up affecting the view, but original raw data will still be discarded. +:::note +If you are wondering why this is useful, note that you can create a materialized view on a `Null` table. So the data written to the table will end up affecting the view, but original raw data will still be discarded. +::: -[Original article](https://clickhouse.com/docs/en/operations/table_engines/null/) + +[Original article](https://clickhouse.com/docs/en/operations/table_engines/special/null/) diff --git a/docs/en/engines/table-engines/special/set.md b/docs/en/engines/table-engines/special/set.md index c38c2418093..5fd80ba55fe 100644 --- a/docs/en/engines/table-engines/special/set.md +++ b/docs/en/engines/table-engines/special/set.md @@ -1,6 +1,6 @@ --- -toc_priority: 39 -toc_title: Set +sidebar_position: 60 +sidebar_label: Set --- # Set Table Engine {#set} @@ -20,4 +20,4 @@ When creating a table, the following settings are applied: - [persistent](../../../operations/settings/settings.md#persistent) -[Original article](https://clickhouse.com/docs/en/operations/table_engines/set/) +[Original article](https://clickhouse.com/docs/en/operations/table_engines/special/set/) diff --git a/docs/en/engines/table-engines/special/url.md b/docs/en/engines/table-engines/special/url.md index 26d928085ce..64642623f88 100644 --- a/docs/en/engines/table-engines/special/url.md +++ b/docs/en/engines/table-engines/special/url.md @@ -1,6 +1,6 @@ --- -toc_priority: 41 -toc_title: URL +sidebar_position: 80 +sidebar_label: URL --- # URL Table Engine {#table_engines-url} @@ -89,4 +89,4 @@ SELECT * FROM url_engine_table - Indexes. - Replication. -[Original article](https://clickhouse.com/docs/en/operations/table_engines/url/) +[Original article](https://clickhouse.com/docs/en/operations/table_engines/special/url/) diff --git a/docs/en/engines/table-engines/special/view.md b/docs/en/engines/table-engines/special/view.md index 9b847a0e2d5..455c301fb01 100644 --- a/docs/en/engines/table-engines/special/view.md +++ b/docs/en/engines/table-engines/special/view.md @@ -1,10 +1,10 @@ --- -toc_priority: 42 -toc_title: View +sidebar_position: 90 +sidebar_label: View --- # View Table Engine {#table_engines-view} Used for implementing views (for more information, see the `CREATE VIEW query`). It does not store data, but only stores the specified `SELECT` query. When reading from a table, it runs this query (and deletes all unnecessary columns from the query). -[Original article](https://clickhouse.com/docs/en/operations/table_engines/view/) +[Original article](https://clickhouse.com/docs/en/operations/table_engines/special/view/) diff --git a/docs/en/install.md b/docs/en/install.md index ecb4eb93042..35021b5bb8d 100644 --- a/docs/en/install.md +++ b/docs/en/install.md @@ -188,7 +188,7 @@ sudo ./clickhouse install ### From Precompiled Binaries for Non-Standard Environments {#from-binaries-non-linux} -For non-Linux operating systems and for AArch64 CPU arhitecture, ClickHouse builds are provided as a cross-compiled binary from the latest commit of the `master` branch (with a few hours delay). These builds are not recommended for use in production environments because they are less thoroughly tested, and they also only contain a subset of ClickHouse features available. +For non-Linux operating systems and for AArch64 CPU arhitecture, ClickHouse builds are provided as a cross-compiled binary from the latest commit of the `master` branch (with a few hours delay). - [MacOS x86_64](https://builds.clickhouse.com/master/macos/clickhouse) diff --git a/docs/en/operations/settings/index.md b/docs/en/operations/settings/index.md index f2a6bfc515a..bca49690025 100644 --- a/docs/en/operations/settings/index.md +++ b/docs/en/operations/settings/index.md @@ -1,7 +1,7 @@ --- -toc_folder_title: Settings -toc_priority: 55 -toc_title: Introduction +sidebar_label: Introduction +sidebar_position: 27 +slug: index --- # Settings {#session-settings-intro} From 421812a877d0ed8200aa3a7a263c84207c65feea Mon Sep 17 00:00:00 2001 From: rfraposa Date: Tue, 29 Mar 2022 17:57:11 -0600 Subject: [PATCH 061/239] Updates /interfaces docs --- docs/en/interfaces/cli.md | 4 ++-- docs/en/interfaces/cpp.md | 4 ++-- docs/en/interfaces/formats.md | 19 +++++++++++-------- docs/en/interfaces/grpc.md | 4 ++-- docs/en/interfaces/http.md | 19 +++++++++++-------- docs/en/interfaces/jdbc.md | 4 ++-- docs/en/interfaces/mysql.md | 4 ++-- docs/en/interfaces/odbc.md | 4 ++-- docs/en/interfaces/tcp.md | 4 ++-- .../third-party/client-libraries.md | 9 +++++---- docs/en/interfaces/third-party/gui.md | 4 ++-- docs/en/interfaces/third-party/index.md | 7 ++++--- .../en/interfaces/third-party/integrations.md | 9 +++++---- docs/en/interfaces/third-party/proxy.md | 4 ++-- 14 files changed, 54 insertions(+), 45 deletions(-) diff --git a/docs/en/interfaces/cli.md b/docs/en/interfaces/cli.md index eaf7a96ce42..2e78bad6445 100644 --- a/docs/en/interfaces/cli.md +++ b/docs/en/interfaces/cli.md @@ -1,6 +1,6 @@ --- -toc_priority: 17 -toc_title: Command-Line Client +sidebar_position: 17 +sidebar_label: Command-Line Client --- # Command-line Client {#command-line-client} diff --git a/docs/en/interfaces/cpp.md b/docs/en/interfaces/cpp.md index dcd1228ea0f..a7b4188799e 100644 --- a/docs/en/interfaces/cpp.md +++ b/docs/en/interfaces/cpp.md @@ -1,6 +1,6 @@ --- -toc_priority: 24 -toc_title: C++ Client Library +sidebar_position: 24 +sidebar_label: C++ Client Library --- # C++ Client Library {#c-client-library} diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index 058c9b6fd4a..801b7c1a14f 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -1,6 +1,6 @@ --- -toc_priority: 21 -toc_title: Input and Output Formats +sidebar_position: 21 +sidebar_label: Input and Output Formats --- # Formats for Input and Output Data {#formats} @@ -764,8 +764,9 @@ CREATE TABLE IF NOT EXISTS example_table - If `input_format_defaults_for_omitted_fields = 0`, then the default value for `x` and `a` equals `0` (as the default value for the `UInt32` data type). - If `input_format_defaults_for_omitted_fields = 1`, then the default value for `x` equals `0`, but the default value of `a` equals `x * 2`. -!!! note "Warning" - When inserting data with `input_format_defaults_for_omitted_fields = 1`, ClickHouse consumes more computational resources, compared to insertion with `input_format_defaults_for_omitted_fields = 0`. +:::warning +When inserting data with `input_format_defaults_for_omitted_fields = 1`, ClickHouse consumes more computational resources, compared to insertion with `input_format_defaults_for_omitted_fields = 0`. +::: ### Selecting Data {#selecting-data} @@ -787,8 +788,9 @@ The query `SELECT * FROM UserActivity FORMAT JSONEachRow` returns: Unlike the [JSON](#json) format, there is no substitution of invalid UTF-8 sequences. Values are escaped in the same way as for `JSON`. -!!! note "Note" - Any set of bytes can be output in the strings. Use the `JSONEachRow` format if you are sure that the data in the table can be formatted as JSON without losing any information. +:::info +Any set of bytes can be output in the strings. Use the `JSONEachRow` format if you are sure that the data in the table can be formatted as JSON without losing any information. +::: ### Usage of Nested Structures {#jsoneachrow-nested} @@ -1340,8 +1342,9 @@ SET format_avro_schema_registry_url = 'http://schema-registry'; SELECT * FROM topic1_stream; ``` -!!! note "Warning" - Setting `format_avro_schema_registry_url` needs to be configured in `users.xml` to maintain it’s value after a restart. Also you can use the `format_avro_schema_registry_url` setting of the `Kafka` table engine. +:::warning +Setting `format_avro_schema_registry_url` needs to be configured in `users.xml` to maintain it’s value after a restart. Also you can use the `format_avro_schema_registry_url` setting of the `Kafka` table engine. +::: ## Parquet {#data-format-parquet} diff --git a/docs/en/interfaces/grpc.md b/docs/en/interfaces/grpc.md index b30715082ec..6ada38c6220 100644 --- a/docs/en/interfaces/grpc.md +++ b/docs/en/interfaces/grpc.md @@ -1,6 +1,6 @@ --- -toc_priority: 19 -toc_title: gRPC Interface +sidebar_position: 19 +sidebar_label: gRPC Interface --- # gRPC Interface {#grpc-interface} diff --git a/docs/en/interfaces/http.md b/docs/en/interfaces/http.md index d72fb4d6f17..a97cf6671b2 100644 --- a/docs/en/interfaces/http.md +++ b/docs/en/interfaces/http.md @@ -1,6 +1,6 @@ --- -toc_priority: 19 -toc_title: HTTP Interface +sidebar_position: 19 +sidebar_label: HTTP Interface --- # HTTP Interface {#http-interface} @@ -178,8 +178,9 @@ You can also choose to use [HTTP compression](https://en.wikipedia.org/wiki/HTTP To send a compressed `POST` request, append the request header `Content-Encoding: compression_method`. In order for ClickHouse to compress the response, enable compression with [enable_http_compression](../operations/settings/settings.md#settings-enable_http_compression) setting and append `Accept-Encoding: compression_method` header to the request. You can configure the data compression level in the [http_zlib_compression_level](../operations/settings/settings.md#settings-http_zlib_compression_level) setting for all compression methods. -!!! note "Note" - Some HTTP clients might decompress data from the server by default (with `gzip` and `deflate`) and you might get decompressed data even if you use the compression settings correctly. +:::info +Some HTTP clients might decompress data from the server by default (with `gzip` and `deflate`) and you might get decompressed data even if you use the compression settings correctly. +::: **Examples** @@ -439,8 +440,9 @@ Next are the configuration methods for different `type`. The following example defines the values of [max_threads](../operations/settings/settings.md#settings-max_threads) and `max_final_threads` settings, then queries the system table to check whether these settings were set successfully. -!!! note "Warning" - To keep the default `handlers` such as` query`, `play`,` ping`, use the `` rule. +:::warning +To keep the default `handlers` such as` query`, `play`,` ping`, add the `` rule. +::: Example: @@ -469,8 +471,9 @@ $ curl -H 'XXX:TEST_HEADER_VALUE' -H 'PARAMS_XXX:max_threads' 'http://localhost: max_final_threads 2 ``` -!!! note "caution" - In one `predefined_query_handler` only supports one `query` of an insert type. +:::warning +In one `predefined_query_handler` only supports one `query` of an insert type. +::: ### dynamic_query_handler {#dynamic_query_handler} diff --git a/docs/en/interfaces/jdbc.md b/docs/en/interfaces/jdbc.md index cf97568a8de..0310156a872 100644 --- a/docs/en/interfaces/jdbc.md +++ b/docs/en/interfaces/jdbc.md @@ -1,6 +1,6 @@ --- -toc_priority: 22 -toc_title: JDBC Driver +sidebar_position: 22 +sidebar_label: JDBC Driver --- # JDBC Driver {#jdbc-driver} diff --git a/docs/en/interfaces/mysql.md b/docs/en/interfaces/mysql.md index 9932e6b6cb3..df8ef38d671 100644 --- a/docs/en/interfaces/mysql.md +++ b/docs/en/interfaces/mysql.md @@ -1,6 +1,6 @@ --- -toc_priority: 20 -toc_title: MySQL Interface +sidebar_position: 20 +sidebar_label: MySQL Interface --- # MySQL Interface {#mysql-interface} diff --git a/docs/en/interfaces/odbc.md b/docs/en/interfaces/odbc.md index fa58ed8b43e..5327f6bb48a 100644 --- a/docs/en/interfaces/odbc.md +++ b/docs/en/interfaces/odbc.md @@ -1,6 +1,6 @@ --- -toc_priority: 23 -toc_title: ODBC Driver +sidebar_position: 23 +sidebar_label: ODBC Driver --- # ODBC Driver {#odbc-driver} diff --git a/docs/en/interfaces/tcp.md b/docs/en/interfaces/tcp.md index b23f8110320..5f2f400799f 100644 --- a/docs/en/interfaces/tcp.md +++ b/docs/en/interfaces/tcp.md @@ -1,6 +1,6 @@ --- -toc_priority: 18 -toc_title: Native Interface (TCP) +sidebar_position: 18 +sidebar_label: Native Interface (TCP) --- # Native Interface (TCP) {#native-interface-tcp} diff --git a/docs/en/interfaces/third-party/client-libraries.md b/docs/en/interfaces/third-party/client-libraries.md index 8d1ff12cf0a..885e9f430f2 100644 --- a/docs/en/interfaces/third-party/client-libraries.md +++ b/docs/en/interfaces/third-party/client-libraries.md @@ -1,12 +1,13 @@ --- -toc_priority: 26 -toc_title: Client Libraries +sidebar_position: 26 +sidebar_label: Client Libraries --- # Client Libraries from Third-party Developers {#client-libraries-from-third-party-developers} -!!! warning "Disclaimer" - ClickHouse Inc does **not** maintain the libraries listed below and hasn’t done any extensive testing to ensure their quality. +:::warning +ClickHouse Inc does **not** maintain the libraries listed below and hasn’t done any extensive testing to ensure their quality. +::: - Python - [infi.clickhouse_orm](https://github.com/Infinidat/infi.clickhouse_orm) diff --git a/docs/en/interfaces/third-party/gui.md b/docs/en/interfaces/third-party/gui.md index c0e270b7207..92d00f2812c 100644 --- a/docs/en/interfaces/third-party/gui.md +++ b/docs/en/interfaces/third-party/gui.md @@ -1,6 +1,6 @@ --- -toc_priority: 28 -toc_title: Visual Interfaces +sidebar_position: 28 +sidebar_label: Visual Interfaces --- # Visual Interfaces from Third-party Developers {#visual-interfaces-from-third-party-developers} diff --git a/docs/en/interfaces/third-party/index.md b/docs/en/interfaces/third-party/index.md index caf100681b4..c9be2b6ada9 100644 --- a/docs/en/interfaces/third-party/index.md +++ b/docs/en/interfaces/third-party/index.md @@ -1,6 +1,6 @@ --- toc_folder_title: Third-Party -toc_priority: 24 +sidebar_position: 24 --- # Third-Party Interfaces {#third-party-interfaces} @@ -12,5 +12,6 @@ This is a collection of links to third-party tools that provide some sort of int - [GUI](../../interfaces/third-party/gui.md) - [Proxies](../../interfaces/third-party/proxy.md) -!!! note "Note" - Generic tools that support common API like [ODBC](../../interfaces/odbc.md) or [JDBC](../../interfaces/jdbc.md) usually can work with ClickHouse as well, but are not listed here because there are way too many of them. +:::note +Generic tools that support common API like [ODBC](../../interfaces/odbc.md) or [JDBC](../../interfaces/jdbc.md) usually can work with ClickHouse as well, but are not listed here because there are way too many of them. +::: \ No newline at end of file diff --git a/docs/en/interfaces/third-party/integrations.md b/docs/en/interfaces/third-party/integrations.md index 3aac78f0878..ae055d63a9d 100644 --- a/docs/en/interfaces/third-party/integrations.md +++ b/docs/en/interfaces/third-party/integrations.md @@ -1,12 +1,13 @@ --- -toc_priority: 27 -toc_title: Integrations +sidebar_position: 27 +sidebar_label: Integrations --- # Integration Libraries from Third-party Developers {#integration-libraries-from-third-party-developers} -!!! warning "Disclaimer" - ClickHouse, Inc. does **not** maintain the tools and libraries listed below and haven’t done extensive testing to ensure their quality. +:::warning Disclaimer +ClickHouse, Inc. does **not** maintain the tools and libraries listed below and haven’t done extensive testing to ensure their quality. +::: ## Infrastructure Products {#infrastructure-products} diff --git a/docs/en/interfaces/third-party/proxy.md b/docs/en/interfaces/third-party/proxy.md index 31a2d5afae9..45077cb6a89 100644 --- a/docs/en/interfaces/third-party/proxy.md +++ b/docs/en/interfaces/third-party/proxy.md @@ -1,6 +1,6 @@ --- -toc_priority: 29 -toc_title: Proxies +sidebar_position: 29 +sidebar_label: Proxies --- # Proxy Servers from Third-party Developers {#proxy-servers-from-third-party-developers} From 01ec63c909115f37aef91507ea0dd738957db929 Mon Sep 17 00:00:00 2001 From: rfraposa Date: Tue, 29 Mar 2022 20:38:50 -0600 Subject: [PATCH 062/239] Updates /operations docs --- docs/en/interfaces/jdbc.md | 3 +- docs/en/interfaces/odbc.md | 4 +- docs/en/operations/_category_.yml | 3 - docs/en/operations/access-rights.md | 14 ++-- docs/en/operations/backup.md | 9 +- docs/en/operations/caches.md | 4 +- docs/en/operations/clickhouse-keeper.md | 14 ++-- docs/en/operations/configuration-files.md | 4 +- .../external-authenticators/index.md | 7 +- .../external-authenticators/kerberos.md | 21 +++-- docs/en/operations/index.md | 7 +- docs/en/operations/monitoring.md | 4 +- docs/en/operations/opentelemetry.md | 9 +- .../optimizing-performance/index.md | 5 +- .../sampling-query-profiler.md | 4 +- docs/en/operations/performance-test.md | 4 +- docs/en/operations/quotas.md | 4 +- docs/en/operations/requirements.md | 4 +- .../server-configuration-parameters/index.md | 7 +- .../settings.md | 72 +++++++++------- .../settings/constraints-on-settings.md | 4 +- docs/en/operations/settings/index.md | 6 +- .../settings/permissions-for-queries.md | 4 +- .../operations/settings/query-complexity.md | 4 +- .../operations/settings/settings-profiles.md | 9 +- docs/en/operations/settings/settings-users.md | 14 ++-- docs/en/operations/settings/settings.md | 82 +++++++++++-------- docs/en/operations/ssl-zookeeper.md | 7 +- docs/en/operations/storing-data.md | 4 +- .../system-tables/asynchronous_metric_log.md | 2 +- .../system-tables/asynchronous_metrics.md | 2 +- docs/en/operations/system-tables/clusters.md | 2 +- docs/en/operations/system-tables/columns.md | 2 +- .../operations/system-tables/contributors.md | 2 +- docs/en/operations/system-tables/crash-log.md | 2 +- .../operations/system-tables/current-roles.md | 2 +- .../system-tables/data_skipping_indices.md | 2 +- .../system-tables/data_type_families.md | 2 +- docs/en/operations/system-tables/databases.md | 2 +- .../system-tables/detached_parts.md | 2 +- .../operations/system-tables/dictionaries.md | 2 +- docs/en/operations/system-tables/disks.md | 2 +- .../system-tables/distributed_ddl_queue.md | 2 +- .../system-tables/distribution_queue.md | 2 +- .../operations/system-tables/enabled-roles.md | 2 +- docs/en/operations/system-tables/errors.md | 2 +- docs/en/operations/system-tables/events.md | 2 +- docs/en/operations/system-tables/functions.md | 2 +- docs/en/operations/system-tables/grants.md | 2 +- .../system-tables/graphite_retentions.md | 2 +- docs/en/operations/system-tables/index.md | 4 +- docs/en/operations/system-tables/licenses.md | 2 +- .../system-tables/merge_tree_settings.md | 2 +- docs/en/operations/system-tables/merges.md | 2 +- .../en/operations/system-tables/metric_log.md | 2 +- docs/en/operations/system-tables/metrics.md | 2 +- docs/en/operations/system-tables/mutations.md | 7 +- docs/en/operations/system-tables/numbers.md | 2 +- .../en/operations/system-tables/numbers_mt.md | 2 +- docs/en/operations/system-tables/one.md | 2 +- .../system-tables/opentelemetry_span_log.md | 2 +- docs/en/operations/system-tables/part_log.md | 2 +- docs/en/operations/system-tables/parts.md | 7 +- .../operations/system-tables/parts_columns.md | 2 +- docs/en/operations/system-tables/processes.md | 2 +- docs/en/operations/system-tables/query_log.md | 7 +- .../system-tables/query_thread_log.md | 2 +- .../system-tables/query_views_log.md | 2 +- .../operations/system-tables/quota_limits.md | 2 +- .../operations/system-tables/quota_usage.md | 2 +- docs/en/operations/system-tables/quotas.md | 2 +- .../operations/system-tables/quotas_usage.md | 2 +- docs/en/operations/system-tables/replicas.md | 2 +- .../system-tables/replicated_fetches.md | 2 +- .../system-tables/replication_queue.md | 2 +- .../operations/system-tables/role-grants.md | 2 +- docs/en/operations/system-tables/roles.md | 2 +- .../operations/system-tables/row_policies.md | 2 +- .../operations/system-tables/session_log.md | 2 +- docs/en/operations/system-tables/settings.md | 2 +- .../settings_profile_elements.md | 2 +- .../system-tables/settings_profiles.md | 2 +- .../operations/system-tables/stack_trace.md | 2 +- .../system-tables/storage_policies.md | 2 +- .../operations/system-tables/table_engines.md | 2 +- docs/en/operations/system-tables/tables.md | 2 +- docs/en/operations/system-tables/text_log.md | 2 +- .../en/operations/system-tables/time_zones.md | 2 +- docs/en/operations/system-tables/trace_log.md | 2 +- docs/en/operations/system-tables/users.md | 2 +- docs/en/operations/system-tables/zookeeper.md | 2 +- .../operations/system-tables/zookeeper_log.md | 2 +- docs/en/operations/tips.md | 11 +-- docs/en/operations/troubleshooting.md | 4 +- docs/en/operations/update.md | 9 +- .../utilities/clickhouse-benchmark.md | 6 +- .../utilities/clickhouse-compressor.md | 3 +- .../operations/utilities/clickhouse-copier.md | 11 +-- .../operations/utilities/clickhouse-format.md | 7 +- .../operations/utilities/clickhouse-local.md | 11 +-- .../utilities/clickhouse-obfuscator.md | 2 +- docs/en/operations/utilities/index.md | 7 +- 102 files changed, 294 insertions(+), 257 deletions(-) diff --git a/docs/en/interfaces/jdbc.md b/docs/en/interfaces/jdbc.md index 0310156a872..4bea0600a2a 100644 --- a/docs/en/interfaces/jdbc.md +++ b/docs/en/interfaces/jdbc.md @@ -5,7 +5,8 @@ sidebar_label: JDBC Driver # JDBC Driver {#jdbc-driver} -- **[Official driver](https://github.com/ClickHouse/clickhouse-jdbc)** +Use the [official JDBC driver](https://github.com/ClickHouse/clickhouse-jdbc) (and Java client) to access ClickHouse from your Java applications. + - Third-party drivers: - [ClickHouse-Native-JDBC](https://github.com/housepower/ClickHouse-Native-JDBC) - [clickhouse4j](https://github.com/blynkkk/clickhouse4j) diff --git a/docs/en/interfaces/odbc.md b/docs/en/interfaces/odbc.md index 5327f6bb48a..4c807654c28 100644 --- a/docs/en/interfaces/odbc.md +++ b/docs/en/interfaces/odbc.md @@ -5,6 +5,8 @@ sidebar_label: ODBC Driver # ODBC Driver {#odbc-driver} -- [Official driver](https://github.com/ClickHouse/clickhouse-odbc) +Use the [official ODBC driver](https://github.com/ClickHouse/clickhouse-odbc) for accessing ClickHouse as a data source. + + [Original article](https://clickhouse.com/docs/en/interfaces/odbc/) diff --git a/docs/en/operations/_category_.yml b/docs/en/operations/_category_.yml index 011ab58d26d..9d6dd1247db 100644 --- a/docs/en/operations/_category_.yml +++ b/docs/en/operations/_category_.yml @@ -2,6 +2,3 @@ position: 70 label: 'Operations' collapsible: true collapsed: true -link: - type: generated-index - title: Operations \ No newline at end of file diff --git a/docs/en/operations/access-rights.md b/docs/en/operations/access-rights.md index 52f7fb87ffd..7d75c47df2b 100644 --- a/docs/en/operations/access-rights.md +++ b/docs/en/operations/access-rights.md @@ -1,6 +1,6 @@ --- -toc_priority: 48 -toc_title: Access Control and Account Management +sidebar_position: 48 +sidebar_label: Access Control and Account Management --- # Access Control and Account Management {#access-control} @@ -24,8 +24,9 @@ You can configure access entities using: We recommend using SQL-driven workflow. Both of the configuration methods work simultaneously, so if you use the server configuration files for managing accounts and access rights, you can smoothly switch to SQL-driven workflow. -!!! note "Warning" - You can’t manage the same access entity by both configuration methods simultaneously. +:::warning +You can’t manage the same access entity by both configuration methods simultaneously. +::: To see all users, roles, profiles, etc. and all their grants use [SHOW ACCESS](../sql-reference/statements/show.md#show-access-statement) statement. @@ -101,8 +102,9 @@ Privileges can be granted to a role by the [GRANT](../sql-reference/statements/g Row policy is a filter that defines which of the rows are available to a user or a role. Row policy contains filters for one particular table, as well as a list of roles and/or users which should use this row policy. -!!! note "Warning" - Row policies makes sense only for users with readonly access. If user can modify table or copy partitions between tables, it defeats the restrictions of row policies. +:::warning +Row policies makes sense only for users with readonly access. If user can modify table or copy partitions between tables, it defeats the restrictions of row policies. +::: Management queries: diff --git a/docs/en/operations/backup.md b/docs/en/operations/backup.md index 7f0ed48928a..c39658aa4b0 100644 --- a/docs/en/operations/backup.md +++ b/docs/en/operations/backup.md @@ -1,6 +1,6 @@ --- -toc_priority: 49 -toc_title: Data Backup +sidebar_position: 49 +sidebar_label: Data Backup --- # Data Backup {#data-backup} @@ -11,8 +11,9 @@ In order to effectively mitigate possible human errors, you should carefully pre Each company has different resources available and business requirements, so there’s no universal solution for ClickHouse backups and restores that will fit every situation. What works for one gigabyte of data likely won’t work for tens of petabytes. There are a variety of possible approaches with their own pros and cons, which will be discussed below. It is a good idea to use several approaches instead of just one in order to compensate for their various shortcomings. -!!! note "Note" - Keep in mind that if you backed something up and never tried to restore it, chances are that restore will not work properly when you actually need it (or at least it will take longer than business can tolerate). So whatever backup approach you choose, make sure to automate the restore process as well, and practice it on a spare ClickHouse cluster regularly. +:::note +Keep in mind that if you backed something up and never tried to restore it, chances are that restore will not work properly when you actually need it (or at least it will take longer than business can tolerate). So whatever backup approach you choose, make sure to automate the restore process as well, and practice it on a spare ClickHouse cluster regularly. +::: ## Duplicating Source Data Somewhere Else {#duplicating-source-data-somewhere-else} diff --git a/docs/en/operations/caches.md b/docs/en/operations/caches.md index 279204a8af1..f2427810184 100644 --- a/docs/en/operations/caches.md +++ b/docs/en/operations/caches.md @@ -1,6 +1,6 @@ --- -toc_priority: 65 -toc_title: Caches +sidebar_position: 65 +sidebar_label: Caches --- # Cache Types {#cache-types} diff --git a/docs/en/operations/clickhouse-keeper.md b/docs/en/operations/clickhouse-keeper.md index 35ec5d858f5..81547736441 100644 --- a/docs/en/operations/clickhouse-keeper.md +++ b/docs/en/operations/clickhouse-keeper.md @@ -1,14 +1,15 @@ --- -toc_priority: 66 -toc_title: ClickHouse Keeper +sidebar_position: 66 +sidebar_label: ClickHouse Keeper --- # [pre-production] ClickHouse Keeper {#clickHouse-keeper} ClickHouse server uses [ZooKeeper](https://zookeeper.apache.org/) coordination system for data [replication](../engines/table-engines/mergetree-family/replication.md) and [distributed DDL](../sql-reference/distributed-ddl.md) queries execution. ClickHouse Keeper is an alternative coordination system compatible with ZooKeeper. -!!! warning "Warning" - This feature is currently in the pre-production stage. We test it in our CI and on small internal installations. +:::warning +This feature is currently in the pre-production stage. We test it in our CI and on small internal installations. +::: ## Implementation details {#implementation-details} @@ -18,8 +19,9 @@ By default, ClickHouse Keeper provides the same guarantees as ZooKeeper (lineari ClickHouse Keeper supports Access Control List (ACL) the same way as [ZooKeeper](https://zookeeper.apache.org/doc/r3.1.2/zookeeperProgrammers.html#sc_ZooKeeperAccessControl) does. ClickHouse Keeper supports the same set of permissions and has the identical built-in schemes: `world`, `auth`, `digest`, `host` and `ip`. Digest authentication scheme uses pair `username:password`. Password is encoded in Base64. -!!! info "Note" - External integrations are not supported. +:::note +External integrations are not supported. +::: ## Configuration {#configuration} diff --git a/docs/en/operations/configuration-files.md b/docs/en/operations/configuration-files.md index cbc139dd958..582e90544e0 100644 --- a/docs/en/operations/configuration-files.md +++ b/docs/en/operations/configuration-files.md @@ -1,6 +1,6 @@ --- -toc_priority: 50 -toc_title: Configuration Files +sidebar_position: 50 +sidebar_label: Configuration Files --- # Configuration Files {#configuration_files} diff --git a/docs/en/operations/external-authenticators/index.md b/docs/en/operations/external-authenticators/index.md index 850b6594b71..af2ba713ec1 100644 --- a/docs/en/operations/external-authenticators/index.md +++ b/docs/en/operations/external-authenticators/index.md @@ -1,10 +1,9 @@ --- -toc_folder_title: External User Authenticators and Directories -toc_priority: 48 -toc_title: Introduction +sidebar_position: 48 +sidebar_label: External User Authenticators and Directories --- -# External User Authenticators and Directories {#external-authenticators} +# External User Authenticators and Directories ClickHouse supports authenticating and managing users using external services. diff --git a/docs/en/operations/external-authenticators/kerberos.md b/docs/en/operations/external-authenticators/kerberos.md index da84c1f6a89..3711bac79c3 100644 --- a/docs/en/operations/external-authenticators/kerberos.md +++ b/docs/en/operations/external-authenticators/kerberos.md @@ -51,12 +51,13 @@ With filtering by realm: ``` -!!! warning "Note" - You can define only one `kerberos` section. The presence of multiple `kerberos` sections will force ClickHouse to disable Kerberos authentication. - -!!! warning "Note" - `principal` and `realm` sections cannot be specified at the same time. The presence of both `principal` and `realm` sections will force ClickHouse to disable Kerberos authentication. +:::warning +You can define only one `kerberos` section. The presence of multiple `kerberos` sections will force ClickHouse to disable Kerberos authentication. +::: +:::warning +`principal` and `realm` sections cannot be specified at the same time. The presence of both `principal` and `realm` sections will force ClickHouse to disable Kerberos authentication. +::: ## Kerberos as an external authenticator for existing users {#kerberos-as-an-external-authenticator-for-existing-users} @@ -94,11 +95,13 @@ Example (goes into `users.xml`): ``` -!!! warning "Warning" - Note that Kerberos authentication cannot be used alongside with any other authentication mechanism. The presence of any other sections like `password` alongside `kerberos` will force ClickHouse to shutdown. +:::warning +Note that Kerberos authentication cannot be used alongside with any other authentication mechanism. The presence of any other sections like `password` alongside `kerberos` will force ClickHouse to shutdown. +::: -!!! info "Reminder" - Note, that now, once user `my_user` uses `kerberos`, Kerberos must be enabled in the main `config.xml` file as described previously. +:::info Reminder +Note, that now, once user `my_user` uses `kerberos`, Kerberos must be enabled in the main `config.xml` file as described previously. +::: ### Enabling Kerberos using SQL {#enabling-kerberos-using-sql} diff --git a/docs/en/operations/index.md b/docs/en/operations/index.md index b78633f2d6b..824e851e997 100644 --- a/docs/en/operations/index.md +++ b/docs/en/operations/index.md @@ -1,7 +1,6 @@ --- -toc_folder_title: Operations -toc_priority: 41 -toc_title: Introduction +sidebar_position: 41 +sidebar_label: Operations --- # Operations {#operations} @@ -23,4 +22,4 @@ ClickHouse operations manual consists of the following major sections: - [Settings](../operations/settings/index.md) - [Utilities](../operations/utilities/index.md) -{## [Original article](https://clickhouse.com/docs/en/operations/) ##} +[Original article](https://clickhouse.com/docs/en/operations/) diff --git a/docs/en/operations/monitoring.md b/docs/en/operations/monitoring.md index ffcdae16c4d..437122e106d 100644 --- a/docs/en/operations/monitoring.md +++ b/docs/en/operations/monitoring.md @@ -1,6 +1,6 @@ --- -toc_priority: 45 -toc_title: Monitoring +sidebar_position: 45 +sidebar_label: Monitoring --- # Monitoring {#monitoring} diff --git a/docs/en/operations/opentelemetry.md b/docs/en/operations/opentelemetry.md index ec27ecfd6b2..740537d88bc 100644 --- a/docs/en/operations/opentelemetry.md +++ b/docs/en/operations/opentelemetry.md @@ -1,14 +1,15 @@ --- -toc_priority: 62 -toc_title: OpenTelemetry Support +sidebar_position: 62 +sidebar_label: OpenTelemetry Support --- # [experimental] OpenTelemetry Support [OpenTelemetry](https://opentelemetry.io/) is an open standard for collecting traces and metrics from the distributed application. ClickHouse has some support for OpenTelemetry. -!!! warning "Warning" - This is an experimental feature that will change in backwards-incompatible ways in future releases. +:::warning +This is an experimental feature that will change in backwards-incompatible ways in future releases. +::: ## Supplying Trace Context to ClickHouse diff --git a/docs/en/operations/optimizing-performance/index.md b/docs/en/operations/optimizing-performance/index.md index 142d3b2f976..ef9c6a4b664 100644 --- a/docs/en/operations/optimizing-performance/index.md +++ b/docs/en/operations/optimizing-performance/index.md @@ -1,7 +1,6 @@ --- -toc_folder_title: Optimizing Performance -toc_hidden: true -toc_priority: 52 +sidebar_label: Optimizing Performance +sidebar_position: 52 --- # Optimizing Performance {#optimizing-performance} diff --git a/docs/en/operations/optimizing-performance/sampling-query-profiler.md b/docs/en/operations/optimizing-performance/sampling-query-profiler.md index 72cfa59b8b2..35e0157df6b 100644 --- a/docs/en/operations/optimizing-performance/sampling-query-profiler.md +++ b/docs/en/operations/optimizing-performance/sampling-query-profiler.md @@ -1,6 +1,6 @@ --- -toc_priority: 54 -toc_title: Query Profiling +sidebar_position: 54 +sidebar_label: Query Profiling --- # Sampling Query Profiler {#sampling-query-profiler} diff --git a/docs/en/operations/performance-test.md b/docs/en/operations/performance-test.md index e410b1b2dfd..47827f331c7 100644 --- a/docs/en/operations/performance-test.md +++ b/docs/en/operations/performance-test.md @@ -1,6 +1,6 @@ --- -toc_priority: 54 -toc_title: Testing Hardware +sidebar_position: 54 +sidebar_label: Testing Hardware --- # How to Test Your Hardware with ClickHouse {#how-to-test-your-hardware-with-clickhouse} diff --git a/docs/en/operations/quotas.md b/docs/en/operations/quotas.md index 6d22a5f2a33..77b0697d483 100644 --- a/docs/en/operations/quotas.md +++ b/docs/en/operations/quotas.md @@ -1,6 +1,6 @@ --- -toc_priority: 51 -toc_title: Quotas +sidebar_position: 51 +sidebar_label: Quotas --- # Quotas {#quotas} diff --git a/docs/en/operations/requirements.md b/docs/en/operations/requirements.md index a3e61b1152b..698603dfb84 100644 --- a/docs/en/operations/requirements.md +++ b/docs/en/operations/requirements.md @@ -1,6 +1,6 @@ --- -toc_priority: 44 -toc_title: Requirements +sidebar_position: 44 +sidebar_label: Requirements --- # Requirements {#requirements} diff --git a/docs/en/operations/server-configuration-parameters/index.md b/docs/en/operations/server-configuration-parameters/index.md index a95d198bd0d..1e4ddc6368e 100644 --- a/docs/en/operations/server-configuration-parameters/index.md +++ b/docs/en/operations/server-configuration-parameters/index.md @@ -1,10 +1,9 @@ --- -toc_folder_title: Server Configuration Parameters -toc_priority: 54 -toc_title: Introduction +sidebar_position: 54 +sidebar_label: Server Configuration Parameters --- -# Server Configuration Parameters {#server-settings} +# Server Configuration Parameters This section contains descriptions of server settings that cannot be changed at the session or query level. diff --git a/docs/en/operations/server-configuration-parameters/settings.md b/docs/en/operations/server-configuration-parameters/settings.md index 266abadb087..985dc626ea4 100644 --- a/docs/en/operations/server-configuration-parameters/settings.md +++ b/docs/en/operations/server-configuration-parameters/settings.md @@ -1,6 +1,6 @@ --- -toc_priority: 57 -toc_title: Server Settings +sidebar_position: 57 +sidebar_label: Server Settings --- # Server Settings {#server-settings} @@ -23,8 +23,9 @@ Default value: 3600. Data compression settings for [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md)-engine tables. -!!! warning "Warning" - Don’t use it if you have just started using ClickHouse. +:::warning +Don’t use it if you have just started using ClickHouse. +::: Configuration template: @@ -87,8 +88,9 @@ Loading from config: ``` -!!! note "NOTE" - Storing keys in the configuration file is not recommended. It isn't secure. You can move the keys into a separate config file on a secure disk and put a symlink to that config file to `config.d/` folder. +:::note +Storing keys in the configuration file is not recommended. It isn't secure. You can move the keys into a separate config file on a secure disk and put a symlink to that config file to `config.d/` folder. +::: Loading from config, when the key is in hex: @@ -173,8 +175,9 @@ Possible values: Default value: `1073741824` (1 GB). -!!! info "Note" - Hard limit is configured via system tools +:::note +Hard limit is configured via system tools +::: **Example** @@ -439,11 +442,13 @@ A username and a password used to connect to other servers during [replication]( By default, if `interserver_http_credentials` section is omitted, authentication is not used during replication. -!!! note "Note" - `interserver_http_credentials` settings do not relate to a ClickHouse client credentials [configuration](../../interfaces/cli.md#configuration_files). +:::note +`interserver_http_credentials` settings do not relate to a ClickHouse client credentials [configuration](../../interfaces/cli.md#configuration_files). +::: -!!! note "Note" - These credentials are common for replication via `HTTP` and `HTTPS`. +:::note +These credentials are common for replication via `HTTP` and `HTTPS`. +::: The section contains the following parameters: @@ -675,8 +680,9 @@ On hosts with low RAM and swap, you possibly need setting `max_server_memory_usa The maximum number of simultaneously processed queries related to MergeTree table. Queries may be limited by other settings: [max_concurrent_insert_queries](#max-concurrent-insert-queries), [max_concurrent_select_queries](#max-concurrent-select-queries), [max_concurrent_queries_for_user](#max-concurrent-queries-for-user), [max_concurrent_queries_for_all_users](#max-concurrent-queries-for-all-users), [min_marks_to_honor_max_concurrent_queries](#min-marks-to-honor-max-concurrent-queries). -!!! info "Note" - These settings can be modified at runtime and will take effect immediately. Queries that are already running will remain unchanged. +:::note +These settings can be modified at runtime and will take effect immediately. Queries that are already running will remain unchanged. +::: Possible values: @@ -695,8 +701,9 @@ Default value: `100`. The maximum number of simultaneously processed `INSERT` queries. -!!! info "Note" - These settings can be modified at runtime and will take effect immediately. Queries that are already running will remain unchanged. +:::note +These settings can be modified at runtime and will take effect immediately. Queries that are already running will remain unchanged. +::: Possible values: @@ -715,8 +722,9 @@ Default value: `0`. The maximum number of simultaneously processed `SELECT` queries. -!!! info "Note" - These settings can be modified at runtime and will take effect immediately. Queries that are already running will remain unchanged. +:::note +These settings can be modified at runtime and will take effect immediately. Queries that are already running will remain unchanged. +::: Possible values: @@ -1025,8 +1033,9 @@ Use the following parameters to configure logging: The path to the directory containing data. -!!! warning "Warning" - The trailing slash is mandatory. +:::note +The trailing slash is mandatory. +::: **Example** @@ -1306,8 +1315,9 @@ Example Path to temporary data for processing large queries. -!!! warning "Note" - The trailing slash is mandatory. +:::note +The trailing slash is mandatory. +::: **Example** @@ -1321,11 +1331,12 @@ Policy from [storage_configuration](../../engines/table-engines/mergetree-family If not set, [tmp_path](#tmp-path) is used, otherwise it is ignored. -!!! note "Note" - - `move_factor` is ignored. - - `keep_free_space_bytes` is ignored. - - `max_data_part_size_bytes` is ignored. - - Уou must have exactly one volume in that policy. +:::note +- `move_factor` is ignored. +- `keep_free_space_bytes` is ignored. +- `max_data_part_size_bytes` is ignored. +- Уou must have exactly one volume in that policy. +::: ## uncompressed_cache_size {#server-settings-uncompressed_cache_size} @@ -1442,10 +1453,11 @@ This setting only applies to the `MergeTree` family. It can be specified: If `use_minimalistic_part_header_in_zookeeper = 1`, then [replicated](../../engines/table-engines/mergetree-family/replication.md) tables store the headers of the data parts compactly using a single `znode`. If the table contains many columns, this storage method significantly reduces the volume of the data stored in Zookeeper. -!!! attention "Attention" - After applying `use_minimalistic_part_header_in_zookeeper = 1`, you can’t downgrade the ClickHouse server to a version that does not support this setting. Be careful when upgrading ClickHouse on servers in a cluster. Don’t upgrade all the servers at once. It is safer to test new versions of ClickHouse in a test environment, or on just a few servers of a cluster. +:::note +After applying `use_minimalistic_part_header_in_zookeeper = 1`, you can’t downgrade the ClickHouse server to a version that does not support this setting. Be careful when upgrading ClickHouse on servers in a cluster. Don’t upgrade all the servers at once. It is safer to test new versions of ClickHouse in a test environment, or on just a few servers of a cluster. - Data part headers already stored with this setting can't be restored to their previous (non-compact) representation. +Data part headers already stored with this setting can't be restored to their previous (non-compact) representation. +::: **Default value:** 0. diff --git a/docs/en/operations/settings/constraints-on-settings.md b/docs/en/operations/settings/constraints-on-settings.md index 338949c5a6a..5adde60a460 100644 --- a/docs/en/operations/settings/constraints-on-settings.md +++ b/docs/en/operations/settings/constraints-on-settings.md @@ -1,6 +1,6 @@ --- -toc_priority: 62 -toc_title: Constraints on Settings +sidebar_position: 62 +sidebar_label: Constraints on Settings --- # Constraints on Settings {#constraints-on-settings} diff --git a/docs/en/operations/settings/index.md b/docs/en/operations/settings/index.md index bca49690025..c371bb0c41a 100644 --- a/docs/en/operations/settings/index.md +++ b/docs/en/operations/settings/index.md @@ -1,10 +1,10 @@ --- -sidebar_label: Introduction -sidebar_position: 27 +sidebar_label: Settings +sidebar_position: 52 slug: index --- -# Settings {#session-settings-intro} +# Settings There are multiple ways to make all the settings described in this section of documentation. diff --git a/docs/en/operations/settings/permissions-for-queries.md b/docs/en/operations/settings/permissions-for-queries.md index 47551f288bb..ff63f524b7d 100644 --- a/docs/en/operations/settings/permissions-for-queries.md +++ b/docs/en/operations/settings/permissions-for-queries.md @@ -1,6 +1,6 @@ --- -toc_priority: 58 -toc_title: Permissions for Queries +sidebar_position: 58 +sidebar_label: Permissions for Queries --- # Permissions for Queries {#permissions_for_queries} diff --git a/docs/en/operations/settings/query-complexity.md b/docs/en/operations/settings/query-complexity.md index 3287caacdf8..c0c77bc809a 100644 --- a/docs/en/operations/settings/query-complexity.md +++ b/docs/en/operations/settings/query-complexity.md @@ -1,6 +1,6 @@ --- -toc_priority: 59 -toc_title: Restrictions on Query Complexity +sidebar_position: 59 +sidebar_label: Restrictions on Query Complexity --- # Restrictions on Query Complexity {#restrictions-on-query-complexity} diff --git a/docs/en/operations/settings/settings-profiles.md b/docs/en/operations/settings/settings-profiles.md index 1939b21bfc3..b8e1e3c21c4 100644 --- a/docs/en/operations/settings/settings-profiles.md +++ b/docs/en/operations/settings/settings-profiles.md @@ -1,14 +1,15 @@ --- -toc_priority: 61 -toc_title: Settings Profiles +sidebar_position: 61 +sidebar_label: Settings Profiles --- # Settings Profiles {#settings-profiles} A settings profile is a collection of settings grouped under the same name. -!!! note "Information" - ClickHouse also supports [SQL-driven workflow](../../operations/access-rights.md#access-control) for managing settings profiles. We recommend using it. +:::note +ClickHouse also supports [SQL-driven workflow](../../operations/access-rights.md#access-control) for managing settings profiles. We recommend using it. +::: The profile can have any name. You can specify the same profile for different users. The most important thing you can write in the settings profile is `readonly=1`, which ensures read-only access. diff --git a/docs/en/operations/settings/settings-users.md b/docs/en/operations/settings/settings-users.md index 1a1d2e2a0fa..6a020be2afc 100644 --- a/docs/en/operations/settings/settings-users.md +++ b/docs/en/operations/settings/settings-users.md @@ -1,14 +1,15 @@ --- -toc_priority: 63 -toc_title: User Settings +sidebar_position: 63 +sidebar_label: User Settings --- # User Settings {#user-settings} The `users` section of the `user.xml` configuration file contains user settings. -!!! note "Information" - ClickHouse also supports [SQL-driven workflow](../../operations/access-rights.md#access-control) for managing users. We recommend using it. +:::note +ClickHouse also supports [SQL-driven workflow](../../operations/access-rights.md#access-control) for managing users. We recommend using it. +::: Structure of the `users` section: @@ -116,8 +117,9 @@ To open access for user from any network, specify: ::/0 ``` -!!! warning "Warning" - It’s insecure to open access from any network unless you have a firewall properly configured or the server is not directly connected to Internet. +:::warning +It’s insecure to open access from any network unless you have a firewall properly configured or the server is not directly connected to Internet. +::: To open access only from localhost, specify: diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index c04ca5822e6..30d7dd98ee7 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -401,8 +401,9 @@ Default value: 1. When performing `INSERT` queries, replace omitted input column values with default values of the respective columns. This option only applies to [JSONEachRow](../../interfaces/formats.md#jsoneachrow), [CSV](../../interfaces/formats.md#csv), [TabSeparated](../../interfaces/formats.md#tabseparated) formats and formats with `WithNames`/`WithNamesAndTypes` suffixes. -!!! note "Note" - When this option is enabled, extended table metadata are sent from server to client. It consumes additional computing resources on the server and can reduce performance. +:::note +When this option is enabled, extended table metadata are sent from server to client. It consumes additional computing resources on the server and can reduce performance. +::: Possible values: @@ -690,8 +691,9 @@ When using `partial_merge` algorithm ClickHouse sorts the data and dumps it to t Changes behaviour of join operations with `ANY` strictness. -!!! warning "Attention" - This setting applies only for `JOIN` operations with [Join](../../engines/table-engines/special/join.md) engine tables. +:::warning +This setting applies only for `JOIN` operations with [Join](../../engines/table-engines/special/join.md) engine tables. +::: Possible values: @@ -762,8 +764,9 @@ Default value: 64. Enables legacy ClickHouse server behaviour in `ANY INNER|LEFT JOIN` operations. -!!! note "Warning" - Use this setting only for backward compatibility if your use cases depend on legacy `JOIN` behaviour. +:::warning +Use this setting only for backward compatibility if your use cases depend on legacy `JOIN` behaviour. +::: When the legacy behaviour enabled: @@ -1137,8 +1140,9 @@ Higher values will lead to higher memory usage. The maximum size of blocks of uncompressed data before compressing for writing to a table. By default, 1,048,576 (1 MiB). Specifying smaller block size generally leads to slightly reduced compression ratio, the compression and decompression speed increases slightly due to cache locality, and memory consumption is reduced. -!!! note "Warning" - This is an expert-level setting, and you shouldn't change it if you're just getting started with ClickHouse. +:::warning +This is an expert-level setting, and you shouldn't change it if you're just getting started with ClickHouse. +::: Don’t confuse blocks for compression (a chunk of memory consisting of bytes) with blocks for query processing (a set of rows from a table). @@ -1154,8 +1158,9 @@ We are writing a UInt32-type column (4 bytes per value). When writing 8192 rows, We are writing a URL column with the String type (average size of 60 bytes per value). When writing 8192 rows, the average will be slightly less than 500 KB of data. Since this is more than 65,536, a compressed block will be formed for each mark. In this case, when reading data from the disk in the range of a single mark, extra data won’t be decompressed. -!!! note "Warning" - This is an expert-level setting, and you shouldn't change it if you're just getting started with ClickHouse. +:::warning +This is an expert-level setting, and you shouldn't change it if you're just getting started with ClickHouse. +::: ## max_query_size {#settings-max_query_size} @@ -1243,8 +1248,9 @@ Default value: `0`. Could be used for throttling speed when replicating the data to add or replace new nodes. -!!! note "Note" - 60000000 bytes/s approximatly corresponds to 457 Mbps (60000000 / 1024 / 1024 * 8). +:::note +60000000 bytes/s approximatly corresponds to 457 Mbps (60000000 / 1024 / 1024 * 8). +::: ## max_replicated_sends_network_bandwidth_for_server {#max_replicated_sends_network_bandwidth_for_server} @@ -1263,8 +1269,9 @@ Default value: `0`. Could be used for throttling speed when replicating the data to add or replace new nodes. -!!! note "Note" - 60000000 bytes/s approximatly corresponds to 457 Mbps (60000000 / 1024 / 1024 * 8). +:::note +60000000 bytes/s approximatly corresponds to 457 Mbps (60000000 / 1024 / 1024 * 8). +::: ## connect_timeout_with_failover_ms {#connect-timeout-with-failover-ms} @@ -1426,8 +1433,9 @@ Possible values: Default value: 1. -!!! warning "Warning" - Disable this setting if you use [max_parallel_replicas](#settings-max_parallel_replicas). +:::warning +Disable this setting if you use [max_parallel_replicas](#settings-max_parallel_replicas). +::: ## totals_mode {#totals-mode} @@ -1458,8 +1466,9 @@ This setting is useful for replicated tables with a sampling key. A query may be - The sampling key is an expression that is expensive to calculate. - The cluster latency distribution has a long tail, so that querying more servers increases the query overall latency. -!!! warning "Warning" - This setting will produce incorrect results when joins or subqueries are involved, and all tables don't meet certain requirements. See [Distributed Subqueries and max_parallel_replicas](../../sql-reference/operators/in.md#max_parallel_replica-subqueries) for more details. +:::warning +This setting will produce incorrect results when joins or subqueries are involved, and all tables don't meet certain requirements. See [Distributed Subqueries and max_parallel_replicas](../../sql-reference/operators/in.md#max_parallel_replica-subqueries) for more details. +::: ## compile_expressions {#compile-expressions} @@ -2111,8 +2120,9 @@ See also: - [distributed_push_down_limit](#distributed-push-down-limit) - [optimize_skip_unused_shards](#optimize-skip-unused-shards) -!!! note "Note" - Right now it requires `optimize_skip_unused_shards` (the reason behind this is that one day it may be enabled by default, and it will work correctly only if data was inserted via Distributed table, i.e. data is distributed according to sharding_key). +:::note +Right now it requires `optimize_skip_unused_shards` (the reason behind this is that one day it may be enabled by default, and it will work correctly only if data was inserted via Distributed table, i.e. data is distributed according to sharding_key). +::: ## optimize_throw_if_noop {#setting-optimize_throw_if_noop} @@ -2254,18 +2264,21 @@ Possible values: Default value: 0. -!!! note "Note" - This setting also affects broken batches (that may appears because of abnormal server (machine) termination and no `fsync_after_insert`/`fsync_directories` for [Distributed](../../engines/table-engines/special/distributed.md) table engine). +:::note +This setting also affects broken batches (that may appears because of abnormal server (machine) termination and no `fsync_after_insert`/`fsync_directories` for [Distributed](../../engines/table-engines/special/distributed.md) table engine). +::: -!!! warning "Warning" - You should not rely on automatic batch splitting, since this may hurt performance. +:::warning +You should not rely on automatic batch splitting, since this may hurt performance. +::: ## os_thread_priority {#setting-os-thread-priority} Sets the priority ([nice](https://en.wikipedia.org/wiki/Nice_(Unix))) for threads that execute queries. The OS scheduler considers this priority when choosing the next thread to run on each available CPU core. -!!! warning "Warning" - To use this setting, you need to set the `CAP_SYS_NICE` capability. The `clickhouse-server` package sets it up during installation. Some virtual environments do not allow you to set the `CAP_SYS_NICE` capability. In this case, `clickhouse-server` shows a message about it at the start. +:::warning +To use this setting, you need to set the `CAP_SYS_NICE` capability. The `clickhouse-server` package sets it up during installation. Some virtual environments do not allow you to set the `CAP_SYS_NICE` capability. In this case, `clickhouse-server` shows a message about it at the start. +::: Possible values: @@ -2539,9 +2552,10 @@ Possible values: Default value: `1`. -!!! note "Note" - - with `use_compact_format_in_distributed_parts_names=0` changes from cluster definition will not be applied for async INSERT. - - with `use_compact_format_in_distributed_parts_names=1` changing the order of the nodes in the cluster definition, will change the `shard_index`/`replica_index` so be aware. +:::note +- with `use_compact_format_in_distributed_parts_names=0` changes from cluster definition will not be applied for async INSERT. +- with `use_compact_format_in_distributed_parts_names=1` changing the order of the nodes in the cluster definition, will change the `shard_index`/`replica_index` so be aware. +::: ## background_buffer_flush_schedule_pool_size {#background_buffer_flush_schedule_pool_size} @@ -3196,11 +3210,13 @@ Possible values: Default value: `0`. -!!! warning "Warning" - Nullable primary key usually indicates bad design. It is forbidden in almost all main stream DBMS. The feature is mainly for [AggregatingMergeTree](../../engines/table-engines/mergetree-family/aggregatingmergetree.md) and is not heavily tested. Use with care. +:::warning +Nullable primary key usually indicates bad design. It is forbidden in almost all main stream DBMS. The feature is mainly for [AggregatingMergeTree](../../engines/table-engines/mergetree-family/aggregatingmergetree.md) and is not heavily tested. Use with care. +::: -!!! warning "Warning" - Do not enable this feature in version `<= 21.8`. It's not properly implemented and may lead to server crash. +:::warning +Do not enable this feature in version `<= 21.8`. It's not properly implemented and may lead to server crash. +::: ## aggregate_functions_null_for_empty {#aggregate_functions_null_for_empty} diff --git a/docs/en/operations/ssl-zookeeper.md b/docs/en/operations/ssl-zookeeper.md index fe899802348..d6043d521e7 100644 --- a/docs/en/operations/ssl-zookeeper.md +++ b/docs/en/operations/ssl-zookeeper.md @@ -1,6 +1,6 @@ --- -toc_priority: 45 -toc_title: Secured communication with Zookeeper +sidebar_position: 45 +sidebar_label: Secured Communication with Zookeeper --- # Optional secured communication between ClickHouse and Zookeeper {#secured-communication-with-zookeeper} @@ -67,8 +67,7 @@ SELECT * FROM system.zookeeper WHERE path = '/'; On unencrypted connection you will see in `tcpdump` output something like this: ```text -..../zookeeper/q -uota. +..../zookeeper/quota. ``` On encrypted connection you should not see this. diff --git a/docs/en/operations/storing-data.md b/docs/en/operations/storing-data.md index beffd45bcbd..2162ae066dd 100644 --- a/docs/en/operations/storing-data.md +++ b/docs/en/operations/storing-data.md @@ -1,6 +1,6 @@ --- -toc_priority: 68 -toc_title: External Disks for Storing Data +sidebar_position: 68 +sidebar_label: External Disks for Storing Data --- # External Disks for Storing Data {#external-disks} diff --git a/docs/en/operations/system-tables/asynchronous_metric_log.md b/docs/en/operations/system-tables/asynchronous_metric_log.md index 273f1f00d71..2233406162b 100644 --- a/docs/en/operations/system-tables/asynchronous_metric_log.md +++ b/docs/en/operations/system-tables/asynchronous_metric_log.md @@ -1,4 +1,4 @@ -## system.asynchronous_metric_log {#system-tables-async-log} +# asynchronous_metric_log {#system-tables-async-log} Contains the historical values for `system.asynchronous_metrics`, which are saved once per minute. Enabled by default. diff --git a/docs/en/operations/system-tables/asynchronous_metrics.md b/docs/en/operations/system-tables/asynchronous_metrics.md index 11255a868cc..162048b06ee 100644 --- a/docs/en/operations/system-tables/asynchronous_metrics.md +++ b/docs/en/operations/system-tables/asynchronous_metrics.md @@ -1,4 +1,4 @@ -# system.asynchronous_metrics {#system_tables-asynchronous_metrics} +# asynchronous_metrics {#system_tables-asynchronous_metrics} Contains metrics that are calculated periodically in the background. For example, the amount of RAM in use. diff --git a/docs/en/operations/system-tables/clusters.md b/docs/en/operations/system-tables/clusters.md index 18a4152df70..776c90b9936 100644 --- a/docs/en/operations/system-tables/clusters.md +++ b/docs/en/operations/system-tables/clusters.md @@ -1,4 +1,4 @@ -# system.clusters {#system-clusters} +# clusters {#system-clusters} Contains information about clusters available in the config file and the servers in them. diff --git a/docs/en/operations/system-tables/columns.md b/docs/en/operations/system-tables/columns.md index 55e4a8284a0..dd5674fe5b1 100644 --- a/docs/en/operations/system-tables/columns.md +++ b/docs/en/operations/system-tables/columns.md @@ -1,4 +1,4 @@ -# system.columns {#system-columns} +# columns {#system-columns} Contains information about columns in all the tables. diff --git a/docs/en/operations/system-tables/contributors.md b/docs/en/operations/system-tables/contributors.md index 0b6e977e0e3..3b76684b44b 100644 --- a/docs/en/operations/system-tables/contributors.md +++ b/docs/en/operations/system-tables/contributors.md @@ -1,4 +1,4 @@ -# system.contributors {#system-contributors} +# contributors {#system-contributors} Contains information about contributors. The order is random at query execution time. diff --git a/docs/en/operations/system-tables/crash-log.md b/docs/en/operations/system-tables/crash-log.md index 404010afc05..be85cb78c9f 100644 --- a/docs/en/operations/system-tables/crash-log.md +++ b/docs/en/operations/system-tables/crash-log.md @@ -1,4 +1,4 @@ -# system.crash_log {#system-tables_crash_log} +# crash_log {#system-tables_crash_log} Contains information about stack traces for fatal errors. The table does not exist in the database by default, it is created only when fatal errors occur. diff --git a/docs/en/operations/system-tables/current-roles.md b/docs/en/operations/system-tables/current-roles.md index c029f367998..81d4fad24a8 100644 --- a/docs/en/operations/system-tables/current-roles.md +++ b/docs/en/operations/system-tables/current-roles.md @@ -1,4 +1,4 @@ -# system.current_roles {#system_tables-current_roles} +# current_roles {#system_tables-current_roles} Contains active roles of a current user. `SET ROLE` changes the contents of this table. diff --git a/docs/en/operations/system-tables/data_skipping_indices.md b/docs/en/operations/system-tables/data_skipping_indices.md index add89ae9144..71dfb046dbb 100644 --- a/docs/en/operations/system-tables/data_skipping_indices.md +++ b/docs/en/operations/system-tables/data_skipping_indices.md @@ -1,4 +1,4 @@ -# system.data_skipping_indices {#system-data-skipping-indices} +# data_skipping_indices {#system-data-skipping-indices} Contains information about existing data skipping indices in all the tables. diff --git a/docs/en/operations/system-tables/data_type_families.md b/docs/en/operations/system-tables/data_type_families.md index 0d11b1cfefb..2e5e7b74c66 100644 --- a/docs/en/operations/system-tables/data_type_families.md +++ b/docs/en/operations/system-tables/data_type_families.md @@ -1,4 +1,4 @@ -# system.data_type_families {#system_tables-data_type_families} +# data_type_families {#system_tables-data_type_families} Contains information about supported [data types](../../sql-reference/data-types/index.md). diff --git a/docs/en/operations/system-tables/databases.md b/docs/en/operations/system-tables/databases.md index 45eebf2ae85..7245ecdcdec 100644 --- a/docs/en/operations/system-tables/databases.md +++ b/docs/en/operations/system-tables/databases.md @@ -1,4 +1,4 @@ -# system.databases {#system-databases} +# databases {#system-databases} Contains information about the databases that are available to the current user. diff --git a/docs/en/operations/system-tables/detached_parts.md b/docs/en/operations/system-tables/detached_parts.md index 7345aa4ba6c..2fe354a4471 100644 --- a/docs/en/operations/system-tables/detached_parts.md +++ b/docs/en/operations/system-tables/detached_parts.md @@ -1,4 +1,4 @@ -# system.detached_parts {#system_tables-detached_parts} +# detached_parts {#system_tables-detached_parts} Contains information about detached parts of [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) tables. The `reason` column specifies why the part was detached. diff --git a/docs/en/operations/system-tables/dictionaries.md b/docs/en/operations/system-tables/dictionaries.md index 5fd326297c8..c41d506ff0a 100644 --- a/docs/en/operations/system-tables/dictionaries.md +++ b/docs/en/operations/system-tables/dictionaries.md @@ -1,4 +1,4 @@ -# system.dictionaries {#system_tables-dictionaries} +# dictionaries {#system_tables-dictionaries} Contains information about [external dictionaries](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md). diff --git a/docs/en/operations/system-tables/disks.md b/docs/en/operations/system-tables/disks.md index f643e3fcfe1..869c0f3cee5 100644 --- a/docs/en/operations/system-tables/disks.md +++ b/docs/en/operations/system-tables/disks.md @@ -1,4 +1,4 @@ -# system.disks {#system_tables-disks} +# disks {#system_tables-disks} Contains information about disks defined in the [server configuration](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes_configure). diff --git a/docs/en/operations/system-tables/distributed_ddl_queue.md b/docs/en/operations/system-tables/distributed_ddl_queue.md index 47899ae5628..0597972197d 100644 --- a/docs/en/operations/system-tables/distributed_ddl_queue.md +++ b/docs/en/operations/system-tables/distributed_ddl_queue.md @@ -1,4 +1,4 @@ -# system.distributed_ddl_queue {#system_tables-distributed_ddl_queue} +# distributed_ddl_queue {#system_tables-distributed_ddl_queue} Contains information about [distributed ddl queries (ON CLUSTER clause)](../../sql-reference/distributed-ddl.md) that were executed on a cluster. diff --git a/docs/en/operations/system-tables/distribution_queue.md b/docs/en/operations/system-tables/distribution_queue.md index 2b0ca536119..231a06458c8 100644 --- a/docs/en/operations/system-tables/distribution_queue.md +++ b/docs/en/operations/system-tables/distribution_queue.md @@ -1,4 +1,4 @@ -# system.distribution_queue {#system_tables-distribution_queue} +# distribution_queue {#system_tables-distribution_queue} Contains information about local files that are in the queue to be sent to the shards. These local files contain new parts that are created by inserting new data into the Distributed table in asynchronous mode. diff --git a/docs/en/operations/system-tables/enabled-roles.md b/docs/en/operations/system-tables/enabled-roles.md index 54569ebbca6..832fc6aba42 100644 --- a/docs/en/operations/system-tables/enabled-roles.md +++ b/docs/en/operations/system-tables/enabled-roles.md @@ -1,4 +1,4 @@ -# system.enabled_roles {#system_tables-enabled_roles} +# enabled_roles {#system_tables-enabled_roles} Contains all active roles at the moment, including current role of the current user and granted roles for current role. diff --git a/docs/en/operations/system-tables/errors.md b/docs/en/operations/system-tables/errors.md index 583cce88ca4..8e60cf93bfa 100644 --- a/docs/en/operations/system-tables/errors.md +++ b/docs/en/operations/system-tables/errors.md @@ -1,4 +1,4 @@ -# system.errors {#system_tables-errors} +# errors {#system_tables-errors} Contains error codes with the number of times they have been triggered. diff --git a/docs/en/operations/system-tables/events.md b/docs/en/operations/system-tables/events.md index 719216a54be..445573ec978 100644 --- a/docs/en/operations/system-tables/events.md +++ b/docs/en/operations/system-tables/events.md @@ -1,4 +1,4 @@ -# system.events {#system_tables-events} +# events {#system_tables-events} Contains information about the number of events that have occurred in the system. For example, in the table, you can find how many `SELECT` queries were processed since the ClickHouse server started. diff --git a/docs/en/operations/system-tables/functions.md b/docs/en/operations/system-tables/functions.md index 0f1a6184ae1..097b6ccd22a 100644 --- a/docs/en/operations/system-tables/functions.md +++ b/docs/en/operations/system-tables/functions.md @@ -1,4 +1,4 @@ -# system.functions {#system-functions} +# functions {#system-functions} Contains information about normal and aggregate functions. diff --git a/docs/en/operations/system-tables/grants.md b/docs/en/operations/system-tables/grants.md index bd0d8c3c5b8..c848972c2d8 100644 --- a/docs/en/operations/system-tables/grants.md +++ b/docs/en/operations/system-tables/grants.md @@ -1,4 +1,4 @@ -# system.grants {#system_tables-grants} +# grants {#system_tables-grants} Privileges granted to ClickHouse user accounts. diff --git a/docs/en/operations/system-tables/graphite_retentions.md b/docs/en/operations/system-tables/graphite_retentions.md index af35da1f6e5..10e265815f4 100644 --- a/docs/en/operations/system-tables/graphite_retentions.md +++ b/docs/en/operations/system-tables/graphite_retentions.md @@ -1,4 +1,4 @@ -# system.graphite_retentions {#system-graphite-retentions} +# graphite_retentions {#system-graphite-retentions} Contains information about parameters [graphite_rollup](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-graphite) which are used in tables with [\*GraphiteMergeTree](../../engines/table-engines/mergetree-family/graphitemergetree.md) engines. diff --git a/docs/en/operations/system-tables/index.md b/docs/en/operations/system-tables/index.md index 5e8418d0af3..7b977ab4d51 100644 --- a/docs/en/operations/system-tables/index.md +++ b/docs/en/operations/system-tables/index.md @@ -1,6 +1,6 @@ --- -toc_priority: 52 -toc_title: System Tables +sidebar_position: 52 +sidebar_label: System Tables --- # System Tables {#system-tables} diff --git a/docs/en/operations/system-tables/licenses.md b/docs/en/operations/system-tables/licenses.md index caef97697a6..fad6e16fd8a 100644 --- a/docs/en/operations/system-tables/licenses.md +++ b/docs/en/operations/system-tables/licenses.md @@ -1,4 +1,4 @@ -# system.licenses {#system-tables_system.licenses} +# licenses Сontains licenses of third-party libraries that are located in the [contrib](https://github.com/ClickHouse/ClickHouse/tree/master/contrib) directory of ClickHouse sources. diff --git a/docs/en/operations/system-tables/merge_tree_settings.md b/docs/en/operations/system-tables/merge_tree_settings.md index 1f24bdbe0cf..0324d5c633d 100644 --- a/docs/en/operations/system-tables/merge_tree_settings.md +++ b/docs/en/operations/system-tables/merge_tree_settings.md @@ -1,4 +1,4 @@ -# system.merge_tree_settings {#system-merge_tree_settings} +# merge_tree_settings {#system-merge_tree_settings} Contains information about settings for `MergeTree` tables. diff --git a/docs/en/operations/system-tables/merges.md b/docs/en/operations/system-tables/merges.md index e9ca30d5f2c..f512e00fc89 100644 --- a/docs/en/operations/system-tables/merges.md +++ b/docs/en/operations/system-tables/merges.md @@ -1,4 +1,4 @@ -# system.merges {#system-merges} +# merges {#system-merges} Contains information about merges and part mutations currently in process for tables in the MergeTree family. diff --git a/docs/en/operations/system-tables/metric_log.md b/docs/en/operations/system-tables/metric_log.md index 875e443d0a6..55b0d800ead 100644 --- a/docs/en/operations/system-tables/metric_log.md +++ b/docs/en/operations/system-tables/metric_log.md @@ -1,4 +1,4 @@ -# system.metric_log {#system_tables-metric_log} +# metric_log {#system_tables-metric_log} Contains history of metrics values from tables `system.metrics` and `system.events`, periodically flushed to disk. diff --git a/docs/en/operations/system-tables/metrics.md b/docs/en/operations/system-tables/metrics.md index 21e5923e3a0..d4e06e1aca6 100644 --- a/docs/en/operations/system-tables/metrics.md +++ b/docs/en/operations/system-tables/metrics.md @@ -1,4 +1,4 @@ -# system.metrics {#system_tables-metrics} +# metrics {#system_tables-metrics} Contains metrics which can be calculated instantly, or have a current value. For example, the number of simultaneously processed queries or the current replica delay. This table is always up to date. diff --git a/docs/en/operations/system-tables/mutations.md b/docs/en/operations/system-tables/mutations.md index 66ce500f213..507146d93de 100644 --- a/docs/en/operations/system-tables/mutations.md +++ b/docs/en/operations/system-tables/mutations.md @@ -1,4 +1,4 @@ -# system.mutations {#system_tables-mutations} +# mutations {#system_tables-mutations} The table contains information about [mutations](../../sql-reference/statements/alter/index.md#mutations) of [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) tables and their progress. Each mutation command is represented by a single row. @@ -28,8 +28,9 @@ Columns: - `1` if the mutation is completed, - `0` if the mutation is still in process. -!!! info "Note" - Even if `parts_to_do = 0` it is possible that a mutation of a replicated table is not completed yet because of a long-running `INSERT` query, that will create a new data part needed to be mutated. +:::note +Even if `parts_to_do = 0` it is possible that a mutation of a replicated table is not completed yet because of a long-running `INSERT` query, that will create a new data part needed to be mutated. +::: If there were problems with mutating some data parts, the following columns contain additional information: diff --git a/docs/en/operations/system-tables/numbers.md b/docs/en/operations/system-tables/numbers.md index 774fdf86b76..29828bfe796 100644 --- a/docs/en/operations/system-tables/numbers.md +++ b/docs/en/operations/system-tables/numbers.md @@ -1,4 +1,4 @@ -# system.numbers {#system-numbers} +# numbers {#system-numbers} This table contains a single UInt64 column named `number` that contains almost all the natural numbers starting from zero. diff --git a/docs/en/operations/system-tables/numbers_mt.md b/docs/en/operations/system-tables/numbers_mt.md index 978a6565b71..02155db4711 100644 --- a/docs/en/operations/system-tables/numbers_mt.md +++ b/docs/en/operations/system-tables/numbers_mt.md @@ -1,4 +1,4 @@ -# system.numbers_mt {#system-numbers-mt} +# numbers_mt {#system-numbers-mt} The same as [system.numbers](../../operations/system-tables/numbers.md) but reads are parallelized. The numbers can be returned in any order. diff --git a/docs/en/operations/system-tables/one.md b/docs/en/operations/system-tables/one.md index 293f0412955..9b84c0bfcd6 100644 --- a/docs/en/operations/system-tables/one.md +++ b/docs/en/operations/system-tables/one.md @@ -1,4 +1,4 @@ -# system.one {#system-one} +# one {#system-one} This table contains a single row with a single `dummy` UInt8 column containing the value 0. diff --git a/docs/en/operations/system-tables/opentelemetry_span_log.md b/docs/en/operations/system-tables/opentelemetry_span_log.md index 521c155d0f7..89af72d6620 100644 --- a/docs/en/operations/system-tables/opentelemetry_span_log.md +++ b/docs/en/operations/system-tables/opentelemetry_span_log.md @@ -1,4 +1,4 @@ -# system.opentelemetry_span_log {#system_tables-opentelemetry_span_log} +# opentelemetry_span_log {#system_tables-opentelemetry_span_log} Contains information about [trace spans](https://opentracing.io/docs/overview/spans/) for executed queries. diff --git a/docs/en/operations/system-tables/part_log.md b/docs/en/operations/system-tables/part_log.md index 1fb5b12e87e..00eaca23862 100644 --- a/docs/en/operations/system-tables/part_log.md +++ b/docs/en/operations/system-tables/part_log.md @@ -1,4 +1,4 @@ -# system.part_log {#system_tables-part-log} +# part_log {#system_tables-part-log} The `system.part_log` table is created only if the [part_log](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-part-log) server setting is specified. diff --git a/docs/en/operations/system-tables/parts.md b/docs/en/operations/system-tables/parts.md index cf0f93ecdd6..845c63e5626 100644 --- a/docs/en/operations/system-tables/parts.md +++ b/docs/en/operations/system-tables/parts.md @@ -1,4 +1,4 @@ -# system.parts {#system_tables-parts} +# parts {#system_tables-parts} Contains information about parts of [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) tables. @@ -96,8 +96,9 @@ Columns: - `move_ttl_info.expression` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — Array of expressions. Each expression defines a [TTL MOVE rule](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-ttl). - !!! note "Warning" - The `move_ttl_info.expression` array is kept mostly for backward compatibility, now the simpliest way to check `TTL MOVE` rule is to use the `move_ttl_info.min` and `move_ttl_info.max` fields. +:::warning +The `move_ttl_info.expression` array is kept mostly for backward compatibility, now the simpliest way to check `TTL MOVE` rule is to use the `move_ttl_info.min` and `move_ttl_info.max` fields. +::: - `move_ttl_info.min` ([Array](../../sql-reference/data-types/array.md)([DateTime](../../sql-reference/data-types/datetime.md))) — Array of date and time values. Each element describes the minimum key value for a [TTL MOVE rule](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-ttl). diff --git a/docs/en/operations/system-tables/parts_columns.md b/docs/en/operations/system-tables/parts_columns.md index 0be4324bab5..e87be3fcd43 100644 --- a/docs/en/operations/system-tables/parts_columns.md +++ b/docs/en/operations/system-tables/parts_columns.md @@ -1,4 +1,4 @@ -# system.parts_columns {#system_tables-parts_columns} +# parts_columns {#system_tables-parts_columns} Contains information about parts and columns of [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) tables. diff --git a/docs/en/operations/system-tables/processes.md b/docs/en/operations/system-tables/processes.md index ee8daf0e5bf..f261ee9b696 100644 --- a/docs/en/operations/system-tables/processes.md +++ b/docs/en/operations/system-tables/processes.md @@ -1,4 +1,4 @@ -# system.processes {#system_tables-processes} +# processes {#system_tables-processes} This system table is used for implementing the `SHOW PROCESSLIST` query. diff --git a/docs/en/operations/system-tables/query_log.md b/docs/en/operations/system-tables/query_log.md index e3aab04f7dd..a8fda41f7c2 100644 --- a/docs/en/operations/system-tables/query_log.md +++ b/docs/en/operations/system-tables/query_log.md @@ -1,9 +1,10 @@ -# system.query_log {#system_tables-query_log} +# query_log {#system_tables-query_log} Contains information about executed queries, for example, start time, duration of processing, error messages. -!!! note "Note" - This table does not contain the ingested data for `INSERT` queries. +:::note +This table does not contain the ingested data for `INSERT` queries. +::: You can change settings of queries logging in the [query_log](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-query-log) section of the server configuration. diff --git a/docs/en/operations/system-tables/query_thread_log.md b/docs/en/operations/system-tables/query_thread_log.md index aa064d675ce..072a311b7db 100644 --- a/docs/en/operations/system-tables/query_thread_log.md +++ b/docs/en/operations/system-tables/query_thread_log.md @@ -1,4 +1,4 @@ -# system.query_thread_log {#system_tables-query_thread_log} +# query_thread_log {#system_tables-query_thread_log} Contains information about threads that execute queries, for example, thread name, thread start time, duration of query processing. diff --git a/docs/en/operations/system-tables/query_views_log.md b/docs/en/operations/system-tables/query_views_log.md index 6a6bbef45e2..5aa69522869 100644 --- a/docs/en/operations/system-tables/query_views_log.md +++ b/docs/en/operations/system-tables/query_views_log.md @@ -1,4 +1,4 @@ -# system.query_views_log {#system_tables-query_views_log} +# query_views_log {#system_tables-query_views_log} Contains information about the dependent views executed when running a query, for example, the view type or the execution time. diff --git a/docs/en/operations/system-tables/quota_limits.md b/docs/en/operations/system-tables/quota_limits.md index 708c4e4e33e..e1873ecfa92 100644 --- a/docs/en/operations/system-tables/quota_limits.md +++ b/docs/en/operations/system-tables/quota_limits.md @@ -1,4 +1,4 @@ -# system.quota_limits {#system_tables-quota_limits} +# quota_limits {#system_tables-quota_limits} Contains information about maximums for all intervals of all quotas. Any number of rows or zero can correspond to one quota. diff --git a/docs/en/operations/system-tables/quota_usage.md b/docs/en/operations/system-tables/quota_usage.md index 6f4d3c9c8ee..ad9f9b8c44f 100644 --- a/docs/en/operations/system-tables/quota_usage.md +++ b/docs/en/operations/system-tables/quota_usage.md @@ -1,4 +1,4 @@ -# system.quota_usage {#system_tables-quota_usage} +# quota_usage {#system_tables-quota_usage} Quota usage by the current user: how much is used and how much is left. diff --git a/docs/en/operations/system-tables/quotas.md b/docs/en/operations/system-tables/quotas.md index bdcc13340f0..0a435919b14 100644 --- a/docs/en/operations/system-tables/quotas.md +++ b/docs/en/operations/system-tables/quotas.md @@ -1,4 +1,4 @@ -# system.quotas {#system_tables-quotas} +# quotas {#system_tables-quotas} Contains information about [quotas](../../operations/system-tables/quotas.md). diff --git a/docs/en/operations/system-tables/quotas_usage.md b/docs/en/operations/system-tables/quotas_usage.md index 7d39af0f601..43811a75187 100644 --- a/docs/en/operations/system-tables/quotas_usage.md +++ b/docs/en/operations/system-tables/quotas_usage.md @@ -1,4 +1,4 @@ -# system.quotas_usage {#system_tables-quotas_usage} +# quotas_usage {#system_tables-quotas_usage} Quota usage by all users. diff --git a/docs/en/operations/system-tables/replicas.md b/docs/en/operations/system-tables/replicas.md index ff1057ca8b1..6ec0f184e15 100644 --- a/docs/en/operations/system-tables/replicas.md +++ b/docs/en/operations/system-tables/replicas.md @@ -1,4 +1,4 @@ -# system.replicas {#system_tables-replicas} +# replicas {#system_tables-replicas} Contains information and status for replicated tables residing on the local server. This table can be used for monitoring. The table contains a row for every Replicated\* table. diff --git a/docs/en/operations/system-tables/replicated_fetches.md b/docs/en/operations/system-tables/replicated_fetches.md index 241bb609853..438d1572109 100644 --- a/docs/en/operations/system-tables/replicated_fetches.md +++ b/docs/en/operations/system-tables/replicated_fetches.md @@ -1,4 +1,4 @@ -# system.replicated_fetches {#system_tables-replicated_fetches} +# replicated_fetches {#system_tables-replicated_fetches} Contains information about currently running background fetches. diff --git a/docs/en/operations/system-tables/replication_queue.md b/docs/en/operations/system-tables/replication_queue.md index 0e1d7792996..a8a51162dae 100644 --- a/docs/en/operations/system-tables/replication_queue.md +++ b/docs/en/operations/system-tables/replication_queue.md @@ -1,4 +1,4 @@ -# system.replication_queue {#system_tables-replication_queue} +# replication_queue {#system_tables-replication_queue} Contains information about tasks from replication queues stored in ZooKeeper for tables in the `ReplicatedMergeTree` family. diff --git a/docs/en/operations/system-tables/role-grants.md b/docs/en/operations/system-tables/role-grants.md index 6da221af1a4..cb0c5bf0b0b 100644 --- a/docs/en/operations/system-tables/role-grants.md +++ b/docs/en/operations/system-tables/role-grants.md @@ -1,4 +1,4 @@ -#system.role_grants {#system_tables-role_grants} +# role_grants Contains the role grants for users and roles. To add entries to this table, use `GRANT role TO user`. diff --git a/docs/en/operations/system-tables/roles.md b/docs/en/operations/system-tables/roles.md index 7a71270b6c4..1f8fe349c7f 100644 --- a/docs/en/operations/system-tables/roles.md +++ b/docs/en/operations/system-tables/roles.md @@ -1,4 +1,4 @@ -# system.roles {#system_tables-roles} +# roles {#system_tables-roles} Contains information about configured [roles](../../operations/access-rights.md#role-management). diff --git a/docs/en/operations/system-tables/row_policies.md b/docs/en/operations/system-tables/row_policies.md index 95a26efe952..2bff037751b 100644 --- a/docs/en/operations/system-tables/row_policies.md +++ b/docs/en/operations/system-tables/row_policies.md @@ -1,4 +1,4 @@ -# system.row_policies {#system_tables-row_policies} +# row_policies {#system_tables-row_policies} Contains filters for one particular table, as well as a list of roles and/or users which should use this row policy. diff --git a/docs/en/operations/system-tables/session_log.md b/docs/en/operations/system-tables/session_log.md index cf69fd8518e..9ee7e294bfd 100644 --- a/docs/en/operations/system-tables/session_log.md +++ b/docs/en/operations/system-tables/session_log.md @@ -1,4 +1,4 @@ -# system.session_log {#system_tables-session_log} +# session_log {#system_tables-session_log} Contains information about all successful and failed login and logout events. diff --git a/docs/en/operations/system-tables/settings.md b/docs/en/operations/system-tables/settings.md index 5d5eda2abc1..ce6f3cd4724 100644 --- a/docs/en/operations/system-tables/settings.md +++ b/docs/en/operations/system-tables/settings.md @@ -1,4 +1,4 @@ -# system.settings {#system-tables-system-settings} +# settings {#system-tables-system-settings} Contains information about session settings for current user. diff --git a/docs/en/operations/system-tables/settings_profile_elements.md b/docs/en/operations/system-tables/settings_profile_elements.md index d812d8f74e0..5a010d6239a 100644 --- a/docs/en/operations/system-tables/settings_profile_elements.md +++ b/docs/en/operations/system-tables/settings_profile_elements.md @@ -1,4 +1,4 @@ -# system.settings_profile_elements {#system_tables-settings_profile_elements} +# settings_profile_elements {#system_tables-settings_profile_elements} Describes the content of the settings profile: diff --git a/docs/en/operations/system-tables/settings_profiles.md b/docs/en/operations/system-tables/settings_profiles.md index f9b62cf5194..ab2020b375d 100644 --- a/docs/en/operations/system-tables/settings_profiles.md +++ b/docs/en/operations/system-tables/settings_profiles.md @@ -1,4 +1,4 @@ -# system.settings_profiles {#system_tables-settings_profiles} +# settings_profiles {#system_tables-settings_profiles} Contains properties of configured setting profiles. diff --git a/docs/en/operations/system-tables/stack_trace.md b/docs/en/operations/system-tables/stack_trace.md index e2135e4beb6..2aa1c5af125 100644 --- a/docs/en/operations/system-tables/stack_trace.md +++ b/docs/en/operations/system-tables/stack_trace.md @@ -1,4 +1,4 @@ -# system.stack_trace {#system-tables_stack_trace} +# stack_trace {#system-tables_stack_trace} Contains stack traces of all server threads. Allows developers to introspect the server state. diff --git a/docs/en/operations/system-tables/storage_policies.md b/docs/en/operations/system-tables/storage_policies.md index c9d2659c289..adbb2f8434e 100644 --- a/docs/en/operations/system-tables/storage_policies.md +++ b/docs/en/operations/system-tables/storage_policies.md @@ -1,4 +1,4 @@ -# system.storage_policies {#system_tables-storage_policies} +# storage_policies {#system_tables-storage_policies} Contains information about storage policies and volumes defined in the [server configuration](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes_configure). diff --git a/docs/en/operations/system-tables/table_engines.md b/docs/en/operations/system-tables/table_engines.md index 57fb5b0ff37..d3ac8da1d70 100644 --- a/docs/en/operations/system-tables/table_engines.md +++ b/docs/en/operations/system-tables/table_engines.md @@ -1,4 +1,4 @@ -# system.table_engines {#system-table-engines} +# table_engines {#system-table-engines} Contains description of table engines supported by server and their feature support information. diff --git a/docs/en/operations/system-tables/tables.md b/docs/en/operations/system-tables/tables.md index 0ccf69bc048..8286d51aed6 100644 --- a/docs/en/operations/system-tables/tables.md +++ b/docs/en/operations/system-tables/tables.md @@ -1,4 +1,4 @@ -# system.tables {#system-tables} +# tables {#system-tables} Contains metadata of each table that the server knows about. diff --git a/docs/en/operations/system-tables/text_log.md b/docs/en/operations/system-tables/text_log.md index 9ed0aa1ee5b..e4967dc8d0b 100644 --- a/docs/en/operations/system-tables/text_log.md +++ b/docs/en/operations/system-tables/text_log.md @@ -1,4 +1,4 @@ -# system.text_log {#system_tables-text_log} +# text_log {#system_tables-text_log} Contains logging entries. The logging level which goes to this table can be limited to the `text_log.level` server setting. diff --git a/docs/en/operations/system-tables/time_zones.md b/docs/en/operations/system-tables/time_zones.md index 68f16a665cc..899e115152f 100644 --- a/docs/en/operations/system-tables/time_zones.md +++ b/docs/en/operations/system-tables/time_zones.md @@ -1,4 +1,4 @@ -# system.time_zones {#system-time_zones} +# time_zones {#system-time_zones} Contains a list of time zones that are supported by the ClickHouse server. This list of timezones might vary depending on the version of ClickHouse. diff --git a/docs/en/operations/system-tables/trace_log.md b/docs/en/operations/system-tables/trace_log.md index ab08ef7415c..ace5662e919 100644 --- a/docs/en/operations/system-tables/trace_log.md +++ b/docs/en/operations/system-tables/trace_log.md @@ -1,4 +1,4 @@ -# system.trace_log {#system_tables-trace_log} +# trace_log {#system_tables-trace_log} Contains stack traces collected by the sampling query profiler. diff --git a/docs/en/operations/system-tables/users.md b/docs/en/operations/system-tables/users.md index e6ab63b9df5..95691f4497c 100644 --- a/docs/en/operations/system-tables/users.md +++ b/docs/en/operations/system-tables/users.md @@ -1,4 +1,4 @@ -# system.users {#system_tables-users} +# users {#system_tables-users} Contains a list of [user accounts](../../operations/access-rights.md#user-account-management) configured at the server. diff --git a/docs/en/operations/system-tables/zookeeper.md b/docs/en/operations/system-tables/zookeeper.md index 4be12309240..e8232483f6f 100644 --- a/docs/en/operations/system-tables/zookeeper.md +++ b/docs/en/operations/system-tables/zookeeper.md @@ -1,4 +1,4 @@ -# system.zookeeper {#system-zookeeper} +# zookeeper {#system-zookeeper} The table does not exist if ZooKeeper is not configured. Allows reading data from the ZooKeeper cluster defined in the config. The query must either have a ‘path =’ condition or a `path IN` condition set with the `WHERE` clause as shown below. This corresponds to the path of the children in ZooKeeper that you want to get data for. diff --git a/docs/en/operations/system-tables/zookeeper_log.md b/docs/en/operations/system-tables/zookeeper_log.md index f7d86c6689b..919c4245d5d 100644 --- a/docs/en/operations/system-tables/zookeeper_log.md +++ b/docs/en/operations/system-tables/zookeeper_log.md @@ -1,4 +1,4 @@ -# system.zookeeper_log {#system-zookeeper_log} +# zookeeper_log {#system-zookeeper_log} This table contains information about the parameters of the request to the ZooKeeper server and the response from it. diff --git a/docs/en/operations/tips.md b/docs/en/operations/tips.md index c676c54a223..c727c636579 100644 --- a/docs/en/operations/tips.md +++ b/docs/en/operations/tips.md @@ -1,6 +1,6 @@ --- -toc_priority: 58 -toc_title: Usage Recommendations +sidebar_position: 58 +sidebar_label: Usage Recommendations --- # Usage Recommendations {#usage-recommendations} @@ -33,8 +33,9 @@ $ echo 0 | sudo tee /proc/sys/vm/overcommit_memory Use `perf top` to watch the time spent in the kernel for memory management. Permanent huge pages also do not need to be allocated. -!!! warning "Attention" - If your system has less than 16 GB of RAM, you may experience various memory exceptions because default settings do not match this amount of memory. The recommended amount of RAM is 32 GB or more. You can use ClickHouse in a system with a small amount of RAM, even with 2 GB of RAM, but it requires additional tuning and can ingest at a low rate. +:::warning +If your system has less than 16 GB of RAM, you may experience various memory exceptions because default settings do not match this amount of memory. The recommended amount of RAM is 32 GB or more. You can use ClickHouse in a system with a small amount of RAM, even with 2 GB of RAM, but it requires additional tuning and can ingest at a low rate. +::: ## Storage Subsystem {#storage-subsystem} @@ -275,4 +276,4 @@ end script If you use antivirus software configure it to skip folders with Clickhouse datafiles (`/var/lib/clickhouse`) otherwise performance may be reduced and you may experience unexpected errors during data ingestion and background merges. -{## [Original article](https://clickhouse.com/docs/en/operations/tips/) ##} +[Original article](https://clickhouse.com/docs/en/operations/tips/) diff --git a/docs/en/operations/troubleshooting.md b/docs/en/operations/troubleshooting.md index f2695ce8437..e0efe4f57f5 100644 --- a/docs/en/operations/troubleshooting.md +++ b/docs/en/operations/troubleshooting.md @@ -1,6 +1,6 @@ --- -toc_priority: 46 -toc_title: Troubleshooting +sidebar_position: 46 +sidebar_label: Troubleshooting --- # Troubleshooting {#troubleshooting} diff --git a/docs/en/operations/update.md b/docs/en/operations/update.md index ffb646ffce2..fb5fb7803a9 100644 --- a/docs/en/operations/update.md +++ b/docs/en/operations/update.md @@ -1,6 +1,6 @@ --- -toc_priority: 47 -toc_title: ClickHouse Upgrade +sidebar_position: 47 +sidebar_label: ClickHouse Upgrade --- # ClickHouse Upgrade {#clickhouse-upgrade} @@ -15,8 +15,9 @@ $ sudo service clickhouse-server restart If you installed ClickHouse using something other than the recommended `deb` packages, use the appropriate update method. -!!! note "Note" - You can update multiple servers at once as soon as there is no moment when all replicas of one shard are offline. +:::note +You can update multiple servers at once as soon as there is no moment when all replicas of one shard are offline. +::: The upgrade of older version of ClickHouse to specific version: diff --git a/docs/en/operations/utilities/clickhouse-benchmark.md b/docs/en/operations/utilities/clickhouse-benchmark.md index 6aa5ea556fc..3a52ec92dc3 100644 --- a/docs/en/operations/utilities/clickhouse-benchmark.md +++ b/docs/en/operations/utilities/clickhouse-benchmark.md @@ -1,9 +1,9 @@ --- -toc_priority: 61 -toc_title: clickhouse-benchmark +sidebar_position: 61 +sidebar_label: clickhouse-benchmark --- -# clickhouse-benchmark {#clickhouse-benchmark} +# clickhouse-benchmark Connects to a ClickHouse server and repeatedly sends specified queries. diff --git a/docs/en/operations/utilities/clickhouse-compressor.md b/docs/en/operations/utilities/clickhouse-compressor.md index 44a1f052824..2f8f4794ba8 100644 --- a/docs/en/operations/utilities/clickhouse-compressor.md +++ b/docs/en/operations/utilities/clickhouse-compressor.md @@ -1,4 +1,5 @@ -## ClickHouse compressor + +# clickhouse-compressor Simple program for data compression and decompression. diff --git a/docs/en/operations/utilities/clickhouse-copier.md b/docs/en/operations/utilities/clickhouse-copier.md index 6587d45abd9..f152c177992 100644 --- a/docs/en/operations/utilities/clickhouse-copier.md +++ b/docs/en/operations/utilities/clickhouse-copier.md @@ -1,14 +1,15 @@ --- -toc_priority: 59 -toc_title: clickhouse-copier +sidebar_position: 59 +sidebar_label: clickhouse-copier --- -# clickhouse-copier {#clickhouse-copier} +# clickhouse-copier Copies data from the tables in one cluster to tables in another (or the same) cluster. -!!! warning "Warning" - To get a consistent copy, the data in the source tables and partitions should not change during the entire process. +:::warning +To get a consistent copy, the data in the source tables and partitions should not change during the entire process. +::: You can run multiple `clickhouse-copier` instances on different servers to perform the same job. ZooKeeper is used for syncing the processes. diff --git a/docs/en/operations/utilities/clickhouse-format.md b/docs/en/operations/utilities/clickhouse-format.md index 333f127e125..219a170fc23 100644 --- a/docs/en/operations/utilities/clickhouse-format.md +++ b/docs/en/operations/utilities/clickhouse-format.md @@ -1,9 +1,4 @@ ---- -toc_priority: 65 -toc_title: clickhouse-format ---- - -# clickhouse-format {#clickhouse-format} +# clickhouse-format Allows formatting input queries. diff --git a/docs/en/operations/utilities/clickhouse-local.md b/docs/en/operations/utilities/clickhouse-local.md index 9d28dffbc16..3c35ab933e2 100644 --- a/docs/en/operations/utilities/clickhouse-local.md +++ b/docs/en/operations/utilities/clickhouse-local.md @@ -1,9 +1,9 @@ --- -toc_priority: 60 -toc_title: clickhouse-local +sidebar_position: 60 +sidebar_label: clickhouse-local --- -# clickhouse-local {#clickhouse-local} +# clickhouse-local The `clickhouse-local` program enables you to perform fast processing on local files, without having to deploy and configure the ClickHouse server. @@ -13,8 +13,9 @@ Accepts data that represent tables and queries them using [ClickHouse SQL dialec By default `clickhouse-local` does not have access to data on the same host, but it supports loading server configuration using `--config-file` argument. -!!! warning "Warning" - It is not recommended to load production server configuration into `clickhouse-local` because data can be damaged in case of human error. +:::warning +It is not recommended to load production server configuration into `clickhouse-local` because data can be damaged in case of human error. +::: For temporary data, a unique temporary data directory is created by default. diff --git a/docs/en/operations/utilities/clickhouse-obfuscator.md b/docs/en/operations/utilities/clickhouse-obfuscator.md index b01a7624b56..baa0f19dda9 100644 --- a/docs/en/operations/utilities/clickhouse-obfuscator.md +++ b/docs/en/operations/utilities/clickhouse-obfuscator.md @@ -1,4 +1,4 @@ -# ClickHouse obfuscator +# clickhouse-obfuscator A simple tool for table data obfuscation. diff --git a/docs/en/operations/utilities/index.md b/docs/en/operations/utilities/index.md index e307f9fde0c..7fdc783f9c4 100644 --- a/docs/en/operations/utilities/index.md +++ b/docs/en/operations/utilities/index.md @@ -1,10 +1,9 @@ --- -toc_folder_title: Utilities -toc_priority: 56 -toc_title: Overview +sidebar_position: 56 +sidebar_label: Utilities --- -# ClickHouse Utility {#clickhouse-utility} +# ClickHouse Utility - [clickhouse-local](../../operations/utilities/clickhouse-local.md) — Allows running SQL queries on data without starting the ClickHouse server, similar to how `awk` does this. - [clickhouse-copier](../../operations/utilities/clickhouse-copier.md) — Copies (and reshards) data from one cluster to another cluster. From 0af6fdb5765e49c8c3fbebae3e6d031c26a772e6 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Wed, 30 Mar 2022 11:28:21 +0800 Subject: [PATCH 063/239] fix building --- src/Storages/Hive/HiveFile.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Storages/Hive/HiveFile.cpp b/src/Storages/Hive/HiveFile.cpp index d1765da6b28..407d9602b61 100644 --- a/src/Storages/Hive/HiveFile.cpp +++ b/src/Storages/Hive/HiveFile.cpp @@ -135,8 +135,8 @@ void HiveOrcFile::prepareReader() void HiveOrcFile::prepareColumnMapping() { const orc::Type & type = reader->GetRawORCReader()->getType(); - size_t size = type.getSubtypeCount(); - for (size_t pos = 0; pos < size; pos++) + size_t count = type.getSubtypeCount(); + for (size_t pos = 0; pos < count; pos++) { /// Column names in hive is case-insensitive. String column{type.getFieldName(pos)}; @@ -156,9 +156,9 @@ std::unique_ptr HiveOrcFile::buildMinMaxIndex(c if (!statistics) return nullptr; - size_t size = index_names_and_types.size(); + size_t range_num = index_names_and_types.size(); auto idx = std::make_unique(); - idx->hyperrectangle.resize(size); + idx->hyperrectangle.resize(range_num); size_t i = 0; for (const auto & name_type : index_names_and_types) From 560471f991a0231162c7892b0471d8eabbd967b6 Mon Sep 17 00:00:00 2001 From: rfraposa Date: Tue, 29 Mar 2022 22:06:21 -0600 Subject: [PATCH 064/239] Update /sql-reference docs --- docs/en/development/architecture.md | 12 ++--- docs/en/development/build-osx.md | 5 ++- .../en/engines/database-engines/replicated.md | 5 ++- docs/en/example-datasets/nyc-taxi.md | 5 ++- docs/en/example-datasets/ontime.md | 5 ++- docs/en/example-datasets/star-schema.md | 5 ++- docs/en/install.md | 7 +-- docs/en/sql-reference/_category_.yml | 5 +-- .../aggregate-functions/combinators.md | 4 +- .../aggregate-functions/index.md | 7 ++- .../parametric-functions.md | 14 +++--- .../aggregate-functions/reference/any.md | 2 +- .../aggregate-functions/reference/anyheavy.md | 2 +- .../aggregate-functions/reference/anylast.md | 2 +- .../aggregate-functions/reference/argmax.md | 2 +- .../aggregate-functions/reference/argmin.md | 2 +- .../aggregate-functions/reference/avg.md | 2 +- .../reference/avgweighted.md | 2 +- .../reference/categoricalinformationvalue.md | 2 +- .../aggregate-functions/reference/corr.md | 7 +-- .../aggregate-functions/reference/count.md | 2 +- .../aggregate-functions/reference/covarpop.md | 7 +-- .../reference/covarsamp.md | 7 +-- .../aggregate-functions/reference/deltasum.md | 7 +-- .../reference/deltasumtimestamp.md | 2 +- .../aggregate-functions/reference/entropy.md | 2 +- .../reference/exponentialmovingaverage.md | 2 +- .../reference/grouparray.md | 2 +- .../reference/grouparrayinsertat.md | 2 +- .../reference/grouparraymovingavg.md | 2 +- .../reference/grouparraymovingsum.md | 2 +- .../reference/grouparraysample.md | 2 +- .../reference/groupbitand.md | 2 +- .../reference/groupbitmap.md | 2 +- .../reference/groupbitmapand.md | 2 +- .../reference/groupbitmapor.md | 2 +- .../reference/groupbitmapxor.md | 2 +- .../reference/groupbitor.md | 2 +- .../reference/groupbitxor.md | 2 +- .../reference/groupuniqarray.md | 2 +- .../aggregate-functions/reference/index.md | 2 +- .../reference/intervalLengthSum.md | 9 ++-- .../aggregate-functions/reference/kurtpop.md | 2 +- .../aggregate-functions/reference/kurtsamp.md | 2 +- .../reference/mannwhitneyutest.md | 4 +- .../aggregate-functions/reference/max.md | 2 +- .../aggregate-functions/reference/maxmap.md | 2 +- .../reference/meanztest.md | 4 +- .../aggregate-functions/reference/median.md | 2 +- .../aggregate-functions/reference/min.md | 2 +- .../aggregate-functions/reference/minmap.md | 2 +- .../aggregate-functions/reference/quantile.md | 2 +- .../reference/quantilebfloat16.md | 2 +- .../reference/quantiledeterministic.md | 2 +- .../reference/quantileexact.md | 2 +- .../reference/quantileexactweighted.md | 2 +- .../reference/quantiles.md | 2 +- .../reference/quantiletdigest.md | 2 +- .../reference/quantiletdigestweighted.md | 7 +-- .../reference/quantiletiming.md | 12 ++--- .../reference/quantiletimingweighted.md | 12 ++--- .../aggregate-functions/reference/rankCorr.md | 2 +- .../reference/simplelinearregression.md | 2 +- .../aggregate-functions/reference/skewpop.md | 2 +- .../aggregate-functions/reference/skewsamp.md | 2 +- .../aggregate-functions/reference/sparkbar.md | 4 +- .../reference/stddevpop.md | 7 +-- .../reference/stddevsamp.md | 7 +-- .../reference/stochasticlinearregression.md | 2 +- .../reference/stochasticlogisticregression.md | 2 +- .../reference/studentttest.md | 4 +- .../aggregate-functions/reference/sum.md | 2 +- .../aggregate-functions/reference/sumcount.md | 2 +- .../aggregate-functions/reference/sumkahan.md | 2 +- .../aggregate-functions/reference/summap.md | 2 +- .../reference/sumwithoverflow.md | 2 +- .../aggregate-functions/reference/topk.md | 2 +- .../reference/topkweighted.md | 2 +- .../aggregate-functions/reference/uniq.md | 2 +- .../reference/uniqcombined.md | 7 +-- .../reference/uniqcombined64.md | 2 +- .../reference/uniqexact.md | 2 +- .../reference/uniqhll12.md | 2 +- .../reference/uniqthetasketch.md | 2 +- .../aggregate-functions/reference/varpop.md | 7 +-- .../aggregate-functions/reference/varsamp.md | 7 +-- .../reference/welchttest.md | 4 +- docs/en/sql-reference/ansi.md | 9 ++-- .../data-types/aggregatefunction.md | 4 +- docs/en/sql-reference/data-types/array.md | 4 +- docs/en/sql-reference/data-types/boolean.md | 4 +- docs/en/sql-reference/data-types/date.md | 4 +- docs/en/sql-reference/data-types/date32.md | 4 +- docs/en/sql-reference/data-types/datetime.md | 4 +- .../en/sql-reference/data-types/datetime64.md | 4 +- docs/en/sql-reference/data-types/decimal.md | 4 +- .../sql-reference/data-types/domains/index.md | 5 +-- .../sql-reference/data-types/domains/ipv4.md | 4 +- .../sql-reference/data-types/domains/ipv6.md | 4 +- docs/en/sql-reference/data-types/enum.md | 4 +- .../sql-reference/data-types/fixedstring.md | 4 +- docs/en/sql-reference/data-types/float.md | 4 +- docs/en/sql-reference/data-types/geo.md | 9 ++-- docs/en/sql-reference/data-types/index.md | 7 ++- docs/en/sql-reference/data-types/int-uint.md | 4 +- .../data-types/lowcardinality.md | 4 +- docs/en/sql-reference/data-types/map.md | 4 +- .../data-types/multiword-types.md | 4 +- .../nested-data-structures/index.md | 6 +-- .../nested-data-structures/nested.md | 4 +- docs/en/sql-reference/data-types/nullable.md | 9 ++-- .../data-types/simpleaggregatefunction.md | 7 +-- .../special-data-types/expression.md | 4 +- .../data-types/special-data-types/index.md | 6 +-- .../data-types/special-data-types/interval.md | 9 ++-- .../data-types/special-data-types/nothing.md | 4 +- .../data-types/special-data-types/set.md | 4 +- docs/en/sql-reference/data-types/string.md | 4 +- docs/en/sql-reference/data-types/tuple.md | 4 +- docs/en/sql-reference/data-types/uuid.md | 4 +- .../external-dictionaries/_category_.yml | 7 +++ .../external-dicts-dict-hierarchical.md | 6 +-- .../external-dicts-dict-layout.md | 16 ++++--- .../external-dicts-dict-lifetime.md | 6 +-- .../external-dicts-dict-polygon.md | 6 +-- .../external-dicts-dict-sources.md | 36 ++++++++------- .../external-dicts-dict-structure.md | 18 ++++---- .../external-dicts-dict.md | 6 +-- .../external-dictionaries/external-dicts.md | 11 ++--- .../external-dictionaries/index.md | 6 --- docs/en/sql-reference/dictionaries/index.md | 5 +-- .../dictionaries/internal-dicts.md | 4 +- docs/en/sql-reference/distributed-ddl.md | 9 ++-- .../functions/arithmetic-functions.md | 4 +- .../functions/array-functions.md | 19 ++++---- docs/en/sql-reference/functions/array-join.md | 4 +- .../sql-reference/functions/bit-functions.md | 4 +- .../functions/bitmap-functions.md | 4 +- .../functions/comparison-functions.md | 4 +- .../functions/conditional-functions.md | 4 +- .../functions/date-time-functions.md | 14 +++--- .../functions/encoding-functions.md | 14 +++--- .../functions/encryption-functions.md | 4 +- .../functions/ext-dict-functions.md | 9 ++-- docs/en/sql-reference/functions/files.md | 4 +- .../functions/functions-for-nulls.md | 4 +- .../functions/geo/coordinates.md | 4 +- .../en/sql-reference/functions/geo/geohash.md | 12 ++--- docs/en/sql-reference/functions/geo/h3.md | 2 +- docs/en/sql-reference/functions/geo/index.md | 6 +-- docs/en/sql-reference/functions/geo/s2.md | 2 +- .../sql-reference/functions/hash-functions.md | 4 +- .../sql-reference/functions/in-functions.md | 4 +- docs/en/sql-reference/functions/index.md | 7 ++- .../sql-reference/functions/introspection.md | 9 ++-- .../functions/ip-address-functions.md | 4 +- .../sql-reference/functions/json-functions.md | 19 ++++---- .../functions/logical-functions.md | 4 +- .../functions/machine-learning-functions.md | 4 +- .../sql-reference/functions/math-functions.md | 4 +- .../sql-reference/functions/nlp-functions.md | 9 ++-- .../functions/other-functions.md | 30 +++++++------ .../functions/random-functions.md | 9 ++-- .../functions/rounding-functions.md | 4 +- .../functions/splitting-merging-functions.md | 4 +- .../functions/string-functions.md | 9 ++-- .../functions/string-replace-functions.md | 9 ++-- .../functions/string-search-functions.md | 44 +++++++++++-------- .../functions/time-window-functions.md | 4 +- .../functions/tuple-functions.md | 4 +- .../functions/tuple-map-functions.md | 4 +- .../functions/type-conversion-functions.md | 14 +++--- .../sql-reference/functions/url-functions.md | 4 +- .../sql-reference/functions/uuid-functions.md | 4 +- .../functions/ym-dict-functions.md | 4 +- docs/en/sql-reference/index.md | 6 +-- docs/en/sql-reference/operators/exists.md | 5 ++- docs/en/sql-reference/operators/in.md | 5 ++- docs/en/sql-reference/operators/index.md | 15 ++++--- .../sql-reference/statements/alter/column.md | 9 ++-- .../sql-reference/statements/alter/comment.md | 4 +- .../statements/alter/constraint.md | 9 ++-- .../sql-reference/statements/alter/delete.md | 9 ++-- .../sql-reference/statements/alter/index.md | 16 ++++--- .../statements/alter/index/index.md | 9 ++-- .../statements/alter/order-by.md | 9 ++-- .../statements/alter/partition.md | 14 +++--- .../statements/alter/projection.md | 9 ++-- .../sql-reference/statements/alter/quota.md | 4 +- .../en/sql-reference/statements/alter/role.md | 4 +- .../statements/alter/row-policy.md | 4 +- .../statements/alter/sample-by.md | 9 ++-- .../sql-reference/statements/alter/setting.md | 10 ++--- .../statements/alter/settings-profile.md | 4 +- docs/en/sql-reference/statements/alter/ttl.md | 4 +- .../sql-reference/statements/alter/update.md | 9 ++-- .../en/sql-reference/statements/alter/user.md | 4 +- .../en/sql-reference/statements/alter/view.md | 4 +- docs/en/sql-reference/statements/attach.md | 4 +- .../sql-reference/statements/check-table.md | 4 +- .../statements/create/database.md | 4 +- .../statements/create/dictionary.md | 4 +- .../statements/create/function.md | 4 +- .../sql-reference/statements/create/index.md | 7 ++- .../sql-reference/statements/create/quota.md | 4 +- .../sql-reference/statements/create/role.md | 4 +- .../statements/create/row-policy.md | 22 +++++----- .../statements/create/settings-profile.md | 4 +- .../sql-reference/statements/create/table.md | 39 +++++++++------- .../sql-reference/statements/create/user.md | 10 ++--- .../sql-reference/statements/create/view.md | 39 ++++++++-------- .../statements/describe-table.md | 4 +- docs/en/sql-reference/statements/detach.md | 4 +- docs/en/sql-reference/statements/drop.md | 4 +- docs/en/sql-reference/statements/exchange.md | 9 ++-- docs/en/sql-reference/statements/exists.md | 4 +- docs/en/sql-reference/statements/explain.md | 15 ++++--- docs/en/sql-reference/statements/grant.md | 4 +- docs/en/sql-reference/statements/index.md | 7 ++- .../sql-reference/statements/insert-into.md | 6 +-- docs/en/sql-reference/statements/kill.md | 4 +- docs/en/sql-reference/statements/misc.md | 2 +- docs/en/sql-reference/statements/optimize.md | 22 ++++++---- docs/en/sql-reference/statements/rename.md | 9 ++-- docs/en/sql-reference/statements/revoke.md | 4 +- .../en/sql-reference/statements/select/all.md | 2 +- .../statements/select/array-join.md | 2 +- .../statements/select/distinct.md | 2 +- .../sql-reference/statements/select/except.md | 2 +- .../sql-reference/statements/select/format.md | 2 +- .../sql-reference/statements/select/from.md | 2 +- .../statements/select/group-by.md | 22 ++++++---- .../sql-reference/statements/select/having.md | 2 +- .../sql-reference/statements/select/index.md | 8 ++-- .../statements/select/intersect.md | 2 +- .../statements/select/into-outfile.md | 2 +- .../sql-reference/statements/select/join.md | 17 ++++--- .../statements/select/limit-by.md | 7 +-- .../sql-reference/statements/select/limit.md | 7 +-- .../sql-reference/statements/select/offset.md | 12 ++--- .../statements/select/order-by.md | 2 +- .../statements/select/prewhere.md | 7 +-- .../sql-reference/statements/select/sample.md | 7 +-- .../sql-reference/statements/select/union.md | 2 +- .../sql-reference/statements/select/where.md | 7 +-- .../sql-reference/statements/select/with.md | 2 +- docs/en/sql-reference/statements/set-role.md | 4 +- docs/en/sql-reference/statements/set.md | 4 +- docs/en/sql-reference/statements/show.md | 9 ++-- docs/en/sql-reference/statements/system.md | 14 +++--- docs/en/sql-reference/statements/truncate.md | 9 ++-- docs/en/sql-reference/statements/use.md | 4 +- docs/en/sql-reference/statements/watch.md | 16 +++---- docs/en/sql-reference/syntax.md | 4 +- .../sql-reference/table-functions/cluster.md | 9 ++-- .../table-functions/dictionary.md | 4 +- docs/en/sql-reference/table-functions/file.md | 9 ++-- .../sql-reference/table-functions/generate.md | 4 +- docs/en/sql-reference/table-functions/hdfs.md | 9 ++-- .../table-functions/hdfsCluster.md | 9 ++-- .../en/sql-reference/table-functions/index.md | 12 ++--- .../en/sql-reference/table-functions/input.md | 4 +- docs/en/sql-reference/table-functions/jdbc.md | 4 +- .../en/sql-reference/table-functions/merge.md | 4 +- .../en/sql-reference/table-functions/mysql.md | 9 ++-- docs/en/sql-reference/table-functions/null.md | 4 +- .../sql-reference/table-functions/numbers.md | 4 +- docs/en/sql-reference/table-functions/odbc.md | 4 +- .../table-functions/postgresql.md | 14 +++--- .../sql-reference/table-functions/remote.md | 4 +- docs/en/sql-reference/table-functions/s3.md | 9 ++-- .../table-functions/s3Cluster.md | 9 ++-- .../sql-reference/table-functions/sqlite.md | 4 +- docs/en/sql-reference/table-functions/url.md | 4 +- docs/en/sql-reference/table-functions/view.md | 4 +- .../sql-reference/window-functions/index.md | 4 +- 276 files changed, 903 insertions(+), 787 deletions(-) create mode 100644 docs/en/sql-reference/dictionaries/external-dictionaries/_category_.yml delete mode 100644 docs/en/sql-reference/dictionaries/external-dictionaries/index.md diff --git a/docs/en/development/architecture.md b/docs/en/development/architecture.md index d824ace0c65..b5cb6c321ac 100644 --- a/docs/en/development/architecture.md +++ b/docs/en/development/architecture.md @@ -155,8 +155,9 @@ The server initializes the `Context` class with the necessary environment for qu We maintain full backward and forward compatibility for the server TCP protocol: old clients can talk to new servers, and new clients can talk to old servers. But we do not want to maintain it eternally, and we are removing support for old versions after about one year. -!!! note "Note" - For most external applications, we recommend using the HTTP interface because it is simple and easy to use. The TCP protocol is more tightly linked to internal data structures: it uses an internal format for passing blocks of data, and it uses custom framing for compressed data. We haven’t released a C library for that protocol because it requires linking most of the ClickHouse codebase, which is not practical. +:::note +For most external applications, we recommend using the HTTP interface because it is simple and easy to use. The TCP protocol is more tightly linked to internal data structures: it uses an internal format for passing blocks of data, and it uses custom framing for compressed data. We haven’t released a C library for that protocol because it requires linking most of the ClickHouse codebase, which is not practical. +::: ## Distributed Query Execution {#distributed-query-execution} @@ -194,7 +195,8 @@ Replication is physical: only compressed parts are transferred between nodes, no Besides, each replica stores its state in ZooKeeper as the set of parts and its checksums. When the state on the local filesystem diverges from the reference state in ZooKeeper, the replica restores its consistency by downloading missing and broken parts from other replicas. When there is some unexpected or broken data in the local filesystem, ClickHouse does not remove it, but moves it to a separate directory and forgets it. -!!! note "Note" - The ClickHouse cluster consists of independent shards, and each shard consists of replicas. The cluster is **not elastic**, so after adding a new shard, data is not rebalanced between shards automatically. Instead, the cluster load is supposed to be adjusted to be uneven. This implementation gives you more control, and it is ok for relatively small clusters, such as tens of nodes. But for clusters with hundreds of nodes that we are using in production, this approach becomes a significant drawback. We should implement a table engine that spans across the cluster with dynamically replicated regions that could be split and balanced between clusters automatically. +:::note +The ClickHouse cluster consists of independent shards, and each shard consists of replicas. The cluster is **not elastic**, so after adding a new shard, data is not rebalanced between shards automatically. Instead, the cluster load is supposed to be adjusted to be uneven. This implementation gives you more control, and it is ok for relatively small clusters, such as tens of nodes. But for clusters with hundreds of nodes that we are using in production, this approach becomes a significant drawback. We should implement a table engine that spans across the cluster with dynamically replicated regions that could be split and balanced between clusters automatically. +::: -{## [Original article](https://clickhouse.com/docs/en/development/architecture/) ##} +[Original article](https://clickhouse.com/docs/en/development/architecture/) diff --git a/docs/en/development/build-osx.md b/docs/en/development/build-osx.md index 5d5706f6e6b..05ef10ad020 100644 --- a/docs/en/development/build-osx.md +++ b/docs/en/development/build-osx.md @@ -94,8 +94,9 @@ cmake --build . --config RelWithDebInfo If you intend to run `clickhouse-server`, make sure to increase the system’s maxfiles variable. -!!! info "Note" - You’ll need to use sudo. +:::note +You’ll need to use sudo. +::: To do so, create the `/Library/LaunchDaemons/limit.maxfiles.plist` file with the following content: diff --git a/docs/en/engines/database-engines/replicated.md b/docs/en/engines/database-engines/replicated.md index 07d6fcd9ece..63d955dc889 100644 --- a/docs/en/engines/database-engines/replicated.md +++ b/docs/en/engines/database-engines/replicated.md @@ -20,8 +20,9 @@ One ClickHouse server can have multiple replicated databases running and updatin - `shard_name` — Shard name. Database replicas are grouped into shards by `shard_name`. - `replica_name` — Replica name. Replica names must be different for all replicas of the same shard. -!!! note "Warning" - For [ReplicatedMergeTree](../table-engines/mergetree-family/replication.md#table_engines-replication) tables if no arguments provided, then default arguments are used: `/clickhouse/tables/{uuid}/{shard}` and `{replica}`. These can be changed in the server settings [default_replica_path](../../operations/server-configuration-parameters/settings.md#default_replica_path) and [default_replica_name](../../operations/server-configuration-parameters/settings.md#default_replica_name). Macro `{uuid}` is unfolded to table's uuid, `{shard}` and `{replica}` are unfolded to values from server config, not from database engine arguments. But in the future, it will be possible to use `shard_name` and `replica_name` of Replicated database. +:::warning +For [ReplicatedMergeTree](../table-engines/mergetree-family/replication.md#table_engines-replication) tables if no arguments provided, then default arguments are used: `/clickhouse/tables/{uuid}/{shard}` and `{replica}`. These can be changed in the server settings [default_replica_path](../../operations/server-configuration-parameters/settings.md#default_replica_path) and [default_replica_name](../../operations/server-configuration-parameters/settings.md#default_replica_name). Macro `{uuid}` is unfolded to table's uuid, `{shard}` and `{replica}` are unfolded to values from server config, not from database engine arguments. But in the future, it will be possible to use `shard_name` and `replica_name` of Replicated database. +::: ## Specifics and Recommendations {#specifics-and-recommendations} diff --git a/docs/en/example-datasets/nyc-taxi.md b/docs/en/example-datasets/nyc-taxi.md index da7be71d46b..270aeb4929c 100644 --- a/docs/en/example-datasets/nyc-taxi.md +++ b/docs/en/example-datasets/nyc-taxi.md @@ -290,8 +290,9 @@ $ sudo service clickhouse-server restart $ clickhouse-client --query "select count(*) from datasets.trips_mergetree" ``` -!!! info "Info" - If you will run the queries described below, you have to use the full table name, `datasets.trips_mergetree`. +:::info +If you will run the queries described below, you have to use the full table name, `datasets.trips_mergetree`. +::: ## Results on Single Server {#results-on-single-server} diff --git a/docs/en/example-datasets/ontime.md b/docs/en/example-datasets/ontime.md index 51df6186bd5..bb3c3644972 100644 --- a/docs/en/example-datasets/ontime.md +++ b/docs/en/example-datasets/ontime.md @@ -156,8 +156,9 @@ $ sudo service clickhouse-server restart $ clickhouse-client --query "select count(*) from datasets.ontime" ``` -!!! info "Info" - If you will run the queries described below, you have to use the full table name, `datasets.ontime`. +:::note +If you will run the queries described below, you have to use the full table name, `datasets.ontime`. +::: ## Queries {#queries} diff --git a/docs/en/example-datasets/star-schema.md b/docs/en/example-datasets/star-schema.md index a8949ef74b9..35ff492c360 100644 --- a/docs/en/example-datasets/star-schema.md +++ b/docs/en/example-datasets/star-schema.md @@ -17,8 +17,9 @@ $ make Generating data: -!!! warning "Attention" - With `-s 100` dbgen generates 600 million rows (67 GB), while while `-s 1000` it generates 6 billion rows (which takes a lot of time) +:::warning +With `-s 100` dbgen generates 600 million rows (67 GB), while while `-s 1000` it generates 6 billion rows (which takes a lot of time) +::: ``` bash $ ./dbgen -s 1000 -T c diff --git a/docs/en/install.md b/docs/en/install.md index 35021b5bb8d..a5405143d77 100644 --- a/docs/en/install.md +++ b/docs/en/install.md @@ -69,9 +69,10 @@ You can also download and install packages manually from [here](https://packages - `clickhouse-client` — Creates a symbolic link for `clickhouse-client` and other client-related tools. and installs client configuration files. - `clickhouse-common-static-dbg` — Installs ClickHouse compiled binary files with debug info. -!!! attention "Attention" - If you need to install specific version of ClickHouse you have to install all packages with the same version: - `sudo apt-get install clickhouse-server=21.8.5.7 clickhouse-client=21.8.5.7 clickhouse-common-static=21.8.5.7` +:::info +If you need to install specific version of ClickHouse you have to install all packages with the same version: +`sudo apt-get install clickhouse-server=21.8.5.7 clickhouse-client=21.8.5.7 clickhouse-common-static=21.8.5.7` +::: ### From RPM Packages {#from-rpm-packages} diff --git a/docs/en/sql-reference/_category_.yml b/docs/en/sql-reference/_category_.yml index cfddcf46548..049ba20f1f5 100644 --- a/docs/en/sql-reference/_category_.yml +++ b/docs/en/sql-reference/_category_.yml @@ -1,7 +1,4 @@ position: 15 label: 'SQL Reference' collapsible: true -collapsed: true -link: - type: generated-index - title: SQL Reference \ No newline at end of file +collapsed: true \ No newline at end of file diff --git a/docs/en/sql-reference/aggregate-functions/combinators.md b/docs/en/sql-reference/aggregate-functions/combinators.md index daa93adaaae..6a8c178919c 100644 --- a/docs/en/sql-reference/aggregate-functions/combinators.md +++ b/docs/en/sql-reference/aggregate-functions/combinators.md @@ -1,6 +1,6 @@ --- -toc_priority: 37 -toc_title: Combinators +sidebar_position: 37 +sidebar_label: Combinators --- # Aggregate Function Combinators {#aggregate_functions_combinators} diff --git a/docs/en/sql-reference/aggregate-functions/index.md b/docs/en/sql-reference/aggregate-functions/index.md index d2b46f6de53..1e6cc0f88c2 100644 --- a/docs/en/sql-reference/aggregate-functions/index.md +++ b/docs/en/sql-reference/aggregate-functions/index.md @@ -1,10 +1,9 @@ --- -toc_folder_title: Aggregate Functions -toc_priority: 33 -toc_title: Introduction +sidebar_label: Aggregate Functions +sidebar_position: 33 --- -# Aggregate Functions {#aggregate-functions} +# Aggregate Functions Aggregate functions work in the [normal](http://www.sql-tutorial.com/sql-aggregate-functions-sql-tutorial) way as expected by database experts. diff --git a/docs/en/sql-reference/aggregate-functions/parametric-functions.md b/docs/en/sql-reference/aggregate-functions/parametric-functions.md index 3adedd0ae70..7708bcb8129 100644 --- a/docs/en/sql-reference/aggregate-functions/parametric-functions.md +++ b/docs/en/sql-reference/aggregate-functions/parametric-functions.md @@ -1,6 +1,6 @@ --- -toc_priority: 38 -toc_title: Parametric +sidebar_position: 38 +sidebar_label: Parametric --- # Parametric Aggregate Functions {#aggregate_functions_parametric} @@ -89,8 +89,9 @@ Checks whether the sequence contains an event chain that matches the pattern. sequenceMatch(pattern)(timestamp, cond1, cond2, ...) ``` -!!! warning "Warning" - Events that occur at the same second may lay in the sequence in an undefined order affecting the result. +:::warning +Events that occur at the same second may lay in the sequence in an undefined order affecting the result. +::: **Arguments** @@ -174,8 +175,9 @@ SELECT sequenceMatch('(?1)(?2)')(time, number = 1, number = 2, number = 4) FROM Counts the number of event chains that matched the pattern. The function searches event chains that do not overlap. It starts to search for the next chain after the current chain is matched. -!!! warning "Warning" - Events that occur at the same second may lay in the sequence in an undefined order affecting the result. +:::warning +Events that occur at the same second may lay in the sequence in an undefined order affecting the result. +::: ``` sql sequenceCount(pattern)(timestamp, cond1, cond2, ...) diff --git a/docs/en/sql-reference/aggregate-functions/reference/any.md b/docs/en/sql-reference/aggregate-functions/reference/any.md index 16306597983..3b5539c5b8d 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/any.md +++ b/docs/en/sql-reference/aggregate-functions/reference/any.md @@ -1,5 +1,5 @@ --- -toc_priority: 6 +sidebar_position: 6 --- # any {#agg_function-any} diff --git a/docs/en/sql-reference/aggregate-functions/reference/anyheavy.md b/docs/en/sql-reference/aggregate-functions/reference/anyheavy.md index 5c9d6875c51..491754453e3 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/anyheavy.md +++ b/docs/en/sql-reference/aggregate-functions/reference/anyheavy.md @@ -1,5 +1,5 @@ --- -toc_priority: 103 +sidebar_position: 103 --- # anyHeavy {#anyheavyx} diff --git a/docs/en/sql-reference/aggregate-functions/reference/anylast.md b/docs/en/sql-reference/aggregate-functions/reference/anylast.md index 43ac72318f2..2a01a587f70 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/anylast.md +++ b/docs/en/sql-reference/aggregate-functions/reference/anylast.md @@ -1,5 +1,5 @@ --- -toc_priority: 104 +sidebar_position: 104 --- ## anyLast {#anylastx} diff --git a/docs/en/sql-reference/aggregate-functions/reference/argmax.md b/docs/en/sql-reference/aggregate-functions/reference/argmax.md index 0630e2f585e..f09bcd0bba2 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/argmax.md +++ b/docs/en/sql-reference/aggregate-functions/reference/argmax.md @@ -1,5 +1,5 @@ --- -toc_priority: 106 +sidebar_position: 106 --- # argMax {#agg-function-argmax} diff --git a/docs/en/sql-reference/aggregate-functions/reference/argmin.md b/docs/en/sql-reference/aggregate-functions/reference/argmin.md index a259a76b7d7..926fda5a512 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/argmin.md +++ b/docs/en/sql-reference/aggregate-functions/reference/argmin.md @@ -1,5 +1,5 @@ --- -toc_priority: 105 +sidebar_position: 105 --- # argMin {#agg-function-argmin} diff --git a/docs/en/sql-reference/aggregate-functions/reference/avg.md b/docs/en/sql-reference/aggregate-functions/reference/avg.md index 9a22faedf7c..b7b5e9fbed4 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/avg.md +++ b/docs/en/sql-reference/aggregate-functions/reference/avg.md @@ -1,5 +1,5 @@ --- -toc_priority: 5 +sidebar_position: 5 --- # avg {#agg_function-avg} diff --git a/docs/en/sql-reference/aggregate-functions/reference/avgweighted.md b/docs/en/sql-reference/aggregate-functions/reference/avgweighted.md index 5f4d846e81b..126c0c2f1d7 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/avgweighted.md +++ b/docs/en/sql-reference/aggregate-functions/reference/avgweighted.md @@ -1,5 +1,5 @@ --- -toc_priority: 107 +sidebar_position: 107 --- # avgWeighted {#avgweighted} diff --git a/docs/en/sql-reference/aggregate-functions/reference/categoricalinformationvalue.md b/docs/en/sql-reference/aggregate-functions/reference/categoricalinformationvalue.md index 2e9001dec19..e836dbe868a 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/categoricalinformationvalue.md +++ b/docs/en/sql-reference/aggregate-functions/reference/categoricalinformationvalue.md @@ -1,5 +1,5 @@ --- -toc_priority: 250 +sidebar_position: 250 --- # categoricalInformationValue {#categoricalinformationvalue} diff --git a/docs/en/sql-reference/aggregate-functions/reference/corr.md b/docs/en/sql-reference/aggregate-functions/reference/corr.md index 88f9295a8f2..c6d7fd5baed 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/corr.md +++ b/docs/en/sql-reference/aggregate-functions/reference/corr.md @@ -1,5 +1,5 @@ --- -toc_priority: 107 +sidebar_position: 107 --- # corr {#corrx-y} @@ -8,5 +8,6 @@ Syntax: `corr(x, y)` Calculates the Pearson correlation coefficient: `Σ((x - x̅)(y - y̅)) / sqrt(Σ((x - x̅)^2) * Σ((y - y̅)^2))`. -!!! note "Note" - This function uses a numerically unstable algorithm. If you need [numerical stability](https://en.wikipedia.org/wiki/Numerical_stability) in calculations, use the `corrStable` function. It works slower but provides a lower computational error. +:::note +This function uses a numerically unstable algorithm. If you need [numerical stability](https://en.wikipedia.org/wiki/Numerical_stability) in calculations, use the `corrStable` function. It works slower but provides a lower computational error. +::: \ No newline at end of file diff --git a/docs/en/sql-reference/aggregate-functions/reference/count.md b/docs/en/sql-reference/aggregate-functions/reference/count.md index 073fd267c42..8df4aef9d03 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/count.md +++ b/docs/en/sql-reference/aggregate-functions/reference/count.md @@ -1,5 +1,5 @@ --- -toc_priority: 1 +sidebar_position: 1 --- # count {#agg_function-count} diff --git a/docs/en/sql-reference/aggregate-functions/reference/covarpop.md b/docs/en/sql-reference/aggregate-functions/reference/covarpop.md index 2a7d805763e..363a98c3f16 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/covarpop.md +++ b/docs/en/sql-reference/aggregate-functions/reference/covarpop.md @@ -1,5 +1,5 @@ --- -toc_priority: 36 +sidebar_position: 36 --- # covarPop {#covarpop} @@ -8,5 +8,6 @@ Syntax: `covarPop(x, y)` Calculates the value of `Σ((x - x̅)(y - y̅)) / n`. -!!! note "Note" - This function uses a numerically unstable algorithm. If you need [numerical stability](https://en.wikipedia.org/wiki/Numerical_stability) in calculations, use the `covarPopStable` function. It works slower but provides a lower computational error. +:::note +This function uses a numerically unstable algorithm. If you need [numerical stability](https://en.wikipedia.org/wiki/Numerical_stability) in calculations, use the `covarPopStable` function. It works slower but provides a lower computational error. +::: \ No newline at end of file diff --git a/docs/en/sql-reference/aggregate-functions/reference/covarsamp.md b/docs/en/sql-reference/aggregate-functions/reference/covarsamp.md index 4bdb1b02d40..977b3f3b5b4 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/covarsamp.md +++ b/docs/en/sql-reference/aggregate-functions/reference/covarsamp.md @@ -1,5 +1,5 @@ --- -toc_priority: 37 +sidebar_position: 37 --- # covarSamp {#covarsamp} @@ -8,5 +8,6 @@ Calculates the value of `Σ((x - x̅)(y - y̅)) / (n - 1)`. Returns Float64. When `n <= 1`, returns +∞. -!!! note "Note" - This function uses a numerically unstable algorithm. If you need [numerical stability](https://en.wikipedia.org/wiki/Numerical_stability) in calculations, use the `covarSampStable` function. It works slower but provides a lower computational error. +:::note +This function uses a numerically unstable algorithm. If you need [numerical stability](https://en.wikipedia.org/wiki/Numerical_stability) in calculations, use the `covarSampStable` function. It works slower but provides a lower computational error. +::: \ No newline at end of file diff --git a/docs/en/sql-reference/aggregate-functions/reference/deltasum.md b/docs/en/sql-reference/aggregate-functions/reference/deltasum.md index 2945084db77..ac35938e26d 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/deltasum.md +++ b/docs/en/sql-reference/aggregate-functions/reference/deltasum.md @@ -1,13 +1,14 @@ --- -toc_priority: 141 +sidebar_position: 141 --- # deltaSum {#agg_functions-deltasum} Sums the arithmetic difference between consecutive rows. If the difference is negative, it is ignored. -!!! info "Note" - The underlying data must be sorted for this function to work properly. If you would like to use this function in a [materialized view](../../../sql-reference/statements/create/view.md#materialized), you most likely want to use the [deltaSumTimestamp](../../../sql-reference/aggregate-functions/reference/deltasumtimestamp.md#agg_functions-deltasumtimestamp) method instead. +:::note +The underlying data must be sorted for this function to work properly. If you would like to use this function in a [materialized view](../../../sql-reference/statements/create/view.md#materialized), you most likely want to use the [deltaSumTimestamp](../../../sql-reference/aggregate-functions/reference/deltasumtimestamp.md#agg_functions-deltasumtimestamp) method instead. +::: **Syntax** diff --git a/docs/en/sql-reference/aggregate-functions/reference/deltasumtimestamp.md b/docs/en/sql-reference/aggregate-functions/reference/deltasumtimestamp.md index 7238f73bc0d..e1024e58328 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/deltasumtimestamp.md +++ b/docs/en/sql-reference/aggregate-functions/reference/deltasumtimestamp.md @@ -1,5 +1,5 @@ --- -toc_priority: 141 +sidebar_position: 141 --- # deltaSumTimestamp {#agg_functions-deltasumtimestamp} diff --git a/docs/en/sql-reference/aggregate-functions/reference/entropy.md b/docs/en/sql-reference/aggregate-functions/reference/entropy.md index 5ebb678e6b5..9f1576c3ed8 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/entropy.md +++ b/docs/en/sql-reference/aggregate-functions/reference/entropy.md @@ -1,5 +1,5 @@ --- -toc_priority: 302 +sidebar_position: 302 --- # entropy {#entropy} diff --git a/docs/en/sql-reference/aggregate-functions/reference/exponentialmovingaverage.md b/docs/en/sql-reference/aggregate-functions/reference/exponentialmovingaverage.md index cfc9b6cd58e..2337a0c8dab 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/exponentialmovingaverage.md +++ b/docs/en/sql-reference/aggregate-functions/reference/exponentialmovingaverage.md @@ -1,5 +1,5 @@ --- -toc_priority: 108 +sidebar_position: 108 --- ## exponentialMovingAverage {#exponential-moving-average} diff --git a/docs/en/sql-reference/aggregate-functions/reference/grouparray.md b/docs/en/sql-reference/aggregate-functions/reference/grouparray.md index 86b7b83022b..348ac98c75b 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/grouparray.md +++ b/docs/en/sql-reference/aggregate-functions/reference/grouparray.md @@ -1,5 +1,5 @@ --- -toc_priority: 110 +sidebar_position: 110 --- # groupArray {#agg_function-grouparray} diff --git a/docs/en/sql-reference/aggregate-functions/reference/grouparrayinsertat.md b/docs/en/sql-reference/aggregate-functions/reference/grouparrayinsertat.md index d29550b007e..0699326725e 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/grouparrayinsertat.md +++ b/docs/en/sql-reference/aggregate-functions/reference/grouparrayinsertat.md @@ -1,5 +1,5 @@ --- -toc_priority: 112 +sidebar_position: 112 --- # groupArrayInsertAt {#grouparrayinsertat} diff --git a/docs/en/sql-reference/aggregate-functions/reference/grouparraymovingavg.md b/docs/en/sql-reference/aggregate-functions/reference/grouparraymovingavg.md index c732efecf58..dc3cc74721e 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/grouparraymovingavg.md +++ b/docs/en/sql-reference/aggregate-functions/reference/grouparraymovingavg.md @@ -1,5 +1,5 @@ --- -toc_priority: 114 +sidebar_position: 114 --- # groupArrayMovingAvg {#agg_function-grouparraymovingavg} diff --git a/docs/en/sql-reference/aggregate-functions/reference/grouparraymovingsum.md b/docs/en/sql-reference/aggregate-functions/reference/grouparraymovingsum.md index c3dfeda850e..563280b7dec 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/grouparraymovingsum.md +++ b/docs/en/sql-reference/aggregate-functions/reference/grouparraymovingsum.md @@ -1,5 +1,5 @@ --- -toc_priority: 113 +sidebar_position: 113 --- # groupArrayMovingSum {#agg_function-grouparraymovingsum} diff --git a/docs/en/sql-reference/aggregate-functions/reference/grouparraysample.md b/docs/en/sql-reference/aggregate-functions/reference/grouparraysample.md index bd170ead577..f0406ddc93c 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/grouparraysample.md +++ b/docs/en/sql-reference/aggregate-functions/reference/grouparraysample.md @@ -1,5 +1,5 @@ --- -toc_priority: 114 +sidebar_position: 114 --- # groupArraySample {#grouparraysample} diff --git a/docs/en/sql-reference/aggregate-functions/reference/groupbitand.md b/docs/en/sql-reference/aggregate-functions/reference/groupbitand.md index 1275ad7536c..0ebb9aec495 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/groupbitand.md +++ b/docs/en/sql-reference/aggregate-functions/reference/groupbitand.md @@ -1,5 +1,5 @@ --- -toc_priority: 125 +sidebar_position: 125 --- # groupBitAnd {#groupbitand} diff --git a/docs/en/sql-reference/aggregate-functions/reference/groupbitmap.md b/docs/en/sql-reference/aggregate-functions/reference/groupbitmap.md index 9317ef98783..7f1fee6a9f0 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/groupbitmap.md +++ b/docs/en/sql-reference/aggregate-functions/reference/groupbitmap.md @@ -1,5 +1,5 @@ --- -toc_priority: 128 +sidebar_position: 128 --- # groupBitmap {#groupbitmap} diff --git a/docs/en/sql-reference/aggregate-functions/reference/groupbitmapand.md b/docs/en/sql-reference/aggregate-functions/reference/groupbitmapand.md index f59bb541a42..89c94547f8b 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/groupbitmapand.md +++ b/docs/en/sql-reference/aggregate-functions/reference/groupbitmapand.md @@ -1,5 +1,5 @@ --- -toc_priority: 129 +sidebar_position: 129 --- # groupBitmapAnd {#groupbitmapand} diff --git a/docs/en/sql-reference/aggregate-functions/reference/groupbitmapor.md b/docs/en/sql-reference/aggregate-functions/reference/groupbitmapor.md index d3f40f63f65..172a3bb29ac 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/groupbitmapor.md +++ b/docs/en/sql-reference/aggregate-functions/reference/groupbitmapor.md @@ -1,5 +1,5 @@ --- -toc_priority: 130 +sidebar_position: 130 --- # groupBitmapOr {#groupbitmapor} diff --git a/docs/en/sql-reference/aggregate-functions/reference/groupbitmapxor.md b/docs/en/sql-reference/aggregate-functions/reference/groupbitmapxor.md index cbe01e08145..52c45815cc5 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/groupbitmapxor.md +++ b/docs/en/sql-reference/aggregate-functions/reference/groupbitmapxor.md @@ -1,5 +1,5 @@ --- -toc_priority: 131 +sidebar_position: 131 --- # groupBitmapXor {#groupbitmapxor} diff --git a/docs/en/sql-reference/aggregate-functions/reference/groupbitor.md b/docs/en/sql-reference/aggregate-functions/reference/groupbitor.md index 24077de0adc..c1ee1c40894 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/groupbitor.md +++ b/docs/en/sql-reference/aggregate-functions/reference/groupbitor.md @@ -1,5 +1,5 @@ --- -toc_priority: 126 +sidebar_position: 126 --- # groupBitOr {#groupbitor} diff --git a/docs/en/sql-reference/aggregate-functions/reference/groupbitxor.md b/docs/en/sql-reference/aggregate-functions/reference/groupbitxor.md index 4b8323f92db..472bcdf65c1 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/groupbitxor.md +++ b/docs/en/sql-reference/aggregate-functions/reference/groupbitxor.md @@ -1,5 +1,5 @@ --- -toc_priority: 127 +sidebar_position: 127 --- # groupBitXor {#groupbitxor} diff --git a/docs/en/sql-reference/aggregate-functions/reference/groupuniqarray.md b/docs/en/sql-reference/aggregate-functions/reference/groupuniqarray.md index 537212e5b94..9b5058032e5 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/groupuniqarray.md +++ b/docs/en/sql-reference/aggregate-functions/reference/groupuniqarray.md @@ -1,5 +1,5 @@ --- -toc_priority: 111 +sidebar_position: 111 --- # groupUniqArray {#groupuniqarray} diff --git a/docs/en/sql-reference/aggregate-functions/reference/index.md b/docs/en/sql-reference/aggregate-functions/reference/index.md index 59befed8785..cd71bca2556 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/index.md +++ b/docs/en/sql-reference/aggregate-functions/reference/index.md @@ -1,6 +1,6 @@ --- toc_folder_title: Reference -toc_priority: 36 +sidebar_position: 36 toc_hidden: true --- diff --git a/docs/en/sql-reference/aggregate-functions/reference/intervalLengthSum.md b/docs/en/sql-reference/aggregate-functions/reference/intervalLengthSum.md index 05adbb2ffe8..33c5686cbbc 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/intervalLengthSum.md +++ b/docs/en/sql-reference/aggregate-functions/reference/intervalLengthSum.md @@ -1,6 +1,6 @@ --- -toc_priority: 146 -toc_title: intervalLengthSum +sidebar_position: 146 +sidebar_label: intervalLengthSum --- # intervalLengthSum {#agg_function-intervallengthsum} @@ -18,8 +18,9 @@ intervalLengthSum(start, end) - `start` — The starting value of the interval. [Int32](../../../sql-reference/data-types/int-uint.md#uint8-uint16-uint32-uint64-int8-int16-int32-int64), [Int64](../../../sql-reference/data-types/int-uint.md#uint8-uint16-uint32-uint64-int8-int16-int32-int64), [UInt32](../../../sql-reference/data-types/int-uint.md#uint8-uint16-uint32-uint64-int8-int16-int32-int64), [UInt64](../../../sql-reference/data-types/int-uint.md#uint8-uint16-uint32-uint64-int8-int16-int32-int64), [Float32](../../../sql-reference/data-types/float.md#float32-float64), [Float64](../../../sql-reference/data-types/float.md#float32-float64), [DateTime](../../../sql-reference/data-types/datetime.md#data_type-datetime) or [Date](../../../sql-reference/data-types/date.md#data_type-date). - `end` — The ending value of the interval. [Int32](../../../sql-reference/data-types/int-uint.md#uint8-uint16-uint32-uint64-int8-int16-int32-int64), [Int64](../../../sql-reference/data-types/int-uint.md#uint8-uint16-uint32-uint64-int8-int16-int32-int64), [UInt32](../../../sql-reference/data-types/int-uint.md#uint8-uint16-uint32-uint64-int8-int16-int32-int64), [UInt64](../../../sql-reference/data-types/int-uint.md#uint8-uint16-uint32-uint64-int8-int16-int32-int64), [Float32](../../../sql-reference/data-types/float.md#float32-float64), [Float64](../../../sql-reference/data-types/float.md#float32-float64), [DateTime](../../../sql-reference/data-types/datetime.md#data_type-datetime) or [Date](../../../sql-reference/data-types/date.md#data_type-date). -!!! info "Note" - Arguments must be of the same data type. Otherwise, an exception will be thrown. +:::note +Arguments must be of the same data type. Otherwise, an exception will be thrown. +::: **Returned value** diff --git a/docs/en/sql-reference/aggregate-functions/reference/kurtpop.md b/docs/en/sql-reference/aggregate-functions/reference/kurtpop.md index c51c4b92e74..5640e69ba7c 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/kurtpop.md +++ b/docs/en/sql-reference/aggregate-functions/reference/kurtpop.md @@ -1,5 +1,5 @@ --- -toc_priority: 153 +sidebar_position: 153 --- # kurtPop {#kurtpop} diff --git a/docs/en/sql-reference/aggregate-functions/reference/kurtsamp.md b/docs/en/sql-reference/aggregate-functions/reference/kurtsamp.md index 0ee40138adc..c0768edaf2d 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/kurtsamp.md +++ b/docs/en/sql-reference/aggregate-functions/reference/kurtsamp.md @@ -1,5 +1,5 @@ --- -toc_priority: 154 +sidebar_position: 154 --- # kurtSamp {#kurtsamp} diff --git a/docs/en/sql-reference/aggregate-functions/reference/mannwhitneyutest.md b/docs/en/sql-reference/aggregate-functions/reference/mannwhitneyutest.md index fe97f7edbf8..32e56b8de10 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/mannwhitneyutest.md +++ b/docs/en/sql-reference/aggregate-functions/reference/mannwhitneyutest.md @@ -1,6 +1,6 @@ --- -toc_priority: 310 -toc_title: mannWhitneyUTest +sidebar_position: 310 +sidebar_label: mannWhitneyUTest --- # mannWhitneyUTest {#mannwhitneyutest} diff --git a/docs/en/sql-reference/aggregate-functions/reference/max.md b/docs/en/sql-reference/aggregate-functions/reference/max.md index 25173a48906..845d0c5ecee 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/max.md +++ b/docs/en/sql-reference/aggregate-functions/reference/max.md @@ -1,5 +1,5 @@ --- -toc_priority: 3 +sidebar_position: 3 --- # max {#agg_function-max} diff --git a/docs/en/sql-reference/aggregate-functions/reference/maxmap.md b/docs/en/sql-reference/aggregate-functions/reference/maxmap.md index c62502cf46e..243a3375552 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/maxmap.md +++ b/docs/en/sql-reference/aggregate-functions/reference/maxmap.md @@ -1,5 +1,5 @@ --- -toc_priority: 143 +sidebar_position: 143 --- # maxMap {#agg_functions-maxmap} diff --git a/docs/en/sql-reference/aggregate-functions/reference/meanztest.md b/docs/en/sql-reference/aggregate-functions/reference/meanztest.md index 7d016f42819..02b89b1b31d 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/meanztest.md +++ b/docs/en/sql-reference/aggregate-functions/reference/meanztest.md @@ -1,6 +1,6 @@ --- -toc_priority: 303 -toc_title: meanZTest +sidebar_position: 303 +sidebar_label: meanZTest --- # meanZTest {#meanztest} diff --git a/docs/en/sql-reference/aggregate-functions/reference/median.md b/docs/en/sql-reference/aggregate-functions/reference/median.md index 619e9a5093e..3e84b4b169c 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/median.md +++ b/docs/en/sql-reference/aggregate-functions/reference/median.md @@ -1,5 +1,5 @@ --- -toc_priority: 212 +sidebar_position: 212 --- # median {#median} diff --git a/docs/en/sql-reference/aggregate-functions/reference/min.md b/docs/en/sql-reference/aggregate-functions/reference/min.md index 64b155857f8..0525066e9f3 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/min.md +++ b/docs/en/sql-reference/aggregate-functions/reference/min.md @@ -1,5 +1,5 @@ --- -toc_priority: 2 +sidebar_position: 2 --- ## min {#agg_function-min} diff --git a/docs/en/sql-reference/aggregate-functions/reference/minmap.md b/docs/en/sql-reference/aggregate-functions/reference/minmap.md index 9408d0ddfff..8a4d50dd46c 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/minmap.md +++ b/docs/en/sql-reference/aggregate-functions/reference/minmap.md @@ -1,5 +1,5 @@ --- -toc_priority: 142 +sidebar_position: 142 --- # minMap {#agg_functions-minmap} diff --git a/docs/en/sql-reference/aggregate-functions/reference/quantile.md b/docs/en/sql-reference/aggregate-functions/reference/quantile.md index b6f38e57342..6a0479da77f 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/quantile.md +++ b/docs/en/sql-reference/aggregate-functions/reference/quantile.md @@ -1,5 +1,5 @@ --- -toc_priority: 200 +sidebar_position: 200 --- # quantile {#quantile} diff --git a/docs/en/sql-reference/aggregate-functions/reference/quantilebfloat16.md b/docs/en/sql-reference/aggregate-functions/reference/quantilebfloat16.md index 728c200441d..f0bd51f0add 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/quantilebfloat16.md +++ b/docs/en/sql-reference/aggregate-functions/reference/quantilebfloat16.md @@ -1,5 +1,5 @@ --- -toc_priority: 209 +sidebar_position: 209 --- # quantileBFloat16 {#quantilebfloat16} diff --git a/docs/en/sql-reference/aggregate-functions/reference/quantiledeterministic.md b/docs/en/sql-reference/aggregate-functions/reference/quantiledeterministic.md index a20ac26f599..bb23ce63cea 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/quantiledeterministic.md +++ b/docs/en/sql-reference/aggregate-functions/reference/quantiledeterministic.md @@ -1,5 +1,5 @@ --- -toc_priority: 206 +sidebar_position: 206 --- # quantileDeterministic {#quantiledeterministic} diff --git a/docs/en/sql-reference/aggregate-functions/reference/quantileexact.md b/docs/en/sql-reference/aggregate-functions/reference/quantileexact.md index bfd9d1e5a55..b3a384b0cfd 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/quantileexact.md +++ b/docs/en/sql-reference/aggregate-functions/reference/quantileexact.md @@ -1,5 +1,5 @@ --- -toc_priority: 202 +sidebar_position: 202 --- # quantileExact Functions {#quantileexact-functions} diff --git a/docs/en/sql-reference/aggregate-functions/reference/quantileexactweighted.md b/docs/en/sql-reference/aggregate-functions/reference/quantileexactweighted.md index 210f44e7587..4740d4a26f8 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/quantileexactweighted.md +++ b/docs/en/sql-reference/aggregate-functions/reference/quantileexactweighted.md @@ -1,5 +1,5 @@ --- -toc_priority: 203 +sidebar_position: 203 --- # quantileExactWeighted {#quantileexactweighted} diff --git a/docs/en/sql-reference/aggregate-functions/reference/quantiles.md b/docs/en/sql-reference/aggregate-functions/reference/quantiles.md index 9777570be83..6d0cf37f25e 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/quantiles.md +++ b/docs/en/sql-reference/aggregate-functions/reference/quantiles.md @@ -1,5 +1,5 @@ --- -toc_priority: 201 +sidebar_position: 201 --- # quantiles Functions {#quantiles-functions} diff --git a/docs/en/sql-reference/aggregate-functions/reference/quantiletdigest.md b/docs/en/sql-reference/aggregate-functions/reference/quantiletdigest.md index dd0d59978d1..f42c88b2aca 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/quantiletdigest.md +++ b/docs/en/sql-reference/aggregate-functions/reference/quantiletdigest.md @@ -1,5 +1,5 @@ --- -toc_priority: 207 +sidebar_position: 207 --- # quantileTDigest {#quantiletdigest} diff --git a/docs/en/sql-reference/aggregate-functions/reference/quantiletdigestweighted.md b/docs/en/sql-reference/aggregate-functions/reference/quantiletdigestweighted.md index 70f30f3a480..684e438f0c7 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/quantiletdigestweighted.md +++ b/docs/en/sql-reference/aggregate-functions/reference/quantiletdigestweighted.md @@ -1,5 +1,5 @@ --- -toc_priority: 208 +sidebar_position: 208 --- # quantileTDigestWeighted {#quantiletdigestweighted} @@ -12,8 +12,9 @@ The result depends on the order of running the query, and is nondeterministic. When using multiple `quantile*` functions with different levels in a query, the internal states are not combined (that is, the query works less efficiently than it could). In this case, use the [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) function. -!!! note "Note" - Using `quantileTDigestWeighted` [is not recommended for tiny data sets](https://github.com/tdunning/t-digest/issues/167#issuecomment-828650275) and can lead to significat error. In this case, consider possibility of using [`quantileTDigest`](../../../sql-reference/aggregate-functions/reference/quantiletdigest.md) instead. +:::note +Using `quantileTDigestWeighted` [is not recommended for tiny data sets](https://github.com/tdunning/t-digest/issues/167#issuecomment-828650275) and can lead to significat error. In this case, consider possibility of using [`quantileTDigest`](../../../sql-reference/aggregate-functions/reference/quantiletdigest.md) instead. +::: **Syntax** diff --git a/docs/en/sql-reference/aggregate-functions/reference/quantiletiming.md b/docs/en/sql-reference/aggregate-functions/reference/quantiletiming.md index dd545c1a485..f282f7e2004 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/quantiletiming.md +++ b/docs/en/sql-reference/aggregate-functions/reference/quantiletiming.md @@ -1,5 +1,5 @@ --- -toc_priority: 204 +sidebar_position: 204 --- # quantileTiming {#quantiletiming} @@ -36,8 +36,9 @@ The calculation is accurate if: Otherwise, the result of the calculation is rounded to the nearest multiple of 16 ms. -!!! note "Note" - For calculating page loading time quantiles, this function is more effective and accurate than [quantile](../../../sql-reference/aggregate-functions/reference/quantile.md#quantile). +:::note +For calculating page loading time quantiles, this function is more effective and accurate than [quantile](../../../sql-reference/aggregate-functions/reference/quantile.md#quantile). +::: **Returned value** @@ -45,8 +46,9 @@ Otherwise, the result of the calculation is rounded to the nearest multiple of 1 Type: `Float32`. -!!! note "Note" - If no values are passed to the function (when using `quantileTimingIf`), [NaN](../../../sql-reference/data-types/float.md#data_type-float-nan-inf) is returned. The purpose of this is to differentiate these cases from cases that result in zero. See [ORDER BY clause](../../../sql-reference/statements/select/order-by.md#select-order-by) for notes on sorting `NaN` values. +:::note +If no values are passed to the function (when using `quantileTimingIf`), [NaN](../../../sql-reference/data-types/float.md#data_type-float-nan-inf) is returned. The purpose of this is to differentiate these cases from cases that result in zero. See [ORDER BY clause](../../../sql-reference/statements/select/order-by.md#select-order-by) for notes on sorting `NaN` values. +::: **Example** diff --git a/docs/en/sql-reference/aggregate-functions/reference/quantiletimingweighted.md b/docs/en/sql-reference/aggregate-functions/reference/quantiletimingweighted.md index 25846cde636..c773f900764 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/quantiletimingweighted.md +++ b/docs/en/sql-reference/aggregate-functions/reference/quantiletimingweighted.md @@ -1,5 +1,5 @@ --- -toc_priority: 205 +sidebar_position: 205 --- # quantileTimingWeighted {#quantiletimingweighted} @@ -38,8 +38,9 @@ The calculation is accurate if: Otherwise, the result of the calculation is rounded to the nearest multiple of 16 ms. -!!! note "Note" - For calculating page loading time quantiles, this function is more effective and accurate than [quantile](../../../sql-reference/aggregate-functions/reference/quantile.md#quantile). +:::note +For calculating page loading time quantiles, this function is more effective and accurate than [quantile](../../../sql-reference/aggregate-functions/reference/quantile.md#quantile). +::: **Returned value** @@ -47,8 +48,9 @@ Otherwise, the result of the calculation is rounded to the nearest multiple of 1 Type: `Float32`. -!!! note "Note" - If no values are passed to the function (when using `quantileTimingIf`), [NaN](../../../sql-reference/data-types/float.md#data_type-float-nan-inf) is returned. The purpose of this is to differentiate these cases from cases that result in zero. See [ORDER BY clause](../../../sql-reference/statements/select/order-by.md#select-order-by) for notes on sorting `NaN` values. +:::note +If no values are passed to the function (when using `quantileTimingIf`), [NaN](../../../sql-reference/data-types/float.md#data_type-float-nan-inf) is returned. The purpose of this is to differentiate these cases from cases that result in zero. See [ORDER BY clause](../../../sql-reference/statements/select/order-by.md#select-order-by) for notes on sorting `NaN` values. +::: **Example** diff --git a/docs/en/sql-reference/aggregate-functions/reference/rankCorr.md b/docs/en/sql-reference/aggregate-functions/reference/rankCorr.md index b364317c22b..399fd88cf0e 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/rankCorr.md +++ b/docs/en/sql-reference/aggregate-functions/reference/rankCorr.md @@ -1,5 +1,5 @@ --- -toc_priority: 145 +sidebar_position: 145 --- # rankCorr {#agg_function-rankcorr} diff --git a/docs/en/sql-reference/aggregate-functions/reference/simplelinearregression.md b/docs/en/sql-reference/aggregate-functions/reference/simplelinearregression.md index fee71cdeb49..8684cd4c3bb 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/simplelinearregression.md +++ b/docs/en/sql-reference/aggregate-functions/reference/simplelinearregression.md @@ -1,5 +1,5 @@ --- -toc_priority: 220 +sidebar_position: 220 --- # simpleLinearRegression {#simplelinearregression} diff --git a/docs/en/sql-reference/aggregate-functions/reference/skewpop.md b/docs/en/sql-reference/aggregate-functions/reference/skewpop.md index f84f8897a35..4cb3d58304f 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/skewpop.md +++ b/docs/en/sql-reference/aggregate-functions/reference/skewpop.md @@ -1,5 +1,5 @@ --- -toc_priority: 150 +sidebar_position: 150 --- # skewPop {#skewpop} diff --git a/docs/en/sql-reference/aggregate-functions/reference/skewsamp.md b/docs/en/sql-reference/aggregate-functions/reference/skewsamp.md index 48a049ca69d..92e807d2d7d 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/skewsamp.md +++ b/docs/en/sql-reference/aggregate-functions/reference/skewsamp.md @@ -1,5 +1,5 @@ --- -toc_priority: 151 +sidebar_position: 151 --- # skewSamp {#skewsamp} diff --git a/docs/en/sql-reference/aggregate-functions/reference/sparkbar.md b/docs/en/sql-reference/aggregate-functions/reference/sparkbar.md index 47c696129c7..ebb9cccbd40 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/sparkbar.md +++ b/docs/en/sql-reference/aggregate-functions/reference/sparkbar.md @@ -1,6 +1,6 @@ --- -toc_priority: 311 -toc_title: sparkbar +sidebar_position: 311 +sidebar_label: sparkbar --- # sparkbar {#sparkbar} diff --git a/docs/en/sql-reference/aggregate-functions/reference/stddevpop.md b/docs/en/sql-reference/aggregate-functions/reference/stddevpop.md index 58f8c27cd72..2b22320ae7a 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/stddevpop.md +++ b/docs/en/sql-reference/aggregate-functions/reference/stddevpop.md @@ -1,10 +1,11 @@ --- -toc_priority: 30 +sidebar_position: 30 --- # stddevPop {#stddevpop} The result is equal to the square root of [varPop](../../../sql-reference/aggregate-functions/reference/varpop.md). -!!! note "Note" - This function uses a numerically unstable algorithm. If you need [numerical stability](https://en.wikipedia.org/wiki/Numerical_stability) in calculations, use the `stddevPopStable` function. It works slower but provides a lower computational error. +:::note +This function uses a numerically unstable algorithm. If you need [numerical stability](https://en.wikipedia.org/wiki/Numerical_stability) in calculations, use the `stddevPopStable` function. It works slower but provides a lower computational error. +::: \ No newline at end of file diff --git a/docs/en/sql-reference/aggregate-functions/reference/stddevsamp.md b/docs/en/sql-reference/aggregate-functions/reference/stddevsamp.md index 4ec72881ae5..3dcee821606 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/stddevsamp.md +++ b/docs/en/sql-reference/aggregate-functions/reference/stddevsamp.md @@ -1,10 +1,11 @@ --- -toc_priority: 31 +sidebar_position: 31 --- # stddevSamp {#stddevsamp} The result is equal to the square root of [varSamp](../../../sql-reference/aggregate-functions/reference/varsamp.md). -!!! note "Note" - This function uses a numerically unstable algorithm. If you need [numerical stability](https://en.wikipedia.org/wiki/Numerical_stability) in calculations, use the `stddevSampStable` function. It works slower but provides a lower computational error. +:::note +This function uses a numerically unstable algorithm. If you need [numerical stability](https://en.wikipedia.org/wiki/Numerical_stability) in calculations, use the `stddevSampStable` function. It works slower but provides a lower computational error. +::: \ No newline at end of file diff --git a/docs/en/sql-reference/aggregate-functions/reference/stochasticlinearregression.md b/docs/en/sql-reference/aggregate-functions/reference/stochasticlinearregression.md index 7a37ed83e17..e171629e90d 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/stochasticlinearregression.md +++ b/docs/en/sql-reference/aggregate-functions/reference/stochasticlinearregression.md @@ -1,5 +1,5 @@ --- -toc_priority: 221 +sidebar_position: 221 --- # stochasticLinearRegression {#agg_functions-stochasticlinearregression} diff --git a/docs/en/sql-reference/aggregate-functions/reference/stochasticlogisticregression.md b/docs/en/sql-reference/aggregate-functions/reference/stochasticlogisticregression.md index 35d1e3899ac..a7d4c640126 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/stochasticlogisticregression.md +++ b/docs/en/sql-reference/aggregate-functions/reference/stochasticlogisticregression.md @@ -1,5 +1,5 @@ --- -toc_priority: 222 +sidebar_position: 222 --- # stochasticLogisticRegression {#agg_functions-stochasticlogisticregression} diff --git a/docs/en/sql-reference/aggregate-functions/reference/studentttest.md b/docs/en/sql-reference/aggregate-functions/reference/studentttest.md index 7d8d255e15b..86207a35c04 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/studentttest.md +++ b/docs/en/sql-reference/aggregate-functions/reference/studentttest.md @@ -1,6 +1,6 @@ --- -toc_priority: 300 -toc_title: studentTTest +sidebar_position: 300 +sidebar_label: studentTTest --- # studentTTest {#studentttest} diff --git a/docs/en/sql-reference/aggregate-functions/reference/sum.md b/docs/en/sql-reference/aggregate-functions/reference/sum.md index 77d38a2c7b2..b72cb84e74f 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/sum.md +++ b/docs/en/sql-reference/aggregate-functions/reference/sum.md @@ -1,5 +1,5 @@ --- -toc_priority: 4 +sidebar_position: 4 --- # sum {#agg_function-sum} diff --git a/docs/en/sql-reference/aggregate-functions/reference/sumcount.md b/docs/en/sql-reference/aggregate-functions/reference/sumcount.md index 00a7a9fc9f1..dbc0601241e 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/sumcount.md +++ b/docs/en/sql-reference/aggregate-functions/reference/sumcount.md @@ -1,5 +1,5 @@ --- -toc_priority: 144 +sidebar_position: 144 --- # sumCount {#agg_function-sumCount} diff --git a/docs/en/sql-reference/aggregate-functions/reference/sumkahan.md b/docs/en/sql-reference/aggregate-functions/reference/sumkahan.md index d4d47fde1fa..8c96464dfd5 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/sumkahan.md +++ b/docs/en/sql-reference/aggregate-functions/reference/sumkahan.md @@ -1,5 +1,5 @@ --- -toc_priority: 145 +sidebar_position: 145 --- # sumKahan {#agg_function-sumKahan} diff --git a/docs/en/sql-reference/aggregate-functions/reference/summap.md b/docs/en/sql-reference/aggregate-functions/reference/summap.md index 4ccbc22de35..78ce6a9e835 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/summap.md +++ b/docs/en/sql-reference/aggregate-functions/reference/summap.md @@ -1,5 +1,5 @@ --- -toc_priority: 141 +sidebar_position: 141 --- # sumMap {#agg_functions-summap} diff --git a/docs/en/sql-reference/aggregate-functions/reference/sumwithoverflow.md b/docs/en/sql-reference/aggregate-functions/reference/sumwithoverflow.md index 1b39e9d0eb1..0582eb5fb7b 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/sumwithoverflow.md +++ b/docs/en/sql-reference/aggregate-functions/reference/sumwithoverflow.md @@ -1,5 +1,5 @@ --- -toc_priority: 140 +sidebar_position: 140 --- # sumWithOverflow {#sumwithoverflowx} diff --git a/docs/en/sql-reference/aggregate-functions/reference/topk.md b/docs/en/sql-reference/aggregate-functions/reference/topk.md index 7e6d0db4946..19e98262899 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/topk.md +++ b/docs/en/sql-reference/aggregate-functions/reference/topk.md @@ -1,5 +1,5 @@ --- -toc_priority: 108 +sidebar_position: 108 --- # topK {#topk} diff --git a/docs/en/sql-reference/aggregate-functions/reference/topkweighted.md b/docs/en/sql-reference/aggregate-functions/reference/topkweighted.md index 694cbd1ad41..2d6e86667ef 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/topkweighted.md +++ b/docs/en/sql-reference/aggregate-functions/reference/topkweighted.md @@ -1,5 +1,5 @@ --- -toc_priority: 109 +sidebar_position: 109 --- # topKWeighted {#topkweighted} diff --git a/docs/en/sql-reference/aggregate-functions/reference/uniq.md b/docs/en/sql-reference/aggregate-functions/reference/uniq.md index 33bfe72548b..6e6791702ef 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/uniq.md +++ b/docs/en/sql-reference/aggregate-functions/reference/uniq.md @@ -1,5 +1,5 @@ --- -toc_priority: 190 +sidebar_position: 190 --- # uniq {#agg_function-uniq} diff --git a/docs/en/sql-reference/aggregate-functions/reference/uniqcombined.md b/docs/en/sql-reference/aggregate-functions/reference/uniqcombined.md index 623c43ae10c..79357cb14ce 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/uniqcombined.md +++ b/docs/en/sql-reference/aggregate-functions/reference/uniqcombined.md @@ -1,5 +1,5 @@ --- -toc_priority: 192 +sidebar_position: 192 --- # uniqCombined {#agg_function-uniqcombined} @@ -34,8 +34,9 @@ Function: - Provides the result deterministically (it does not depend on the query processing order). -!!! note "Note" - Since it uses 32-bit hash for non-`String` type, the result will have very high error for cardinalities significantly larger than `UINT_MAX` (error will raise quickly after a few tens of billions of distinct values), hence in this case you should use [uniqCombined64](../../../sql-reference/aggregate-functions/reference/uniqcombined64.md#agg_function-uniqcombined64) +:::note +Since it uses 32-bit hash for non-`String` type, the result will have very high error for cardinalities significantly larger than `UINT_MAX` (error will raise quickly after a few tens of billions of distinct values), hence in this case you should use [uniqCombined64](../../../sql-reference/aggregate-functions/reference/uniqcombined64.md#agg_function-uniqcombined64) +::: Compared to the [uniq](../../../sql-reference/aggregate-functions/reference/uniq.md#agg_function-uniq) function, the `uniqCombined`: diff --git a/docs/en/sql-reference/aggregate-functions/reference/uniqcombined64.md b/docs/en/sql-reference/aggregate-functions/reference/uniqcombined64.md index 6d060d82779..fb0be23c768 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/uniqcombined64.md +++ b/docs/en/sql-reference/aggregate-functions/reference/uniqcombined64.md @@ -1,5 +1,5 @@ --- -toc_priority: 193 +sidebar_position: 193 --- # uniqCombined64 {#agg_function-uniqcombined64} diff --git a/docs/en/sql-reference/aggregate-functions/reference/uniqexact.md b/docs/en/sql-reference/aggregate-functions/reference/uniqexact.md index e446258fbf7..68e6bc562f9 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/uniqexact.md +++ b/docs/en/sql-reference/aggregate-functions/reference/uniqexact.md @@ -1,5 +1,5 @@ --- -toc_priority: 191 +sidebar_position: 191 --- # uniqExact {#agg_function-uniqexact} diff --git a/docs/en/sql-reference/aggregate-functions/reference/uniqhll12.md b/docs/en/sql-reference/aggregate-functions/reference/uniqhll12.md index 1d619ab7d93..1a13b365560 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/uniqhll12.md +++ b/docs/en/sql-reference/aggregate-functions/reference/uniqhll12.md @@ -1,5 +1,5 @@ --- -toc_priority: 194 +sidebar_position: 194 --- # uniqHLL12 {#agg_function-uniqhll12} diff --git a/docs/en/sql-reference/aggregate-functions/reference/uniqthetasketch.md b/docs/en/sql-reference/aggregate-functions/reference/uniqthetasketch.md index b5161462442..9b9c16922b1 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/uniqthetasketch.md +++ b/docs/en/sql-reference/aggregate-functions/reference/uniqthetasketch.md @@ -1,5 +1,5 @@ --- -toc_priority: 195 +sidebar_position: 195 --- # uniqTheta {#agg_function-uniqthetasketch} diff --git a/docs/en/sql-reference/aggregate-functions/reference/varpop.md b/docs/en/sql-reference/aggregate-functions/reference/varpop.md index c08dcfd9bfd..f16cfcdc63f 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/varpop.md +++ b/docs/en/sql-reference/aggregate-functions/reference/varpop.md @@ -1,5 +1,5 @@ --- -toc_priority: 32 +sidebar_position: 32 --- # varPop(x) {#varpopx} @@ -8,5 +8,6 @@ Calculates the amount `Σ((x - x̅)^2) / n`, where `n` is the sample size and `x In other words, dispersion for a set of values. Returns `Float64`. -!!! note "Note" - This function uses a numerically unstable algorithm. If you need [numerical stability](https://en.wikipedia.org/wiki/Numerical_stability) in calculations, use the `varPopStable` function. It works slower but provides a lower computational error. +:::note +This function uses a numerically unstable algorithm. If you need [numerical stability](https://en.wikipedia.org/wiki/Numerical_stability) in calculations, use the `varPopStable` function. It works slower but provides a lower computational error. +::: \ No newline at end of file diff --git a/docs/en/sql-reference/aggregate-functions/reference/varsamp.md b/docs/en/sql-reference/aggregate-functions/reference/varsamp.md index 78bc545a5d0..b323f78fbd1 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/varsamp.md +++ b/docs/en/sql-reference/aggregate-functions/reference/varsamp.md @@ -1,5 +1,5 @@ --- -toc_priority: 33 +sidebar_position: 33 --- # varSamp {#varsamp} @@ -10,5 +10,6 @@ It represents an unbiased estimate of the variance of a random variable if passe Returns `Float64`. When `n <= 1`, returns `+∞`. -!!! note "Note" - This function uses a numerically unstable algorithm. If you need [numerical stability](https://en.wikipedia.org/wiki/Numerical_stability) in calculations, use the `varSampStable` function. It works slower but provides a lower computational error. +:::note +This function uses a numerically unstable algorithm. If you need [numerical stability](https://en.wikipedia.org/wiki/Numerical_stability) in calculations, use the `varSampStable` function. It works slower but provides a lower computational error. +::: \ No newline at end of file diff --git a/docs/en/sql-reference/aggregate-functions/reference/welchttest.md b/docs/en/sql-reference/aggregate-functions/reference/welchttest.md index 2e127f87f9f..0aff60e7bbf 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/welchttest.md +++ b/docs/en/sql-reference/aggregate-functions/reference/welchttest.md @@ -1,6 +1,6 @@ --- -toc_priority: 301 -toc_title: welchTTest +sidebar_position: 301 +sidebar_label: welchTTest --- # welchTTest {#welchttest} diff --git a/docs/en/sql-reference/ansi.md b/docs/en/sql-reference/ansi.md index 7a87ac2dcdb..5797c697255 100644 --- a/docs/en/sql-reference/ansi.md +++ b/docs/en/sql-reference/ansi.md @@ -1,12 +1,13 @@ --- -toc_priority: 40 -toc_title: ANSI Compatibility +sidebar_position: 40 +sidebar_label: ANSI Compatibility --- # ANSI SQL Compatibility of ClickHouse SQL Dialect {#ansi-sql-compatibility-of-clickhouse-sql-dialect} -!!! note "Note" - This article relies on Table 38, “Feature taxonomy and definition for mandatory features”, Annex F of [ISO/IEC CD 9075-2:2011](https://www.iso.org/obp/ui/#iso:std:iso-iec:9075:-2:ed-4:v1:en:sec:8). +:::note +This article relies on Table 38, “Feature taxonomy and definition for mandatory features”, Annex F of [ISO/IEC CD 9075-2:2011](https://www.iso.org/obp/ui/#iso:std:iso-iec:9075:-2:ed-4:v1:en:sec:8). +::: ## Differences in Behaviour {#differences-in-behaviour} diff --git a/docs/en/sql-reference/data-types/aggregatefunction.md b/docs/en/sql-reference/data-types/aggregatefunction.md index e483a20eed9..6dc89e2864f 100644 --- a/docs/en/sql-reference/data-types/aggregatefunction.md +++ b/docs/en/sql-reference/data-types/aggregatefunction.md @@ -1,6 +1,6 @@ --- -toc_priority: 53 -toc_title: AggregateFunction +sidebar_position: 53 +sidebar_label: AggregateFunction --- # AggregateFunction {#data-type-aggregatefunction} diff --git a/docs/en/sql-reference/data-types/array.md b/docs/en/sql-reference/data-types/array.md index a8dad7ba989..909df86ec2f 100644 --- a/docs/en/sql-reference/data-types/array.md +++ b/docs/en/sql-reference/data-types/array.md @@ -1,6 +1,6 @@ --- -toc_priority: 52 -toc_title: Array(T) +sidebar_position: 52 +sidebar_label: Array(T) --- # Array(t) {#data-type-array} diff --git a/docs/en/sql-reference/data-types/boolean.md b/docs/en/sql-reference/data-types/boolean.md index ca44238277c..a20e30777af 100644 --- a/docs/en/sql-reference/data-types/boolean.md +++ b/docs/en/sql-reference/data-types/boolean.md @@ -1,6 +1,6 @@ --- -toc_priority: 43 -toc_title: Boolean +sidebar_position: 43 +sidebar_label: Boolean --- # Boolean Values {#boolean-values} diff --git a/docs/en/sql-reference/data-types/date.md b/docs/en/sql-reference/data-types/date.md index 828f9ee70f5..bc3fda4a9d0 100644 --- a/docs/en/sql-reference/data-types/date.md +++ b/docs/en/sql-reference/data-types/date.md @@ -1,6 +1,6 @@ --- -toc_priority: 47 -toc_title: Date +sidebar_position: 47 +sidebar_label: Date --- # Date {#data_type-date} diff --git a/docs/en/sql-reference/data-types/date32.md b/docs/en/sql-reference/data-types/date32.md index 592f952e1be..46c9fe00b34 100644 --- a/docs/en/sql-reference/data-types/date32.md +++ b/docs/en/sql-reference/data-types/date32.md @@ -1,6 +1,6 @@ --- -toc_priority: 48 -toc_title: Date32 +sidebar_position: 48 +sidebar_label: Date32 --- # Date32 {#data_type-datetime32} diff --git a/docs/en/sql-reference/data-types/datetime.md b/docs/en/sql-reference/data-types/datetime.md index 6fa4f8326fe..cae83ac9a31 100644 --- a/docs/en/sql-reference/data-types/datetime.md +++ b/docs/en/sql-reference/data-types/datetime.md @@ -1,6 +1,6 @@ --- -toc_priority: 48 -toc_title: DateTime +sidebar_position: 48 +sidebar_label: DateTime --- # Datetime {#data_type-datetime} diff --git a/docs/en/sql-reference/data-types/datetime64.md b/docs/en/sql-reference/data-types/datetime64.md index 02d9efc0249..aefd7e4a18b 100644 --- a/docs/en/sql-reference/data-types/datetime64.md +++ b/docs/en/sql-reference/data-types/datetime64.md @@ -1,6 +1,6 @@ --- -toc_priority: 49 -toc_title: DateTime64 +sidebar_position: 49 +sidebar_label: DateTime64 --- # Datetime64 {#data_type-datetime64} diff --git a/docs/en/sql-reference/data-types/decimal.md b/docs/en/sql-reference/data-types/decimal.md index fae0bb6dbb9..33b4addb54f 100644 --- a/docs/en/sql-reference/data-types/decimal.md +++ b/docs/en/sql-reference/data-types/decimal.md @@ -1,6 +1,6 @@ --- -toc_priority: 42 -toc_title: Decimal +sidebar_position: 42 +sidebar_label: Decimal --- # Decimal(P, S), Decimal32(S), Decimal64(S), Decimal128(S), Decimal256(S) {#decimal} diff --git a/docs/en/sql-reference/data-types/domains/index.md b/docs/en/sql-reference/data-types/domains/index.md index 57db0c4263c..e27bf9a6d37 100644 --- a/docs/en/sql-reference/data-types/domains/index.md +++ b/docs/en/sql-reference/data-types/domains/index.md @@ -1,7 +1,6 @@ --- -toc_folder_title: Domains -toc_priority: 56 -toc_title: Overview +sidebar_position: 56 +sidebar_label: Domains --- # Domains {#domains} diff --git a/docs/en/sql-reference/data-types/domains/ipv4.md b/docs/en/sql-reference/data-types/domains/ipv4.md index aafd46edef8..76d285fe34a 100644 --- a/docs/en/sql-reference/data-types/domains/ipv4.md +++ b/docs/en/sql-reference/data-types/domains/ipv4.md @@ -1,6 +1,6 @@ --- -toc_priority: 59 -toc_title: IPv4 +sidebar_position: 59 +sidebar_label: IPv4 --- ## IPv4 {#ipv4} diff --git a/docs/en/sql-reference/data-types/domains/ipv6.md b/docs/en/sql-reference/data-types/domains/ipv6.md index 30b3c8add69..c5745dcb80f 100644 --- a/docs/en/sql-reference/data-types/domains/ipv6.md +++ b/docs/en/sql-reference/data-types/domains/ipv6.md @@ -1,6 +1,6 @@ --- -toc_priority: 60 -toc_title: IPv6 +sidebar_position: 60 +sidebar_label: IPv6 --- ## IPv6 {#ipv6} diff --git a/docs/en/sql-reference/data-types/enum.md b/docs/en/sql-reference/data-types/enum.md index ae22e60a5f3..5dbec255da6 100644 --- a/docs/en/sql-reference/data-types/enum.md +++ b/docs/en/sql-reference/data-types/enum.md @@ -1,6 +1,6 @@ --- -toc_priority: 50 -toc_title: Enum +sidebar_position: 50 +sidebar_label: Enum --- # Enum {#enum} diff --git a/docs/en/sql-reference/data-types/fixedstring.md b/docs/en/sql-reference/data-types/fixedstring.md index 59ed123fb10..230792c19bb 100644 --- a/docs/en/sql-reference/data-types/fixedstring.md +++ b/docs/en/sql-reference/data-types/fixedstring.md @@ -1,6 +1,6 @@ --- -toc_priority: 45 -toc_title: FixedString(N) +sidebar_position: 45 +sidebar_label: FixedString(N) --- # Fixedstring {#fixedstring} diff --git a/docs/en/sql-reference/data-types/float.md b/docs/en/sql-reference/data-types/float.md index fcc071b9f9a..46076e29525 100644 --- a/docs/en/sql-reference/data-types/float.md +++ b/docs/en/sql-reference/data-types/float.md @@ -1,6 +1,6 @@ --- -toc_priority: 41 -toc_title: Float32, Float64 +sidebar_position: 41 +sidebar_label: Float32, Float64 --- # Float32, Float64 {#float32-float64} diff --git a/docs/en/sql-reference/data-types/geo.md b/docs/en/sql-reference/data-types/geo.md index e6d32ef3305..7ce863a5a10 100644 --- a/docs/en/sql-reference/data-types/geo.md +++ b/docs/en/sql-reference/data-types/geo.md @@ -1,14 +1,15 @@ --- -toc_priority: 62 -toc_title: Geo +sidebar_position: 62 +sidebar_label: Geo --- # Geo Data Types {#geo-data-types} ClickHouse supports data types for representing geographical objects — locations, lands, etc. -!!! warning "Warning" - Currently geo data types are an experimental feature. To work with them you must set `allow_experimental_geo_types = 1`. +:::warning +Currently geo data types are an experimental feature. To work with them you must set `allow_experimental_geo_types = 1`. +::: **See Also** - [Representing simple geographical features](https://en.wikipedia.org/wiki/GeoJSON). diff --git a/docs/en/sql-reference/data-types/index.md b/docs/en/sql-reference/data-types/index.md index 831b8d19d94..ca26b89ec87 100644 --- a/docs/en/sql-reference/data-types/index.md +++ b/docs/en/sql-reference/data-types/index.md @@ -1,10 +1,9 @@ --- -toc_folder_title: Data Types -toc_priority: 37 -toc_title: Introduction +sidebar_label: Data Types +sidebar_position: 37 --- -# Data Types {#data_types} +# Data Types ClickHouse can store various kinds of data in table cells. diff --git a/docs/en/sql-reference/data-types/int-uint.md b/docs/en/sql-reference/data-types/int-uint.md index 4cc590d9fa5..86d587cfb55 100644 --- a/docs/en/sql-reference/data-types/int-uint.md +++ b/docs/en/sql-reference/data-types/int-uint.md @@ -1,6 +1,6 @@ --- -toc_priority: 40 -toc_title: UInt8, UInt16, UInt32, UInt64, UInt128, UInt256, Int8, Int16, Int32, Int64, Int128, Int256 +sidebar_position: 40 +sidebar_label: UInt8, UInt16, UInt32, UInt64, UInt128, UInt256, Int8, Int16, Int32, Int64, Int128, Int256 --- # UInt8, UInt16, UInt32, UInt64, UInt128, UInt256, Int8, Int16, Int32, Int64, Int128, Int256 diff --git a/docs/en/sql-reference/data-types/lowcardinality.md b/docs/en/sql-reference/data-types/lowcardinality.md index 3a813103335..40105446cbe 100644 --- a/docs/en/sql-reference/data-types/lowcardinality.md +++ b/docs/en/sql-reference/data-types/lowcardinality.md @@ -1,6 +1,6 @@ --- -toc_priority: 51 -toc_title: LowCardinality +sidebar_position: 51 +sidebar_label: LowCardinality --- # LowCardinality Data Type {#lowcardinality-data-type} diff --git a/docs/en/sql-reference/data-types/map.md b/docs/en/sql-reference/data-types/map.md index cdc3c874043..56f4442fe5a 100644 --- a/docs/en/sql-reference/data-types/map.md +++ b/docs/en/sql-reference/data-types/map.md @@ -1,6 +1,6 @@ --- -toc_priority: 65 -toc_title: Map(key, value) +sidebar_position: 65 +sidebar_label: Map(key, value) --- # Map(key, value) {#data_type-map} diff --git a/docs/en/sql-reference/data-types/multiword-types.md b/docs/en/sql-reference/data-types/multiword-types.md index bd91dd10ad6..ae57037b6e2 100644 --- a/docs/en/sql-reference/data-types/multiword-types.md +++ b/docs/en/sql-reference/data-types/multiword-types.md @@ -1,6 +1,6 @@ --- -toc_priority: 61 -toc_title: Multiword Type Names +sidebar_position: 61 +sidebar_label: Multiword Type Names --- # Multiword Types {#multiword-types} diff --git a/docs/en/sql-reference/data-types/nested-data-structures/index.md b/docs/en/sql-reference/data-types/nested-data-structures/index.md index b383fc53464..c0f016ea41d 100644 --- a/docs/en/sql-reference/data-types/nested-data-structures/index.md +++ b/docs/en/sql-reference/data-types/nested-data-structures/index.md @@ -1,8 +1,6 @@ --- -toc_folder_title: Nested Data Structures -toc_hidden: true -toc_priority: 54 -toc_title: hidden +sidebar_label: Nested Data Structures +sidebar_position: 54 --- # Nested Data Structures {#nested-data-structures} diff --git a/docs/en/sql-reference/data-types/nested-data-structures/nested.md b/docs/en/sql-reference/data-types/nested-data-structures/nested.md index e08b7e0de3e..8258d8bd8e5 100644 --- a/docs/en/sql-reference/data-types/nested-data-structures/nested.md +++ b/docs/en/sql-reference/data-types/nested-data-structures/nested.md @@ -1,6 +1,6 @@ --- -toc_priority: 57 -toc_title: Nested(Name1 Type1, Name2 Type2, ...) +sidebar_position: 57 +sidebar_label: Nested(Name1 Type1, Name2 Type2, ...) --- # Nested {#nested} diff --git a/docs/en/sql-reference/data-types/nullable.md b/docs/en/sql-reference/data-types/nullable.md index 2154315d269..f3c3dcd2326 100644 --- a/docs/en/sql-reference/data-types/nullable.md +++ b/docs/en/sql-reference/data-types/nullable.md @@ -1,6 +1,6 @@ --- -toc_priority: 55 -toc_title: Nullable +sidebar_position: 55 +sidebar_label: Nullable --- # Nullable(typename) {#data_type-nullable} @@ -17,8 +17,9 @@ A `Nullable` type field can’t be included in table indexes. To store `Nullable` type values in a table column, ClickHouse uses a separate file with `NULL` masks in addition to normal file with values. Entries in masks file allow ClickHouse to distinguish between `NULL` and a default value of corresponding data type for each table row. Because of an additional file, `Nullable` column consumes additional storage space compared to a similar normal one. -!!! info "Note" - Using `Nullable` almost always negatively affects performance, keep this in mind when designing your databases. +:::note +Using `Nullable` almost always negatively affects performance, keep this in mind when designing your databases. +::: ## Finding NULL {#finding-null} diff --git a/docs/en/sql-reference/data-types/simpleaggregatefunction.md b/docs/en/sql-reference/data-types/simpleaggregatefunction.md index 7a4c4375541..1c04a71dedb 100644 --- a/docs/en/sql-reference/data-types/simpleaggregatefunction.md +++ b/docs/en/sql-reference/data-types/simpleaggregatefunction.md @@ -22,10 +22,11 @@ The following aggregate functions are supported: - [`maxMap`](../../sql-reference/aggregate-functions/reference/maxmap.md#agg_functions-maxmap) -!!! note "Note" - Values of the `SimpleAggregateFunction(func, Type)` look and stored the same way as `Type`, so you do not need to apply functions with `-Merge`/`-State` suffixes. +:::note +Values of the `SimpleAggregateFunction(func, Type)` look and stored the same way as `Type`, so you do not need to apply functions with `-Merge`/`-State` suffixes. - `SimpleAggregateFunction` has better performance than `AggregateFunction` with same aggregation function. +`SimpleAggregateFunction` has better performance than `AggregateFunction` with same aggregation function. +::: **Parameters** diff --git a/docs/en/sql-reference/data-types/special-data-types/expression.md b/docs/en/sql-reference/data-types/special-data-types/expression.md index e1ffba478e6..b6a2a2ebb9d 100644 --- a/docs/en/sql-reference/data-types/special-data-types/expression.md +++ b/docs/en/sql-reference/data-types/special-data-types/expression.md @@ -1,6 +1,6 @@ --- -toc_priority: 58 -toc_title: Expression +sidebar_position: 58 +sidebar_label: Expression --- # Expression {#expression} diff --git a/docs/en/sql-reference/data-types/special-data-types/index.md b/docs/en/sql-reference/data-types/special-data-types/index.md index 3398af94c70..5455d34a2a2 100644 --- a/docs/en/sql-reference/data-types/special-data-types/index.md +++ b/docs/en/sql-reference/data-types/special-data-types/index.md @@ -1,8 +1,6 @@ --- -toc_folder_title: Special Data Types -toc_hidden: true -toc_priority: 55 -toc_title: hidden +sidebar_label: Special Data Types +sidebar_position: 55 --- # Special Data Types {#special-data-types} diff --git a/docs/en/sql-reference/data-types/special-data-types/interval.md b/docs/en/sql-reference/data-types/special-data-types/interval.md index 7c0c5b00c0d..3ebeee01bf6 100644 --- a/docs/en/sql-reference/data-types/special-data-types/interval.md +++ b/docs/en/sql-reference/data-types/special-data-types/interval.md @@ -1,14 +1,15 @@ --- -toc_priority: 61 -toc_title: Interval +sidebar_position: 61 +sidebar_label: Interval --- # Interval {#data-type-interval} The family of data types representing time and date intervals. The resulting types of the [INTERVAL](../../../sql-reference/operators/index.md#operator-interval) operator. -!!! warning "Warning" - `Interval` data type values can’t be stored in tables. +:::warning +`Interval` data type values can’t be stored in tables. +::: Structure: diff --git a/docs/en/sql-reference/data-types/special-data-types/nothing.md b/docs/en/sql-reference/data-types/special-data-types/nothing.md index e69272a665e..f9f296f7dc4 100644 --- a/docs/en/sql-reference/data-types/special-data-types/nothing.md +++ b/docs/en/sql-reference/data-types/special-data-types/nothing.md @@ -1,6 +1,6 @@ --- -toc_priority: 60 -toc_title: Nothing +sidebar_position: 60 +sidebar_label: Nothing --- # Nothing {#nothing} diff --git a/docs/en/sql-reference/data-types/special-data-types/set.md b/docs/en/sql-reference/data-types/special-data-types/set.md index 6babd047888..6d447b96f3b 100644 --- a/docs/en/sql-reference/data-types/special-data-types/set.md +++ b/docs/en/sql-reference/data-types/special-data-types/set.md @@ -1,6 +1,6 @@ --- -toc_priority: 59 -toc_title: Set +sidebar_position: 59 +sidebar_label: Set --- # Set {#set} diff --git a/docs/en/sql-reference/data-types/string.md b/docs/en/sql-reference/data-types/string.md index 3d0f01e147f..e2903c7329d 100644 --- a/docs/en/sql-reference/data-types/string.md +++ b/docs/en/sql-reference/data-types/string.md @@ -1,6 +1,6 @@ --- -toc_priority: 44 -toc_title: String +sidebar_position: 44 +sidebar_label: String --- # String {#string} diff --git a/docs/en/sql-reference/data-types/tuple.md b/docs/en/sql-reference/data-types/tuple.md index b28bef67af5..eea48ab37b4 100644 --- a/docs/en/sql-reference/data-types/tuple.md +++ b/docs/en/sql-reference/data-types/tuple.md @@ -1,6 +1,6 @@ --- -toc_priority: 54 -toc_title: Tuple(T1, T2, ...) +sidebar_position: 54 +sidebar_label: Tuple(T1, T2, ...) --- # Tuple(t1, T2, …) {#tuplet1-t2} diff --git a/docs/en/sql-reference/data-types/uuid.md b/docs/en/sql-reference/data-types/uuid.md index 528534de0a0..010fc0b5cf5 100644 --- a/docs/en/sql-reference/data-types/uuid.md +++ b/docs/en/sql-reference/data-types/uuid.md @@ -1,6 +1,6 @@ --- -toc_priority: 46 -toc_title: UUID +sidebar_position: 46 +sidebar_label: UUID --- # UUID {#uuid-data-type} diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/_category_.yml b/docs/en/sql-reference/dictionaries/external-dictionaries/_category_.yml new file mode 100644 index 00000000000..77f42ba74d1 --- /dev/null +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/_category_.yml @@ -0,0 +1,7 @@ +position: 37 +label: 'External Dictionaries' +collapsible: true +collapsed: true +link: + type: generated-index + title: External Dictionaries \ No newline at end of file diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-hierarchical.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-hierarchical.md index 08d3b8d8ad0..c48ad217431 100644 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-hierarchical.md +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-hierarchical.md @@ -1,9 +1,9 @@ --- -toc_priority: 45 -toc_title: Hierarchical dictionaries +sidebar_position: 45 +sidebar_label: Hierarchical dictionaries --- -# Hierarchical Dictionaries {#hierarchical-dictionaries} +# Hierarchical Dictionaries ClickHouse supports hierarchical dictionaries with a [numeric key](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md#ext_dict-numeric-key). diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md index effcc614930..bd88a8b09f7 100644 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md @@ -1,9 +1,9 @@ --- -toc_priority: 41 -toc_title: Storing Dictionaries in Memory +sidebar_position: 41 +sidebar_label: Storing Dictionaries in Memory --- -# Storing Dictionaries in Memory {#dicts-external-dicts-dict-layout} +# Storing Dictionaries in Memory There are a variety of ways to store dictionaries in memory. @@ -238,8 +238,9 @@ Example: The table contains discounts for each advertiser in the format: To use a sample for date ranges, define the `range_min` and `range_max` elements in the [structure](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md). These elements must contain elements `name` and `type` (if `type` is not specified, the default type will be used - Date). `type` can be any numeric type (Date / DateTime / UInt64 / Int32 / others). -!!! warning "Warning" - Values of `range_min` and `range_max` should fit in `Int64` type. +:::warning +Values of `range_min` and `range_max` should fit in `Int64` type. +::: Example: @@ -407,8 +408,9 @@ Set a large enough cache size. You need to experiment to select the number of ce 3. Assess memory consumption using the `system.dictionaries` table. 4. Increase or decrease the number of cells until the required memory consumption is reached. -!!! warning "Warning" - Do not use ClickHouse as a source, because it is slow to process queries with random reads. +:::warning +Do not use ClickHouse as a source, because it is slow to process queries with random reads. +::: ### complex_key_cache {#complex-key-cache} diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md index afef6ae249d..83814781005 100644 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md @@ -1,9 +1,9 @@ --- -toc_priority: 42 -toc_title: Dictionary Updates +sidebar_position: 42 +sidebar_label: Dictionary Updates --- -# Dictionary Updates {#dictionary-updates} +# Dictionary Updates ClickHouse periodically updates the dictionaries. The update interval for fully downloaded dictionaries and the invalidation interval for cached dictionaries are defined in the `` tag in seconds. diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-polygon.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-polygon.md index b49f384367d..7fcea84b55d 100644 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-polygon.md +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-polygon.md @@ -1,9 +1,9 @@ --- -toc_priority: 46 -toc_title: Polygon Dictionaries With Grids +sidebar_position: 46 +sidebar_label: Polygon Dictionaries With Grids --- -# Polygon dictionaries {#polygon-dictionaries} +# Polygon dictionaries Polygon dictionaries allow you to efficiently search for the polygon containing specified points. For example: defining a city area by geographical coordinates. diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md index c3c4bbc6493..e5502a17a3a 100644 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md @@ -1,9 +1,9 @@ --- -toc_priority: 43 -toc_title: Sources of External Dictionaries +sidebar_position: 43 +sidebar_label: Sources of External Dictionaries --- -# Sources of External Dictionaries {#dicts-external-dicts-dict-sources} +# Sources of External Dictionaries An external dictionary can be connected from many different sources. @@ -220,8 +220,9 @@ When creating a dictionary using the DDL command (`CREATE DICTIONARY ...`) remot ### Known Vulnerability of the ODBC Dictionary Functionality {#known-vulnerability-of-the-odbc-dictionary-functionality} -!!! attention "Attention" - When connecting to the database through the ODBC driver connection parameter `Servername` can be substituted. In this case values of `USERNAME` and `PASSWORD` from `odbc.ini` are sent to the remote server and can be compromised. +:::note +When connecting to the database through the ODBC driver connection parameter `Servername` can be substituted. In this case values of `USERNAME` and `PASSWORD` from `odbc.ini` are sent to the remote server and can be compromised. +::: **Example of insecure use** @@ -471,8 +472,9 @@ Setting fields: - `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Updating dictionaries](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md). - `query` – The custom query. Optional parameter. -!!! info "Note" - The `table` and `query` fields cannot be used together. And either one of the `table` or `query` fields must be declared. +:::note +The `table` and `query` fields cannot be used together. And either one of the `table` or `query` fields must be declared. +::: ClickHouse receives quoting symbols from ODBC-driver and quote all settings in queries to driver, so it’s necessary to set table name accordingly to table name case in database. @@ -549,8 +551,9 @@ Setting fields: - `query` – The custom query. Optional parameter. -!!! info "Note" - The `table` or `where` fields cannot be used together with the `query` field. And either one of the `table` or `query` fields must be declared. +:::note +The `table` or `where` fields cannot be used together with the `query` field. And either one of the `table` or `query` fields must be declared. +::: MySQL can be connected on a local host via sockets. To do this, set `host` and `socket`. @@ -639,8 +642,9 @@ Setting fields: - `secure` - Use ssl for connection. - `query` – The custom query. Optional parameter. -!!! info "Note" - The `table` or `where` fields cannot be used together with the `query` field. And either one of the `table` or `query` fields must be declared. +:::note +The `table` or `where` fields cannot be used together with the `query` field. And either one of the `table` or `query` fields must be declared. +::: ### Mongodb {#dicts-external_dicts_dict_sources-mongodb} @@ -752,8 +756,9 @@ Setting fields: - `max_threads` – The maximum number of threads to use for loading data from multiple partitions in compose key dictionaries. - `query` – The custom query. Optional parameter. -!!! info "Note" - The `column_family` or `where` fields cannot be used together with the `query` field. And either one of the `column_family` or `query` fields must be declared. +:::note +The `column_family` or `where` fields cannot be used together with the `query` field. And either one of the `column_family` or `query` fields must be declared. +::: ### PostgreSQL {#dicts-external_dicts_dict_sources-postgresql} @@ -808,5 +813,6 @@ Setting fields: - `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Updating dictionaries](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md). - `query` – The custom query. Optional parameter. -!!! info "Note" - The `table` or `where` fields cannot be used together with the `query` field. And either one of the `table` or `query` fields must be declared. +:::note +The `table` or `where` fields cannot be used together with the `query` field. And either one of the `table` or `query` fields must be declared. +::: \ No newline at end of file diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md index 0d1c4535b28..2712bbf6911 100644 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md @@ -1,11 +1,11 @@ --- -toc_priority: 44 -toc_title: Dictionary Key and Fields +sidebar_position: 44 +sidebar_label: Dictionary Key and Fields --- -# Dictionary Key and Fields {#dictionary-key-and-fields} +# Dictionary Key and Fields -The `` clause describes the dictionary key and fields available for queries. +The `structure` clause describes the dictionary key and fields available for queries. XML description: @@ -56,8 +56,9 @@ ClickHouse supports the following types of keys: An xml structure can contain either `` or ``. DDL-query must contain single `PRIMARY KEY`. -!!! warning "Warning" - You must not describe key as an attribute. +:::warning +You must not describe key as an attribute. +::: ### Numeric Key {#ext_dict-numeric-key} @@ -92,8 +93,9 @@ PRIMARY KEY Id The key can be a `tuple` from any types of fields. The [layout](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md) in this case must be `complex_key_hashed` or `complex_key_cache`. -!!! tip "Tip" - A composite key can consist of a single element. This makes it possible to use a string as the key, for instance. +:::tip +A composite key can consist of a single element. This makes it possible to use a string as the key, for instance. +::: The key structure is set in the element ``. Key fields are specified in the same format as the dictionary [attributes](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md). Example: diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict.md index e15d944130e..bb4fcdab51a 100644 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict.md +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict.md @@ -1,9 +1,9 @@ --- -toc_priority: 40 -toc_title: Configuring an External Dictionary +sidebar_position: 40 +sidebar_label: Configuring an External Dictionary --- -# Configuring an External Dictionary {#dicts-external-dicts-dict} +# Configuring an External Dictionary If dictionary is configured using xml file, than dictionary configuration has the following structure: diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts.md index 00025c70c60..d816888f019 100644 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts.md +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts.md @@ -1,9 +1,9 @@ --- -toc_priority: 39 -toc_title: General Description +sidebar_position: 39 +sidebar_label: General Description --- -# External Dictionaries {#dicts-external-dicts} +# External Dictionaries You can add your own dictionaries from various data sources. The data source for a dictionary can be a local text or executable file, an HTTP(s) resource, or another DBMS. For more information, see “[Sources for external dictionaries](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md)”. @@ -45,8 +45,9 @@ You can [configure](../../../sql-reference/dictionaries/external-dictionaries/ex [DDL queries for dictionaries](../../../sql-reference/statements/create/dictionary.md) does not require any additional records in server configuration. They allow to work with dictionaries as first-class entities, like tables or views. -!!! attention "Attention" - You can convert values for a small dictionary by describing it in a `SELECT` query (see the [transform](../../../sql-reference/functions/other-functions.md) function). This functionality is not related to external dictionaries. +:::note +You can convert values for a small dictionary by describing it in a `SELECT` query (see the [transform](../../../sql-reference/functions/other-functions.md) function). This functionality is not related to external dictionaries. +::: ## See Also {#ext-dicts-see-also} diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/index.md b/docs/en/sql-reference/dictionaries/external-dictionaries/index.md deleted file mode 100644 index 4098ac38060..00000000000 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/index.md +++ /dev/null @@ -1,6 +0,0 @@ ---- -toc_folder_title: External Dictionaries -toc_priority: 37 ---- - - diff --git a/docs/en/sql-reference/dictionaries/index.md b/docs/en/sql-reference/dictionaries/index.md index 22f4182a1c0..8e54b70eab0 100644 --- a/docs/en/sql-reference/dictionaries/index.md +++ b/docs/en/sql-reference/dictionaries/index.md @@ -1,7 +1,6 @@ --- -toc_folder_title: Dictionaries -toc_priority: 35 -toc_title: Introduction +sidebar_label: Dictionaries +sidebar_position: 35 --- # Dictionaries {#dictionaries} diff --git a/docs/en/sql-reference/dictionaries/internal-dicts.md b/docs/en/sql-reference/dictionaries/internal-dicts.md index a8976772aa5..1996c974412 100644 --- a/docs/en/sql-reference/dictionaries/internal-dicts.md +++ b/docs/en/sql-reference/dictionaries/internal-dicts.md @@ -1,6 +1,6 @@ --- -toc_priority: 39 -toc_title: Internal Dictionaries +sidebar_position: 39 +sidebar_label: Internal Dictionaries --- # Internal Dictionaries {#internal_dicts} diff --git a/docs/en/sql-reference/distributed-ddl.md b/docs/en/sql-reference/distributed-ddl.md index c291c85fa7a..e0eae13672a 100644 --- a/docs/en/sql-reference/distributed-ddl.md +++ b/docs/en/sql-reference/distributed-ddl.md @@ -1,6 +1,6 @@ --- -toc_priority: 32 -toc_title: Distributed DDL +sidebar_position: 32 +sidebar_label: Distributed DDL --- # Distributed DDL Queries (ON CLUSTER Clause) {#distributed-ddl-queries-on-cluster-clause} @@ -17,5 +17,6 @@ In order to run these queries correctly, each host must have the same cluster de The local version of the query will eventually be executed on each host in the cluster, even if some hosts are currently not available. -!!! warning "Warning" - The order for executing queries within a single host is guaranteed. +:::warning +The order for executing queries within a single host is guaranteed. +::: \ No newline at end of file diff --git a/docs/en/sql-reference/functions/arithmetic-functions.md b/docs/en/sql-reference/functions/arithmetic-functions.md index 40fadf34eab..63c481c9ae6 100644 --- a/docs/en/sql-reference/functions/arithmetic-functions.md +++ b/docs/en/sql-reference/functions/arithmetic-functions.md @@ -1,6 +1,6 @@ --- -toc_priority: 34 -toc_title: Arithmetic +sidebar_position: 34 +sidebar_label: Arithmetic --- # Arithmetic Functions {#arithmetic-functions} diff --git a/docs/en/sql-reference/functions/array-functions.md b/docs/en/sql-reference/functions/array-functions.md index 8231cda4b77..3f96f75e7b8 100644 --- a/docs/en/sql-reference/functions/array-functions.md +++ b/docs/en/sql-reference/functions/array-functions.md @@ -1,6 +1,6 @@ --- -toc_priority: 35 -toc_title: Arrays +sidebar_position: 35 +sidebar_label: Arrays --- # Array Functions {#functions-for-working-with-arrays} @@ -17,8 +17,9 @@ empty([x]) An array is considered empty if it does not contain any elements. -!!! note "Note" - Can be optimized by enabling the [optimize_functions_to_subcolumns](../../operations/settings/settings.md#optimize-functions-to-subcolumns) setting. With `optimize_functions_to_subcolumns = 1` the function reads only [size0](../../sql-reference/data-types/array.md#array-size) subcolumn instead of reading and processing the whole array column. The query `SELECT empty(arr) FROM TABLE;` transforms to `SELECT arr.size0 = 0 FROM TABLE;`. +:::note +Can be optimized by enabling the [optimize_functions_to_subcolumns](../../operations/settings/settings.md#optimize-functions-to-subcolumns) setting. With `optimize_functions_to_subcolumns = 1` the function reads only [size0](../../sql-reference/data-types/array.md#array-size) subcolumn instead of reading and processing the whole array column. The query `SELECT empty(arr) FROM TABLE;` transforms to `SELECT arr.size0 = 0 FROM TABLE;`. +::: The function also works for [strings](string-functions.md#empty) or [UUID](uuid-functions.md#empty). @@ -60,8 +61,9 @@ notEmpty([x]) An array is considered non-empty if it contains at least one element. -!!! note "Note" - Can be optimized by enabling the [optimize_functions_to_subcolumns](../../operations/settings/settings.md#optimize-functions-to-subcolumns) setting. With `optimize_functions_to_subcolumns = 1` the function reads only [size0](../../sql-reference/data-types/array.md#array-size) subcolumn instead of reading and processing the whole array column. The query `SELECT notEmpty(arr) FROM table` transforms to `SELECT arr.size0 != 0 FROM TABLE`. +:::note +Can be optimized by enabling the [optimize_functions_to_subcolumns](../../operations/settings/settings.md#optimize-functions-to-subcolumns) setting. With `optimize_functions_to_subcolumns = 1` the function reads only [size0](../../sql-reference/data-types/array.md#array-size) subcolumn instead of reading and processing the whole array column. The query `SELECT notEmpty(arr) FROM table` transforms to `SELECT arr.size0 != 0 FROM TABLE`. +::: The function also works for [strings](string-functions.md#notempty) or [UUID](uuid-functions.md#notempty). @@ -733,8 +735,9 @@ SELECT arraySort((x, y) -> -y, [0, 1, 2], [1, 2, 3]) as res; └─────────┘ ``` -!!! note "Note" - To improve sorting efficiency, the [Schwartzian transform](https://en.wikipedia.org/wiki/Schwartzian_transform) is used. +:::note +To improve sorting efficiency, the [Schwartzian transform](https://en.wikipedia.org/wiki/Schwartzian_transform) is used. +::: ## arrayReverseSort(\[func,\] arr, …) {#array_functions-reverse-sort} diff --git a/docs/en/sql-reference/functions/array-join.md b/docs/en/sql-reference/functions/array-join.md index e87d0bca4bb..24d9c2b08d8 100644 --- a/docs/en/sql-reference/functions/array-join.md +++ b/docs/en/sql-reference/functions/array-join.md @@ -1,6 +1,6 @@ --- -toc_priority: 61 -toc_title: arrayJoin +sidebar_position: 61 +sidebar_label: arrayJoin --- # arrayJoin function {#functions_arrayjoin} diff --git a/docs/en/sql-reference/functions/bit-functions.md b/docs/en/sql-reference/functions/bit-functions.md index 24adb362c98..c23c5ac5431 100644 --- a/docs/en/sql-reference/functions/bit-functions.md +++ b/docs/en/sql-reference/functions/bit-functions.md @@ -1,6 +1,6 @@ --- -toc_priority: 48 -toc_title: Bit +sidebar_position: 48 +sidebar_label: Bit --- # Bit Functions {#bit-functions} diff --git a/docs/en/sql-reference/functions/bitmap-functions.md b/docs/en/sql-reference/functions/bitmap-functions.md index a6104835469..68d1fc88a31 100644 --- a/docs/en/sql-reference/functions/bitmap-functions.md +++ b/docs/en/sql-reference/functions/bitmap-functions.md @@ -1,6 +1,6 @@ --- -toc_priority: 49 -toc_title: Bitmap +sidebar_position: 49 +sidebar_label: Bitmap --- # Bitmap Functions {#bitmap-functions} diff --git a/docs/en/sql-reference/functions/comparison-functions.md b/docs/en/sql-reference/functions/comparison-functions.md index edaf0a01c73..b5e842ddcad 100644 --- a/docs/en/sql-reference/functions/comparison-functions.md +++ b/docs/en/sql-reference/functions/comparison-functions.md @@ -1,6 +1,6 @@ --- -toc_priority: 36 -toc_title: Comparison +sidebar_position: 36 +sidebar_label: Comparison --- # Comparison Functions {#comparison-functions} diff --git a/docs/en/sql-reference/functions/conditional-functions.md b/docs/en/sql-reference/functions/conditional-functions.md index 241112f7f7f..21189bbb072 100644 --- a/docs/en/sql-reference/functions/conditional-functions.md +++ b/docs/en/sql-reference/functions/conditional-functions.md @@ -1,6 +1,6 @@ --- -toc_priority: 43 -toc_title: 'Conditional ' +sidebar_position: 43 +sidebar_label: 'Conditional ' --- # Conditional Functions {#conditional-functions} diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md index d535a516b3a..5f783cf4149 100644 --- a/docs/en/sql-reference/functions/date-time-functions.md +++ b/docs/en/sql-reference/functions/date-time-functions.md @@ -1,6 +1,6 @@ --- -toc_priority: 39 -toc_title: Dates and Times +sidebar_position: 39 +sidebar_label: Dates and Times --- # Functions for Working with Dates and Times {#functions-for-working-with-dates-and-times} @@ -266,8 +266,9 @@ Result: └────────────────┘ ``` -!!! attention "Attention" - The return type `toStartOf*` functions described below is `Date` or `DateTime`. Though these functions can take `DateTime64` as an argument, passing them a `DateTime64` that is out of the normal range (years 1925 - 2283) will give an incorrect result. +:::note +The return type `toStartOf*` functions described below is `Date` or `DateTime`. Though these functions can take `DateTime64` as an argument, passing them a `DateTime64` that is out of the normal range (years 1925 - 2283) will give an incorrect result. +::: ## toStartOfYear {#tostartofyear} @@ -290,8 +291,9 @@ Returns the date. Rounds down a date or date with time to the first day of the month. Returns the date. -!!! attention "Attention" - The behavior of parsing incorrect dates is implementation specific. ClickHouse may return zero date, throw an exception or do “natural” overflow. +:::note +The behavior of parsing incorrect dates is implementation specific. ClickHouse may return zero date, throw an exception or do “natural” overflow. +::: ## toMonday {#tomonday} diff --git a/docs/en/sql-reference/functions/encoding-functions.md b/docs/en/sql-reference/functions/encoding-functions.md index e3d5a4b18db..6e25befcbc7 100644 --- a/docs/en/sql-reference/functions/encoding-functions.md +++ b/docs/en/sql-reference/functions/encoding-functions.md @@ -1,6 +1,6 @@ --- -toc_priority: 52 -toc_title: Encoding +sidebar_position: 52 +sidebar_label: Encoding --- # Encoding Functions {#encoding-functions} @@ -170,8 +170,9 @@ Performs the opposite operation of [hex](#hex). It interprets each pair of hexad If you want to convert the result to a number, you can use the [reverse](../../sql-reference/functions/string-functions.md#reverse) and [reinterpretAs<Type>](../../sql-reference/functions/type-conversion-functions.md#type-conversion-functions) functions. -!!! note "Note" - If `unhex` is invoked from within the `clickhouse-client`, binary strings display using UTF-8. +:::note +If `unhex` is invoked from within the `clickhouse-client`, binary strings display using UTF-8. +::: Alias: `UNHEX`. @@ -328,8 +329,9 @@ Alias: `UNBIN`. For a numeric argument `unbin()` does not return the inverse of `bin()`. If you want to convert the result to a number, you can use the [reverse](../../sql-reference/functions/string-functions.md#reverse) and [reinterpretAs<Type>](../../sql-reference/functions/type-conversion-functions.md#reinterpretasuint8163264) functions. -!!! note "Note" - If `unbin` is invoked from within the `clickhouse-client`, binary strings are displayed using UTF-8. +:::note +If `unbin` is invoked from within the `clickhouse-client`, binary strings are displayed using UTF-8. +::: Supports binary digits `0` and `1`. The number of binary digits does not have to be multiples of eight. If the argument string contains anything other than binary digits, some implementation-defined result is returned (an exception isn’t thrown). diff --git a/docs/en/sql-reference/functions/encryption-functions.md b/docs/en/sql-reference/functions/encryption-functions.md index ea4d0f84488..942a63a48a8 100644 --- a/docs/en/sql-reference/functions/encryption-functions.md +++ b/docs/en/sql-reference/functions/encryption-functions.md @@ -1,6 +1,6 @@ --- -toc_priority: 67 -toc_title: Encryption +sidebar_position: 67 +sidebar_label: Encryption --- # Encryption functions {#encryption-functions} diff --git a/docs/en/sql-reference/functions/ext-dict-functions.md b/docs/en/sql-reference/functions/ext-dict-functions.md index 84e1e5eca3b..1d3f5952c98 100644 --- a/docs/en/sql-reference/functions/ext-dict-functions.md +++ b/docs/en/sql-reference/functions/ext-dict-functions.md @@ -1,10 +1,11 @@ --- -toc_priority: 58 -toc_title: External Dictionaries +sidebar_position: 58 +sidebar_label: External Dictionaries --- -!!! attention "Attention" - For dictionaries, created with [DDL queries](../../sql-reference/statements/create/dictionary.md), the `dict_name` parameter must be fully specified, like `.`. Otherwise, the current database is used. +:::note +For dictionaries created with [DDL queries](../../sql-reference/statements/create/dictionary.md), the `dict_name` parameter must be fully specified, like `.`. Otherwise, the current database is used. +::: # Functions for Working with External Dictionaries {#ext_dict_functions} diff --git a/docs/en/sql-reference/functions/files.md b/docs/en/sql-reference/functions/files.md index 9cbf8932465..5bb77016039 100644 --- a/docs/en/sql-reference/functions/files.md +++ b/docs/en/sql-reference/functions/files.md @@ -1,6 +1,6 @@ --- -toc_priority: 43 -toc_title: Files +sidebar_position: 43 +sidebar_label: Files --- # Functions for Working with Files {#functions-for-working-with-files} diff --git a/docs/en/sql-reference/functions/functions-for-nulls.md b/docs/en/sql-reference/functions/functions-for-nulls.md index 42307093dda..0ff93357208 100644 --- a/docs/en/sql-reference/functions/functions-for-nulls.md +++ b/docs/en/sql-reference/functions/functions-for-nulls.md @@ -1,6 +1,6 @@ --- -toc_priority: 63 -toc_title: Nullable +sidebar_position: 63 +sidebar_label: Nullable --- # Functions for Working with Nullable Values {#functions-for-working-with-nullable-aggregates} diff --git a/docs/en/sql-reference/functions/geo/coordinates.md b/docs/en/sql-reference/functions/geo/coordinates.md index b0862dded67..41ba409cbc1 100644 --- a/docs/en/sql-reference/functions/geo/coordinates.md +++ b/docs/en/sql-reference/functions/geo/coordinates.md @@ -1,6 +1,6 @@ --- -toc_title: Geographical Coordinates -toc_priority: 62 +sidebar_label: Geographical Coordinates +sidebar_position: 62 --- diff --git a/docs/en/sql-reference/functions/geo/geohash.md b/docs/en/sql-reference/functions/geo/geohash.md index 1192ed5f56a..e65456d0c40 100644 --- a/docs/en/sql-reference/functions/geo/geohash.md +++ b/docs/en/sql-reference/functions/geo/geohash.md @@ -1,5 +1,5 @@ --- -toc_title: Geohash +sidebar_label: Geohash --- # Functions for Working with Geohash {#geohash} @@ -80,8 +80,9 @@ geohashesInBox(longitude_min, latitude_min, longitude_max, latitude_max, precisi - `latitude_max` — Maximum latitude. Range: `[-90°, 90°]`. Type: [Float](../../../sql-reference/data-types/float.md). - `precision` — Geohash precision. Range: `[1, 12]`. Type: [UInt8](../../../sql-reference/data-types/int-uint.md). -!!! info "Note" - All coordinate parameters must be of the same type: either `Float32` or `Float64`. +:::note +All coordinate parameters must be of the same type: either `Float32` or `Float64`. +::: **Returned values** @@ -90,8 +91,9 @@ geohashesInBox(longitude_min, latitude_min, longitude_max, latitude_max, precisi Type: [Array](../../../sql-reference/data-types/array.md)([String](../../../sql-reference/data-types/string.md)). -!!! info "Note" - Function throws an exception if resulting array is over 10’000’000 items long. +:::note +Function throws an exception if resulting array is over 10’000’000 items long. +::: **Example** diff --git a/docs/en/sql-reference/functions/geo/h3.md b/docs/en/sql-reference/functions/geo/h3.md index ecbe00adfd7..50115dd4d75 100644 --- a/docs/en/sql-reference/functions/geo/h3.md +++ b/docs/en/sql-reference/functions/geo/h3.md @@ -1,5 +1,5 @@ --- -toc_title: H3 Indexes +sidebar_label: H3 Indexes --- # Functions for Working with H3 Indexes {#h3index} diff --git a/docs/en/sql-reference/functions/geo/index.md b/docs/en/sql-reference/functions/geo/index.md index 65bf2ab83cb..f76c3a3f731 100644 --- a/docs/en/sql-reference/functions/geo/index.md +++ b/docs/en/sql-reference/functions/geo/index.md @@ -1,8 +1,8 @@ --- -toc_title: hidden -toc_priority: 62 -toc_folder_title: Geo +sidebar_label: Geo +sidebar_position: 62 --- +# Geo Functions [Original article](https://clickhouse.com/docs/en/sql-reference/functions/geo/) diff --git a/docs/en/sql-reference/functions/geo/s2.md b/docs/en/sql-reference/functions/geo/s2.md index f8736bcc61a..c3d95d2f0a9 100644 --- a/docs/en/sql-reference/functions/geo/s2.md +++ b/docs/en/sql-reference/functions/geo/s2.md @@ -1,5 +1,5 @@ --- -toc_title: S2 Geometry +sidebar_label: S2 Geometry --- # Functions for Working with S2 Index {#s2index} diff --git a/docs/en/sql-reference/functions/hash-functions.md b/docs/en/sql-reference/functions/hash-functions.md index c892b814957..e4b1fdd3bbb 100644 --- a/docs/en/sql-reference/functions/hash-functions.md +++ b/docs/en/sql-reference/functions/hash-functions.md @@ -1,6 +1,6 @@ --- -toc_priority: 50 -toc_title: Hash +sidebar_position: 50 +sidebar_label: Hash --- # Hash Functions {#hash-functions} diff --git a/docs/en/sql-reference/functions/in-functions.md b/docs/en/sql-reference/functions/in-functions.md index c8936e74954..ab8ba93daba 100644 --- a/docs/en/sql-reference/functions/in-functions.md +++ b/docs/en/sql-reference/functions/in-functions.md @@ -1,6 +1,6 @@ --- -toc_priority: 60 -toc_title: IN Operator +sidebar_position: 60 +sidebar_label: IN Operator --- # Functions for Implementing the IN Operator {#functions-for-implementing-the-in-operator} diff --git a/docs/en/sql-reference/functions/index.md b/docs/en/sql-reference/functions/index.md index 7cceec889bd..261cf908e07 100644 --- a/docs/en/sql-reference/functions/index.md +++ b/docs/en/sql-reference/functions/index.md @@ -1,10 +1,9 @@ --- -toc_folder_title: Functions -toc_priority: 32 -toc_title: Introduction +sidebar_position: 32 +sidebar_label: Functions --- -# Functions {#functions} +# Functions There are at least\* two types of functions - regular functions (they are just called “functions”) and aggregate functions. These are completely different concepts. Regular functions work as if they are applied to each row separately (for each row, the result of the function does not depend on the other rows). Aggregate functions accumulate a set of values from various rows (i.e. they depend on the entire set of rows). diff --git a/docs/en/sql-reference/functions/introspection.md b/docs/en/sql-reference/functions/introspection.md index 1be68c6bdd4..694d07f18dc 100644 --- a/docs/en/sql-reference/functions/introspection.md +++ b/docs/en/sql-reference/functions/introspection.md @@ -1,14 +1,15 @@ --- -toc_priority: 65 -toc_title: Introspection +sidebar_position: 65 +sidebar_label: Introspection --- # Introspection Functions {#introspection-functions} You can use functions described in this chapter to introspect [ELF](https://en.wikipedia.org/wiki/Executable_and_Linkable_Format) and [DWARF](https://en.wikipedia.org/wiki/DWARF) for query profiling. -!!! warning "Warning" - These functions are slow and may impose security considerations. +:::warning +These functions are slow and may impose security considerations. +::: For proper operation of introspection functions: diff --git a/docs/en/sql-reference/functions/ip-address-functions.md b/docs/en/sql-reference/functions/ip-address-functions.md index 469a66d460f..c293c1ff317 100644 --- a/docs/en/sql-reference/functions/ip-address-functions.md +++ b/docs/en/sql-reference/functions/ip-address-functions.md @@ -1,6 +1,6 @@ --- -toc_priority: 55 -toc_title: IP Addresses +sidebar_position: 55 +sidebar_label: IP Addresses --- # Functions for Working with IPv4 and IPv6 Addresses {#functions-for-working-with-ip-addresses} diff --git a/docs/en/sql-reference/functions/json-functions.md b/docs/en/sql-reference/functions/json-functions.md index d5622ac5fdc..be69b7b4f2b 100644 --- a/docs/en/sql-reference/functions/json-functions.md +++ b/docs/en/sql-reference/functions/json-functions.md @@ -1,6 +1,6 @@ --- -toc_priority: 56 -toc_title: JSON +sidebar_position: 56 +sidebar_label: JSON --- # Functions for Working with JSON {#functions-for-working-with-json} @@ -359,8 +359,9 @@ SELECT JSON_EXISTS('{"hello":["world"]}', '$.hello[*]'); SELECT JSON_EXISTS('{"hello":["world"]}', '$.hello[0]'); ``` -!!! note "Note" - before version 21.11 the order of arguments was wrong, i.e. JSON_EXISTS(path, json) +:::note +Before version 21.11 the order of arguments was wrong, i.e. JSON_EXISTS(path, json) +::: ## JSON_QUERY(json, path) {#json-query} @@ -385,8 +386,9 @@ Result: [2] String ``` -!!! note "Note" - before version 21.11 the order of arguments was wrong, i.e. JSON_QUERY(path, json) +:::note +Before version 21.11 the order of arguments was wrong, i.e. JSON_QUERY(path, json) +::: ## JSON_VALUE(json, path) {#json-value} @@ -412,8 +414,9 @@ Result: String ``` -!!! note "Note" - before version 21.11 the order of arguments was wrong, i.e. JSON_VALUE(path, json) +:::note +Before version 21.11 the order of arguments was wrong, i.e. JSON_VALUE(path, json) +::: ## toJSONString {#tojsonstring} diff --git a/docs/en/sql-reference/functions/logical-functions.md b/docs/en/sql-reference/functions/logical-functions.md index dcdb01e2059..0055e253951 100644 --- a/docs/en/sql-reference/functions/logical-functions.md +++ b/docs/en/sql-reference/functions/logical-functions.md @@ -1,6 +1,6 @@ --- -toc_priority: 37 -toc_title: Logical +sidebar_position: 37 +sidebar_label: Logical --- # Logical Functions {#logical-functions} diff --git a/docs/en/sql-reference/functions/machine-learning-functions.md b/docs/en/sql-reference/functions/machine-learning-functions.md index b823340058e..5b3e8b87e34 100644 --- a/docs/en/sql-reference/functions/machine-learning-functions.md +++ b/docs/en/sql-reference/functions/machine-learning-functions.md @@ -1,6 +1,6 @@ --- -toc_priority: 64 -toc_title: Machine Learning +sidebar_position: 64 +sidebar_label: Machine Learning --- # Machine Learning Functions {#machine-learning-functions} diff --git a/docs/en/sql-reference/functions/math-functions.md b/docs/en/sql-reference/functions/math-functions.md index a5fc07cf687..645587b4f5c 100644 --- a/docs/en/sql-reference/functions/math-functions.md +++ b/docs/en/sql-reference/functions/math-functions.md @@ -1,6 +1,6 @@ --- -toc_priority: 44 -toc_title: Mathematical +sidebar_position: 44 +sidebar_label: Mathematical --- # Mathematical Functions {#mathematical-functions} diff --git a/docs/en/sql-reference/functions/nlp-functions.md b/docs/en/sql-reference/functions/nlp-functions.md index 8a1a44cf079..5a00252f56c 100644 --- a/docs/en/sql-reference/functions/nlp-functions.md +++ b/docs/en/sql-reference/functions/nlp-functions.md @@ -1,12 +1,13 @@ --- -toc_priority: 67 -toc_title: NLP +sidebar_position: 67 +sidebar_label: NLP --- # [experimental] Natural Language Processing functions {#nlp-functions} -!!! warning "Warning" - This is an experimental feature that is currently in development and is not ready for general use. It will change in unpredictable backwards-incompatible ways in future releases. Set `allow_experimental_nlp_functions = 1` to enable it. +:::warning +This is an experimental feature that is currently in development and is not ready for general use. It will change in unpredictable backwards-incompatible ways in future releases. Set `allow_experimental_nlp_functions = 1` to enable it. +::: ## stem {#stem} diff --git a/docs/en/sql-reference/functions/other-functions.md b/docs/en/sql-reference/functions/other-functions.md index bce3f9144b1..45e9ef43c6a 100644 --- a/docs/en/sql-reference/functions/other-functions.md +++ b/docs/en/sql-reference/functions/other-functions.md @@ -1,6 +1,6 @@ --- -toc_priority: 67 -toc_title: Other +sidebar_position: 67 +sidebar_label: Other --- # Other Functions {#other-functions} @@ -729,8 +729,9 @@ neighbor(column, offset[, default_value]) The result of the function depends on the affected data blocks and the order of data in the block. -!!! warning "Warning" - It can reach the neighbor rows only inside the currently processed data block. +:::warning +It can reach the neighbor rows only inside the currently processed data block. +::: The rows order used during the calculation of `neighbor` can differ from the order of rows returned to the user. To prevent that you can make a subquery with [ORDER BY](../../sql-reference/statements/select/order-by.md) and call the function from outside the subquery. @@ -838,8 +839,9 @@ Result: Calculates the difference between successive row values ​​in the data block. Returns 0 for the first row and the difference from the previous row for each subsequent row. -!!! warning "Warning" - It can reach the previous row only inside the currently processed data block. +:::warning +It can reach the previous row only inside the currently processed data block. +::: The result of the function depends on the affected data blocks and the order of data in the block. @@ -921,9 +923,9 @@ Each event has a start time and an end time. The start time is included in the e The function calculates the total number of active (concurrent) events for each event start time. -!!! warning "Warning" - Events must be ordered by the start time in ascending order. If this requirement is violated the function raises an exception. - Every data block is processed separately. If events from different data blocks overlap then they can not be processed correctly. +:::warning +Events must be ordered by the start time in ascending order. If this requirement is violated the function raises an exception. Every data block is processed separately. If events from different data blocks overlap then they can not be processed correctly. +::: **Syntax** @@ -1609,8 +1611,9 @@ Result: Accumulates states of an aggregate function for each row of a data block. -!!! warning "Warning" - The state is reset for each new data block. +:::warning +The state is reset for each new data block. +::: **Syntax** @@ -2068,8 +2071,9 @@ Number of digits. Type: [UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges). - !!! note "Note" - For `Decimal` values takes into account their scales: calculates result over underlying integer type which is `(value * scale)`. For example: `countDigits(42) = 2`, `countDigits(42.000) = 5`, `countDigits(0.04200) = 4`. I.e. you may check decimal overflow for `Decimal64` with `countDecimal(x) > 18`. It's a slow variant of [isDecimalOverflow](#is-decimal-overflow). +:::note +For `Decimal` values takes into account their scales: calculates result over underlying integer type which is `(value * scale)`. For example: `countDigits(42) = 2`, `countDigits(42.000) = 5`, `countDigits(0.04200) = 4`. I.e. you may check decimal overflow for `Decimal64` with `countDecimal(x) > 18`. It's a slow variant of [isDecimalOverflow](#is-decimal-overflow). +::: **Example** diff --git a/docs/en/sql-reference/functions/random-functions.md b/docs/en/sql-reference/functions/random-functions.md index aab9483de45..5e20a93da1f 100644 --- a/docs/en/sql-reference/functions/random-functions.md +++ b/docs/en/sql-reference/functions/random-functions.md @@ -1,14 +1,15 @@ --- -toc_priority: 51 -toc_title: Pseudo-Random Numbers +sidebar_position: 51 +sidebar_label: Pseudo-Random Numbers --- # Functions for Generating Pseudo-Random Numbers {#functions-for-generating-pseudo-random-numbers} All the functions accept zero arguments or one argument. If an argument is passed, it can be any type, and its value is not used for anything. The only purpose of this argument is to prevent common subexpression elimination, so that two different instances of the same function return different columns with different random numbers. -!!! note "Note" - Non-cryptographic generators of pseudo-random numbers are used. +:::note +Non-cryptographic generators of pseudo-random numbers are used. +::: ## rand, rand32 {#rand} diff --git a/docs/en/sql-reference/functions/rounding-functions.md b/docs/en/sql-reference/functions/rounding-functions.md index b224e7ab406..a469318e623 100644 --- a/docs/en/sql-reference/functions/rounding-functions.md +++ b/docs/en/sql-reference/functions/rounding-functions.md @@ -1,6 +1,6 @@ --- -toc_priority: 45 -toc_title: Rounding +sidebar_position: 45 +sidebar_label: Rounding --- # Rounding Functions {#rounding-functions} diff --git a/docs/en/sql-reference/functions/splitting-merging-functions.md b/docs/en/sql-reference/functions/splitting-merging-functions.md index 7a4e04bbf6c..7e94c225f6b 100644 --- a/docs/en/sql-reference/functions/splitting-merging-functions.md +++ b/docs/en/sql-reference/functions/splitting-merging-functions.md @@ -1,6 +1,6 @@ --- -toc_priority: 47 -toc_title: Splitting and Merging Strings and Arrays +sidebar_position: 47 +sidebar_label: Splitting and Merging Strings and Arrays --- # Functions for Splitting and Merging Strings and Arrays {#functions-for-splitting-and-merging-strings-and-arrays} diff --git a/docs/en/sql-reference/functions/string-functions.md b/docs/en/sql-reference/functions/string-functions.md index a30cacde519..d63e466a836 100644 --- a/docs/en/sql-reference/functions/string-functions.md +++ b/docs/en/sql-reference/functions/string-functions.md @@ -1,12 +1,13 @@ --- -toc_priority: 40 -toc_title: Strings +sidebar_position: 40 +sidebar_label: Strings --- # Functions for Working with Strings {#functions-for-working-with-strings} -!!! note "Note" - Functions for [searching](../../sql-reference/functions/string-search-functions.md) and [replacing](../../sql-reference/functions/string-replace-functions.md) in strings are described separately. +:::note +Functions for [searching](../../sql-reference/functions/string-search-functions.md) and [replacing](../../sql-reference/functions/string-replace-functions.md) in strings are described separately. +::: ## empty {#empty} diff --git a/docs/en/sql-reference/functions/string-replace-functions.md b/docs/en/sql-reference/functions/string-replace-functions.md index 144b4fbc1da..1df8bfd0c44 100644 --- a/docs/en/sql-reference/functions/string-replace-functions.md +++ b/docs/en/sql-reference/functions/string-replace-functions.md @@ -1,12 +1,13 @@ --- -toc_priority: 42 -toc_title: For Replacing in Strings +sidebar_position: 42 +sidebar_label: For Replacing in Strings --- # Functions for Searching and Replacing in Strings {#functions-for-searching-and-replacing-in-strings} -!!! note "Note" - Functions for [searching](../../sql-reference/functions/string-search-functions.md) and [other manipulations with strings](../../sql-reference/functions/string-functions.md) are described separately. +:::note +Functions for [searching](../../sql-reference/functions/string-search-functions.md) and [other manipulations with strings](../../sql-reference/functions/string-functions.md) are described separately. +::: ## replaceOne(haystack, pattern, replacement) {#replaceonehaystack-pattern-replacement} diff --git a/docs/en/sql-reference/functions/string-search-functions.md b/docs/en/sql-reference/functions/string-search-functions.md index a0c0116a058..985d9f1e63a 100644 --- a/docs/en/sql-reference/functions/string-search-functions.md +++ b/docs/en/sql-reference/functions/string-search-functions.md @@ -1,14 +1,15 @@ --- -toc_priority: 41 -toc_title: For Searching in Strings +sidebar_position: 41 +sidebar_label: For Searching in Strings --- # Functions for Searching in Strings {#functions-for-searching-strings} The search is case-sensitive by default in all these functions. There are separate variants for case insensitive search. -!!! note "Note" - Functions for [replacing](../../sql-reference/functions/string-replace-functions.md) and [other manipulations with strings](../../sql-reference/functions/string-functions.md) are described separately. +:::note +Functions for [replacing](../../sql-reference/functions/string-replace-functions.md) and [other manipulations with strings](../../sql-reference/functions/string-functions.md) are described separately. +::: ## position(haystack, needle), locate(haystack, needle) {#position} @@ -30,8 +31,9 @@ position(needle IN haystack) Alias: `locate(haystack, needle[, start_pos])`. -!!! note "Note" - Syntax of `position(needle IN haystack)` provides SQL-compatibility, the function works the same way as to `position(haystack, needle)`. +:::note +Syntax of `position(needle IN haystack)` provides SQL-compatibility, the function works the same way as to `position(haystack, needle)`. +::: **Arguments** @@ -342,8 +344,9 @@ Returns 1, if at least one string needlei matches the string `haystac For a case-insensitive search or/and in UTF-8 format use functions `multiSearchAnyCaseInsensitive, multiSearchAnyUTF8, multiSearchAnyCaseInsensitiveUTF8`. -!!! note "Note" - In all `multiSearch*` functions the number of needles should be less than 28 because of implementation specification. +:::note +In all `multiSearch*` functions the number of needles should be less than 28 because of implementation specification. +::: ## match(haystack, pattern) {#matchhaystack-pattern} @@ -358,8 +361,9 @@ For patterns to search for substrings in a string, it is better to use LIKE or The same as `match`, but returns 0 if none of the regular expressions are matched and 1 if any of the patterns matches. It uses [hyperscan](https://github.com/intel/hyperscan) library. For patterns to search substrings in a string, it is better to use `multiSearchAny` since it works much faster. -!!! note "Note" - The length of any of the `haystack` string must be less than 232 bytes otherwise the exception is thrown. This restriction takes place because of hyperscan API. +:::note +The length of any of the `haystack` string must be less than 232 bytes otherwise the exception is thrown. This restriction takes place because of hyperscan API. +::: ## multiMatchAnyIndex(haystack, \[pattern1, pattern2, …, patternn\]) {#multimatchanyindexhaystack-pattern1-pattern2-patternn} @@ -381,11 +385,13 @@ The same as `multiFuzzyMatchAny`, but returns any index that matches the haystac The same as `multiFuzzyMatchAny`, but returns the array of all indices in any order that match the haystack within a constant edit distance. -!!! note "Note" - `multiFuzzyMatch*` functions do not support UTF-8 regular expressions, and such expressions are treated as bytes because of hyperscan restriction. +:::note +`multiFuzzyMatch*` functions do not support UTF-8 regular expressions, and such expressions are treated as bytes because of hyperscan restriction. +::: -!!! note "Note" - To turn off all functions that use hyperscan, use setting `SET allow_hyperscan = 0;`. +:::note +To turn off all functions that use hyperscan, use setting `SET allow_hyperscan = 0;`. +::: ## extract(haystack, pattern) {#extracthaystack-pattern} @@ -399,8 +405,9 @@ Extracts all the fragments of a string using a regular expression. If ‘haystac Matches all groups of the `haystack` string using the `pattern` regular expression. Returns an array of arrays, where the first array includes all fragments matching the first group, the second array - matching the second group, etc. -!!! note "Note" - `extractAllGroupsHorizontal` function is slower than [extractAllGroupsVertical](#extractallgroups-vertical). +:::note +`extractAllGroupsHorizontal` function is slower than [extractAllGroupsVertical](#extractallgroups-vertical). +::: **Syntax** @@ -570,8 +577,9 @@ Same as `ngramDistance` but calculates the non-symmetric difference between `nee For case-insensitive search or/and in UTF-8 format use functions `ngramSearchCaseInsensitive, ngramSearchUTF8, ngramSearchCaseInsensitiveUTF8`. -!!! note "Note" - For UTF-8 case we use 3-gram distance. All these are not perfectly fair n-gram distances. We use 2-byte hashes to hash n-grams and then calculate the (non-)symmetric difference between these hash tables – collisions may occur. With UTF-8 case-insensitive format we do not use fair `tolower` function – we zero the 5-th bit (starting from zero) of each codepoint byte and first bit of zeroth byte if bytes more than one – this works for Latin and mostly for all Cyrillic letters. +:::note +For UTF-8 case we use 3-gram distance. All these are not perfectly fair n-gram distances. We use 2-byte hashes to hash n-grams and then calculate the (non-)symmetric difference between these hash tables – collisions may occur. With UTF-8 case-insensitive format we do not use fair `tolower` function – we zero the 5-th bit (starting from zero) of each codepoint byte and first bit of zeroth byte if bytes more than one – this works for Latin and mostly for all Cyrillic letters. +::: ## countSubstrings {#countSubstrings} diff --git a/docs/en/sql-reference/functions/time-window-functions.md b/docs/en/sql-reference/functions/time-window-functions.md index 2ea44a6e585..b45866cf931 100644 --- a/docs/en/sql-reference/functions/time-window-functions.md +++ b/docs/en/sql-reference/functions/time-window-functions.md @@ -1,6 +1,6 @@ --- -toc_priority: 68 -toc_title: Time Window +sidebar_position: 68 +sidebar_label: Time Window --- # Time Window Functions {#time-window-functions} diff --git a/docs/en/sql-reference/functions/tuple-functions.md b/docs/en/sql-reference/functions/tuple-functions.md index 96bceb8958c..cfce02f4d31 100644 --- a/docs/en/sql-reference/functions/tuple-functions.md +++ b/docs/en/sql-reference/functions/tuple-functions.md @@ -1,6 +1,6 @@ --- -toc_priority: 66 -toc_title: Tuples +sidebar_position: 66 +sidebar_label: Tuples --- # Functions for Working with Tuples {#tuple-functions} diff --git a/docs/en/sql-reference/functions/tuple-map-functions.md b/docs/en/sql-reference/functions/tuple-map-functions.md index 8ead8c58c7a..a0d62ff5ecb 100644 --- a/docs/en/sql-reference/functions/tuple-map-functions.md +++ b/docs/en/sql-reference/functions/tuple-map-functions.md @@ -1,6 +1,6 @@ --- -toc_priority: 46 -toc_title: Working with maps +sidebar_position: 46 +sidebar_label: Working with maps --- # Functions for maps {#functions-for-working-with-tuple-maps} diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 18cc3d98561..de6ca769589 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -1,6 +1,6 @@ --- -toc_priority: 38 -toc_title: Type Conversion +sidebar_position: 38 +sidebar_label: Type Conversion --- # Type Conversion Functions {#type-conversion-functions} @@ -689,8 +689,9 @@ x::t - Converted value. -!!! note "Note" - If the input value does not fit the bounds of the target type, the result overflows. For example, `CAST(-1, 'UInt8')` returns `255`. +:::note +If the input value does not fit the bounds of the target type, the result overflows. For example, `CAST(-1, 'UInt8')` returns `255`. +::: **Examples** @@ -1432,8 +1433,9 @@ Result: Converts a `DateTime64` to a `Int64` value with fixed sub-second precision. Input value is scaled up or down appropriately depending on it precision. -!!! info "Note" - The output value is a timestamp in UTC, not in the timezone of `DateTime64`. +:::note +The output value is a timestamp in UTC, not in the timezone of `DateTime64`. +::: **Syntax** diff --git a/docs/en/sql-reference/functions/url-functions.md b/docs/en/sql-reference/functions/url-functions.md index 5a305aa5033..c91029c4fce 100644 --- a/docs/en/sql-reference/functions/url-functions.md +++ b/docs/en/sql-reference/functions/url-functions.md @@ -1,6 +1,6 @@ --- -toc_priority: 54 -toc_title: URLs +sidebar_position: 54 +sidebar_label: URLs --- # Functions for Working with URLs {#functions-for-working-with-urls} diff --git a/docs/en/sql-reference/functions/uuid-functions.md b/docs/en/sql-reference/functions/uuid-functions.md index 3616b587bf7..d23b505a93f 100644 --- a/docs/en/sql-reference/functions/uuid-functions.md +++ b/docs/en/sql-reference/functions/uuid-functions.md @@ -1,6 +1,6 @@ --- -toc_priority: 53 -toc_title: UUID +sidebar_position: 53 +sidebar_label: UUID --- # Functions for Working with UUID {#functions-for-working-with-uuid} diff --git a/docs/en/sql-reference/functions/ym-dict-functions.md b/docs/en/sql-reference/functions/ym-dict-functions.md index 1e6c9cbd0b4..85215957443 100644 --- a/docs/en/sql-reference/functions/ym-dict-functions.md +++ b/docs/en/sql-reference/functions/ym-dict-functions.md @@ -1,6 +1,6 @@ --- -toc_priority: 59 -toc_title: Embedded Dictionaries +sidebar_position: 59 +sidebar_label: Embedded Dictionaries --- # Functions for Working with Embedded Dictionaries diff --git a/docs/en/sql-reference/index.md b/docs/en/sql-reference/index.md index e8fe092e622..1123c8533a9 100644 --- a/docs/en/sql-reference/index.md +++ b/docs/en/sql-reference/index.md @@ -1,8 +1,6 @@ --- -toc_folder_title: SQL Reference -toc_hidden: true -toc_priority: 28 -toc_title: hidden +sidebar_position: 28 +sidebar_label: SQL Reference --- # SQL Reference {#sql-reference} diff --git a/docs/en/sql-reference/operators/exists.md b/docs/en/sql-reference/operators/exists.md index ee0c7317637..25413790801 100644 --- a/docs/en/sql-reference/operators/exists.md +++ b/docs/en/sql-reference/operators/exists.md @@ -4,8 +4,9 @@ The `EXISTS` operator checks how many records are in the result of a subquery. I `EXISTS` can be used in a [WHERE](../../sql-reference/statements/select/where.md) clause. -!!! warning "Warning" - References to main query tables and columns are not supported in a subquery. +:::warning +References to main query tables and columns are not supported in a subquery. +::: **Syntax** diff --git a/docs/en/sql-reference/operators/in.md b/docs/en/sql-reference/operators/in.md index d8468370f3e..5dda097e799 100644 --- a/docs/en/sql-reference/operators/in.md +++ b/docs/en/sql-reference/operators/in.md @@ -119,8 +119,9 @@ FROM t_null There are two options for IN-s with subqueries (similar to JOINs): normal `IN` / `JOIN` and `GLOBAL IN` / `GLOBAL JOIN`. They differ in how they are run for distributed query processing. -!!! attention "Attention" - Remember that the algorithms described below may work differently depending on the [settings](../../operations/settings/settings.md) `distributed_product_mode` setting. +:::note +Remember that the algorithms described below may work differently depending on the [settings](../../operations/settings/settings.md) `distributed_product_mode` setting. +::: When using the regular IN, the query is sent to remote servers, and each of them runs the subqueries in the `IN` or `JOIN` clause. diff --git a/docs/en/sql-reference/operators/index.md b/docs/en/sql-reference/operators/index.md index a64dcd70c6c..4761f46ec05 100644 --- a/docs/en/sql-reference/operators/index.md +++ b/docs/en/sql-reference/operators/index.md @@ -1,6 +1,6 @@ --- -toc_priority: 38 -toc_title: Operators +sidebar_position: 38 +sidebar_label: Operators --- # Operators {#operators} @@ -210,8 +210,9 @@ Types of intervals: You can also use a string literal when setting the `INTERVAL` value. For example, `INTERVAL 1 HOUR` is identical to the `INTERVAL '1 hour'` or `INTERVAL '1' hour`. -!!! warning "Warning" - Intervals with different types can’t be combined. You can’t use expressions like `INTERVAL 4 DAY 1 HOUR`. Specify intervals in units that are smaller or equal to the smallest unit of the interval, for example, `INTERVAL 25 HOUR`. You can use consecutive operations, like in the example below. +:::warning +Intervals with different types can’t be combined. You can’t use expressions like `INTERVAL 4 DAY 1 HOUR`. Specify intervals in units that are smaller or equal to the smallest unit of the interval, for example, `INTERVAL 25 HOUR`. You can use consecutive operations, like in the example below. +::: Examples: @@ -247,9 +248,9 @@ SELECT now() AS current_date_time, current_date_time + INTERVAL '4' day + INTERV You can work with dates without using `INTERVAL`, just by adding or subtracting seconds, minutes, and hours. For example, an interval of one day can be set by adding `60*60*24`. -!!! note "Note" - The `INTERVAL` syntax or `addDays` function are always preferred. Simple addition or subtraction (syntax like `now() + ...`) doesn't consider time settings. For example, daylight saving time. - +:::note +The `INTERVAL` syntax or `addDays` function are always preferred. Simple addition or subtraction (syntax like `now() + ...`) doesn't consider time settings. For example, daylight saving time. +::: Examples: diff --git a/docs/en/sql-reference/statements/alter/column.md b/docs/en/sql-reference/statements/alter/column.md index 6bb63ea06a6..3d22146a56b 100644 --- a/docs/en/sql-reference/statements/alter/column.md +++ b/docs/en/sql-reference/statements/alter/column.md @@ -1,6 +1,6 @@ --- -toc_priority: 37 -toc_title: COLUMN +sidebar_position: 37 +sidebar_label: COLUMN --- # Column Manipulations {#column-manipulations} @@ -75,8 +75,9 @@ Deletes the column with the name `name`. If the `IF EXISTS` clause is specified, Deletes data from the file system. Since this deletes entire files, the query is completed almost instantly. -!!! warning "Warning" - You can’t delete a column if it is referenced by [materialized view](../../../sql-reference/statements/create/view.md#materialized). Otherwise, it returns an error. +:::warning +You can’t delete a column if it is referenced by [materialized view](../../../sql-reference/statements/create/view.md#materialized). Otherwise, it returns an error. +::: Example: diff --git a/docs/en/sql-reference/statements/alter/comment.md b/docs/en/sql-reference/statements/alter/comment.md index 67a17fc8974..af57adcf31c 100644 --- a/docs/en/sql-reference/statements/alter/comment.md +++ b/docs/en/sql-reference/statements/alter/comment.md @@ -1,6 +1,6 @@ --- -toc_priority: 51 -toc_title: COMMENT +sidebar_position: 51 +sidebar_label: COMMENT --- # ALTER TABLE … MODIFY COMMENT {#alter-modify-comment} diff --git a/docs/en/sql-reference/statements/alter/constraint.md b/docs/en/sql-reference/statements/alter/constraint.md index 8f4ce57b905..c9517981ae7 100644 --- a/docs/en/sql-reference/statements/alter/constraint.md +++ b/docs/en/sql-reference/statements/alter/constraint.md @@ -1,6 +1,6 @@ --- -toc_priority: 43 -toc_title: CONSTRAINT +sidebar_position: 43 +sidebar_label: CONSTRAINT --- # Manipulating Constraints {#manipulations-with-constraints} @@ -16,7 +16,8 @@ See more on [constraints](../../../sql-reference/statements/create/table.md#cons Queries will add or remove metadata about constraints from table so they are processed immediately. -!!! warning "Warning" - Constraint check **will not be executed** on existing data if it was added. +:::warning +Constraint check **will not be executed** on existing data if it was added. +::: All changes on replicated tables are broadcasted to ZooKeeper and will be applied on other replicas as well. diff --git a/docs/en/sql-reference/statements/alter/delete.md b/docs/en/sql-reference/statements/alter/delete.md index 6c638c0a3ac..21ae091f9e7 100644 --- a/docs/en/sql-reference/statements/alter/delete.md +++ b/docs/en/sql-reference/statements/alter/delete.md @@ -1,6 +1,6 @@ --- -toc_priority: 39 -toc_title: DELETE +sidebar_position: 39 +sidebar_label: DELETE --- # ALTER TABLE … DELETE Statement {#alter-mutations} @@ -11,8 +11,9 @@ ALTER TABLE [db.]table [ON CLUSTER cluster] DELETE WHERE filter_expr Deletes data matching the specified filtering expression. Implemented as a [mutation](../../../sql-reference/statements/alter/index.md#mutations). -!!! note "Note" - The `ALTER TABLE` prefix makes this syntax different from most other systems supporting SQL. It is intended to signify that unlike similar queries in OLTP databases this is a heavy operation not designed for frequent use. +:::note +The `ALTER TABLE` prefix makes this syntax different from most other systems supporting SQL. It is intended to signify that unlike similar queries in OLTP databases this is a heavy operation not designed for frequent use. +::: The `filter_expr` must be of type `UInt8`. The query deletes rows in the table for which this expression takes a non-zero value. diff --git a/docs/en/sql-reference/statements/alter/index.md b/docs/en/sql-reference/statements/alter/index.md index 0d5909518ed..536da948218 100644 --- a/docs/en/sql-reference/statements/alter/index.md +++ b/docs/en/sql-reference/statements/alter/index.md @@ -1,9 +1,9 @@ --- -toc_priority: 35 -toc_title: ALTER +sidebar_position: 35 +sidebar_label: ALTER --- -## ALTER {#query_language_queries_alter} +# ALTER Most `ALTER TABLE` queries modify table settings or data: @@ -16,8 +16,9 @@ Most `ALTER TABLE` queries modify table settings or data: - [CONSTRAINT](../../../sql-reference/statements/alter/constraint.md) - [TTL](../../../sql-reference/statements/alter/ttl.md) -!!! note "Note" - Most `ALTER TABLE` queries are supported only for [\*MergeTree](../../../engines/table-engines/mergetree-family/index.md) tables, as well as [Merge](../../../engines/table-engines/special/merge.md) and [Distributed](../../../engines/table-engines/special/distributed.md). +:::note +Most `ALTER TABLE` queries are supported only for [\*MergeTree](../../../engines/table-engines/mergetree-family/index.md) tables, as well as [Merge](../../../engines/table-engines/special/merge.md) and [Distributed](../../../engines/table-engines/special/distributed.md). +::: These `ALTER` statements manipulate views: @@ -54,7 +55,8 @@ For all `ALTER` queries, you can use the [replication_alter_partitions_sync](../ You can specify how long (in seconds) to wait for inactive replicas to execute all `ALTER` queries with the [replication_wait_for_inactive_replica_timeout](../../../operations/settings/settings.md#replication-wait-for-inactive-replica-timeout) setting. -!!! info "Note" - For all `ALTER` queries, if `replication_alter_partitions_sync = 2` and some replicas are not active for more than the time, specified in the `replication_wait_for_inactive_replica_timeout` setting, then an exception `UNFINISHED` is thrown. +:::note +For all `ALTER` queries, if `replication_alter_partitions_sync = 2` and some replicas are not active for more than the time, specified in the `replication_wait_for_inactive_replica_timeout` setting, then an exception `UNFINISHED` is thrown. +::: For `ALTER TABLE ... UPDATE|DELETE` queries the synchronicity is defined by the [mutations_sync](../../../operations/settings/settings.md#mutations_sync) setting. diff --git a/docs/en/sql-reference/statements/alter/index/index.md b/docs/en/sql-reference/statements/alter/index/index.md index 4e2943d37f3..92f55792a70 100644 --- a/docs/en/sql-reference/statements/alter/index/index.md +++ b/docs/en/sql-reference/statements/alter/index/index.md @@ -1,7 +1,7 @@ --- toc_hidden_folder: true -toc_priority: 42 -toc_title: INDEX +sidebar_position: 42 +sidebar_label: INDEX --- # Manipulating Data Skipping Indices {#manipulations-with-data-skipping-indices} @@ -18,5 +18,6 @@ The first two commands are lightweight in a sense that they only change metadata Also, they are replicated, syncing indices metadata via ZooKeeper. -!!! note "Note" - Index manipulation is supported only for tables with [`*MergeTree`](../../../../engines/table-engines/mergetree-family/mergetree.md) engine (including [replicated](../../../../engines/table-engines/mergetree-family/replication.md) variants). +:::note +Index manipulation is supported only for tables with [`*MergeTree`](../../../../engines/table-engines/mergetree-family/mergetree.md) engine (including [replicated](../../../../engines/table-engines/mergetree-family/replication.md) variants). +::: \ No newline at end of file diff --git a/docs/en/sql-reference/statements/alter/order-by.md b/docs/en/sql-reference/statements/alter/order-by.md index 16f9ace206d..84d29ae8e11 100644 --- a/docs/en/sql-reference/statements/alter/order-by.md +++ b/docs/en/sql-reference/statements/alter/order-by.md @@ -1,6 +1,6 @@ --- -toc_priority: 41 -toc_title: ORDER BY +sidebar_position: 41 +sidebar_label: ORDER BY --- # Manipulating Key Expressions {#manipulations-with-key-expressions} @@ -13,5 +13,6 @@ The command changes the [sorting key](../../../engines/table-engines/mergetree-f The command is lightweight in a sense that it only changes metadata. To keep the property that data part rows are ordered by the sorting key expression you cannot add expressions containing existing columns to the sorting key (only columns added by the `ADD COLUMN` command in the same `ALTER` query, without default column value). -!!! note "Note" - It only works for tables in the [`MergeTree`](../../../engines/table-engines/mergetree-family/mergetree.md) family (including [replicated](../../../engines/table-engines/mergetree-family/replication.md) tables). +:::note +It only works for tables in the [`MergeTree`](../../../engines/table-engines/mergetree-family/mergetree.md) family (including [replicated](../../../engines/table-engines/mergetree-family/replication.md) tables). +::: \ No newline at end of file diff --git a/docs/en/sql-reference/statements/alter/partition.md b/docs/en/sql-reference/statements/alter/partition.md index 12737624ecb..453d1bd7bf6 100644 --- a/docs/en/sql-reference/statements/alter/partition.md +++ b/docs/en/sql-reference/statements/alter/partition.md @@ -1,6 +1,6 @@ --- -toc_priority: 38 -toc_title: PARTITION +sidebar_position: 38 +sidebar_label: PARTITION --- # Manipulating Partitions and Parts {#alter_manipulations-with-partitions} @@ -160,8 +160,9 @@ ALTER TABLE table_name FREEZE [PARTITION partition_expr] [WITH NAME 'backup_name This query creates a local backup of a specified partition. If the `PARTITION` clause is omitted, the query creates the backup of all partitions at once. -!!! note "Note" - The entire backup process is performed without stopping the server. +:::note +The entire backup process is performed without stopping the server. +::: Note that for old-styled tables you can specify the prefix of the partition name (for example, `2019`) - then the query creates the backup for all the corresponding partitions. Read about setting the partition expression in a section [How to specify the partition expression](#alter-how-to-specify-part-expr). @@ -171,8 +172,9 @@ At the time of execution, for a data snapshot, the query creates hardlinks to a - `N` is the incremental number of the backup. - if the `WITH NAME` parameter is specified, then the value of the `'backup_name'` parameter is used instead of the incremental number. -!!! note "Note" - If you use [a set of disks for data storage in a table](../../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes), the `shadow/N` directory appears on every disk, storing data parts that matched by the `PARTITION` expression. +:::note +If you use [a set of disks for data storage in a table](../../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes), the `shadow/N` directory appears on every disk, storing data parts that matched by the `PARTITION` expression. +::: The same structure of directories is created inside the backup as inside `/var/lib/clickhouse/`. The query performs `chmod` for all files, forbidding writing into them. diff --git a/docs/en/sql-reference/statements/alter/projection.md b/docs/en/sql-reference/statements/alter/projection.md index c7ebc83c496..5ccf33d2d2f 100644 --- a/docs/en/sql-reference/statements/alter/projection.md +++ b/docs/en/sql-reference/statements/alter/projection.md @@ -1,6 +1,6 @@ --- -toc_priority: 49 -toc_title: PROJECTION +sidebar_position: 49 +sidebar_label: PROJECTION --- # Manipulating Projections {#manipulations-with-projections} @@ -20,5 +20,6 @@ The commands `ADD`, `DROP` and `CLEAR` are lightweight in a sense that they only Also, they are replicated, syncing projections metadata via ZooKeeper. -!!! note "Note" - Projection manipulation is supported only for tables with [`*MergeTree`](../../../engines/table-engines/mergetree-family/mergetree.md) engine (including [replicated](../../../engines/table-engines/mergetree-family/replication.md) variants). +:::note +Projection manipulation is supported only for tables with [`*MergeTree`](../../../engines/table-engines/mergetree-family/mergetree.md) engine (including [replicated](../../../engines/table-engines/mergetree-family/replication.md) variants). +::: \ No newline at end of file diff --git a/docs/en/sql-reference/statements/alter/quota.md b/docs/en/sql-reference/statements/alter/quota.md index 05130a569ab..2398a57502c 100644 --- a/docs/en/sql-reference/statements/alter/quota.md +++ b/docs/en/sql-reference/statements/alter/quota.md @@ -1,6 +1,6 @@ --- -toc_priority: 46 -toc_title: QUOTA +sidebar_position: 46 +sidebar_label: QUOTA --- # ALTER QUOTA {#alter-quota-statement} diff --git a/docs/en/sql-reference/statements/alter/role.md b/docs/en/sql-reference/statements/alter/role.md index ea6d3c61820..d3cb28a1705 100644 --- a/docs/en/sql-reference/statements/alter/role.md +++ b/docs/en/sql-reference/statements/alter/role.md @@ -1,6 +1,6 @@ --- -toc_priority: 46 -toc_title: ROLE +sidebar_position: 46 +sidebar_label: ROLE --- ## ALTER ROLE {#alter-role-statement} diff --git a/docs/en/sql-reference/statements/alter/row-policy.md b/docs/en/sql-reference/statements/alter/row-policy.md index bbf9f317737..47207d29287 100644 --- a/docs/en/sql-reference/statements/alter/row-policy.md +++ b/docs/en/sql-reference/statements/alter/row-policy.md @@ -1,6 +1,6 @@ --- -toc_priority: 47 -toc_title: ROW POLICY +sidebar_position: 47 +sidebar_label: ROW POLICY --- # ALTER ROW POLICY {#alter-row-policy-statement} diff --git a/docs/en/sql-reference/statements/alter/sample-by.md b/docs/en/sql-reference/statements/alter/sample-by.md index 21b20be8b78..08e4fe1066b 100644 --- a/docs/en/sql-reference/statements/alter/sample-by.md +++ b/docs/en/sql-reference/statements/alter/sample-by.md @@ -1,6 +1,6 @@ --- -toc_priority: 41 -toc_title: SAMPLE BY +sidebar_position: 41 +sidebar_label: SAMPLE BY --- # Manipulating Sampling-Key Expressions {#manipulations-with-sampling-key-expressions} @@ -15,5 +15,6 @@ The command changes the [sampling key](../../../engines/table-engines/mergetree- The command is lightweight in the sense that it only changes metadata. The primary key must contain the new sample key. -!!! note "Note" - It only works for tables in the [MergeTree](../../../engines/table-engines/mergetree-family/mergetree.md) family (including [replicated](../../../engines/table-engines/mergetree-family/replication.md) tables). +:::note +It only works for tables in the [MergeTree](../../../engines/table-engines/mergetree-family/mergetree.md) family (including [replicated](../../../engines/table-engines/mergetree-family/replication.md) tables). +::: \ No newline at end of file diff --git a/docs/en/sql-reference/statements/alter/setting.md b/docs/en/sql-reference/statements/alter/setting.md index 90747bc1919..bb361e2ee6f 100644 --- a/docs/en/sql-reference/statements/alter/setting.md +++ b/docs/en/sql-reference/statements/alter/setting.md @@ -1,6 +1,6 @@ --- -toc_priority: 38 -toc_title: SETTING +sidebar_position: 38 +sidebar_label: SETTING --- # Table Settings Manipulations {#table_settings_manipulations} @@ -14,9 +14,9 @@ If a setting with the specified name does not exist, then the query raises an ex ALTER TABLE [db].name [ON CLUSTER cluster] MODIFY|RESET SETTING ... ``` -!!! note "Note" - These queries can be applied to [MergeTree](../../../engines/table-engines/mergetree-family/mergetree.md) tables only. - +:::note +These queries can be applied to [MergeTree](../../../engines/table-engines/mergetree-family/mergetree.md) tables only. +::: ## MODIFY SETTING {#alter_modify_setting} diff --git a/docs/en/sql-reference/statements/alter/settings-profile.md b/docs/en/sql-reference/statements/alter/settings-profile.md index 57d12142c48..b1728f21c08 100644 --- a/docs/en/sql-reference/statements/alter/settings-profile.md +++ b/docs/en/sql-reference/statements/alter/settings-profile.md @@ -1,6 +1,6 @@ --- -toc_priority: 48 -toc_title: SETTINGS PROFILE +sidebar_position: 48 +sidebar_label: SETTINGS PROFILE --- ## ALTER SETTINGS PROFILE {#alter-settings-profile-statement} diff --git a/docs/en/sql-reference/statements/alter/ttl.md b/docs/en/sql-reference/statements/alter/ttl.md index 9cd63d3b8fe..f2cf8724197 100644 --- a/docs/en/sql-reference/statements/alter/ttl.md +++ b/docs/en/sql-reference/statements/alter/ttl.md @@ -1,6 +1,6 @@ --- -toc_priority: 44 -toc_title: TTL +sidebar_position: 44 +sidebar_label: TTL --- # Manipulations with Table TTL {#manipulations-with-table-ttl} diff --git a/docs/en/sql-reference/statements/alter/update.md b/docs/en/sql-reference/statements/alter/update.md index 13ea1b2a8db..aeff7cfa1b2 100644 --- a/docs/en/sql-reference/statements/alter/update.md +++ b/docs/en/sql-reference/statements/alter/update.md @@ -1,6 +1,6 @@ --- -toc_priority: 40 -toc_title: UPDATE +sidebar_position: 40 +sidebar_label: UPDATE --- # ALTER TABLE … UPDATE Statements {#alter-table-update-statements} @@ -11,8 +11,9 @@ ALTER TABLE [db.]table UPDATE column1 = expr1 [, ...] WHERE filter_expr Manipulates data matching the specified filtering expression. Implemented as a [mutation](../../../sql-reference/statements/alter/index.md#mutations). -!!! note "Note" - The `ALTER TABLE` prefix makes this syntax different from most other systems supporting SQL. It is intended to signify that unlike similar queries in OLTP databases this is a heavy operation not designed for frequent use. +:::note +The `ALTER TABLE` prefix makes this syntax different from most other systems supporting SQL. It is intended to signify that unlike similar queries in OLTP databases this is a heavy operation not designed for frequent use. +::: The `filter_expr` must be of type `UInt8`. This query updates values of specified columns to the values of corresponding expressions in rows for which the `filter_expr` takes a non-zero value. Values are casted to the column type using the `CAST` operator. Updating columns that are used in the calculation of the primary or the partition key is not supported. diff --git a/docs/en/sql-reference/statements/alter/user.md b/docs/en/sql-reference/statements/alter/user.md index 4873982e2a1..f9b90349dab 100644 --- a/docs/en/sql-reference/statements/alter/user.md +++ b/docs/en/sql-reference/statements/alter/user.md @@ -1,6 +1,6 @@ --- -toc_priority: 45 -toc_title: USER +sidebar_position: 45 +sidebar_label: USER --- # ALTER USER {#alter-user-statement} diff --git a/docs/en/sql-reference/statements/alter/view.md b/docs/en/sql-reference/statements/alter/view.md index 0fb1c4be0ff..71e89aaefe8 100644 --- a/docs/en/sql-reference/statements/alter/view.md +++ b/docs/en/sql-reference/statements/alter/view.md @@ -1,6 +1,6 @@ --- -toc_priority: 50 -toc_title: VIEW +sidebar_position: 50 +sidebar_label: VIEW --- # ALTER TABLE … MODIFY QUERY Statement {#alter-modify-query} diff --git a/docs/en/sql-reference/statements/attach.md b/docs/en/sql-reference/statements/attach.md index 2949ac6db38..bc7b2be333f 100644 --- a/docs/en/sql-reference/statements/attach.md +++ b/docs/en/sql-reference/statements/attach.md @@ -1,6 +1,6 @@ --- -toc_priority: 40 -toc_title: ATTACH +sidebar_position: 40 +sidebar_label: ATTACH --- # ATTACH Statement {#attach} diff --git a/docs/en/sql-reference/statements/check-table.md b/docs/en/sql-reference/statements/check-table.md index c9ad40860f7..1164a8b8be6 100644 --- a/docs/en/sql-reference/statements/check-table.md +++ b/docs/en/sql-reference/statements/check-table.md @@ -1,6 +1,6 @@ --- -toc_priority: 41 -toc_title: CHECK +sidebar_position: 41 +sidebar_label: CHECK --- # CHECK TABLE Statement {#check-table} diff --git a/docs/en/sql-reference/statements/create/database.md b/docs/en/sql-reference/statements/create/database.md index 787bbc02346..18ed94bef79 100644 --- a/docs/en/sql-reference/statements/create/database.md +++ b/docs/en/sql-reference/statements/create/database.md @@ -1,6 +1,6 @@ --- -toc_priority: 35 -toc_title: DATABASE +sidebar_position: 35 +sidebar_label: DATABASE --- # CREATE DATABASE {#query-language-create-database} diff --git a/docs/en/sql-reference/statements/create/dictionary.md b/docs/en/sql-reference/statements/create/dictionary.md index 86ab8f977b0..246625cc901 100644 --- a/docs/en/sql-reference/statements/create/dictionary.md +++ b/docs/en/sql-reference/statements/create/dictionary.md @@ -1,6 +1,6 @@ --- -toc_priority: 38 -toc_title: DICTIONARY +sidebar_position: 38 +sidebar_label: DICTIONARY --- # CREATE DICTIONARY {#create-dictionary-query} diff --git a/docs/en/sql-reference/statements/create/function.md b/docs/en/sql-reference/statements/create/function.md index ddfcdfef521..a87d3d70e54 100644 --- a/docs/en/sql-reference/statements/create/function.md +++ b/docs/en/sql-reference/statements/create/function.md @@ -1,6 +1,6 @@ --- -toc_priority: 38 -toc_title: FUNCTION +sidebar_position: 38 +sidebar_label: FUNCTION --- # CREATE FUNCTION {#create-function} diff --git a/docs/en/sql-reference/statements/create/index.md b/docs/en/sql-reference/statements/create/index.md index 3df62869e2b..666a2c66d2f 100644 --- a/docs/en/sql-reference/statements/create/index.md +++ b/docs/en/sql-reference/statements/create/index.md @@ -1,10 +1,9 @@ --- -toc_folder_title: CREATE -toc_priority: 34 -toc_title: Overview +sidebar_position: 34 +sidebar_label: CREATE --- -# CREATE Queries {#create-queries} +# CREATE Queries Create queries make a new entity of one of the following kinds: diff --git a/docs/en/sql-reference/statements/create/quota.md b/docs/en/sql-reference/statements/create/quota.md index 767846ead52..931da165a73 100644 --- a/docs/en/sql-reference/statements/create/quota.md +++ b/docs/en/sql-reference/statements/create/quota.md @@ -1,6 +1,6 @@ --- -toc_priority: 42 -toc_title: QUOTA +sidebar_position: 42 +sidebar_label: QUOTA --- # CREATE QUOTA {#create-quota-statement} diff --git a/docs/en/sql-reference/statements/create/role.md b/docs/en/sql-reference/statements/create/role.md index e0e58f7a0f6..5f7db960f27 100644 --- a/docs/en/sql-reference/statements/create/role.md +++ b/docs/en/sql-reference/statements/create/role.md @@ -1,6 +1,6 @@ --- -toc_priority: 40 -toc_title: ROLE +sidebar_position: 40 +sidebar_label: ROLE --- # CREATE ROLE {#create-role-statement} diff --git a/docs/en/sql-reference/statements/create/row-policy.md b/docs/en/sql-reference/statements/create/row-policy.md index 3f88d794619..58b7b1e2cb9 100644 --- a/docs/en/sql-reference/statements/create/row-policy.md +++ b/docs/en/sql-reference/statements/create/row-policy.md @@ -1,14 +1,15 @@ --- -toc_priority: 41 -toc_title: ROW POLICY +sidebar_position: 41 +sidebar_label: ROW POLICY --- # CREATE ROW POLICY {#create-row-policy-statement} Creates a [row policy](../../../operations/access-rights.md#row-policy-management), i.e. a filter used to determine which rows a user can read from a table. -!!! note "Warning" - Row policies makes sense only for users with readonly access. If user can modify table or copy partitions between tables, it defeats the restrictions of row policies. +:::warning +Row policies makes sense only for users with readonly access. If user can modify table or copy partitions between tables, it defeats the restrictions of row policies. +::: Syntax: @@ -30,16 +31,17 @@ In the section `TO` you can provide a list of users and roles this policy should Keyword `ALL` means all the ClickHouse users including current user. Keyword `ALL EXCEPT` allow to exclude some users from the all users list, for example, `CREATE ROW POLICY ... TO ALL EXCEPT accountant, john@localhost` -!!! note "Note" - If there are no row policies defined for a table then any user can `SELECT` all the row from the table. Defining one or more row policies for the table makes the access to the table depending on the row policies no matter if those row policies are defined for the current user or not. For example, the following policy +:::note +If there are no row policies defined for a table then any user can `SELECT` all the row from the table. Defining one or more row policies for the table makes the access to the table depending on the row policies no matter if those row policies are defined for the current user or not. For example, the following policy - `CREATE ROW POLICY pol1 ON mydb.table1 USING b=1 TO mira, peter` +`CREATE ROW POLICY pol1 ON mydb.table1 USING b=1 TO mira, peter` - forbids the users `mira` and `peter` to see the rows with `b != 1`, and any non-mentioned user (e.g., the user `paul`) will see no rows from `mydb.table1` at all. +forbids the users `mira` and `peter` to see the rows with `b != 1`, and any non-mentioned user (e.g., the user `paul`) will see no rows from `mydb.table1` at all. - If that's not desirable it can't be fixed by adding one more row policy, like the following: +If that's not desirable it can't be fixed by adding one more row policy, like the following: - `CREATE ROW POLICY pol2 ON mydb.table1 USING 1 TO ALL EXCEPT mira, peter` +`CREATE ROW POLICY pol2 ON mydb.table1 USING 1 TO ALL EXCEPT mira, peter` +::: ## AS Clause {#create-row-policy-as} diff --git a/docs/en/sql-reference/statements/create/settings-profile.md b/docs/en/sql-reference/statements/create/settings-profile.md index 07bb54c9da3..0cc633d9770 100644 --- a/docs/en/sql-reference/statements/create/settings-profile.md +++ b/docs/en/sql-reference/statements/create/settings-profile.md @@ -1,6 +1,6 @@ --- -toc_priority: 43 -toc_title: SETTINGS PROFILE +sidebar_position: 43 +sidebar_label: SETTINGS PROFILE --- # CREATE SETTINGS PROFILE {#create-settings-profile-statement} diff --git a/docs/en/sql-reference/statements/create/table.md b/docs/en/sql-reference/statements/create/table.md index 409ec422ade..82aad344117 100644 --- a/docs/en/sql-reference/statements/create/table.md +++ b/docs/en/sql-reference/statements/create/table.md @@ -1,6 +1,6 @@ --- -toc_priority: 36 -toc_title: TABLE +sidebar_position: 36 +sidebar_label: TABLE --- # CREATE TABLE {#create-table-query} @@ -159,8 +159,9 @@ ENGINE = engine PRIMARY KEY(expr1[, expr2,...]); ``` -!!! warning "Warning" - You can't combine both ways in one query. +:::warning +You can't combine both ways in one query. +::: ## Constraints {#constraints} @@ -214,8 +215,9 @@ ALTER TABLE codec_example MODIFY COLUMN float_value CODEC(Default); Codecs can be combined in a pipeline, for example, `CODEC(Delta, Default)`. -!!! warning "Warning" - You can’t decompress ClickHouse database files with external utilities like `lz4`. Instead, use the special [clickhouse-compressor](https://github.com/ClickHouse/ClickHouse/tree/master/programs/compressor) utility. +:::warning +You can’t decompress ClickHouse database files with external utilities like `lz4`. Instead, use the special [clickhouse-compressor](https://github.com/ClickHouse/ClickHouse/tree/master/programs/compressor) utility. +::: Compression is supported for the following table engines: @@ -271,11 +273,13 @@ Encryption codecs: These codecs use a fixed nonce and encryption is therefore deterministic. This makes it compatible with deduplicating engines such as [ReplicatedMergeTree](../../../engines/table-engines/mergetree-family/replication.md) but has a weakness: when the same data block is encrypted twice, the resulting ciphertext will be exactly the same so an adversary who can read the disk can see this equivalence (although only the equivalence, without getting its content). -!!! attention "Attention" - Most engines including the "*MergeTree" family create index files on disk without applying codecs. This means plaintext will appear on disk if an encrypted column is indexed. +:::warning +Most engines including the "*MergeTree" family create index files on disk without applying codecs. This means plaintext will appear on disk if an encrypted column is indexed. +::: -!!! attention "Attention" - If you perform a SELECT query mentioning a specific value in an encrypted column (such as in its WHERE clause), the value may appear in [system.query_log](../../../operations/system-tables/query_log.md). You may want to disable the logging. +:::warning +If you perform a SELECT query mentioning a specific value in an encrypted column (such as in its WHERE clause), the value may appear in [system.query_log](../../../operations/system-tables/query_log.md). You may want to disable the logging. +::: **Example** @@ -287,8 +291,9 @@ CREATE TABLE mytable ENGINE = MergeTree ORDER BY x; ``` -!!!note "Note" - If compression needs to be applied, it must be explicitly specified. Otherwise, only encryption will be applied to data. +:::note +If compression needs to be applied, it must be explicitly specified. Otherwise, only encryption will be applied to data. +::: **Example** @@ -330,8 +335,9 @@ It’s possible to use tables with [ENGINE = Memory](../../../engines/table-engi 'REPLACE' query allows you to update the table atomically. -!!!note "Note" - This query is supported only for [Atomic](../../../engines/database-engines/atomic.md) database engine. +:::note +This query is supported only for [Atomic](../../../engines/database-engines/atomic.md) database engine. +::: If you need to delete some data from a table, you can create a new table and fill it with a `SELECT` statement that does not retrieve unwanted data, then drop the old table and rename the new one: @@ -405,8 +411,9 @@ SELECT * FROM base.t1; You can add a comment to the table when you creating it. -!!!note "Note" - The comment is supported for all table engines except [Kafka](../../../engines/table-engines/integrations/kafka.md), [RabbitMQ](../../../engines/table-engines/integrations/rabbitmq.md) and [EmbeddedRocksDB](../../../engines/table-engines/integrations/embedded-rocksdb.md). +:::note +The comment is supported for all table engines except [Kafka](../../../engines/table-engines/integrations/kafka.md), [RabbitMQ](../../../engines/table-engines/integrations/rabbitmq.md) and [EmbeddedRocksDB](../../../engines/table-engines/integrations/embedded-rocksdb.md). +::: **Syntax** diff --git a/docs/en/sql-reference/statements/create/user.md b/docs/en/sql-reference/statements/create/user.md index 5dfcf891439..0aad0961a8b 100644 --- a/docs/en/sql-reference/statements/create/user.md +++ b/docs/en/sql-reference/statements/create/user.md @@ -1,6 +1,6 @@ --- -toc_priority: 39 -toc_title: USER +sidebar_position: 39 +sidebar_label: USER --- # CREATE USER {#create-user-statement} @@ -52,9 +52,9 @@ Another way of specifying host is to use `@` syntax following the username. Exam - `CREATE USER mira@'localhost'` — Equivalent to the `HOST LOCAL` syntax. - `CREATE USER mira@'192.168.%.%'` — Equivalent to the `HOST LIKE` syntax. -!!! info "Warning" - ClickHouse treats `user_name@'address'` as a username as a whole. Thus, technically you can create multiple users with the same `user_name` and different constructions after `@`. However, we do not recommend to do so. - +:::warning +ClickHouse treats `user_name@'address'` as a username as a whole. Thus, technically you can create multiple users with the same `user_name` and different constructions after `@`. However, we do not recommend to do so. +::: ## GRANTEES Clause {#grantees} diff --git a/docs/en/sql-reference/statements/create/view.md b/docs/en/sql-reference/statements/create/view.md index f7d3a6d697a..e31d1b4473f 100644 --- a/docs/en/sql-reference/statements/create/view.md +++ b/docs/en/sql-reference/statements/create/view.md @@ -1,6 +1,6 @@ --- -toc_priority: 37 -toc_title: VIEW +sidebar_position: 37 +sidebar_label: VIEW --- # CREATE VIEW {#create-view} @@ -49,10 +49,11 @@ When creating a materialized view with `TO [db].[table]`, you must not use `POPU A materialized view is implemented as follows: when inserting data to the table specified in `SELECT`, part of the inserted data is converted by this `SELECT` query, and the result is inserted in the view. -!!! important "Important" - Materialized views in ClickHouse use **column names** instead of column order during insertion into destination table. If some column names are not present in the `SELECT` query result, ClickHouse uses a default value, even if the column is not [Nullable](../../data-types/nullable.md). A safe practice would be to add aliases for every column when using Materialized views. +:::note +Materialized views in ClickHouse use **column names** instead of column order during insertion into destination table. If some column names are not present in the `SELECT` query result, ClickHouse uses a default value, even if the column is not [Nullable](../../data-types/nullable.md). A safe practice would be to add aliases for every column when using Materialized views. - Materialized views in ClickHouse are implemented more like insert triggers. If there’s some aggregation in the view query, it’s applied only to the batch of freshly inserted data. Any changes to existing data of source table (like update, delete, drop partition, etc.) does not change the materialized view. +Materialized views in ClickHouse are implemented more like insert triggers. If there’s some aggregation in the view query, it’s applied only to the batch of freshly inserted data. Any changes to existing data of source table (like update, delete, drop partition, etc.) does not change the materialized view. +::: If you specify `POPULATE`, the existing table data is inserted into the view when creating it, as if making a `CREATE TABLE ... AS SELECT ...` . Otherwise, the query contains only the data inserted in the table after creating the view. We **do not recommend** using `POPULATE`, since data inserted in the table during the view creation will not be inserted in it. @@ -68,10 +69,9 @@ To delete a view, use [DROP VIEW](../../../sql-reference/statements/drop.md#drop ## Live View [Experimental] {#live-view} -!!! important "Important" - This is an experimental feature that may change in backwards-incompatible ways in the future releases. - Enable usage of live views and `WATCH` query using [allow_experimental_live_view](../../../operations/settings/settings.md#allow-experimental-live-view) setting. Input the command `set allow_experimental_live_view = 1`. - +:::note +This is an experimental feature that may change in backwards-incompatible ways in the future releases. Enable usage of live views and `WATCH` query using [allow_experimental_live_view](../../../operations/settings/settings.md#allow-experimental-live-view) setting. Input the command `set allow_experimental_live_view = 1`. +::: ```sql CREATE LIVE VIEW [IF NOT EXISTS] [db.]table_name [WITH [TIMEOUT [value_in_sec] [AND]] [REFRESH [value_in_sec]]] AS SELECT ... @@ -83,14 +83,15 @@ Live views are triggered by insert into the innermost table specified in the que Live views work similarly to how a query in a distributed table works. But instead of combining partial results from different servers they combine partial result from current data with partial result from the new data. When a live view query includes a subquery then the cached partial result is only stored for the innermost subquery. -!!! info "Limitations" - - [Table function](../../../sql-reference/table-functions/index.md) is not supported as the innermost table. - - Tables that do not have inserts such as a [dictionary](../../../sql-reference/dictionaries/index.md), [system table](../../../operations/system-tables/index.md), a [normal view](#normal), or a [materialized view](#materialized) will not trigger a live view. - - Only queries where one can combine partial result from the old data plus partial result from the new data will work. Live view will not work for queries that require the complete data set to compute the final result or aggregations where the state of the aggregation must be preserved. - - Does not work with replicated or distributed tables where inserts are performed on different nodes. - - Can't be triggered by multiple tables. +:::info +- [Table function](../../../sql-reference/table-functions/index.md) is not supported as the innermost table. +- Tables that do not have inserts such as a [dictionary](../../../sql-reference/dictionaries/index.md), [system table](../../../operations/system-tables/index.md), a [normal view](#normal), or a [materialized view](#materialized) will not trigger a live view. +- Only queries where one can combine partial result from the old data plus partial result from the new data will work. Live view will not work for queries that require the complete data set to compute the final result or aggregations where the state of the aggregation must be preserved. +- Does not work with replicated or distributed tables where inserts are performed on different nodes. +- Can't be triggered by multiple tables. - See [WITH REFRESH](#live-view-with-refresh) to force periodic updates of a live view that in some cases can be used as a workaround. +See [WITH REFRESH](#live-view-with-refresh) to force periodic updates of a live view that in some cases can be used as a workaround. +::: ### Monitoring Live View Changes {#live-view-monitoring} @@ -246,9 +247,9 @@ Most common uses of live view tables include: ## Window View [Experimental] {#window-view} -!!! important "Important" - This is an experimental feature that may change in backwards-incompatible ways in the future releases. - Enable usage of window views and `WATCH` query using [allow_experimental_window_view](../../../operations/settings/settings.md#allow-experimental-window-view) setting. Input the command `set allow_experimental_window_view = 1`. +:::info +This is an experimental feature that may change in backwards-incompatible ways in the future releases. Enable usage of window views and `WATCH` query using [allow_experimental_window_view](../../../operations/settings/settings.md#allow-experimental-window-view) setting. Input the command `set allow_experimental_window_view = 1`. +::: ``` sql CREATE WINDOW VIEW [IF NOT EXISTS] [db.]table_name [TO [db.]table_name] [ENGINE = engine] [WATERMARK = strategy] [ALLOWED_LATENESS = interval_function] AS SELECT ... GROUP BY time_window_function diff --git a/docs/en/sql-reference/statements/describe-table.md b/docs/en/sql-reference/statements/describe-table.md index 823a31ed313..7fbe5bd2790 100644 --- a/docs/en/sql-reference/statements/describe-table.md +++ b/docs/en/sql-reference/statements/describe-table.md @@ -1,6 +1,6 @@ --- -toc_priority: 42 -toc_title: DESCRIBE +sidebar_position: 42 +sidebar_label: DESCRIBE --- # DESCRIBE TABLE {#misc-describe-table} diff --git a/docs/en/sql-reference/statements/detach.md b/docs/en/sql-reference/statements/detach.md index b77bcbc00fb..bf20f7b3461 100644 --- a/docs/en/sql-reference/statements/detach.md +++ b/docs/en/sql-reference/statements/detach.md @@ -1,6 +1,6 @@ --- -toc_priority: 43 -toc_title: DETACH +sidebar_position: 43 +sidebar_label: DETACH --- # DETACH Statement {#detach} diff --git a/docs/en/sql-reference/statements/drop.md b/docs/en/sql-reference/statements/drop.md index 552a7b5f1a9..0d3e1f7860d 100644 --- a/docs/en/sql-reference/statements/drop.md +++ b/docs/en/sql-reference/statements/drop.md @@ -1,6 +1,6 @@ --- -toc_priority: 44 -toc_title: DROP +sidebar_position: 44 +sidebar_label: DROP --- # DROP Statements {#drop} diff --git a/docs/en/sql-reference/statements/exchange.md b/docs/en/sql-reference/statements/exchange.md index 91b0c48ddcf..abe3d40950e 100644 --- a/docs/en/sql-reference/statements/exchange.md +++ b/docs/en/sql-reference/statements/exchange.md @@ -1,6 +1,6 @@ --- -toc_priority: 49 -toc_title: EXCHANGE +sidebar_position: 49 +sidebar_label: EXCHANGE --- # EXCHANGE Statement {#exchange} @@ -8,8 +8,9 @@ toc_title: EXCHANGE Exchanges the names of two tables or dictionaries atomically. This task can also be accomplished with a [RENAME](./rename.md) query using a temporary name, but the operation is not atomic in that case. -!!! note "Note" - The `EXCHANGE` query is supported by the [Atomic](../../engines/database-engines/atomic.md) database engine only. +:::note +The `EXCHANGE` query is supported by the [Atomic](../../engines/database-engines/atomic.md) database engine only. +::: **Syntax** diff --git a/docs/en/sql-reference/statements/exists.md b/docs/en/sql-reference/statements/exists.md index b7c4a487791..7c6cc812665 100644 --- a/docs/en/sql-reference/statements/exists.md +++ b/docs/en/sql-reference/statements/exists.md @@ -1,6 +1,6 @@ --- -toc_priority: 45 -toc_title: EXISTS +sidebar_position: 45 +sidebar_label: EXISTS --- # EXISTS Statement {#exists-statement} diff --git a/docs/en/sql-reference/statements/explain.md b/docs/en/sql-reference/statements/explain.md index 9c74c069f02..80f8961a3e9 100644 --- a/docs/en/sql-reference/statements/explain.md +++ b/docs/en/sql-reference/statements/explain.md @@ -1,6 +1,6 @@ --- -toc_priority: 39 -toc_title: EXPLAIN +sidebar_position: 39 +sidebar_label: EXPLAIN --- # EXPLAIN Statement {#explain} @@ -138,8 +138,9 @@ Union ReadFromStorage (SystemNumbers) ``` -!!! note "Note" - Step and query cost estimation is not supported. +:::note +Step and query cost estimation is not supported. +::: When `json = 1`, the query plan is represented in JSON format. Every node is a dictionary that always has the keys `Node Type` and `Plans`. `Node Type` is a string with a step name. `Plans` is an array with child step descriptions. Other optional keys may be added depending on node type and settings. @@ -446,8 +447,8 @@ Result: └─────────────────────────────────────────────────────────┘ ``` -!!! note "Note" - The validation is not complete, so a successfull query does not guarantee that the override would - not cause issues. +:::note +The validation is not complete, so a successfull query does not guarantee that the override would not cause issues. +::: [Оriginal article](https://clickhouse.com/docs/en/sql-reference/statements/explain/) diff --git a/docs/en/sql-reference/statements/grant.md b/docs/en/sql-reference/statements/grant.md index 1b2b63ba0e7..1ee330061b5 100644 --- a/docs/en/sql-reference/statements/grant.md +++ b/docs/en/sql-reference/statements/grant.md @@ -1,6 +1,6 @@ --- -toc_priority: 38 -toc_title: GRANT +sidebar_position: 38 +sidebar_label: GRANT --- # GRANT Statement {#grant} diff --git a/docs/en/sql-reference/statements/index.md b/docs/en/sql-reference/statements/index.md index a317e4a47de..ab51cbb330c 100644 --- a/docs/en/sql-reference/statements/index.md +++ b/docs/en/sql-reference/statements/index.md @@ -1,10 +1,9 @@ --- -toc_folder_title: Statements -toc_hidden: true -toc_priority: 31 +sidebar_position: 31 +sidebar_label: Statements --- -# ClickHouse SQL Statements {#clickhouse-sql-statements} +# ClickHouse SQL Statements Statements represent various kinds of action you can perform using SQL queries. Each kind of statement has it’s own syntax and usage details that are described separately: diff --git a/docs/en/sql-reference/statements/insert-into.md b/docs/en/sql-reference/statements/insert-into.md index f8eefad7051..17d6ce1809b 100644 --- a/docs/en/sql-reference/statements/insert-into.md +++ b/docs/en/sql-reference/statements/insert-into.md @@ -1,9 +1,9 @@ --- -toc_priority: 33 -toc_title: INSERT INTO +sidebar_position: 33 +sidebar_label: INSERT INTO --- -## INSERT INTO Statement {#insert} +# INSERT INTO Statement Inserts data into a table. diff --git a/docs/en/sql-reference/statements/kill.md b/docs/en/sql-reference/statements/kill.md index eab6f602c4a..9fe207f24b2 100644 --- a/docs/en/sql-reference/statements/kill.md +++ b/docs/en/sql-reference/statements/kill.md @@ -1,6 +1,6 @@ --- -toc_priority: 46 -toc_title: KILL +sidebar_position: 46 +sidebar_label: KILL --- # KILL Statements {#kill-statements} diff --git a/docs/en/sql-reference/statements/misc.md b/docs/en/sql-reference/statements/misc.md index c553ef37f8d..2751c5296c2 100644 --- a/docs/en/sql-reference/statements/misc.md +++ b/docs/en/sql-reference/statements/misc.md @@ -1,6 +1,6 @@ --- toc_hidden: true -toc_priority: 41 +sidebar_position: 70 --- # Miscellaneous Statements {#miscellaneous-queries} diff --git a/docs/en/sql-reference/statements/optimize.md b/docs/en/sql-reference/statements/optimize.md index 30899cc2940..773284a1b30 100644 --- a/docs/en/sql-reference/statements/optimize.md +++ b/docs/en/sql-reference/statements/optimize.md @@ -1,14 +1,15 @@ --- -toc_priority: 47 -toc_title: OPTIMIZE +sidebar_position: 47 +sidebar_label: OPTIMIZE --- # OPTIMIZE Statement {#misc_operations-optimize} This query tries to initialize an unscheduled merge of data parts for tables. -!!! warning "Warning" - `OPTIMIZE` can’t fix the `Too many parts` error. +:::warning +`OPTIMIZE` can’t fix the `Too many parts` error. +::: **Syntax** @@ -27,16 +28,19 @@ When `OPTIMIZE` is used with the [ReplicatedMergeTree](../../engines/table-engin You can specify how long (in seconds) to wait for inactive replicas to execute `OPTIMIZE` queries by the [replication_wait_for_inactive_replica_timeout](../../operations/settings/settings.md#replication-wait-for-inactive-replica-timeout) setting. -!!! info "Note" - If the `replication_alter_partitions_sync` is set to `2` and some replicas are not active for more than the time, specified by the `replication_wait_for_inactive_replica_timeout` setting, then an exception `UNFINISHED` is thrown. +:::note +If the `replication_alter_partitions_sync` is set to `2` and some replicas are not active for more than the time, specified by the `replication_wait_for_inactive_replica_timeout` setting, then an exception `UNFINISHED` is thrown. +::: ## BY expression {#by-expression} If you want to perform deduplication on custom set of columns rather than on all, you can specify list of columns explicitly or use any combination of [`*`](../../sql-reference/statements/select/index.md#asterisk), [`COLUMNS`](../../sql-reference/statements/select/index.md#columns-expression) or [`EXCEPT`](../../sql-reference/statements/select/index.md#except-modifier) expressions. The explictly written or implicitly expanded list of columns must include all columns specified in row ordering expression (both primary and sorting keys) and partitioning expression (partitioning key). -!!! note "Note" - Notice that `*` behaves just like in `SELECT`: [MATERIALIZED](../../sql-reference/statements/create/table.md#materialized) and [ALIAS](../../sql-reference/statements/create/table.md#alias) columns are not used for expansion. - Also, it is an error to specify empty list of columns, or write an expression that results in an empty list of columns, or deduplicate by an `ALIAS` column. +:::note +Notice that `*` behaves just like in `SELECT`: [MATERIALIZED](../../sql-reference/statements/create/table.md#materialized) and [ALIAS](../../sql-reference/statements/create/table.md#alias) columns are not used for expansion. + +Also, it is an error to specify empty list of columns, or write an expression that results in an empty list of columns, or deduplicate by an `ALIAS` column. +::: **Syntax** diff --git a/docs/en/sql-reference/statements/rename.md b/docs/en/sql-reference/statements/rename.md index c2192f1a6e1..b3bea3e3c37 100644 --- a/docs/en/sql-reference/statements/rename.md +++ b/docs/en/sql-reference/statements/rename.md @@ -1,6 +1,6 @@ --- -toc_priority: 48 -toc_title: RENAME +sidebar_position: 48 +sidebar_label: RENAME --- # RENAME Statement {#misc_operations-rename} @@ -8,8 +8,9 @@ toc_title: RENAME Renames databases, tables, or dictionaries. Several entities can be renamed in a single query. Note that the `RENAME` query with several entities is non-atomic operation. To swap entities names atomically, use the [EXCHANGE](./exchange.md) statement. -!!! note "Note" - The `RENAME` query is supported by the [Atomic](../../engines/database-engines/atomic.md) database engine only. +:::note +The `RENAME` query is supported by the [Atomic](../../engines/database-engines/atomic.md) database engine only. +::: **Syntax** diff --git a/docs/en/sql-reference/statements/revoke.md b/docs/en/sql-reference/statements/revoke.md index 75005260c4a..4ffa8a21027 100644 --- a/docs/en/sql-reference/statements/revoke.md +++ b/docs/en/sql-reference/statements/revoke.md @@ -1,6 +1,6 @@ --- -toc_priority: 39 -toc_title: REVOKE +sidebar_position: 39 +sidebar_label: REVOKE --- # REVOKE Statement {#revoke} diff --git a/docs/en/sql-reference/statements/select/all.md b/docs/en/sql-reference/statements/select/all.md index ba66f63b447..6b35678fd92 100644 --- a/docs/en/sql-reference/statements/select/all.md +++ b/docs/en/sql-reference/statements/select/all.md @@ -1,5 +1,5 @@ --- -toc_title: ALL +sidebar_label: ALL --- # ALL Clause {#select-all} diff --git a/docs/en/sql-reference/statements/select/array-join.md b/docs/en/sql-reference/statements/select/array-join.md index f138bcc45c7..f7fc08ae9ba 100644 --- a/docs/en/sql-reference/statements/select/array-join.md +++ b/docs/en/sql-reference/statements/select/array-join.md @@ -1,5 +1,5 @@ --- -toc_title: ARRAY JOIN +sidebar_label: ARRAY JOIN --- # ARRAY JOIN Clause {#select-array-join-clause} diff --git a/docs/en/sql-reference/statements/select/distinct.md b/docs/en/sql-reference/statements/select/distinct.md index 390afa46248..898de4730ae 100644 --- a/docs/en/sql-reference/statements/select/distinct.md +++ b/docs/en/sql-reference/statements/select/distinct.md @@ -1,5 +1,5 @@ --- -toc_title: DISTINCT +sidebar_label: DISTINCT --- # DISTINCT Clause {#select-distinct} diff --git a/docs/en/sql-reference/statements/select/except.md b/docs/en/sql-reference/statements/select/except.md index e6d9b365a91..dcaefd67ca9 100644 --- a/docs/en/sql-reference/statements/select/except.md +++ b/docs/en/sql-reference/statements/select/except.md @@ -1,5 +1,5 @@ --- -toc_title: EXCEPT +sidebar_label: EXCEPT --- # EXCEPT Clause {#except-clause} diff --git a/docs/en/sql-reference/statements/select/format.md b/docs/en/sql-reference/statements/select/format.md index c3104bd12fe..a7936509ad5 100644 --- a/docs/en/sql-reference/statements/select/format.md +++ b/docs/en/sql-reference/statements/select/format.md @@ -1,5 +1,5 @@ --- -toc_title: FORMAT +sidebar_label: FORMAT --- # FORMAT Clause {#format-clause} diff --git a/docs/en/sql-reference/statements/select/from.md b/docs/en/sql-reference/statements/select/from.md index df30a0fb0d2..9d5147db13c 100644 --- a/docs/en/sql-reference/statements/select/from.md +++ b/docs/en/sql-reference/statements/select/from.md @@ -1,5 +1,5 @@ --- -toc_title: FROM +sidebar_label: FROM --- # FROM Clause {#select-from} diff --git a/docs/en/sql-reference/statements/select/group-by.md b/docs/en/sql-reference/statements/select/group-by.md index 969a39ce51f..b08647271f1 100644 --- a/docs/en/sql-reference/statements/select/group-by.md +++ b/docs/en/sql-reference/statements/select/group-by.md @@ -1,5 +1,5 @@ --- -toc_title: GROUP BY +sidebar_label: GROUP BY --- # GROUP BY Clause {#select-group-by-clause} @@ -12,8 +12,9 @@ toc_title: GROUP BY When you want to group data in the table by column numbers instead of column names, enable the setting [enable_positional_arguments](../../../operations/settings/settings.md#enable-positional-arguments). -!!! note "Note" - There’s an additional way to run aggregation over a table. If a query contains table columns only inside aggregate functions, the `GROUP BY clause` can be omitted, and aggregation by an empty set of keys is assumed. Such queries always return exactly one row. +:::note +There’s an additional way to run aggregation over a table. If a query contains table columns only inside aggregate functions, the `GROUP BY clause` can be omitted, and aggregation by an empty set of keys is assumed. Such queries always return exactly one row. +::: ## NULL Processing {#null-processing} @@ -55,8 +56,9 @@ The subtotals are calculated in the reverse order: at first subtotals are calcul In the subtotals rows the values of already "grouped" key expressions are set to `0` or empty line. -!!! note "Note" - Mind that [HAVING](../../../sql-reference/statements/select/having.md) clause can affect the subtotals results. +:::note +Mind that [HAVING](../../../sql-reference/statements/select/having.md) clause can affect the subtotals results. +::: **Example** @@ -114,8 +116,9 @@ As `GROUP BY` section has three key expressions, the result contains four tables In the subtotals rows the values of all "grouped" key expressions are set to `0` or empty line. -!!! note "Note" - Mind that [HAVING](../../../sql-reference/statements/select/having.md) clause can affect the subtotals results. +:::note +Mind that [HAVING](../../../sql-reference/statements/select/having.md) clause can affect the subtotals results. +::: **Example** @@ -206,8 +209,9 @@ This extra row is only produced in `JSON*`, `TabSeparated*`, and `Pretty*` forma - In `Pretty*` formats, the row is output as a separate table after the main result. - In the other formats it is not available. -!!! note "Note" - totals is output in the results of `SELECT` queries, and is not output in `INSERT INTO ... SELECT`. +:::note +totals is output in the results of `SELECT` queries, and is not output in `INSERT INTO ... SELECT`. +::: `WITH TOTALS` can be run in different ways when [HAVING](../../../sql-reference/statements/select/having.md) is present. The behavior depends on the `totals_mode` setting. diff --git a/docs/en/sql-reference/statements/select/having.md b/docs/en/sql-reference/statements/select/having.md index 93d56097b11..9aee0cf4d63 100644 --- a/docs/en/sql-reference/statements/select/having.md +++ b/docs/en/sql-reference/statements/select/having.md @@ -1,5 +1,5 @@ --- -toc_title: HAVING +sidebar_label: HAVING --- # HAVING Clause {#having-clause} diff --git a/docs/en/sql-reference/statements/select/index.md b/docs/en/sql-reference/statements/select/index.md index 33644133153..50dd8fecf3a 100644 --- a/docs/en/sql-reference/statements/select/index.md +++ b/docs/en/sql-reference/statements/select/index.md @@ -1,11 +1,9 @@ --- -title: SELECT Query -toc_folder_title: SELECT -toc_priority: 32 -toc_title: Overview +sidebar_position: 32 +sidebar_label: SELECT --- -# SELECT Query {#select-queries-syntax} +# SELECT Query `SELECT` queries perform data retrieval. By default, the requested data is returned to the client, while in conjunction with [INSERT INTO](../../../sql-reference/statements/insert-into.md) it can be forwarded to a different table. diff --git a/docs/en/sql-reference/statements/select/intersect.md b/docs/en/sql-reference/statements/select/intersect.md index 2243a35e4d8..ef9868daebb 100644 --- a/docs/en/sql-reference/statements/select/intersect.md +++ b/docs/en/sql-reference/statements/select/intersect.md @@ -1,5 +1,5 @@ --- -toc_title: INTERSECT +sidebar_label: INTERSECT --- # INTERSECT Clause {#intersect-clause} diff --git a/docs/en/sql-reference/statements/select/into-outfile.md b/docs/en/sql-reference/statements/select/into-outfile.md index b949b9c83c0..08f53348cd3 100644 --- a/docs/en/sql-reference/statements/select/into-outfile.md +++ b/docs/en/sql-reference/statements/select/into-outfile.md @@ -1,5 +1,5 @@ --- -toc_title: INTO OUTFILE +sidebar_label: INTO OUTFILE --- # INTO OUTFILE Clause {#into-outfile-clause} diff --git a/docs/en/sql-reference/statements/select/join.md b/docs/en/sql-reference/statements/select/join.md index 3d302be561a..0cf58d0b90f 100644 --- a/docs/en/sql-reference/statements/select/join.md +++ b/docs/en/sql-reference/statements/select/join.md @@ -1,5 +1,5 @@ --- -toc_title: JOIN +sidebar_label: JOIN --- # JOIN Clause {#select-join} @@ -36,8 +36,9 @@ Additional join types available in ClickHouse: - `LEFT ANY JOIN`, `RIGHT ANY JOIN` and `INNER ANY JOIN`, partially (for opposite side of `LEFT` and `RIGHT`) or completely (for `INNER` and `FULL`) disables the cartesian product for standard `JOIN` types. - `ASOF JOIN` and `LEFT ASOF JOIN`, joining sequences with a non-exact match. `ASOF JOIN` usage is described below. -!!! note "Note" - When [join_algorithm](../../../operations/settings/settings.md#settings-join_algorithm) is set to `partial_merge`, `RIGHT JOIN` and `FULL JOIN` are supported only with `ALL` strictness (`SEMI`, `ANTI`, `ANY`, and `ASOF` are not supported). +:::note +When [join_algorithm](../../../operations/settings/settings.md#settings-join_algorithm) is set to `partial_merge`, `RIGHT JOIN` and `FULL JOIN` are supported only with `ALL` strictness (`SEMI`, `ANTI`, `ANY`, and `ASOF` are not supported). +::: ## Settings {#join-settings} @@ -63,8 +64,9 @@ Rows are joined if the whole complex condition is met. If the conditions are not The `OR` operator inside the `ON` clause works using the hash join algorithm — for each `OR` argument with join keys for `JOIN`, a separate hash table is created, so memory consumption and query execution time grow linearly with an increase in the number of expressions `OR` of the `ON` clause. -!!! note "Note" - If a condition refers columns from different tables, then only the equality operator (`=`) is supported so far. +:::note +If a condition refers columns from different tables, then only the equality operator (`=`) is supported so far. +::: **Example** @@ -197,8 +199,9 @@ For example, consider the following tables: `ASOF JOIN` can take the timestamp of a user event from `table_1` and find an event in `table_2` where the timestamp is closest to the timestamp of the event from `table_1` corresponding to the closest match condition. Equal timestamp values are the closest if available. Here, the `user_id` column can be used for joining on equality and the `ev_time` column can be used for joining on the closest match. In our example, `event_1_1` can be joined with `event_2_1` and `event_1_2` can be joined with `event_2_3`, but `event_2_2` can’t be joined. -!!! note "Note" - `ASOF` join is **not** supported in the [Join](../../../engines/table-engines/special/join.md) table engine. +:::note +`ASOF` join is **not** supported in the [Join](../../../engines/table-engines/special/join.md) table engine. +::: ## Distributed JOIN {#global-join} diff --git a/docs/en/sql-reference/statements/select/limit-by.md b/docs/en/sql-reference/statements/select/limit-by.md index 68b459a46e8..913b7b40338 100644 --- a/docs/en/sql-reference/statements/select/limit-by.md +++ b/docs/en/sql-reference/statements/select/limit-by.md @@ -1,5 +1,5 @@ --- -toc_title: LIMIT BY +sidebar_label: LIMIT BY --- # LIMIT BY Clause {#limit-by-clause} @@ -13,8 +13,9 @@ ClickHouse supports the following syntax variants: During query processing, ClickHouse selects data ordered by sorting key. The sorting key is set explicitly using an [ORDER BY](order-by.md#select-order-by) clause or implicitly as a property of the table engine (row order is only guaranteed when using [ORDER BY](order-by.md#select-order-by), otherwise the row blocks will not be ordered due to multi-threading). Then ClickHouse applies `LIMIT n BY expressions` and returns the first `n` rows for each distinct combination of `expressions`. If `OFFSET` is specified, then for each data block that belongs to a distinct combination of `expressions`, ClickHouse skips `offset_value` number of rows from the beginning of the block and returns a maximum of `n` rows as a result. If `offset_value` is bigger than the number of rows in the data block, ClickHouse returns zero rows from the block. -!!! note "Note" - `LIMIT BY` is not related to [LIMIT](../../../sql-reference/statements/select/limit.md). They can both be used in the same query. +:::note +`LIMIT BY` is not related to [LIMIT](../../../sql-reference/statements/select/limit.md). They can both be used in the same query. +::: If you want to use column numbers instead of column names in the `LIMIT BY` clause, enable the setting [enable_positional_arguments](../../../operations/settings/settings.md#enable-positional-arguments). diff --git a/docs/en/sql-reference/statements/select/limit.md b/docs/en/sql-reference/statements/select/limit.md index 6ed38b2dd64..6b1c90041fe 100644 --- a/docs/en/sql-reference/statements/select/limit.md +++ b/docs/en/sql-reference/statements/select/limit.md @@ -1,5 +1,5 @@ --- -toc_title: LIMIT +sidebar_label: LIMIT --- # LIMIT Clause {#limit-clause} @@ -12,8 +12,9 @@ toc_title: LIMIT If there is no [ORDER BY](../../../sql-reference/statements/select/order-by.md) clause that explicitly sorts results, the choice of rows for the result may be arbitrary and non-deterministic. -!!! note "Note" - The number of rows in the result set can also depend on the [limit](../../../operations/settings/settings.md#limit) setting. +:::note +The number of rows in the result set can also depend on the [limit](../../../operations/settings/settings.md#limit) setting. +::: ## LIMIT … WITH TIES Modifier {#limit-with-ties} diff --git a/docs/en/sql-reference/statements/select/offset.md b/docs/en/sql-reference/statements/select/offset.md index 20ebd972a24..e120845dbc6 100644 --- a/docs/en/sql-reference/statements/select/offset.md +++ b/docs/en/sql-reference/statements/select/offset.md @@ -1,5 +1,5 @@ --- -toc_title: OFFSET +sidebar_label: OFFSET --- # OFFSET FETCH Clause {#offset-fetch} @@ -30,11 +30,13 @@ SELECT * FROM test_fetch ORDER BY a LIMIT 3 OFFSET 1; The `WITH TIES` option is used to return any additional rows that tie for the last place in the result set according to the `ORDER BY` clause. For example, if `fetch_row_count` is set to 5 but two additional rows match the values of the `ORDER BY` columns in the fifth row, the result set will contain seven rows. -!!! note "Note" - According to the standard, the `OFFSET` clause must come before the `FETCH` clause if both are present. +:::note +According to the standard, the `OFFSET` clause must come before the `FETCH` clause if both are present. +::: -!!! note "Note" - The real offset can also depend on the [offset](../../../operations/settings/settings.md#offset) setting. +:::note +The real offset can also depend on the [offset](../../../operations/settings/settings.md#offset) setting. +::: ## Examples {#examples} diff --git a/docs/en/sql-reference/statements/select/order-by.md b/docs/en/sql-reference/statements/select/order-by.md index b24f0213e4e..46e483dddf4 100644 --- a/docs/en/sql-reference/statements/select/order-by.md +++ b/docs/en/sql-reference/statements/select/order-by.md @@ -1,5 +1,5 @@ --- -toc_title: ORDER BY +sidebar_label: ORDER BY --- # ORDER BY Clause {#select-order-by} diff --git a/docs/en/sql-reference/statements/select/prewhere.md b/docs/en/sql-reference/statements/select/prewhere.md index 646bb83e692..c3aa2e14384 100644 --- a/docs/en/sql-reference/statements/select/prewhere.md +++ b/docs/en/sql-reference/statements/select/prewhere.md @@ -1,5 +1,5 @@ --- -toc_title: PREWHERE +sidebar_label: PREWHERE --- # PREWHERE Clause {#prewhere-clause} @@ -18,8 +18,9 @@ If the [optimize_move_to_prewhere](../../../operations/settings/settings.md#opti If query has [FINAL](from.md#select-from-final) modifier, the `PREWHERE` optimization is not always correct. It is enabled only if both settings [optimize_move_to_prewhere](../../../operations/settings/settings.md#optimize_move_to_prewhere) and [optimize_move_to_prewhere_if_final](../../../operations/settings/settings.md#optimize_move_to_prewhere_if_final) are turned on. -!!! note "Attention" - The `PREWHERE` section is executed before `FINAL`, so the results of `FROM ... FINAL` queries may be skewed when using `PREWHERE` with fields not in the `ORDER BY` section of a table. +:::note +The `PREWHERE` section is executed before `FINAL`, so the results of `FROM ... FINAL` queries may be skewed when using `PREWHERE` with fields not in the `ORDER BY` section of a table. +::: ## Limitations {#limitations} diff --git a/docs/en/sql-reference/statements/select/sample.md b/docs/en/sql-reference/statements/select/sample.md index a587731e563..3673a49a9e9 100644 --- a/docs/en/sql-reference/statements/select/sample.md +++ b/docs/en/sql-reference/statements/select/sample.md @@ -1,5 +1,5 @@ --- -toc_title: SAMPLE +sidebar_label: SAMPLE --- # SAMPLE Clause {#select-sample-clause} @@ -14,8 +14,9 @@ Approximated query processing can be useful in the following cases: - When your raw data is not accurate, so approximation does not noticeably degrade the quality. - Business requirements target approximate results (for cost-effectiveness, or to market exact results to premium users). -!!! note "Note" - You can only use sampling with the tables in the [MergeTree](../../../engines/table-engines/mergetree-family/mergetree.md) family, and only if the sampling expression was specified during table creation (see [MergeTree engine](../../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table)). +:::note +You can only use sampling with the tables in the [MergeTree](../../../engines/table-engines/mergetree-family/mergetree.md) family, and only if the sampling expression was specified during table creation (see [MergeTree engine](../../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table)). +::: The features of data sampling are listed below: diff --git a/docs/en/sql-reference/statements/select/union.md b/docs/en/sql-reference/statements/select/union.md index 6dfe554edf0..8a1c7a770c9 100644 --- a/docs/en/sql-reference/statements/select/union.md +++ b/docs/en/sql-reference/statements/select/union.md @@ -1,5 +1,5 @@ --- -toc_title: UNION +sidebar_label: UNION --- # UNION Clause {#union-clause} diff --git a/docs/en/sql-reference/statements/select/where.md b/docs/en/sql-reference/statements/select/where.md index 348b869e2db..c68f9d39d09 100644 --- a/docs/en/sql-reference/statements/select/where.md +++ b/docs/en/sql-reference/statements/select/where.md @@ -1,5 +1,5 @@ --- -toc_title: WHERE +sidebar_label: WHERE --- # WHERE Clause {#select-where} @@ -10,8 +10,9 @@ If there is a `WHERE` clause, it must contain an expression with the `UInt8` typ `WHERE` expression is evaluated on the ability to use indexes and partition pruning, if the underlying table engine supports that. -!!! note "Note" - There is a filtering optimization called [PREWHERE](../../../sql-reference/statements/select/prewhere.md). +:::note +There is a filtering optimization called [PREWHERE](../../../sql-reference/statements/select/prewhere.md). +::: If you need to test a value for [NULL](../../../sql-reference/syntax.md#null-literal), use [IS NULL](../../operators/index.md#operator-is-null) and [IS NOT NULL](../../operators/index.md#is-not-null) operators or [isNull](../../../sql-reference/functions/functions-for-nulls.md#isnull) and [isNotNull](../../../sql-reference/functions/functions-for-nulls.md#isnotnull) functions. Otherwise an expression with `NULL` never passes. diff --git a/docs/en/sql-reference/statements/select/with.md b/docs/en/sql-reference/statements/select/with.md index d6c8da261cb..39fcb752980 100644 --- a/docs/en/sql-reference/statements/select/with.md +++ b/docs/en/sql-reference/statements/select/with.md @@ -1,5 +1,5 @@ --- -toc_title: WITH +sidebar_label: WITH --- # WITH Clause {#with-clause} diff --git a/docs/en/sql-reference/statements/set-role.md b/docs/en/sql-reference/statements/set-role.md index cf14a9c6d75..cac7ca28b92 100644 --- a/docs/en/sql-reference/statements/set-role.md +++ b/docs/en/sql-reference/statements/set-role.md @@ -1,6 +1,6 @@ --- -toc_priority: 51 -toc_title: SET ROLE +sidebar_position: 51 +sidebar_label: SET ROLE --- # SET ROLE Statement {#set-role-statement} diff --git a/docs/en/sql-reference/statements/set.md b/docs/en/sql-reference/statements/set.md index e5de5c41284..d2a1d30c797 100644 --- a/docs/en/sql-reference/statements/set.md +++ b/docs/en/sql-reference/statements/set.md @@ -1,6 +1,6 @@ --- -toc_priority: 50 -toc_title: SET +sidebar_position: 50 +sidebar_label: SET --- # SET Statement {#query-set} diff --git a/docs/en/sql-reference/statements/show.md b/docs/en/sql-reference/statements/show.md index 96cbee0b04d..75c5c121946 100644 --- a/docs/en/sql-reference/statements/show.md +++ b/docs/en/sql-reference/statements/show.md @@ -1,6 +1,6 @@ --- -toc_priority: 37 -toc_title: SHOW +sidebar_position: 37 +sidebar_label: SHOW --- # SHOW Statements {#show-queries} @@ -361,8 +361,9 @@ SHOW ACCESS Returns a list of clusters. All available clusters are listed in the [system.clusters](../../operations/system-tables/clusters.md) table. -!!! info "Note" - `SHOW CLUSTER name` query displays the contents of system.clusters table for this cluster. +:::note +`SHOW CLUSTER name` query displays the contents of system.clusters table for this cluster. +::: ### Syntax {#show-cluster-syntax} diff --git a/docs/en/sql-reference/statements/system.md b/docs/en/sql-reference/statements/system.md index b71853f29dd..14eed981381 100644 --- a/docs/en/sql-reference/statements/system.md +++ b/docs/en/sql-reference/statements/system.md @@ -1,6 +1,6 @@ --- -toc_priority: 36 -toc_title: SYSTEM +sidebar_position: 36 +sidebar_label: SYSTEM --- # SYSTEM Statements {#query-language-system} @@ -191,8 +191,9 @@ Provides possibility to stop background merges for tables in the MergeTree famil SYSTEM STOP MERGES [ON VOLUME | [db.]merge_tree_family_table_name] ``` -!!! note "Note" - `DETACH / ATTACH` table will start background merges for the table even in case when merges have been stopped for all MergeTree tables before. +:::note +`DETACH / ATTACH` table will start background merges for the table even in case when merges have been stopped for all MergeTree tables before. +::: ### START MERGES {#query_language-system-start-merges} @@ -326,8 +327,9 @@ One may execute query after: Replica attaches locally found parts and sends info about them to Zookeeper. Parts present on a replica before metadata loss are not re-fetched from other ones if not being outdated (so replica restoration does not mean re-downloading all data over the network). -!!! warning "Warning" - Parts in all states are moved to `detached/` folder. Parts active before data loss (committed) are attached. +:::warning +Parts in all states are moved to `detached/` folder. Parts active before data loss (committed) are attached. +::: **Syntax** diff --git a/docs/en/sql-reference/statements/truncate.md b/docs/en/sql-reference/statements/truncate.md index b5354196fa4..393ba82b3cd 100644 --- a/docs/en/sql-reference/statements/truncate.md +++ b/docs/en/sql-reference/statements/truncate.md @@ -1,6 +1,6 @@ --- -toc_priority: 52 -toc_title: TRUNCATE +sidebar_position: 52 +sidebar_label: TRUNCATE --- # TRUNCATE Statement {#truncate-statement} @@ -17,5 +17,6 @@ You can use the [replication_alter_partitions_sync](../../operations/settings/se You can specify how long (in seconds) to wait for inactive replicas to execute `TRUNCATE` queries with the [replication_wait_for_inactive_replica_timeout](../../operations/settings/settings.md#replication-wait-for-inactive-replica-timeout) setting. -!!! info "Note" - If the `replication_alter_partitions_sync` is set to `2` and some replicas are not active for more than the time, specified by the `replication_wait_for_inactive_replica_timeout` setting, then an exception `UNFINISHED` is thrown. +:::note +If the `replication_alter_partitions_sync` is set to `2` and some replicas are not active for more than the time, specified by the `replication_wait_for_inactive_replica_timeout` setting, then an exception `UNFINISHED` is thrown. +::: \ No newline at end of file diff --git a/docs/en/sql-reference/statements/use.md b/docs/en/sql-reference/statements/use.md index 841c23d333d..869bf44fdeb 100644 --- a/docs/en/sql-reference/statements/use.md +++ b/docs/en/sql-reference/statements/use.md @@ -1,6 +1,6 @@ --- -toc_priority: 53 -toc_title: USE +sidebar_position: 53 +sidebar_label: USE --- # USE Statement {#use} diff --git a/docs/en/sql-reference/statements/watch.md b/docs/en/sql-reference/statements/watch.md index be793d30f3d..688cf21e23c 100644 --- a/docs/en/sql-reference/statements/watch.md +++ b/docs/en/sql-reference/statements/watch.md @@ -1,14 +1,13 @@ --- -toc_priority: 53 -toc_title: WATCH +sidebar_position: 53 +sidebar_label: WATCH --- # WATCH Statement (Experimental) {#watch} -!!! important "Important" - This is an experimental feature that may change in backwards-incompatible ways in the future releases. - Enable live views and `WATCH` query using `set allow_experimental_live_view = 1`. - +:::warning +This is an experimental feature that may change in backwards-incompatible ways in the future releases. Enable live views and `WATCH` query using `set allow_experimental_live_view = 1`. +::: ``` sql WATCH [db.]live_view @@ -105,5 +104,6 @@ WATCH lv EVENTS LIMIT 1; The `FORMAT` clause works the same way as for the [SELECT](../../sql-reference/statements/select/format.md#format-clause). -!!! info "Note" - The [JSONEachRowWithProgress](../../interfaces/formats.md#jsoneachrowwithprogress) format should be used when watching [LIVE VIEW](./create/view.md#live-view) tables over the HTTP interface. The progress messages will be added to the output to keep the long-lived HTTP connection alive until the query result changes. The interval between progress messages is controlled using the [live_view_heartbeat_interval](./create/view.md#live-view-settings) setting. +:::note +The [JSONEachRowWithProgress](../../interfaces/formats.md#jsoneachrowwithprogress) format should be used when watching [LIVE VIEW](./create/view.md#live-view) tables over the HTTP interface. The progress messages will be added to the output to keep the long-lived HTTP connection alive until the query result changes. The interval between progress messages is controlled using the [live_view_heartbeat_interval](./create/view.md#live-view-settings) setting. +::: \ No newline at end of file diff --git a/docs/en/sql-reference/syntax.md b/docs/en/sql-reference/syntax.md index 19efef3dc6a..10664549329 100644 --- a/docs/en/sql-reference/syntax.md +++ b/docs/en/sql-reference/syntax.md @@ -1,6 +1,6 @@ --- -toc_priority: 31 -toc_title: Syntax +sidebar_position: 31 +sidebar_label: Syntax --- # Syntax {#syntax} diff --git a/docs/en/sql-reference/table-functions/cluster.md b/docs/en/sql-reference/table-functions/cluster.md index a02c2a10fb7..5954ed1b439 100644 --- a/docs/en/sql-reference/table-functions/cluster.md +++ b/docs/en/sql-reference/table-functions/cluster.md @@ -1,6 +1,6 @@ --- -toc_priority: 50 -toc_title: cluster +sidebar_position: 50 +sidebar_label: cluster --- # cluster, clusterAllReplicas {#cluster-clusterallreplicas} @@ -9,8 +9,9 @@ Allows to access all shards in an existing cluster which configured in `remote_s `clusterAllReplicas` function — same as `cluster`, but all replicas are queried. Each replica in a cluster is used as a separate shard/connection. -!!! note "Note" - All available clusters are listed in the [system.clusters](../../operations/system-tables/clusters.md) table. +:::note +All available clusters are listed in the [system.clusters](../../operations/system-tables/clusters.md) table. +::: **Syntax** diff --git a/docs/en/sql-reference/table-functions/dictionary.md b/docs/en/sql-reference/table-functions/dictionary.md index ad30cb30adf..f04a4b6eb24 100644 --- a/docs/en/sql-reference/table-functions/dictionary.md +++ b/docs/en/sql-reference/table-functions/dictionary.md @@ -1,6 +1,6 @@ --- -toc_priority: 54 -toc_title: dictionary function +sidebar_position: 54 +sidebar_label: dictionary function --- # dictionary {#dictionary-function} diff --git a/docs/en/sql-reference/table-functions/file.md b/docs/en/sql-reference/table-functions/file.md index f7c2a9e6d5b..4b72b0d84f5 100644 --- a/docs/en/sql-reference/table-functions/file.md +++ b/docs/en/sql-reference/table-functions/file.md @@ -1,6 +1,6 @@ --- -toc_priority: 37 -toc_title: file +sidebar_position: 37 +sidebar_label: file --- # file {#file} @@ -106,8 +106,9 @@ Query the number of rows in all files of these two directories: SELECT count(*) FROM file('{some,another}_dir/*', 'TSV', 'name String, value UInt32'); ``` -!!! warning "Warning" - If your listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`. +:::warning +If your listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`. +::: **Example** diff --git a/docs/en/sql-reference/table-functions/generate.md b/docs/en/sql-reference/table-functions/generate.md index ae22e1a1b88..bb9ad3f7551 100644 --- a/docs/en/sql-reference/table-functions/generate.md +++ b/docs/en/sql-reference/table-functions/generate.md @@ -1,6 +1,6 @@ --- -toc_priority: 47 -toc_title: generateRandom +sidebar_position: 47 +sidebar_label: generateRandom --- # generateRandom {#generaterandom} diff --git a/docs/en/sql-reference/table-functions/hdfs.md b/docs/en/sql-reference/table-functions/hdfs.md index a7c3baca299..7f7dc53d27e 100644 --- a/docs/en/sql-reference/table-functions/hdfs.md +++ b/docs/en/sql-reference/table-functions/hdfs.md @@ -1,6 +1,6 @@ --- -toc_priority: 45 -toc_title: hdfs +sidebar_position: 45 +sidebar_label: hdfs --- # hdfs {#hdfs} @@ -78,8 +78,9 @@ SELECT count(*) FROM hdfs('hdfs://hdfs1:9000/{some,another}_dir/*', 'TSV', 'name String, value UInt32') ``` -!!! warning "Warning" - If your listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`. +:::warning +If your listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`. +::: **Example** diff --git a/docs/en/sql-reference/table-functions/hdfsCluster.md b/docs/en/sql-reference/table-functions/hdfsCluster.md index 6183fe83c38..b46b8e64a1a 100644 --- a/docs/en/sql-reference/table-functions/hdfsCluster.md +++ b/docs/en/sql-reference/table-functions/hdfsCluster.md @@ -1,6 +1,6 @@ --- -toc_priority: 55 -toc_title: hdfsCluster +sidebar_position: 55 +sidebar_label: hdfsCluster --- # hdfsCluster Table Function {#hdfsCluster-table-function} @@ -49,8 +49,9 @@ SELECT count(*) FROM hdfsCluster('cluster_simple', 'hdfs://hdfs1:9000/{some,another}_dir/*', 'TSV', 'name String, value UInt32') ``` -!!! warning "Warning" - If your listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`. +:::warning +If your listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`. +::: **See Also** diff --git a/docs/en/sql-reference/table-functions/index.md b/docs/en/sql-reference/table-functions/index.md index 24d67e31fa8..a51312324f0 100644 --- a/docs/en/sql-reference/table-functions/index.md +++ b/docs/en/sql-reference/table-functions/index.md @@ -1,10 +1,9 @@ --- -toc_folder_title: Table Functions -toc_priority: 34 -toc_title: Introduction +sidebar_label: Table Functions +sidebar_position: 34 --- -# Table Functions {#table-functions} +# Table Functions Table functions are methods for constructing tables. @@ -20,8 +19,9 @@ You can use table functions in: - [INSERT INTO TABLE FUNCTION](../../sql-reference/statements/insert-into.md#inserting-into-table-function) query. -!!! warning "Warning" - You can’t use table functions if the [allow_ddl](../../operations/settings/permissions-for-queries.md#settings_allow_ddl) setting is disabled. +:::warning +You can’t use table functions if the [allow_ddl](../../operations/settings/permissions-for-queries.md#settings_allow_ddl) setting is disabled. +::: | Function | Description | |------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------| diff --git a/docs/en/sql-reference/table-functions/input.md b/docs/en/sql-reference/table-functions/input.md index 17707b798d6..916abb890ff 100644 --- a/docs/en/sql-reference/table-functions/input.md +++ b/docs/en/sql-reference/table-functions/input.md @@ -1,6 +1,6 @@ --- -toc_priority: 46 -toc_title: input +sidebar_position: 46 +sidebar_label: input --- # input {#input} diff --git a/docs/en/sql-reference/table-functions/jdbc.md b/docs/en/sql-reference/table-functions/jdbc.md index 9fe1333fc94..57128f7d146 100644 --- a/docs/en/sql-reference/table-functions/jdbc.md +++ b/docs/en/sql-reference/table-functions/jdbc.md @@ -1,6 +1,6 @@ --- -toc_priority: 43 -toc_title: jdbc +sidebar_position: 43 +sidebar_label: jdbc --- # jdbc {#table-function-jdbc} diff --git a/docs/en/sql-reference/table-functions/merge.md b/docs/en/sql-reference/table-functions/merge.md index c89f0f4cc5a..301f0a69caf 100644 --- a/docs/en/sql-reference/table-functions/merge.md +++ b/docs/en/sql-reference/table-functions/merge.md @@ -1,6 +1,6 @@ --- -toc_priority: 38 -toc_title: merge +sidebar_position: 38 +sidebar_label: merge --- # merge {#merge} diff --git a/docs/en/sql-reference/table-functions/mysql.md b/docs/en/sql-reference/table-functions/mysql.md index b45ab86f60f..c6983d8fba1 100644 --- a/docs/en/sql-reference/table-functions/mysql.md +++ b/docs/en/sql-reference/table-functions/mysql.md @@ -1,6 +1,6 @@ --- -toc_priority: 42 -toc_title: mysql +sidebar_position: 42 +sidebar_label: mysql --- # mysql {#mysql} @@ -55,8 +55,9 @@ SELECT name FROM mysql(`mysql1:3306|mysql2:3306|mysql3:3306`, 'mysql_database', A table object with the same columns as the original MySQL table. -!!! info "Note" - In the `INSERT` query to distinguish table function `mysql(...)` from table name with column names list, you must use keywords `FUNCTION` or `TABLE FUNCTION`. See examples below. +:::note +In the `INSERT` query to distinguish table function `mysql(...)` from table name with column names list, you must use keywords `FUNCTION` or `TABLE FUNCTION`. See examples below. +::: **Examples** diff --git a/docs/en/sql-reference/table-functions/null.md b/docs/en/sql-reference/table-functions/null.md index 4a8d221d620..48df12bfece 100644 --- a/docs/en/sql-reference/table-functions/null.md +++ b/docs/en/sql-reference/table-functions/null.md @@ -1,6 +1,6 @@ --- -toc_priority: 53 -toc_title: null function +sidebar_position: 53 +sidebar_label: null function --- # null {#null-function} diff --git a/docs/en/sql-reference/table-functions/numbers.md b/docs/en/sql-reference/table-functions/numbers.md index f9735056b05..c15c47cf725 100644 --- a/docs/en/sql-reference/table-functions/numbers.md +++ b/docs/en/sql-reference/table-functions/numbers.md @@ -1,6 +1,6 @@ --- -toc_priority: 39 -toc_title: numbers +sidebar_position: 39 +sidebar_label: numbers --- # numbers {#numbers} diff --git a/docs/en/sql-reference/table-functions/odbc.md b/docs/en/sql-reference/table-functions/odbc.md index a8481fbfd68..d2614337cdd 100644 --- a/docs/en/sql-reference/table-functions/odbc.md +++ b/docs/en/sql-reference/table-functions/odbc.md @@ -1,6 +1,6 @@ --- -toc_priority: 44 -toc_title: odbc +sidebar_position: 44 +sidebar_label: odbc --- # odbc {#table-functions-odbc} diff --git a/docs/en/sql-reference/table-functions/postgresql.md b/docs/en/sql-reference/table-functions/postgresql.md index b2bdc2495e5..6a30b1f3f0c 100644 --- a/docs/en/sql-reference/table-functions/postgresql.md +++ b/docs/en/sql-reference/table-functions/postgresql.md @@ -1,6 +1,6 @@ --- -toc_priority: 42 -toc_title: postgresql +sidebar_position: 42 +sidebar_label: postgresql --- # postgresql {#postgresql} @@ -26,8 +26,9 @@ postgresql('host:port', 'database', 'table', 'user', 'password'[, `schema`]) A table object with the same columns as the original PostgreSQL table. -!!! info "Note" - In the `INSERT` query to distinguish table function `postgresql(...)` from table name with column names list you must use keywords `FUNCTION` or `TABLE FUNCTION`. See examples below. +:::note +In the `INSERT` query to distinguish table function `postgresql(...)` from table name with column names list you must use keywords `FUNCTION` or `TABLE FUNCTION`. See examples below. +::: ## Implementation Details {#implementation-details} @@ -41,8 +42,9 @@ All joins, aggregations, sorting, `IN [ array ]` conditions and the `LIMIT` samp PostgreSQL Array types converts into ClickHouse arrays. -!!! info "Note" - Be careful, in PostgreSQL an array data type column like Integer[] may contain arrays of different dimensions in different rows, but in ClickHouse it is only allowed to have multidimensional arrays of the same dimension in all rows. +:::note +Be careful, in PostgreSQL an array data type column like Integer[] may contain arrays of different dimensions in different rows, but in ClickHouse it is only allowed to have multidimensional arrays of the same dimension in all rows. +::: Supports multiple replicas that must be listed by `|`. For example: diff --git a/docs/en/sql-reference/table-functions/remote.md b/docs/en/sql-reference/table-functions/remote.md index 9effbb03553..0eae00564ba 100644 --- a/docs/en/sql-reference/table-functions/remote.md +++ b/docs/en/sql-reference/table-functions/remote.md @@ -1,6 +1,6 @@ --- -toc_priority: 40 -toc_title: remote +sidebar_position: 40 +sidebar_label: remote --- # remote, remoteSecure {#remote-remotesecure} diff --git a/docs/en/sql-reference/table-functions/s3.md b/docs/en/sql-reference/table-functions/s3.md index 7dffd252dc9..61dda209ee6 100644 --- a/docs/en/sql-reference/table-functions/s3.md +++ b/docs/en/sql-reference/table-functions/s3.md @@ -1,6 +1,6 @@ --- -toc_priority: 45 -toc_title: s3 +sidebar_position: 45 +sidebar_label: s3 --- # s3 Table Function {#s3-table-function} @@ -95,8 +95,9 @@ FROM s3('https://clickhouse-public-datasets.s3.amazonaws.com/my-test-bucket-768/ └─────────┘ ``` -!!! warning "Warning" - If your listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`. +:::warning +If your listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`. +::: Count the total amount of rows in files named `file-000.csv`, `file-001.csv`, … , `file-999.csv`: diff --git a/docs/en/sql-reference/table-functions/s3Cluster.md b/docs/en/sql-reference/table-functions/s3Cluster.md index 65565aa92cb..dbd3538c692 100644 --- a/docs/en/sql-reference/table-functions/s3Cluster.md +++ b/docs/en/sql-reference/table-functions/s3Cluster.md @@ -1,6 +1,6 @@ --- -toc_priority: 55 -toc_title: s3Cluster +sidebar_position: 55 +sidebar_label: s3Cluster --- # s3Cluster Table Function {#s3Cluster-table-function} @@ -39,8 +39,9 @@ Count the total amount of rows in all files in the cluster `cluster_simple`: SELECT count(*) FROM s3Cluster('cluster_simple', 'http://minio1:9001/root/data/{clickhouse,database}/*', 'minio', 'minio123', 'CSV', 'name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64)))'); ``` -!!! warning "Warning" - If your listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`. +:::warning +If your listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`. +::: **See Also** diff --git a/docs/en/sql-reference/table-functions/sqlite.md b/docs/en/sql-reference/table-functions/sqlite.md index be7bd92d7e7..6058843ae61 100644 --- a/docs/en/sql-reference/table-functions/sqlite.md +++ b/docs/en/sql-reference/table-functions/sqlite.md @@ -1,6 +1,6 @@ --- -toc_priority: 55 -toc_title: sqlite +sidebar_position: 55 +sidebar_label: sqlite --- ## sqlite {#sqlite} diff --git a/docs/en/sql-reference/table-functions/url.md b/docs/en/sql-reference/table-functions/url.md index bfad7a67e0d..3f2f9c6a710 100644 --- a/docs/en/sql-reference/table-functions/url.md +++ b/docs/en/sql-reference/table-functions/url.md @@ -1,6 +1,6 @@ --- -toc_priority: 41 -toc_title: url +sidebar_position: 41 +sidebar_label: url --- # url {#url} diff --git a/docs/en/sql-reference/table-functions/view.md b/docs/en/sql-reference/table-functions/view.md index f78120c370e..727cc04e5a2 100644 --- a/docs/en/sql-reference/table-functions/view.md +++ b/docs/en/sql-reference/table-functions/view.md @@ -1,6 +1,6 @@ --- -toc_priority: 51 -toc_title: view +sidebar_position: 51 +sidebar_label: view --- ## view {#view} diff --git a/docs/en/sql-reference/window-functions/index.md b/docs/en/sql-reference/window-functions/index.md index 0a55eafc7ab..e9a15995a16 100644 --- a/docs/en/sql-reference/window-functions/index.md +++ b/docs/en/sql-reference/window-functions/index.md @@ -1,6 +1,6 @@ --- -toc_priority: 62 -toc_title: Window Functions +sidebar_position: 62 +sidebar_label: Window Functions --- # Window Functions From 371cdc956aa6d3710c5894ff0f128d9d7554fff1 Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Wed, 30 Mar 2022 12:54:19 +0200 Subject: [PATCH 065/239] Added input format settings for parsing invalid IPv4, IPv6 addresses as default values --- src/Core/Settings.h | 2 + .../Serializations/SerializationIP.cpp | 14 +++- src/Formats/FormatFactory.cpp | 2 + src/Formats/FormatSettings.h | 3 + .../02244_ip_address_invalid_insert.reference | 10 +++ .../02244_ip_address_invalid_insert.sql | 81 +++++++++++++++++++ .../02244_ipv6_invalid_insert.reference | 4 - .../0_stateless/02244_ipv6_invalid_insert.sql | 11 --- 8 files changed, 108 insertions(+), 19 deletions(-) create mode 100644 tests/queries/0_stateless/02244_ip_address_invalid_insert.reference create mode 100644 tests/queries/0_stateless/02244_ip_address_invalid_insert.sql delete mode 100644 tests/queries/0_stateless/02244_ipv6_invalid_insert.reference delete mode 100644 tests/queries/0_stateless/02244_ipv6_invalid_insert.sql diff --git a/src/Core/Settings.h b/src/Core/Settings.h index f81b61ea648..14d7f34b057 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -637,6 +637,8 @@ class IColumn; M(DateTimeInputFormat, date_time_input_format, FormatSettings::DateTimeInputFormat::Basic, "Method to read DateTime from text input formats. Possible values: 'basic', 'best_effort' and 'best_effort_us'.", 0) \ M(DateTimeOutputFormat, date_time_output_format, FormatSettings::DateTimeOutputFormat::Simple, "Method to write DateTime to text output. Possible values: 'simple', 'iso', 'unix_timestamp'.", 0) \ \ + M(Bool, input_format_ipv4_default_on_conversion_error, false, "Deserialization of IPv4 will use default values instead of throwing exception on conversion error.", 0) \ + M(Bool, input_format_ipv6_default_on_conversion_error, false, "Deserialization of IPV6 will use default values instead of throwing exception on conversion error.", 0) \ M(String, bool_true_representation, "true", "Text to represent bool value in TSV/CSV formats.", 0) \ M(String, bool_false_representation, "false", "Text to represent bool value in TSV/CSV formats.", 0) \ \ diff --git a/src/DataTypes/Serializations/SerializationIP.cpp b/src/DataTypes/Serializations/SerializationIP.cpp index 916edced0a3..ed0e9d54415 100644 --- a/src/DataTypes/Serializations/SerializationIP.cpp +++ b/src/DataTypes/Serializations/SerializationIP.cpp @@ -6,6 +6,8 @@ #include #include #include +#include + namespace DB { @@ -47,9 +49,11 @@ void SerializationIPv4::deserializeText(IColumn & column, ReadBuffer & istr, con char buffer[IPV4_MAX_TEXT_LENGTH + 1] = {'\0'}; istr.read(buffer, sizeof(buffer) - 1); UInt32 ipv4_value = 0; - if (!parseIPv4(buffer, reinterpret_cast(&ipv4_value))) + + bool parse_result = parseIPv4(buffer, reinterpret_cast(&ipv4_value)); + if (!parse_result && !settings.input_format_ipv4_default_on_conversion_error) { - throw Exception("Invalid IPv4 value.", ErrorCodes::CANNOT_PARSE_DOMAIN_VALUE_FROM_STRING); + throw Exception("Invalid IPv4 value", ErrorCodes::CANNOT_PARSE_DOMAIN_VALUE_FROM_STRING); } col->insert(ipv4_value); @@ -89,9 +93,11 @@ void SerializationIPv6::deserializeText(IColumn & column, ReadBuffer & istr, con istr.read(buffer, sizeof(buffer) - 1); std::string ipv6_value(IPV6_BINARY_LENGTH, '\0'); - if (!parseIPv6(buffer, reinterpret_cast(ipv6_value.data()))) + + bool parse_result = parseIPv6(buffer, reinterpret_cast(ipv6_value.data())); + if (!parse_result && !settings.input_format_ipv6_default_on_conversion_error) { - throw Exception("Invalid IPv6 value.", ErrorCodes::CANNOT_PARSE_DOMAIN_VALUE_FROM_STRING); + throw Exception("Invalid IPv6 value", ErrorCodes::CANNOT_PARSE_DOMAIN_VALUE_FROM_STRING); } col->insertString(ipv6_value); diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index 3aa82cb79b4..f9c834cb3de 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -77,6 +77,8 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.custom.row_between_delimiter = settings.format_custom_row_between_delimiter; format_settings.date_time_input_format = settings.date_time_input_format; format_settings.date_time_output_format = settings.date_time_output_format; + format_settings.input_format_ipv4_default_on_conversion_error = settings.input_format_ipv4_default_on_conversion_error; + format_settings.input_format_ipv6_default_on_conversion_error = settings.input_format_ipv6_default_on_conversion_error; format_settings.bool_true_representation = settings.bool_true_representation; format_settings.bool_false_representation = settings.bool_false_representation; format_settings.enable_streaming = settings.output_format_enable_streaming; diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index bd0a84d9ded..ea9fcc2658a 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -65,6 +65,9 @@ struct FormatSettings DateTimeOutputFormat date_time_output_format = DateTimeOutputFormat::Simple; + bool input_format_ipv4_default_on_conversion_error = false; + bool input_format_ipv6_default_on_conversion_error = false; + UInt64 input_allow_errors_num = 0; Float32 input_allow_errors_ratio = 0; diff --git a/tests/queries/0_stateless/02244_ip_address_invalid_insert.reference b/tests/queries/0_stateless/02244_ip_address_invalid_insert.reference new file mode 100644 index 00000000000..60e6a5da083 --- /dev/null +++ b/tests/queries/0_stateless/02244_ip_address_invalid_insert.reference @@ -0,0 +1,10 @@ +1.1.1.1 1.1.1.1 + 0.0.0.0 +1.1.1.1 1.1.1.1 + 0.0.0.0 +fe80::9801:43ff:fe1f:7690 fe80::9801:43ff:fe1f:7690 +1.1.1.1 :: + :: +fe80::9801:43ff:fe1f:7690 fe80::9801:43ff:fe1f:7690 +1.1.1.1 ::ffff:1.1.1.1 + :: diff --git a/tests/queries/0_stateless/02244_ip_address_invalid_insert.sql b/tests/queries/0_stateless/02244_ip_address_invalid_insert.sql new file mode 100644 index 00000000000..4057b9b2d98 --- /dev/null +++ b/tests/queries/0_stateless/02244_ip_address_invalid_insert.sql @@ -0,0 +1,81 @@ +DROP TABLE IF EXISTS test_table_ipv4; +CREATE TABLE test_table_ipv4 +( + ip String, + ipv4 IPv4 +) ENGINE = TinyLog; + +INSERT INTO test_table_ipv4 VALUES ('1.1.1.1', '1.1.1.1'), ('', ''); --{clientError 441} + +SET input_format_ipv4_default_on_conversion_error = 1; + +INSERT INTO test_table_ipv4 VALUES ('1.1.1.1', '1.1.1.1'), ('', ''); +SELECT ip, ipv4 FROM test_table_ipv4; + +SET input_format_ipv4_default_on_conversion_error = 0; + +DROP TABLE test_table_ipv4; + +DROP TABLE IF EXISTS test_table_ipv4_materialized; +CREATE TABLE test_table_ipv4_materialized +( + ip String, + ipv6 IPv4 MATERIALIZED toIPv4(ip) +) ENGINE = TinyLog; + +INSERT INTO test_table_ipv4_materialized(ip) VALUES ('1.1.1.1'), (''); --{serverError 441} + +SET input_format_ipv4_default_on_conversion_error = 1; + +INSERT INTO test_table_ipv4_materialized(ip) VALUES ('1.1.1.1'), (''); --{serverError 441} + +SET cast_ipv4_ipv6_default_on_conversion_error = 1; + +INSERT INTO test_table_ipv4_materialized(ip) VALUES ('1.1.1.1'), (''); +SELECT ip, ipv6 FROM test_table_ipv4_materialized; + +SET input_format_ipv4_default_on_conversion_error = 0; +SET cast_ipv4_ipv6_default_on_conversion_error = 0; + +DROP TABLE test_table_ipv4_materialized; + +DROP TABLE IF EXISTS test_table_ipv6; +CREATE TABLE test_table_ipv6 +( + ip String, + ipv6 IPv6 +) ENGINE = TinyLog; + +INSERT INTO test_table_ipv6 VALUES ('fe80::9801:43ff:fe1f:7690', 'fe80::9801:43ff:fe1f:7690'), ('1.1.1.1', '1.1.1.1'), ('', ''); --{clientError 441} + +SET input_format_ipv6_default_on_conversion_error = 1; + +INSERT INTO test_table_ipv6 VALUES ('fe80::9801:43ff:fe1f:7690', 'fe80::9801:43ff:fe1f:7690'), ('1.1.1.1', '1.1.1.1'), ('', ''); +SELECT ip, ipv6 FROM test_table_ipv6; + +SET input_format_ipv6_default_on_conversion_error = 0; + +DROP TABLE test_table_ipv6; + +DROP TABLE IF EXISTS test_table_ipv6_materialized; +CREATE TABLE test_table_ipv6_materialized +( + ip String, + ipv6 IPv6 MATERIALIZED toIPv6(ip) +) ENGINE = TinyLog; + +INSERT INTO test_table_ipv6_materialized(ip) VALUES ('fe80::9801:43ff:fe1f:7690'), ('1.1.1.1'), (''); --{serverError 441} + +SET input_format_ipv6_default_on_conversion_error = 1; + +INSERT INTO test_table_ipv6_materialized(ip) VALUES ('fe80::9801:43ff:fe1f:7690'), ('1.1.1.1'), (''); --{serverError 441} + +SET cast_ipv4_ipv6_default_on_conversion_error = 1; + +INSERT INTO test_table_ipv6_materialized(ip) VALUES ('fe80::9801:43ff:fe1f:7690'), ('1.1.1.1'), (''); +SELECT ip, ipv6 FROM test_table_ipv6_materialized; + +SET input_format_ipv6_default_on_conversion_error = 0; +SET cast_ipv4_ipv6_default_on_conversion_error = 0; + +DROP TABLE test_table_ipv6_materialized; diff --git a/tests/queries/0_stateless/02244_ipv6_invalid_insert.reference b/tests/queries/0_stateless/02244_ipv6_invalid_insert.reference deleted file mode 100644 index 783d8f124dd..00000000000 --- a/tests/queries/0_stateless/02244_ipv6_invalid_insert.reference +++ /dev/null @@ -1,4 +0,0 @@ -fe80::9801:43ff:fe1f:7690 -1.1.1.1 - -::ffff:1.1.1.1 diff --git a/tests/queries/0_stateless/02244_ipv6_invalid_insert.sql b/tests/queries/0_stateless/02244_ipv6_invalid_insert.sql deleted file mode 100644 index 98fb45a5758..00000000000 --- a/tests/queries/0_stateless/02244_ipv6_invalid_insert.sql +++ /dev/null @@ -1,11 +0,0 @@ -DROP TABLE IF EXISTS test_table; -CREATE TABLE test_table(ip String, ipv6 IPv6 MATERIALIZED toIPv6(ip)) ENGINE = TinyLog; - -INSERT INTO test_table(ip) VALUES ('fe80::9801:43ff:fe1f:7690'), ('1.1.1.1'), (''), ('::ffff:1.1.1.1' ); --{serverError 441} - -SET cast_ipv4_ipv6_default_on_conversion_error = 1; - -INSERT INTO test_table(ip) VALUES ( 'fe80::9801:43ff:fe1f:7690'), ('1.1.1.1'), (''), ('::ffff:1.1.1.1' ); -SELECT * FROM test_table; - -DROP TABLE test_table; From 0722beca0c615cd698a4efc71c4ebd1f642585bb Mon Sep 17 00:00:00 2001 From: Denny Crane Date: Wed, 30 Mar 2022 08:45:30 -0300 Subject: [PATCH 066/239] Update Client.cpp --- programs/client/Client.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/programs/client/Client.cpp b/programs/client/Client.cpp index c2094b3b00d..3d5cc291f46 100644 --- a/programs/client/Client.cpp +++ b/programs/client/Client.cpp @@ -810,7 +810,7 @@ void Client::addOptions(OptionsDescription & options_description) ("quota_key", po::value(), "A string to differentiate quotas when the user have keyed quotas configured on server") ("max_client_network_bandwidth", po::value(), "the maximum speed of data exchange over the network for the client in bytes per second.") - ("compression", po::value(), "enable or disable compression") + ("compression", po::value(), "enable or disable compression (enabled by default for remote communication and disabled for localhost communication).") ("query-fuzzer-runs", po::value()->default_value(0), "After executing every SELECT query, do random mutations in it and run again specified number of times. This is used for testing to discover unexpected corner cases.") ("interleave-queries-file", po::value>()->multitoken(), From 5cb2301e398552ba9f3ccabde0f1563481e5c3df Mon Sep 17 00:00:00 2001 From: chen9t Date: Wed, 30 Mar 2022 20:40:20 +0800 Subject: [PATCH 067/239] Clean dirty meta cache when part is broken during part loading phase --- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 42 +++++++++++-------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 69ed238d78e..76d42e4b785 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -635,24 +635,32 @@ void IMergeTreeDataPart::loadColumnsChecksumsIndexes(bool require_columns_checks /// Motivation: memory for index is shared between queries - not belong to the query itself. MemoryTrackerBlockerInThread temporarily_disable_memory_tracker(VariableContext::Global); - loadUUID(); - loadColumns(require_columns_checksums); - loadChecksums(require_columns_checksums); - loadIndexGranularity(); - calculateColumnsAndSecondaryIndicesSizesOnDisk(); - loadIndex(); /// Must be called after loadIndexGranularity as it uses the value of `index_granularity` - loadRowsCount(); /// Must be called after loadIndexGranularity() as it uses the value of `index_granularity`. - loadPartitionAndMinMaxIndex(); - if (!parent_part) - { - loadTTLInfos(); - loadProjections(require_columns_checksums, check_consistency); + try { + loadUUID(); + loadColumns(require_columns_checksums); + loadChecksums(require_columns_checksums); + loadIndexGranularity(); + calculateColumnsAndSecondaryIndicesSizesOnDisk(); + loadIndex(); /// Must be called after loadIndexGranularity as it uses the value of `index_granularity` + loadRowsCount(); /// Must be called after loadIndexGranularity() as it uses the value of `index_granularity`. + loadPartitionAndMinMaxIndex(); + if (!parent_part) + { + loadTTLInfos(); + loadProjections(require_columns_checksums, check_consistency); + } + + if (check_consistency) + checkConsistency(require_columns_checksums); + + loadDefaultCompressionCodec(); + } catch (...) { + // There could be conditions that data part to be loaded is broken, but some of meta infos are already written + // into meta data before exception, need to clean them all. + metadata_manager->deleteAll(/*include_projection*/true); + metadata_manager->assertAllDeleted(/*include_projection*/true); + throw; } - - if (check_consistency) - checkConsistency(require_columns_checksums); - - loadDefaultCompressionCodec(); } void IMergeTreeDataPart::appendFilesOfColumnsChecksumsIndexes(Strings & files, bool include_projection) const From 8c05a3dffc662b2d638c0bbcbb4993b3e175149e Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Wed, 30 Mar 2022 15:00:32 +0200 Subject: [PATCH 068/239] Update 01825_type_json_parallel_insert.sql --- tests/queries/0_stateless/01825_type_json_parallel_insert.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/01825_type_json_parallel_insert.sql b/tests/queries/0_stateless/01825_type_json_parallel_insert.sql index f54004a6630..93d1eecfbd7 100644 --- a/tests/queries/0_stateless/01825_type_json_parallel_insert.sql +++ b/tests/queries/0_stateless/01825_type_json_parallel_insert.sql @@ -1,4 +1,4 @@ --- Tags: long +-- Tags: long, no-backward-compatibility-check:22.3.2.1 DROP TABLE IF EXISTS t_json_parallel; SET allow_experimental_object_type = 1, max_insert_threads = 20, max_threads = 20; From 738966b6b6bf1683787906c692308e8f870bc040 Mon Sep 17 00:00:00 2001 From: shuchaome Date: Thu, 31 Mar 2022 00:03:45 +0800 Subject: [PATCH 069/239] fix filebuffer pos in RemoteReadBuffer --- src/Storages/Cache/ExternalDataSourceCache.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Storages/Cache/ExternalDataSourceCache.cpp b/src/Storages/Cache/ExternalDataSourceCache.cpp index 18607c16ffa..17966d49c74 100644 --- a/src/Storages/Cache/ExternalDataSourceCache.cpp +++ b/src/Storages/Cache/ExternalDataSourceCache.cpp @@ -94,6 +94,8 @@ bool RemoteReadBuffer::nextImpl() return status; } + //file_buffer::pos should increase correspondingly when RemoteReadBuffer is consumed, otherwise start_offset will be incorrect. + local_file_holder->file_buffer->position() = local_file_holder->file_buffer->buffer().begin() + BufferBase::offset(); auto start_offset = local_file_holder->file_buffer->getPosition(); auto end_offset = start_offset + local_file_holder->file_buffer->internalBuffer().size(); local_file_holder->file_cache_controller->value().waitMoreData(start_offset, end_offset); From 91eec8962fb78e03be002966b61bcfbdf6228a9d Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Wed, 30 Mar 2022 18:39:28 +0200 Subject: [PATCH 070/239] Rename test --- src/Functions/tests/{gtest_hasAll.cpp => gtest_has_all.cpp} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/Functions/tests/{gtest_hasAll.cpp => gtest_has_all.cpp} (100%) diff --git a/src/Functions/tests/gtest_hasAll.cpp b/src/Functions/tests/gtest_has_all.cpp similarity index 100% rename from src/Functions/tests/gtest_hasAll.cpp rename to src/Functions/tests/gtest_has_all.cpp From 8d0a9689e4cad21dd03e459a74b9a0a564b0db60 Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Wed, 30 Mar 2022 18:40:18 +0200 Subject: [PATCH 071/239] Update gatherutils CMakeLists to use X86_INTRINSICS_FLAGS from cpu_features --- src/Functions/GatherUtils/CMakeLists.txt | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/src/Functions/GatherUtils/CMakeLists.txt b/src/Functions/GatherUtils/CMakeLists.txt index 10909b99b82..460b02326a1 100644 --- a/src/Functions/GatherUtils/CMakeLists.txt +++ b/src/Functions/GatherUtils/CMakeLists.txt @@ -1,5 +1,4 @@ include("${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake") -option(ENABLE_AVX2 "Enable AVX2 instructions (when available) when build for modern Intel CPUs" OFF) add_headers_and_sources(clickhouse_functions_gatherutils .) add_library(clickhouse_functions_gatherutils ${clickhouse_functions_gatherutils_sources} ${clickhouse_functions_gatherutils_headers}) @@ -17,11 +16,4 @@ if (STRIP_DEBUG_SYMBOLS_FUNCTIONS) target_compile_options(clickhouse_functions_gatherutils PRIVATE "-g0") endif() -if (HAVE_SSE42) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2") - target_compile_options(clickhouse_functions_gatherutils PRIVATE -msse4.2) -endif() -if (HAVE_AVX2 AND ENABLE_AVX2) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2 -DENABLE_AVX2") - target_compile_options(clickhouse_functions_gatherutils PRIVATE -mavx2) -endif() +set_target_properties(clickhouse_functions_gatherutils PROPERTIES COMPILE_FLAGS "${X86_INTRINSICS_FLAGS}") From e43fdcd7ebb54ce15eb4612dba18f93beae03802 Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Wed, 30 Mar 2022 18:41:34 +0200 Subject: [PATCH 072/239] Function hasAll added dynamic dispatch for SSE4.2, AVX2 --- .../GatherUtils/sliceHasImplAnyAll.h | 2302 ++++++----------- 1 file changed, 848 insertions(+), 1454 deletions(-) diff --git a/src/Functions/GatherUtils/sliceHasImplAnyAll.h b/src/Functions/GatherUtils/sliceHasImplAnyAll.h index 52448f88447..97ac0c6be72 100644 --- a/src/Functions/GatherUtils/sliceHasImplAnyAll.h +++ b/src/Functions/GatherUtils/sliceHasImplAnyAll.h @@ -9,37 +9,836 @@ #include #include #endif + #if defined(__AVX2__) #include #endif -namespace DB::GatherUtils -{ -namespace +#include + +namespace DB::GatherUtils { inline ALWAYS_INLINE bool hasNull(const UInt8 * null_map, size_t null_map_size) { - if (null_map != nullptr) - { - for (size_t i = 0; i < null_map_size; ++i) - { - if (null_map[i]) - return true; - } + if (null_map == nullptr) { + return false; } + + for (size_t i = 0; i < null_map_size; ++i) + { + if (null_map[i]) + return true; + } + return false; } +template +inline ALWAYS_INLINE bool hasAllIntegralLoopRemainder( + size_t j, const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) +{ + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + for (; j < second.size; ++j) + { + // skip null elements since both have at least one - assuming it was checked earlier that at least one element in 'first' is null + if (has_second_null_map && second_null_map[j]) + continue; + + bool found = false; + + for (size_t i = 0; i < first.size; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + + if (first.data[i] == second.data[j]) + { + found = true; + break; + } + } + + if (!found) + return false; + } + return true; } -/// Methods to check if first array has elements from second array, overloaded for various combinations of types. + +#if defined(__AVX2__) + +DECLARE_AVX2_SPECIFIC_CODE ( + +// AVX2 Int64, UInt64 specialization +template +requires (std::is_same_v || std::is_same_v) +inline ALWAYS_INLINE bool sliceHasImplAnyAllImplInt64( + const NumericArraySlice & first, + const NumericArraySlice & second, + const UInt8 * first_null_map, + const UInt8 * second_null_map) +{ + if (second.size == 0) + return true; + + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) + return false; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; + int has_mask = 1; + static constexpr Int64 full = -1, none = 0; + const __m256i ones = _mm256_set1_epi64x(full); + const __m256i zeros = _mm256_setzero_si256(); + + if (second.size > 3 && first.size > 3) + { + for (; j < second.size - 3 && has_mask; j += 4) + { + has_mask = 0; + const __m256i second_data = _mm256_loadu_si256(reinterpret_cast(second.data + j)); + // bits of the bitmask are set to one if considered as null in the corresponding null map, 0 otherwise; + __m256i bitmask = has_second_null_map ? + _mm256_set_epi64x( + (second_null_map[j + 3])? full : none, + (second_null_map[j + 2])? full : none, + (second_null_map[j + 1])? full : none, + (second_null_map[j]) ? full : none) + : zeros; + + size_t i = 0; + for (; i < first.size - 3 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 4) + { + const __m256i first_data = _mm256_loadu_si256(reinterpret_cast(first.data + i)); + const __m256i first_nm_mask = has_first_null_map? + _mm256_set_m128i( + _mm_cvtepi8_epi64(_mm_loadu_si128(reinterpret_cast(first_null_map + i + 2))), + _mm_cvtepi8_epi64(_mm_loadu_si128(reinterpret_cast(first_null_map + i)))) + : zeros; + bitmask = + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + first_nm_mask, + _mm256_cmpeq_epi64(second_data, first_data)), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(5,4,3,2,1,0,7,6)), + _mm256_cmpeq_epi64(second_data, _mm256_permutevar8x32_epi32(first_data, _mm256_set_epi32(5,4,3,2,1,0,7,6))))), + + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(3,2,1,0,7,6,5,4)), + _mm256_cmpeq_epi64(second_data, _mm256_permutevar8x32_epi32(first_data, _mm256_set_epi32(3,2,1,0,7,6,5,4)))), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(1,0,7,6,5,4,3,2)), + _mm256_cmpeq_epi64(second_data, _mm256_permutevar8x32_epi32(first_data, _mm256_set_epi32(1,0,7,6,5,4,3,2)))))), + bitmask); + } + + if (i < first.size) + { + for (; i < first.size && !has_mask; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + + __m256i v_i = _mm256_set1_epi64x(first.data[i]); + bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi64(second_data, v_i)); + has_mask = _mm256_testc_si256(bitmask, ones); + } + } + } + } + + if (!has_mask && second.size > 3) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); +} + +// AVX2 Int32, UInt32 specialization +template +requires (std::is_same_v || std::is_same_v) +inline ALWAYS_INLINE bool sliceHasImplAnyAllImplInt32( + const NumericArraySlice & first, + const NumericArraySlice & second, + const UInt8 * first_null_map, + const UInt8 * second_null_map) +{ + if (second.size == 0) + return true; + + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) + return false; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; + int has_mask = 1; + static constexpr int full = -1, none = 0; + + const __m256i ones = _mm256_set1_epi32(full); + const __m256i zeros = _mm256_setzero_si256(); + + if (second.size > 7 && first.size > 7) + { + for (; j < second.size - 7 && has_mask; j += 8) + { + has_mask = 0; + const __m256i second_data = _mm256_loadu_si256(reinterpret_cast(second.data + j)); + // bits of the bitmask are set to one if considered as null in the corresponding null map, 0 otherwise; + __m256i bitmask = has_second_null_map ? + _mm256_set_epi32( + (second_null_map[j + 7]) ? full : none, + (second_null_map[j + 6]) ? full : none, + (second_null_map[j + 5]) ? full : none, + (second_null_map[j + 4]) ? full : none, + (second_null_map[j + 3]) ? full : none, + (second_null_map[j + 2]) ? full : none, + (second_null_map[j + 1]) ? full : none, + (second_null_map[j]) ? full : none) + : zeros; + + size_t i = 0; + for (; i < first.size - 7 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 8) + { + const __m256i first_data = _mm256_loadu_si256(reinterpret_cast(first.data + i)); + // Create a mask to avoid to compare null elements + // set_m128i takes two arguments: (high segment, low segment) that are two __m128i convert from 8bits to 32bits to match with next operations + const __m256i first_nm_mask = has_first_null_map? + _mm256_set_m128i( + _mm_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast(first_null_map + i + 4))), + _mm_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast(first_null_map + i)))) + : zeros; + bitmask = + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + first_nm_mask, + _mm256_cmpeq_epi32(second_data, first_data)), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(6,5,4,3,2,1,0,7)), + _mm256_cmpeq_epi32(second_data, _mm256_permutevar8x32_epi32(first_data, _mm256_set_epi32(6,5,4,3,2,1,0,7))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(5,4,3,2,1,0,7,6)), + _mm256_cmpeq_epi32(second_data, _mm256_permutevar8x32_epi32(first_data, _mm256_set_epi32(5,4,3,2,1,0,7,6)))), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(4,3,2,1,0,7,6,5)), + _mm256_cmpeq_epi32(second_data, _mm256_permutevar8x32_epi32(first_data, _mm256_set_epi32(4,3,2,1,0,7,6,5))))) + ), + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(3,2,1,0,7,6,5,4)), + _mm256_cmpeq_epi32(second_data, _mm256_permutevar8x32_epi32(first_data, _mm256_set_epi32(3,2,1,0,7,6,5,4)))), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(2,1,0,7,6,5,4,3)), + _mm256_cmpeq_epi32(second_data, _mm256_permutevar8x32_epi32(first_data, _mm256_set_epi32(2,1,0,7,6,5,4,3))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(1,0,7,6,5,4,3,2)), + _mm256_cmpeq_epi32(second_data, _mm256_permutevar8x32_epi32(first_data, _mm256_set_epi32(1,0,7,6,5,4,3,2)))), + _mm256_andnot_si256( + _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(0,7,6,5,4,3,2,1)), + _mm256_cmpeq_epi32(second_data, _mm256_permutevar8x32_epi32(first_data, _mm256_set_epi32(0,7,6,5,4,3,2,1))))))), + bitmask); + } + + if (i < first.size) + { + for (; i < first.size && !has_mask; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + + __m256i v_i = _mm256_set1_epi32(first.data[i]); + bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi32(second_data, v_i)); + has_mask = _mm256_testc_si256(bitmask, ones); + } + } + } + } + + if (!has_mask && second.size > 7) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); +} + +// AVX2 Int16, UInt16 specialization +template +requires (std::is_same_v || std::is_same_v) +inline ALWAYS_INLINE bool sliceHasImplAnyAllImplInt16( + const NumericArraySlice & first, + const NumericArraySlice & second, + const UInt8 * first_null_map, + const UInt8 * second_null_map) +{ + if (second.size == 0) + return true; + + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) + return false; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; + int has_mask = 1; + static constexpr int16_t full = -1, none = 0; + const __m256i ones = _mm256_set1_epi16(full); + const __m256i zeros = _mm256_setzero_si256(); + if (second.size > 15 && first.size > 15) + { + for (; j < second.size - 15 && has_mask; j += 16) + { + has_mask = 0; + const __m256i second_data = _mm256_loadu_si256(reinterpret_cast(second.data + j)); + __m256i bitmask = has_second_null_map ? + _mm256_set_epi16( + (second_null_map[j + 15]) ? full : none, (second_null_map[j + 14]) ? full : none, + (second_null_map[j + 13]) ? full : none, (second_null_map[j + 12]) ? full : none, + (second_null_map[j + 11]) ? full : none, (second_null_map[j + 10]) ? full : none, + (second_null_map[j + 9]) ? full : none, (second_null_map[j + 8])? full : none, + (second_null_map[j + 7]) ? full : none, (second_null_map[j + 6])? full : none, + (second_null_map[j + 5]) ? full : none, (second_null_map[j + 4])? full : none, + (second_null_map[j + 3]) ? full : none, (second_null_map[j + 2])? full : none, + (second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full : none) + : zeros; + + size_t i = 0; + for (; i < first.size - 15 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 16) + { + const __m256i first_data = _mm256_loadu_si256(reinterpret_cast(first.data + i)); + const __m256i first_nm_mask = has_first_null_map? + _mm256_set_m128i( + _mm_cvtepi8_epi16(_mm_loadu_si128(reinterpret_cast(first_null_map + i + 8))), + _mm_cvtepi8_epi16(_mm_loadu_si128(reinterpret_cast(first_null_map + i)))) + : zeros; + + bitmask = + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + first_nm_mask, + _mm256_cmpeq_epi16(second_data, first_data)), + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30)), + _mm256_cmpeq_epi16(second_data, _mm256_shuffle_epi8(first_data, _mm256_set_epi8(29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28)), + _mm256_cmpeq_epi16(second_data, _mm256_shuffle_epi8(first_data, _mm256_set_epi8(27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26)), + _mm256_cmpeq_epi16(second_data, _mm256_shuffle_epi8(first_data, _mm256_set_epi8(25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26))))) + ), + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24)), + _mm256_cmpeq_epi16(second_data, _mm256_shuffle_epi8(first_data, _mm256_set_epi8(23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22)), + _mm256_cmpeq_epi16(second_data, _mm256_shuffle_epi8(first_data, _mm256_set_epi8(21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20)), + _mm256_cmpeq_epi16(second_data, _mm256_shuffle_epi8(first_data, _mm256_set_epi8(19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18)), + _mm256_cmpeq_epi16(second_data, _mm256_shuffle_epi8(first_data, _mm256_set_epi8(17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18)))))) + ), + _mm256_or_si256( + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_permute2x128_si256(first_nm_mask, first_nm_mask,1), + _mm256_cmpeq_epi16(second_data, _mm256_permute2x128_si256(first_data, first_data, 1))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14)), + _mm256_cmpeq_epi16(second_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_data, first_data, 1), _mm256_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12)), + _mm256_cmpeq_epi16(second_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_data, first_data, 1), _mm256_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10)), + _mm256_cmpeq_epi16(second_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_data, first_data, 1), _mm256_set_epi8(9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10))))) + ), + _mm256_or_si256( + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8)), + _mm256_cmpeq_epi16(second_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_data ,first_data, 1), _mm256_set_epi8(7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6)), + _mm256_cmpeq_epi16(second_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_data, first_data, 1), _mm256_set_epi8(5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6))))), + _mm256_or_si256( + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4)), + _mm256_cmpeq_epi16(second_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_data ,first_data ,1), _mm256_set_epi8(3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4)))), + _mm256_andnot_si256( + _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2)), + _mm256_cmpeq_epi16(second_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_data, first_data, 1), _mm256_set_epi8(1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2)))))) + ) + ), + bitmask); + } + + if (i < first.size) + { + for (; i < first.size && !has_mask; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + + __m256i v_i = _mm256_set1_epi16(first.data[i]); + bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi16(second_data, v_i)); + has_mask = _mm256_testc_si256(bitmask, ones); + } + } + } + } + + if (!has_mask && second.size > 15) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); +} + +) + +#endif + +#if defined(__SSE4_2__) + +DECLARE_SSE42_SPECIFIC_CODE ( + +// SSE4.2 Int64, UInt64 specialization +template +requires (std::is_same_v || std::is_same_v) +inline ALWAYS_INLINE bool sliceHasImplAnyAllImplInt64( + const NumericArraySlice & first, + const NumericArraySlice & second, + const UInt8 * first_null_map, + const UInt8 * second_null_map) +{ + if (second.size == 0) + return true; + + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) + return false; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; + int has_mask = 1; + static constexpr Int64 full = -1, none = 0; + const __m128i zeros = _mm_setzero_si128(); + if (second.size > 1 && first.size > 1) + { + for (; j < second.size - 1 && has_mask; j += 2) + { + has_mask = 0; + const __m128i second_data = _mm_loadu_si128(reinterpret_cast(second.data + j)); + __m128i bitmask = has_second_null_map ? + _mm_set_epi64x( + (second_null_map[j + 1]) ? full : none, + (second_null_map[j]) ? full : none) + : zeros; + + size_t i = 0; + + for (; i < first.size - 1 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 2) + { + const __m128i first_data = _mm_loadu_si128(reinterpret_cast(first.data + i)); + const __m128i first_nm_mask = has_first_null_map ? + _mm_cvtepi8_epi64(_mm_loadu_si128(reinterpret_cast(first_null_map + i))) + : zeros; + + bitmask = + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + first_nm_mask, + _mm_cmpeq_epi64(second_data, first_data)), + _mm_andnot_si128( + _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(1,0,3,2)), + _mm_cmpeq_epi64(second_data, _mm_shuffle_epi32(first_data, _MM_SHUFFLE(1,0,3,2))))), + bitmask); + } + + if (i < first.size) + { + for (; i < first.size && !has_mask; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + + __m128i v_i = _mm_set1_epi64x(first.data[i]); + bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi64(second_data, v_i)); + has_mask = _mm_test_all_ones(bitmask); + } + } + } + } + + if (!has_mask && second.size > 1) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); +} + +// SSE4.2 Int32, UInt32 specialization +template +requires (std::is_same_v || std::is_same_v) +inline ALWAYS_INLINE bool sliceHasImplAnyAllImplInt32( + const NumericArraySlice & first, + const NumericArraySlice & second, + const UInt8 * first_null_map, + const UInt8 * second_null_map) +{ + if (second.size == 0) + return true; + + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) + return false; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; + int has_mask = 1; + static constexpr int full = -1, none = 0; + const __m128i zeros = _mm_setzero_si128(); + if (second.size > 3 && first.size > 3) + { + for (; j < second.size - 3 && has_mask; j += 4) + { + has_mask = 0; + const __m128i second_data = _mm_loadu_si128(reinterpret_cast(second.data + j)); + __m128i bitmask = has_second_null_map ? + _mm_set_epi32( + (second_null_map[j + 3]) ? full : none, + (second_null_map[j + 2]) ? full : none, + (second_null_map[j + 1]) ? full : none, + (second_null_map[j]) ? full : none) + : zeros; + + size_t i = 0; + for (; i < first.size - 3 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 4) + { + const __m128i first_data = _mm_loadu_si128(reinterpret_cast(first.data + i)); + const __m128i first_nm_mask = has_first_null_map ? + _mm_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast(first_null_map + i))) + : zeros; + + bitmask = + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + first_nm_mask, + _mm_cmpeq_epi32(second_data, first_data)), + _mm_andnot_si128( + _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(2,1,0,3)), + _mm_cmpeq_epi32(second_data, _mm_shuffle_epi32(first_data, _MM_SHUFFLE(2,1,0,3))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(1,0,3,2)), + _mm_cmpeq_epi32(second_data, _mm_shuffle_epi32(first_data, _MM_SHUFFLE(1,0,3,2)))), + _mm_andnot_si128( + _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(0,3,2,1)), + _mm_cmpeq_epi32(second_data, _mm_shuffle_epi32(first_data, _MM_SHUFFLE(0,3,2,1))))) + ), + bitmask); + } + + if (i < first.size) + { + for (; i < first.size && !has_mask; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + __m128i r_i = _mm_set1_epi32(first.data[i]); + bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi32(second_data, r_i)); + has_mask = _mm_test_all_ones(bitmask); + } + } + } + } + + if (!has_mask && second.size > 3) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); +} + +// SSE4.2 Int16, UInt16 specialization +template +requires (std::is_same_v || std::is_same_v) +inline ALWAYS_INLINE bool sliceHasImplAnyAllImplInt16( + const NumericArraySlice & first, + const NumericArraySlice & second, + const UInt8 * first_null_map, + const UInt8 * second_null_map) +{ + if (second.size == 0) + return true; + + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) + return false; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; + int has_mask = 1; + static constexpr int16_t full = -1, none = 0; + const __m128i zeros = _mm_setzero_si128(); + if (second.size > 6 && first.size > 6) + { + for (; j < second.size - 7 && has_mask; j += 8) + { + has_mask = 0; + const __m128i second_data = _mm_loadu_si128(reinterpret_cast(second.data + j)); + __m128i bitmask = has_second_null_map ? + _mm_set_epi16( + (second_null_map[j + 7]) ? full : none, (second_null_map[j + 6]) ? full : none, + (second_null_map[j + 5]) ? full : none, (second_null_map[j + 4]) ? full : none, + (second_null_map[j + 3]) ? full : none, (second_null_map[j + 2]) ? full : none, + (second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full: none) + : zeros; + + size_t i = 0; + for (; i < first.size-7 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 8) + { + const __m128i first_data = _mm_loadu_si128(reinterpret_cast(first.data + i)); + const __m128i first_nm_mask = has_first_null_map ? + _mm_cvtepi8_epi16(_mm_loadu_si128(reinterpret_cast(first_null_map + i))) + : zeros; + bitmask = + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + first_nm_mask, + _mm_cmpeq_epi16(second_data, first_data)), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14)), + _mm_cmpeq_epi16(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)), + _mm_cmpeq_epi16(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10)), + _mm_cmpeq_epi16(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10))))) + ), + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)), + _mm_cmpeq_epi16(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6)), + _mm_cmpeq_epi16(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)), + _mm_cmpeq_epi16(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)), + _mm_cmpeq_epi16(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)))))) + ), + bitmask); + } + + if (i < first.size) + { + for (; i < first.size && !has_mask; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + __m128i v_i = _mm_set1_epi16(first.data[i]); + bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi16(second_data, v_i)); + has_mask = _mm_test_all_ones(bitmask); + } + } + } + } + + if (!has_mask && second.size > 6) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); +} + +// Int8/UInt8 version is faster with SSE than with AVX2 +// SSE2 Int8, UInt8 specialization +template +requires (std::is_same_v || std::is_same_v) +inline ALWAYS_INLINE bool sliceHasImplAnyAllImplInt8( + const NumericArraySlice & first, + const NumericArraySlice & second, + const UInt8 * first_null_map, + const UInt8 * second_null_map) +{ + if (second.size == 0) + return true; + + if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) + return false; + + const bool has_first_null_map = first_null_map != nullptr; + const bool has_second_null_map = second_null_map != nullptr; + + size_t j = 0; + int has_mask = 1; + static constexpr int8_t full = -1, none = 0; + const __m128i zeros = _mm_setzero_si128(); + + if (second.size > 15 && first.size > 15) + { + for (; j < second.size - 15 && has_mask; j += 16) + { + has_mask = 0; + const __m128i second_data = _mm_loadu_si128(reinterpret_cast(second.data + j)); + __m128i bitmask = has_second_null_map ? + _mm_set_epi8( + (second_null_map[j + 15]) ? full : none, (second_null_map[j + 14]) ? full : none, + (second_null_map[j + 13]) ? full : none, (second_null_map[j + 12]) ? full : none, + (second_null_map[j + 11]) ? full : none, (second_null_map[j + 10]) ? full : none, + (second_null_map[j + 9]) ? full : none, (second_null_map[j + 8]) ? full : none, + (second_null_map[j + 7]) ? full : none, (second_null_map[j + 6]) ? full : none, + (second_null_map[j + 5]) ? full : none, (second_null_map[j + 4]) ? full : none, + (second_null_map[j + 3]) ? full : none, (second_null_map[j + 2]) ? full : none, + (second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full : none) + : zeros; + + size_t i = 0; + for (; i < first.size - 15 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 16) + { + const __m128i first_data = _mm_loadu_si128(reinterpret_cast(first.data + i)); + const __m128i first_nm_mask = has_first_null_map ? + _mm_loadu_si128(reinterpret_cast(first_null_map + i)) + : zeros; + bitmask = + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + first_nm_mask, + _mm_cmpeq_epi8(second_data, first_data)), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15)), + _mm_cmpeq_epi8(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14)), + _mm_cmpeq_epi8(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13)), + _mm_cmpeq_epi8(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13))))) + ), + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)), + _mm_cmpeq_epi8(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11)), + _mm_cmpeq_epi8(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10)), + _mm_cmpeq_epi8(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9)), + _mm_cmpeq_epi8(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9))))))), + _mm_or_si128( + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)), + _mm_cmpeq_epi8(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7)), + _mm_cmpeq_epi8(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6)), + _mm_cmpeq_epi8(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5)), + _mm_cmpeq_epi8(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5)))))), + _mm_or_si128( + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)), + _mm_cmpeq_epi8(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3)), + _mm_cmpeq_epi8(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3))))), + _mm_or_si128( + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)), + _mm_cmpeq_epi8(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)))), + _mm_andnot_si128( + _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1)), + _mm_cmpeq_epi8(second_data, _mm_shuffle_epi8(first_data, _mm_set_epi8(0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1)))))))), + bitmask); + } + + if (i < first.size) + { + for (; i < first.size && !has_mask; ++i) + { + if (has_first_null_map && first_null_map[i]) + continue; + + __m128i v_i = _mm_set1_epi8(first.data[i]); + bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi8(second_data, v_i)); + has_mask = _mm_test_all_ones(bitmask); + } + } + } + } + + if (!has_mask && second.size > 15) + return false; + + return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); +} + +) + +#endif + template < ArraySearchType search_type, typename FirstSliceType, typename SecondSliceType, - bool (*isEqual)(const FirstSliceType &, const SecondSliceType &, size_t, size_t)> -bool sliceHasImplAnyAll(const FirstSliceType & first, const SecondSliceType & second, const UInt8 * first_null_map, const UInt8 * second_null_map) + bool (*isEqual)(const FirstSliceType &, const SecondSliceType &, size_t, size_t)> +bool sliceHasImplAnyAllGenericImpl(const FirstSliceType & first, const SecondSliceType & second, const UInt8 * first_null_map, const UInt8 * second_null_map) { const bool has_first_null_map = first_null_map != nullptr; const bool has_second_null_map = second_null_map != nullptr; @@ -81,1469 +880,64 @@ bool sliceHasImplAnyAll(const FirstSliceType & first, const SecondSliceType & se if (!has && search_type == ArraySearchType::All) return false; } + return search_type == ArraySearchType::All; } - -#if (defined(__AVX2__) && defined(ENABLE_AVX2)) || defined(__SSE4_2__) - -namespace +/// Methods to check if first array has elements from second array, overloaded for various combinations of types. +template < + ArraySearchType search_type, + typename FirstSliceType, + typename SecondSliceType, + bool (*isEqual)(const FirstSliceType &, const SecondSliceType &, size_t, size_t)> +inline ALWAYS_INLINE bool sliceHasImplAnyAll(const FirstSliceType & first, const SecondSliceType & second, const UInt8 * first_null_map, const UInt8 * second_null_map) { - -template -inline ALWAYS_INLINE bool hasAllIntegralLoopRemainder( - size_t j, const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - for (; j < second.size; ++j) +#if USE_MULTITARGET_CODE + if constexpr (search_type == ArraySearchType::All && std::is_same_v) { - // skip null elements since both have at least one - assuming it was checked earlier that at least one element in 'first' is null - if (has_second_null_map && second_null_map[j]) - continue; - bool found = false; - - for (size_t i = 0; i < first.size; ++i) +#if defined(__AVX2__) + if (isArchSupported(TargetArch::AVX2)) { - if (has_first_null_map && first_null_map[i]) - continue; - - if (first.data[i] == second.data[j]) + if constexpr (std::is_same_v> || std::is_same_v>) { - found = true; - break; + return GatherUtils::TargetSpecific::AVX2::sliceHasImplAnyAllImplInt16(first, second, first_null_map, second_null_map); + } + else if constexpr (std::is_same_v> || std::is_same_v>) + { + return GatherUtils::TargetSpecific::AVX2::sliceHasImplAnyAllImplInt32(first, second, first_null_map, second_null_map); + } + else if constexpr (std::is_same_v> || std::is_same_v>) + { + return GatherUtils::TargetSpecific::AVX2::sliceHasImplAnyAllImplInt64(first, second, first_null_map, second_null_map); } } - - if (!found) - return false; - } - return true; -} - -} - #endif -#if defined(__AVX2__) && defined(ENABLE_AVX2) - -// AVX2 Int64 specialization -template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - if (second.size == 0) - return true; - - if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) - return false; - - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - size_t j = 0; - short has_mask = 1; - const Int64 full = -1, none = 0; - const __m256i ones = _mm256_set1_epi64x(full); - const __m256i zeros = _mm256_setzero_si256(); - if (second.size > 3 && first.size > 3) - { - for (; j < second.size - 3 && has_mask; j += 4) + if (isArchSupported(TargetArch::SSE42)) { - has_mask = 0; - const __m256i f_data = _mm256_loadu_si256(reinterpret_cast(second.data + j)); - // bits of the bitmask are set to one if considered as null in the corresponding null map, 0 otherwise; - __m256i bitmask = has_second_null_map ? - _mm256_set_epi64x( - (second_null_map[j + 3])? full : none, - (second_null_map[j + 2])? full : none, - (second_null_map[j + 1])? full : none, - (second_null_map[j]) ? full : none) - : zeros; - - unsigned i = 0; - for (; i < first.size - 3 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 4) + if constexpr (std::is_same_v> || std::is_same_v>) { - const __m256i s_data = _mm256_loadu_si256(reinterpret_cast(first.data + i)); - const __m256i first_nm_mask = has_first_null_map? - _mm256_set_m128i( - _mm_cvtepi8_epi64(_mm_loadu_si128(reinterpret_cast(first_null_map + i + 2))), - _mm_cvtepi8_epi64(_mm_loadu_si128(reinterpret_cast(first_null_map + i)))) - : zeros; - bitmask = - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - first_nm_mask, - _mm256_cmpeq_epi64(f_data, s_data)), - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(5,4,3,2,1,0,7,6)), - _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(5,4,3,2,1,0,7,6))))), - - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(3,2,1,0,7,6,5,4)), - _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(3,2,1,0,7,6,5,4)))), - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(1,0,7,6,5,4,3,2)), - _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(1,0,7,6,5,4,3,2)))))), - bitmask); + return TargetSpecific::SSE42::sliceHasImplAnyAllImplInt8(first, second, first_null_map, second_null_map); } - - if (i < first.size) + else if constexpr (std::is_same_v> || std::is_same_v>) { - for (; i < first.size && !has_mask; ++i) - { - if (has_first_null_map && first_null_map[i]) - continue; - __m256i v_i = _mm256_set1_epi64x(first.data[i]); - bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi64(f_data, v_i)); - has_mask = _mm256_testc_si256(bitmask, ones); - } + return GatherUtils::TargetSpecific::SSE42::sliceHasImplAnyAllImplInt16(first, second, first_null_map, second_null_map); + } + else if constexpr (std::is_same_v> || std::is_same_v>) + { + return GatherUtils::TargetSpecific::SSE42::sliceHasImplAnyAllImplInt32(first, second, first_null_map, second_null_map); + } + else if constexpr (std::is_same_v> || std::is_same_v>) + { + return GatherUtils::TargetSpecific::SSE42::sliceHasImplAnyAllImplInt64(first, second, first_null_map, second_null_map); } } } - - if (!has_mask && second.size > 3) - return false; - - return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); -} - -// AVX2 Int64 specialization -template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - if (second.size == 0) - return true; - - if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) - return false; - - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - size_t j = 0; - short has_mask = 1; - const Int64 full = -1, none = 0; - const __m256i ones = _mm256_set1_epi64x(full); - const __m256i zeros = _mm256_setzero_si256(); - if (second.size > 3 && first.size > 3) - { - for (; j < second.size - 3 && has_mask; j += 4) - { - has_mask = 0; - const __m256i f_data = _mm256_loadu_si256(reinterpret_cast(second.data + j)); - // bits of the bitmask are set to one if considered as null in the corresponding null map, 0 otherwise; - __m256i bitmask = has_second_null_map ? - _mm256_set_epi64x( - (second_null_map[j + 3])? full : none, - (second_null_map[j + 2])? full : none, - (second_null_map[j + 1])? full : none, - (second_null_map[j]) ? full : none) - : zeros; - - unsigned i = 0; - for (; i < first.size - 3 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 4) - { - const __m256i s_data = _mm256_loadu_si256(reinterpret_cast(first.data + i)); - const __m256i first_nm_mask = has_first_null_map? - _mm256_set_m128i( - _mm_cvtepi8_epi64(_mm_loadu_si128(reinterpret_cast(first_null_map + i + 2))), - _mm_cvtepi8_epi64(_mm_loadu_si128(reinterpret_cast(first_null_map + i)))) - : zeros; - bitmask = - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - first_nm_mask, - _mm256_cmpeq_epi64(f_data, s_data)), - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(5,4,3,2,1,0,7,6)), - _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(5,4,3,2,1,0,7,6))))), - - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(3,2,1,0,7,6,5,4)), - _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(3,2,1,0,7,6,5,4)))), - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(1,0,7,6,5,4,3,2)), - _mm256_cmpeq_epi64(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(1,0,7,6,5,4,3,2)))))), - bitmask); - } - - if (i < first.size) - { - for (; i < first.size && !has_mask; ++i) - { - if (has_first_null_map && first_null_map[i]) - continue; - __m256i v_i = _mm256_set1_epi64x(first.data[i]); - bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi64(f_data, v_i)); - has_mask = _mm256_testc_si256(bitmask, ones); - } - } - } - } - - if (!has_mask && second.size > 3) - return false; - - return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); -} - -// AVX2 Int32 specialization of sliceHasImplAnyAll -template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - if (second.size == 0) - return true; - - if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) - return false; - - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - size_t j = 0; - short has_mask = 1; - const int full = -1, none = 0; - const __m256i ones = _mm256_set1_epi32(full); - const __m256i zeros = _mm256_setzero_si256(); - if (second.size > 7 && first.size > 7) - { - for (; j < second.size - 7 && has_mask; j += 8) - { - has_mask = 0; - const __m256i f_data = _mm256_loadu_si256(reinterpret_cast(second.data + j)); - // bits of the bitmask are set to one if considered as null in the corresponding null map, 0 otherwise; - __m256i bitmask = has_second_null_map ? - _mm256_set_epi32( - (second_null_map[j + 7]) ? full : none, - (second_null_map[j + 6]) ? full : none, - (second_null_map[j + 5]) ? full : none, - (second_null_map[j + 4]) ? full : none, - (second_null_map[j + 3]) ? full : none, - (second_null_map[j + 2]) ? full : none, - (second_null_map[j + 1]) ? full : none, - (second_null_map[j]) ? full : none) - : zeros; - - size_t i = 0; - for (; i < first.size - 7 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 8) - { - const __m256i s_data = _mm256_loadu_si256(reinterpret_cast(first.data + i)); - // Create a mask to avoid to compare null elements - // set_m128i takes two arguments: (high segment, low segment) that are two __m128i convert from 8bits to 32bits to match with next operations - const __m256i first_nm_mask = has_first_null_map? - _mm256_set_m128i( - _mm_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast(first_null_map + i + 4))), - _mm_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast(first_null_map + i)))) - : zeros; - bitmask = - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - first_nm_mask, - _mm256_cmpeq_epi32(f_data, s_data)), - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(6,5,4,3,2,1,0,7)), - _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(6,5,4,3,2,1,0,7))))), - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(5,4,3,2,1,0,7,6)), - _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(5,4,3,2,1,0,7,6)))), - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(4,3,2,1,0,7,6,5)), - _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(4,3,2,1,0,7,6,5))))) - ), - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(3,2,1,0,7,6,5,4)), - _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(3,2,1,0,7,6,5,4)))), - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(2,1,0,7,6,5,4,3)), - _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(2,1,0,7,6,5,4,3))))), - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(1,0,7,6,5,4,3,2)), - _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(1,0,7,6,5,4,3,2)))), - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(0,7,6,5,4,3,2,1)), - _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(0,7,6,5,4,3,2,1))))))), - bitmask); - } - - if (i < first.size) - { - // Loop(i)-jam - for (; i < first.size && !has_mask; ++i) - { - if (has_first_null_map && first_null_map[i]) - continue; - __m256i v_i = _mm256_set1_epi32(first.data[i]); - bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi32(f_data, v_i)); - has_mask = _mm256_testc_si256(bitmask, ones); - } - } - } - } - - if (!has_mask && second.size > 7) - return false; - - return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); -} - -// AVX2 UInt32 specialization of sliceHasImplAnyAll -template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - if (second.size == 0) - return true; - - if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) - return false; - - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - size_t j = 0; - short has_mask = 1; - const int full = -1, none = 0; - const __m256i ones = _mm256_set1_epi32(full); - const __m256i zeros = _mm256_setzero_si256(); - if (second.size > 7 && first.size > 7) - { - for (; j < second.size - 7 && has_mask; j += 8) - { - has_mask = 0; - const __m256i f_data = _mm256_loadu_si256(reinterpret_cast(second.data + j)); - // bits of the bitmask are set to one if considered as null in the corresponding null map, 0 otherwise; - __m256i bitmask = has_second_null_map ? - _mm256_set_epi32( - (second_null_map[j + 7]) ? full : none, - (second_null_map[j + 6]) ? full : none, - (second_null_map[j + 5]) ? full : none, - (second_null_map[j + 4]) ? full : none, - (second_null_map[j + 3]) ? full : none, - (second_null_map[j + 2]) ? full : none, - (second_null_map[j + 1]) ? full : none, - (second_null_map[j]) ? full : none) - : zeros; - - size_t i = 0; - for (; i < first.size - 7 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 8) - { - const __m256i s_data = _mm256_loadu_si256(reinterpret_cast(first.data + i)); - // Create a mask to avoid to compare null elements - // set_m128i takes two arguments: (high segment, low segment) that are two __m128i convert from 8bits to 32bits to match with next operations - const __m256i first_nm_mask = has_first_null_map? - _mm256_set_m128i( - _mm_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast(first_null_map + i + 4))), - _mm_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast(first_null_map + i)))) - : zeros; - bitmask = - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - first_nm_mask, - _mm256_cmpeq_epi32(f_data, s_data)), - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(6,5,4,3,2,1,0,7)), - _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(6,5,4,3,2,1,0,7))))), - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(5,4,3,2,1,0,7,6)), - _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(5,4,3,2,1,0,7,6)))), - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(4,3,2,1,0,7,6,5)), - _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(4,3,2,1,0,7,6,5))))) - ), - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(3,2,1,0,7,6,5,4)), - _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(3,2,1,0,7,6,5,4)))), - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(2,1,0,7,6,5,4,3)), - _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(2,1,0,7,6,5,4,3))))), - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(1,0,7,6,5,4,3,2)), - _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(1,0,7,6,5,4,3,2)))), - _mm256_andnot_si256( - _mm256_permutevar8x32_epi32(first_nm_mask, _mm256_set_epi32(0,7,6,5,4,3,2,1)), - _mm256_cmpeq_epi32(f_data, _mm256_permutevar8x32_epi32(s_data, _mm256_set_epi32(0,7,6,5,4,3,2,1))))))), - bitmask); - } - - if (i < first.size) - { - // Loop(i)-jam - for (; i < first.size && !has_mask; ++i) - { - if (has_first_null_map && first_null_map[i]) - continue; - __m256i v_i = _mm256_set1_epi32(first.data[i]); - bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi32(f_data, v_i)); - has_mask = _mm256_testc_si256(bitmask, ones); - } - } - } - } - - if (!has_mask && second.size > 7) - return false; - - return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); -} - -// AVX2 Int16 specialization -template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - if (second.size == 0) - return true; - - if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) - return false; - - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - size_t j = 0; - short has_mask = 1; - const int16_t full = -1, none = 0; - const __m256i ones = _mm256_set1_epi16(full); - const __m256i zeros = _mm256_setzero_si256(); - if (second.size > 15 && first.size > 15) - { - for (; j < second.size - 15 && has_mask; j += 16) - { - has_mask = 0; - const __m256i f_data = _mm256_loadu_si256(reinterpret_cast(second.data + j)); - __m256i bitmask = has_second_null_map ? - _mm256_set_epi16( - (second_null_map[j + 15]) ? full : none, (second_null_map[j + 14]) ? full : none, - (second_null_map[j + 13]) ? full : none, (second_null_map[j + 12]) ? full : none, - (second_null_map[j + 11]) ? full : none, (second_null_map[j + 10]) ? full : none, - (second_null_map[j + 9]) ? full : none, (second_null_map[j + 8])? full : none, - (second_null_map[j + 7]) ? full : none, (second_null_map[j + 6])? full : none, - (second_null_map[j + 5]) ? full : none, (second_null_map[j + 4])? full : none, - (second_null_map[j + 3]) ? full : none, (second_null_map[j + 2])? full : none, - (second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full : none) - : zeros; - unsigned i = 0; - for (; i < first.size - 15 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 16) - { - const __m256i s_data = _mm256_loadu_si256(reinterpret_cast(first.data + i)); - const __m256i first_nm_mask = has_first_null_map? - _mm256_set_m128i( - _mm_cvtepi8_epi16(_mm_loadu_si128(reinterpret_cast(first_null_map + i + 8))), - _mm_cvtepi8_epi16(_mm_loadu_si128(reinterpret_cast(first_null_map + i)))) - : zeros; - bitmask = - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - first_nm_mask, - _mm256_cmpeq_epi16(f_data, s_data)), - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30))))), - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28)))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26))))) - ), - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24)))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22))))), - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20)))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18)))))) - ), - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_permute2x128_si256(first_nm_mask, first_nm_mask,1), - _mm256_cmpeq_epi16(f_data, _mm256_permute2x128_si256(s_data, s_data, 1))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14))))), - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12)))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10))))) - ), - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data ,s_data, 1), _mm256_set_epi8(7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8)))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6))))), - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data ,s_data ,1), _mm256_set_epi8(3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4)))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2)))))) - ) - ), - bitmask); - } - - if (i < first.size) - { - for (; i < first.size && !has_mask; ++i) - { - if (has_first_null_map && first_null_map[i]) - continue; - __m256i v_i = _mm256_set1_epi16(first.data[i]); - bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi16(f_data, v_i)); - has_mask = _mm256_testc_si256(bitmask, ones); - } - } - } - } - - if (!has_mask && second.size > 15) - return false; - - return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); -} - -// AVX2 UInt16 specialization -template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - if (second.size == 0) - return true; - - if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) - return false; - - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - size_t j = 0; - short has_mask = 1; - const int16_t full = -1, none = 0; - const __m256i ones = _mm256_set1_epi16(full); - const __m256i zeros = _mm256_setzero_si256(); - if (second.size > 15 && first.size > 15) - { - for (; j < second.size - 15 && has_mask; j += 16) - { - has_mask = 0; - const __m256i f_data = _mm256_loadu_si256(reinterpret_cast(second.data + j)); - __m256i bitmask = has_second_null_map ? - _mm256_set_epi16( - (second_null_map[j + 15]) ? full : none, (second_null_map[j + 14]) ? full : none, - (second_null_map[j + 13]) ? full : none, (second_null_map[j + 12]) ? full : none, - (second_null_map[j + 11]) ? full : none, (second_null_map[j + 10]) ? full : none, - (second_null_map[j + 9]) ? full : none, (second_null_map[j + 8])? full : none, - (second_null_map[j + 7]) ? full : none, (second_null_map[j + 6])? full : none, - (second_null_map[j + 5]) ? full : none, (second_null_map[j + 4])? full : none, - (second_null_map[j + 3]) ? full : none, (second_null_map[j + 2])? full : none, - (second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full : none) - : zeros; - unsigned i = 0; - for (; i < first.size - 15 && !has_mask; has_mask = _mm256_testc_si256(bitmask, ones), i += 16) - { - const __m256i s_data = _mm256_loadu_si256(reinterpret_cast(first.data + i)); - const __m256i first_nm_mask = has_first_null_map? - _mm256_set_m128i( - _mm_cvtepi8_epi16(_mm_loadu_si128(reinterpret_cast(first_null_map + i + 8))), - _mm_cvtepi8_epi16(_mm_loadu_si128(reinterpret_cast(first_null_map + i)))) - : zeros; - bitmask = - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - first_nm_mask, - _mm256_cmpeq_epi16(f_data, s_data)), - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30))))), - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28)))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26))))) - ), - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24)))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22))))), - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20)))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(first_nm_mask, _mm256_set_epi8(17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(s_data, _mm256_set_epi8(17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18)))))) - ), - _mm256_or_si256( - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_permute2x128_si256(first_nm_mask, first_nm_mask,1), - _mm256_cmpeq_epi16(f_data, _mm256_permute2x128_si256(s_data, s_data, 1))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14))))), - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12)))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(9,8,7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10))))) - ), - _mm256_or_si256( - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data ,s_data, 1), _mm256_set_epi8(7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8)))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(5,4,3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6))))), - _mm256_or_si256( - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data ,s_data ,1), _mm256_set_epi8(3,2,1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4)))), - _mm256_andnot_si256( - _mm256_shuffle_epi8(_mm256_permute2x128_si256(first_nm_mask, first_nm_mask, 1), _mm256_set_epi8(1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2)), - _mm256_cmpeq_epi16(f_data, _mm256_shuffle_epi8(_mm256_permute2x128_si256(s_data, s_data, 1), _mm256_set_epi8(1,0,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2)))))) - ) - ), - bitmask); - } - - if (i < first.size) - { - for (; i < first.size && !has_mask; ++i) - { - if (has_first_null_map && first_null_map[i]) - continue; - __m256i v_i = _mm256_set1_epi16(first.data[i]); - bitmask = _mm256_or_si256(bitmask, _mm256_cmpeq_epi16(f_data, v_i)); - has_mask = _mm256_testc_si256(bitmask, ones); - } - } - } - } - - if (!has_mask && second.size > 15) - return false; - - return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); -} - -#elif defined(__SSE4_2__) - -// SSE4.2 Int64 specialization -template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - if (second.size == 0) - return true; - - if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) - return false; - - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - size_t j = 0; - short has_mask = 1; - const Int64 full = -1, none = 0; - const __m128i zeros = _mm_setzero_si128(); - if (second.size > 1 && first.size > 1) - { - for (; j < second.size - 1 && has_mask; j += 2) - { - has_mask = 0; - const __m128i f_data = _mm_loadu_si128(reinterpret_cast(second.data + j)); - __m128i bitmask = has_second_null_map ? - _mm_set_epi64x( - (second_null_map[j + 1]) ? full : none, - (second_null_map[j]) ? full : none) - : zeros; - unsigned i = 0; - for (; i < first.size - 1 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 2) - { - const __m128i s_data = _mm_loadu_si128(reinterpret_cast(first.data + i)); - const __m128i first_nm_mask = has_first_null_map ? - _mm_cvtepi8_epi64(_mm_loadu_si128(reinterpret_cast(first_null_map + i))) - : zeros; - bitmask = - _mm_or_si128( - _mm_or_si128( - _mm_andnot_si128( - first_nm_mask, - _mm_cmpeq_epi64(f_data, s_data)), - _mm_andnot_si128( - _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(1,0,3,2)), - _mm_cmpeq_epi64(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(1,0,3,2))))), - bitmask); - } - - if (i < first.size) - { - for (; i < first.size && !has_mask; ++i) - { - if (has_first_null_map && first_null_map[i]) - continue; - __m128i v_i = _mm_set1_epi64x(first.data[i]); - bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi64(f_data, v_i)); - has_mask = _mm_test_all_ones(bitmask); - } - } - } - } - - if (!has_mask && second.size > 1) - return false; - - return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); -} - -// SSE4.2 UInt64 specialization -template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - if (second.size == 0) - return true; - - if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) - return false; - - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - size_t j = 0; - short has_mask = 1; - const Int64 full = -1, none = 0; - const __m128i zeros = _mm_setzero_si128(); - if (second.size > 1 && first.size > 1) - { - for (; j < second.size - 1 && has_mask; j += 2) - { - has_mask = 0; - const __m128i f_data = _mm_loadu_si128(reinterpret_cast(second.data + j)); - __m128i bitmask = has_second_null_map ? - _mm_set_epi64x( - (second_null_map[j + 1]) ? full : none, - (second_null_map[j]) ? full : none) - : zeros; - unsigned i = 0; - for (; i < first.size - 1 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 2) - { - const __m128i s_data = _mm_loadu_si128(reinterpret_cast(first.data + i)); - const __m128i first_nm_mask = has_first_null_map ? - _mm_cvtepi8_epi64(_mm_loadu_si128(reinterpret_cast(first_null_map + i))) - : zeros; - bitmask = - _mm_or_si128( - _mm_or_si128( - _mm_andnot_si128( - first_nm_mask, - _mm_cmpeq_epi64(f_data, s_data)), - _mm_andnot_si128( - _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(1,0,3,2)), - _mm_cmpeq_epi64(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(1,0,3,2))))), - bitmask); - } - - if (i < first.size) - { - for (; i < first.size && !has_mask; ++i) - { - if (has_first_null_map && first_null_map[i]) - continue; - __m128i v_i = _mm_set1_epi64x(first.data[i]); - bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi64(f_data, v_i)); - has_mask = _mm_test_all_ones(bitmask); - } - } - } - } - - if (!has_mask && second.size > 1) - return false; - - return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); -} - -// SSE4.2 Int32 specialization -template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - if (second.size == 0) - return true; - - if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) - return false; - - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - size_t j = 0; - short has_mask = 1; - const int full = -1, none = 0; - const __m128i zeros = _mm_setzero_si128(); - if (second.size > 3 && first.size > 3) - { - for (; j < second.size - 3 && has_mask; j += 4) - { - has_mask = 0; - const __m128i f_data = _mm_loadu_si128(reinterpret_cast(second.data + j)); - __m128i bitmask = has_second_null_map ? - _mm_set_epi32( - (second_null_map[j + 3]) ? full : none, - (second_null_map[j + 2]) ? full : none, - (second_null_map[j + 1]) ? full : none, - (second_null_map[j]) ? full : none) - : zeros; - - unsigned i = 0; - for (; i < first.size - 3 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 4) - { - const __m128i s_data = _mm_loadu_si128(reinterpret_cast(first.data + i)); - const __m128i first_nm_mask = has_first_null_map ? - _mm_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast(first_null_map + i))) - : zeros; - - bitmask = - _mm_or_si128( - _mm_or_si128( - _mm_or_si128( - _mm_andnot_si128( - first_nm_mask, - _mm_cmpeq_epi32(f_data, s_data)), - _mm_andnot_si128( - _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(2,1,0,3)), - _mm_cmpeq_epi32(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(2,1,0,3))))), - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(1,0,3,2)), - _mm_cmpeq_epi32(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(1,0,3,2)))), - _mm_andnot_si128( - _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(0,3,2,1)), - _mm_cmpeq_epi32(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(0,3,2,1))))) - ), - bitmask); - } - - if (i < first.size) - { - for (; i < first.size && !has_mask; ++i) - { - if (has_first_null_map && first_null_map[i]) - continue; - __m128i r_i = _mm_set1_epi32(first.data[i]); - bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi32(f_data, r_i)); - has_mask = _mm_test_all_ones(bitmask); - } - } - } - } - - if (!has_mask && second.size > 3) - return false; - - return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); -} - -// SSE4.2 UInt32 specialization -template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - if (second.size == 0) - return true; - - if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) - return false; - - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - size_t j = 0; - short has_mask = 1; - const int full = -1, none = 0; - const __m128i zeros = _mm_setzero_si128(); - if (second.size > 3 && first.size > 3) - { - for (; j < second.size - 3 && has_mask; j += 4) - { - has_mask = 0; - const __m128i f_data = _mm_loadu_si128(reinterpret_cast(second.data + j)); - __m128i bitmask = has_second_null_map ? - _mm_set_epi32( - (second_null_map[j + 3]) ? full : none, - (second_null_map[j + 2]) ? full : none, - (second_null_map[j + 1]) ? full : none, - (second_null_map[j]) ? full : none) - : zeros; - - unsigned i = 0; - for (; i < first.size - 3 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 4) - { - const __m128i s_data = _mm_loadu_si128(reinterpret_cast(first.data + i)); - const __m128i first_nm_mask = has_first_null_map ? - _mm_cvtepi8_epi32(_mm_loadu_si128(reinterpret_cast(first_null_map + i))) - : zeros; - - bitmask = - _mm_or_si128( - _mm_or_si128( - _mm_or_si128( - _mm_andnot_si128( - first_nm_mask, - _mm_cmpeq_epi32(f_data, s_data)), - _mm_andnot_si128( - _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(2,1,0,3)), - _mm_cmpeq_epi32(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(2,1,0,3))))), - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(1,0,3,2)), - _mm_cmpeq_epi32(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(1,0,3,2)))), - _mm_andnot_si128( - _mm_shuffle_epi32(first_nm_mask, _MM_SHUFFLE(0,3,2,1)), - _mm_cmpeq_epi32(f_data, _mm_shuffle_epi32(s_data, _MM_SHUFFLE(0,3,2,1))))) - ), - bitmask); - } - - if (i < first.size) - { - for (; i < first.size && !has_mask; ++i) - { - if (has_first_null_map && first_null_map[i]) - continue; - __m128i r_i = _mm_set1_epi32(first.data[i]); - bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi32(f_data, r_i)); - has_mask = _mm_test_all_ones(bitmask); - } - } - } - } - - if (!has_mask && second.size > 3) - return false; - - return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); -} - -// SSE4.2 Int16 specialization -template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - if (second.size == 0) - return true; - - if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) - return false; - - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - size_t j = 0; - short has_mask = 1; - const int16_t full = -1, none = 0; - const __m128i zeros = _mm_setzero_si128(); - if (second.size > 6 && first.size > 6) - { - for (; j < second.size - 7 && has_mask; j += 8) - { - has_mask = 0; - const __m128i f_data = _mm_loadu_si128(reinterpret_cast(second.data + j)); - __m128i bitmask = has_second_null_map ? - _mm_set_epi16( - (second_null_map[j + 7]) ? full : none, (second_null_map[j + 6]) ? full : none, - (second_null_map[j + 5]) ? full : none, (second_null_map[j + 4]) ? full : none, - (second_null_map[j + 3]) ? full : none, (second_null_map[j + 2]) ? full : none, - (second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full: none) - : zeros; - unsigned i = 0; - for (; i < first.size-7 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 8) - { - const __m128i s_data = _mm_loadu_si128(reinterpret_cast(first.data + i)); - const __m128i first_nm_mask = has_first_null_map ? - _mm_cvtepi8_epi16(_mm_loadu_si128(reinterpret_cast(first_null_map + i))) - : zeros; - bitmask = - _mm_or_si128( - _mm_or_si128( - _mm_or_si128( - _mm_or_si128( - _mm_andnot_si128( - first_nm_mask, - _mm_cmpeq_epi16(f_data, s_data)), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14)), - _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14))))), - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)), - _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10)), - _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10))))) - ), - _mm_or_si128( - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)), - _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6)), - _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6))))), - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)), - _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)), - _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)))))) - ), - bitmask); - } - - if (i < first.size) - { - for (; i < first.size && !has_mask; ++i) - { - if (has_first_null_map && first_null_map[i]) - continue; - __m128i v_i = _mm_set1_epi16(first.data[i]); - bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi16(f_data, v_i)); - has_mask = _mm_test_all_ones(bitmask); - } - } - } - } - - if (!has_mask && second.size > 6) - return false; - - return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); -} - -// SSE4.2 UInt16 specialization -template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - if (second.size == 0) - return true; - - if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) - return false; - - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - size_t j = 0; - short has_mask = 1; - const int16_t full = -1, none = 0; - const __m128i zeros = _mm_setzero_si128(); - if (second.size > 6 && first.size > 6) - { - for (; j < second.size - 7 && has_mask; j += 8) - { - has_mask = 0; - const __m128i f_data = _mm_loadu_si128(reinterpret_cast(second.data + j)); - __m128i bitmask = has_second_null_map ? - _mm_set_epi16( - (second_null_map[j + 7]) ? full : none, (second_null_map[j + 6]) ? full : none, - (second_null_map[j + 5]) ? full : none, (second_null_map[j + 4]) ? full : none, - (second_null_map[j + 3]) ? full : none, (second_null_map[j + 2]) ? full : none, - (second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full: none) - : zeros; - unsigned i = 0; - for (; i < first.size-7 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 8) - { - const __m128i s_data = _mm_loadu_si128(reinterpret_cast(first.data + i)); - const __m128i first_nm_mask = has_first_null_map ? - _mm_cvtepi8_epi16(_mm_loadu_si128(reinterpret_cast(first_null_map + i))) - : zeros; - bitmask = - _mm_or_si128( - _mm_or_si128( - _mm_or_si128( - _mm_or_si128( - _mm_andnot_si128( - first_nm_mask, - _mm_cmpeq_epi16(f_data, s_data)), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14)), - _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14))))), - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)), - _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10)), - _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10))))) - ), - _mm_or_si128( - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)), - _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6)), - _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6))))), - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)), - _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)), - _mm_cmpeq_epi16(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)))))) - ), - bitmask); - } - - if (i < first.size) - { - for (; i < first.size && !has_mask; ++i) - { - if (has_first_null_map && first_null_map[i]) - continue; - __m128i v_i = _mm_set1_epi16(first.data[i]); - bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi16(f_data, v_i)); - has_mask = _mm_test_all_ones(bitmask); - } - } - } - } - - if (!has_mask && second.size > 6) - return false; - - return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); -} - #endif -// Int8 version is faster with SSE than with AVX2 -#if defined(__SSE4_2__) -// SSE2 Int8 specialization -template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - if (second.size == 0) - return true; - - if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) - return false; - - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - size_t j = 0; - short has_mask = 1; - const int8_t full = -1, none = 0; - const __m128i zeros = _mm_setzero_si128(); - if (second.size > 15 && first.size > 15) - { - for (; j < second.size - 15 && has_mask; j += 16) - { - has_mask = 0; - const __m128i f_data = _mm_loadu_si128(reinterpret_cast(second.data + j)); - __m128i bitmask = has_second_null_map ? - _mm_set_epi8( - (second_null_map[j + 15]) ? full : none, (second_null_map[j + 14]) ? full : none, - (second_null_map[j + 13]) ? full : none, (second_null_map[j + 12]) ? full : none, - (second_null_map[j + 11]) ? full : none, (second_null_map[j + 10]) ? full : none, - (second_null_map[j + 9]) ? full : none, (second_null_map[j + 8]) ? full : none, - (second_null_map[j + 7]) ? full : none, (second_null_map[j + 6]) ? full : none, - (second_null_map[j + 5]) ? full : none, (second_null_map[j + 4]) ? full : none, - (second_null_map[j + 3]) ? full : none, (second_null_map[j + 2]) ? full : none, - (second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full : none) - : zeros; - unsigned i = 0; - for (; i < first.size - 15 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 16) - { - const __m128i s_data = _mm_loadu_si128(reinterpret_cast(first.data + i)); - const __m128i first_nm_mask = has_first_null_map ? - _mm_loadu_si128(reinterpret_cast(first_null_map + i)) - : zeros; - bitmask = - _mm_or_si128( - _mm_or_si128( - _mm_or_si128( - _mm_or_si128( - _mm_or_si128( - _mm_andnot_si128( - first_nm_mask, - _mm_cmpeq_epi8(f_data, s_data)), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15))))), - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13))))) - ), - _mm_or_si128( - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11))))), - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9))))))), - _mm_or_si128( - _mm_or_si128( - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7))))), - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5)))))), - _mm_or_si128( - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3))))), - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1)))))))), - bitmask); - } - - if (i < first.size) - { - for (; i < first.size && !has_mask; ++i) - { - if (has_first_null_map && first_null_map[i]) - continue; - __m128i v_i = _mm_set1_epi8(first.data[i]); - bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi8(f_data, v_i)); - has_mask = _mm_test_all_ones(bitmask); - } - } - } - } - - if (!has_mask && second.size > 15) - return false; - - return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); + return sliceHasImplAnyAllGenericImpl(first, second, first_null_map, second_null_map); } -// SSE2 UInt8 specialization -template <> -inline ALWAYS_INLINE bool sliceHasImplAnyAll, NumericArraySlice, sliceEqualElements >( - const NumericArraySlice & first, const NumericArraySlice & second, const UInt8 * first_null_map, const UInt8 * second_null_map) -{ - if (second.size == 0) - return true; - - if (!hasNull(first_null_map, first.size) && hasNull(second_null_map, second.size)) - return false; - - const bool has_first_null_map = first_null_map != nullptr; - const bool has_second_null_map = second_null_map != nullptr; - - size_t j = 0; - short has_mask = 1; - const int8_t full = -1, none = 0; - const __m128i zeros = _mm_setzero_si128(); - if (second.size > 15 && first.size > 15) - { - for (; j < second.size - 15 && has_mask; j += 16) - { - has_mask = 0; - const __m128i f_data = _mm_loadu_si128(reinterpret_cast(second.data + j)); - __m128i bitmask = has_second_null_map ? - _mm_set_epi8( - (second_null_map[j + 15]) ? full : none, (second_null_map[j + 14]) ? full : none, - (second_null_map[j + 13]) ? full : none, (second_null_map[j + 12]) ? full : none, - (second_null_map[j + 11]) ? full : none, (second_null_map[j + 10]) ? full : none, - (second_null_map[j + 9]) ? full : none, (second_null_map[j + 8]) ? full : none, - (second_null_map[j + 7]) ? full : none, (second_null_map[j + 6]) ? full : none, - (second_null_map[j + 5]) ? full : none, (second_null_map[j + 4]) ? full : none, - (second_null_map[j + 3]) ? full : none, (second_null_map[j + 2]) ? full : none, - (second_null_map[j + 1]) ? full : none, (second_null_map[j]) ? full : none) - : zeros; - unsigned i = 0; - for (; i < first.size - 15 && !has_mask; has_mask = _mm_test_all_ones(bitmask), i += 16) - { - const __m128i s_data = _mm_loadu_si128(reinterpret_cast(first.data + i)); - const __m128i first_nm_mask = has_first_null_map ? - _mm_loadu_si128(reinterpret_cast(first_null_map + i)) - : zeros; - bitmask = - _mm_or_si128( - _mm_or_si128( - _mm_or_si128( - _mm_or_si128( - _mm_or_si128( - _mm_andnot_si128( - first_nm_mask, - _mm_cmpeq_epi8(f_data, s_data)), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15))))), - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13))))) - ), - _mm_or_si128( - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11))))), - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9))))))), - _mm_or_si128( - _mm_or_si128( - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7))))), - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5)))))), - _mm_or_si128( - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3))))), - _mm_or_si128( - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2)))), - _mm_andnot_si128( - _mm_shuffle_epi8(first_nm_mask, _mm_set_epi8(0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1)), - _mm_cmpeq_epi8(f_data, _mm_shuffle_epi8(s_data, _mm_set_epi8(0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1)))))))), - bitmask); - } - - if (i < first.size) - { - for (; i < first.size && !has_mask; ++i) - { - if (has_first_null_map && first_null_map[i]) - continue; - __m128i v_i = _mm_set1_epi8(first.data[i]); - bitmask = _mm_or_si128(bitmask, _mm_cmpeq_epi8(f_data, v_i)); - has_mask = _mm_test_all_ones(bitmask); - } - } - } - } - - if (!has_mask && second.size > 15) - return false; - - return hasAllIntegralLoopRemainder(j, first, second, first_null_map, second_null_map); -} -#endif } From b40a69f59acbb53464df12952a3aca1e8d8ad9f9 Mon Sep 17 00:00:00 2001 From: chen9t Date: Thu, 31 Mar 2022 10:10:05 +0800 Subject: [PATCH 073/239] Code style --- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 49 ++++++++++--------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 76d42e4b785..0ed96f5dda4 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -635,31 +635,34 @@ void IMergeTreeDataPart::loadColumnsChecksumsIndexes(bool require_columns_checks /// Motivation: memory for index is shared between queries - not belong to the query itself. MemoryTrackerBlockerInThread temporarily_disable_memory_tracker(VariableContext::Global); - try { - loadUUID(); - loadColumns(require_columns_checksums); - loadChecksums(require_columns_checksums); - loadIndexGranularity(); - calculateColumnsAndSecondaryIndicesSizesOnDisk(); - loadIndex(); /// Must be called after loadIndexGranularity as it uses the value of `index_granularity` - loadRowsCount(); /// Must be called after loadIndexGranularity() as it uses the value of `index_granularity`. - loadPartitionAndMinMaxIndex(); - if (!parent_part) - { - loadTTLInfos(); - loadProjections(require_columns_checksums, check_consistency); - } + try + { + loadUUID(); + loadColumns(require_columns_checksums); + loadChecksums(require_columns_checksums); + loadIndexGranularity(); + calculateColumnsAndSecondaryIndicesSizesOnDisk(); + loadIndex(); /// Must be called after loadIndexGranularity as it uses the value of `index_granularity` + loadRowsCount(); /// Must be called after loadIndexGranularity() as it uses the value of `index_granularity`. + loadPartitionAndMinMaxIndex(); + if (!parent_part) + { + loadTTLInfos(); + loadProjections(require_columns_checksums, check_consistency); + } - if (check_consistency) - checkConsistency(require_columns_checksums); + if (check_consistency) + checkConsistency(require_columns_checksums); - loadDefaultCompressionCodec(); - } catch (...) { - // There could be conditions that data part to be loaded is broken, but some of meta infos are already written - // into meta data before exception, need to clean them all. - metadata_manager->deleteAll(/*include_projection*/true); - metadata_manager->assertAllDeleted(/*include_projection*/true); - throw; + loadDefaultCompressionCodec(); + } + catch (...) + { + // There could be conditions that data part to be loaded is broken, but some of meta infos are already written + // into meta data before exception, need to clean them all. + metadata_manager->deleteAll(/*include_projection*/ true); + metadata_manager->assertAllDeleted(/*include_projection*/ true); + throw; } } From 74275da7eedfba63aaf954e546aab9f2d1bd27c5 Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 31 Mar 2022 10:52:34 +0000 Subject: [PATCH 074/239] Make better --- src/Processors/Formats/ISchemaReader.cpp | 129 ++++++++++------------- 1 file changed, 58 insertions(+), 71 deletions(-) diff --git a/src/Processors/Formats/ISchemaReader.cpp b/src/Processors/Formats/ISchemaReader.cpp index 796cdccbe8f..615ebfb03be 100644 --- a/src/Processors/Formats/ISchemaReader.cpp +++ b/src/Processors/Formats/ISchemaReader.cpp @@ -11,6 +11,60 @@ namespace ErrorCodes extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; } +static void chooseResultType( + DataTypePtr & type, + const DataTypePtr & new_type, + bool allow_bools_as_numbers, + const DataTypePtr & default_type, + const String & column_name, + size_t row) +{ + if (!type) + type = new_type; + + /// If the new type and the previous type for this column are different, + /// we will use default type if we have it or throw an exception. + if (new_type && type->equals(*new_type)) + { + /// Check if we have Bool and Number and if allow_bools_as_numbers + /// is true make the result type Number + auto not_nullable_type = removeNullable(type); + auto not_nullable_new_type = removeNullable(new_type); + if (allow_bools_as_numbers && (isBool(not_nullable_type) || isBool(not_nullable_new_type)) + && (isNumber(not_nullable_type) || isNumber(not_nullable_new_type))) + { + if (isBool(not_nullable_type)) + type = new_type; + } + else if (default_type) + type = default_type; + else + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "Automatically defined type {} for column {} in row {} differs from type defined by previous rows: {}", + type->getName(), + column_name, + row, + new_type->getName()); + } +} + +static void checkTypeAndAppend(NamesAndTypesList & result, DataTypePtr & type, const String & name, const DataTypePtr & default_type, size_t max_rows_to_read) +{ + if (!type) + { + if (!default_type) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "Cannot determine table structure by first {} rows of data, because some columns contain only Nulls. To increase the maximum " + "number of rows to read for structure determination, use setting input_format_max_rows_to_read_for_schema_inference", + max_rows_to_read); + + type = default_type; + } + result.emplace_back(name, type); +} + IRowSchemaReader::IRowSchemaReader(ReadBuffer & in_, size_t max_rows_to_read_, DataTypePtr default_type_, bool allow_bools_as_numbers_) : ISchemaReader(in_), max_rows_to_read(max_rows_to_read_), default_type(default_type_), allow_bools_as_numbers(allow_bools_as_numbers_) { @@ -35,29 +89,7 @@ NamesAndTypesList IRowSchemaReader::readSchema() if (!new_data_types[i]) continue; - /// If we couldn't determine the type of column yet, just set the new type. - if (!data_types[i]) - data_types[i] = new_data_types[i]; - /// If the new type and the previous type for this column are different, - /// we will use default type if we have it or throw an exception. - else if (!data_types[i]->equals(*new_data_types[i])) - { - /// Check if we have Bool and Number and if allow_bools_as_numbers - /// is true make the result type Number - auto not_nullable_type = removeNullable(data_types[i]); - auto not_nullable_new_type = removeNullable(new_data_types[i]); - if (allow_bools_as_numbers && (isBool(not_nullable_type) || isBool(not_nullable_new_type)) - && (isNumber(not_nullable_type) || isNumber(not_nullable_new_type))) { - if (isBool(not_nullable_type)) - data_types[i] = new_data_types[i]; - } - else if (default_type) - data_types[i] = default_type; - else - throw Exception( - ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "Automatically defined type {} for column {} in row {} differs from type defined by previous rows: {}", new_data_types[i]->getName(), i + 1, row, data_types[i]->getName()); - } + chooseResultType(data_types[i], new_data_types[i], allow_bools_as_numbers, default_type, std::to_string(i + 1), row); } } @@ -82,18 +114,7 @@ NamesAndTypesList IRowSchemaReader::readSchema() for (size_t i = 0; i != data_types.size(); ++i) { /// Check that we could determine the type of this column. - if (!data_types[i]) - { - if (!default_type) - throw Exception( - ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "Cannot determine table structure by first {} rows of data, because some columns contain only Nulls. To increase the maximum " - "number of rows to read for structure determination, use setting input_format_max_rows_to_read_for_schema_inference", - max_rows_to_read); - - data_types[i] = default_type; - } - result.emplace_back(column_names[i], data_types[i]); + checkTypeAndAppend(result, data_types[i], column_names[i], default_type, max_rows_to_read); } return result; @@ -125,30 +146,7 @@ NamesAndTypesList IRowWithNamesSchemaReader::readSchema() } auto & type = it->second; - /// If we couldn't determine the type of column yet, just set the new type. - if (!type) - type = new_type; - /// If the new type and the previous type for this column are different, - /// we will use default type if we have it or throw an exception. - else if (new_type && type->equals(*new_type)) - { - /// Check if we have Bool and Number and if allow_bools_as_numbers - /// is true make the result type Number - auto not_nullable_type = removeNullable(type); - auto not_nullable_new_type = removeNullable(new_type); - if (allow_bools_as_numbers && (isBool(not_nullable_type) || isBool(not_nullable_new_type)) - && (isNumber(not_nullable_type) || isNumber(not_nullable_new_type))) - { - if (isBool(not_nullable_type)) - type = new_type; - } - else if (default_type) - type = default_type; - else - throw Exception( - ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "Automatically defined type {} for column {} in row {} differs from type defined by previous rows: {}", type->getName(), name, row, new_type->getName()); - } + chooseResultType(type, new_type, allow_bools_as_numbers, default_type, name, row); } } @@ -160,18 +158,7 @@ NamesAndTypesList IRowWithNamesSchemaReader::readSchema() for (auto & [name, type] : names_and_types) { /// Check that we could determine the type of this column. - if (!type) - { - if (!default_type) - throw Exception( - ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "Cannot determine table structure by first {} rows of data, because some columns contain only Nulls. To increase the maximum " - "number of rows to read for structure determination, use setting input_format_max_rows_to_read_for_schema_inference", - max_rows_to_read); - - type = default_type; - } - result.emplace_back(name, type); + checkTypeAndAppend(result, type, name, default_type, max_rows_to_read); } return result; From d272356324a647e53fd44e01527c63402b6b80fe Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 31 Mar 2022 10:55:09 +0000 Subject: [PATCH 075/239] Minor code improvement --- src/Processors/Formats/ISchemaReader.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/Processors/Formats/ISchemaReader.cpp b/src/Processors/Formats/ISchemaReader.cpp index 615ebfb03be..567a5b05096 100644 --- a/src/Processors/Formats/ISchemaReader.cpp +++ b/src/Processors/Formats/ISchemaReader.cpp @@ -30,8 +30,9 @@ static void chooseResultType( /// is true make the result type Number auto not_nullable_type = removeNullable(type); auto not_nullable_new_type = removeNullable(new_type); - if (allow_bools_as_numbers && (isBool(not_nullable_type) || isBool(not_nullable_new_type)) - && (isNumber(not_nullable_type) || isNumber(not_nullable_new_type))) + bool bool_type_presents = isBool(not_nullable_type) || isBool(not_nullable_new_type); + bool number_type_presents = isNumber(not_nullable_type) || isNumber(not_nullable_new_type); + if (allow_bools_as_numbers && bool_type_presents && number_type_presents) { if (isBool(not_nullable_type)) type = new_type; From 63946ccb32454d5ecebdb35c92c8402c473af845 Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 31 Mar 2022 11:14:30 +0000 Subject: [PATCH 076/239] Fix tests --- .../02149_schema_inference.reference | 30 +++--- .../02188_table_function_format.reference | 92 +++++++++---------- ...e_table_without_columns_metadata.reference | 2 +- .../02240_tskv_schema_inference_bug.reference | 8 +- ...247_names_order_in_json_and_tskv.reference | 2 - .../02247_names_order_in_json_and_tskv.sh | 0 6 files changed, 66 insertions(+), 68 deletions(-) mode change 100644 => 100755 tests/queries/0_stateless/02247_names_order_in_json_and_tskv.sh diff --git a/tests/queries/0_stateless/02149_schema_inference.reference b/tests/queries/0_stateless/02149_schema_inference.reference index f46e3bee101..52139aa12dd 100644 --- a/tests/queries/0_stateless/02149_schema_inference.reference +++ b/tests/queries/0_stateless/02149_schema_inference.reference @@ -53,34 +53,34 @@ c Map(String, Nullable(Float64)) d Nullable(UInt8) 42.42 [(1,'String'),(2,'abcd')] {'key':42,'key2':24} 1 JSONEachRow -d Nullable(UInt8) +a Nullable(Float64) b Array(Tuple(Nullable(Float64), Nullable(String))) c Map(String, Nullable(Float64)) -a Nullable(Float64) -1 [(1,'String'),(2,'abcd')] {'key':42,'key2':24} 42.42 d Nullable(UInt8) +42.42 [(1,'String'),(2,'abcd')] {'key':42,'key2':24} 1 +a Nullable(Float64) b Array(Tuple(Nullable(Float64), Nullable(String))) c Map(String, Nullable(Float64)) -a Nullable(Float64) +d Nullable(UInt8) \N [(1,'String'),(2,NULL)] {'key':NULL,'key2':24} \N -1 [(2,'String 2'),(3,'hello')] {'key3':4242,'key4':2424} 32 +32 [(2,'String 2'),(3,'hello')] {'key3':4242,'key4':2424} 1 +a Nullable(Float64) b Nullable(String) c Array(Nullable(Float64)) -a Nullable(Float64) -s1 [] 1 -\N [2] 2 -\N [] \N -\N [] \N -\N [3] \N +1 s1 [] +2 \N [2] +\N \N [] +\N \N [] +\N \N [3] TSKV +a Nullable(String) b Nullable(String) c Nullable(String) -a Nullable(String) -s1 \N 1 -} [2] 2 +1 s1 \N +2 } [2] \N \N \N \N \N \N -\N [3] \N +\N \N [3] Values c1 Nullable(Float64) c2 Nullable(String) diff --git a/tests/queries/0_stateless/02188_table_function_format.reference b/tests/queries/0_stateless/02188_table_function_format.reference index ab568fb9fe5..403a4044544 100644 --- a/tests/queries/0_stateless/02188_table_function_format.reference +++ b/tests/queries/0_stateless/02188_table_function_format.reference @@ -1,52 +1,52 @@ -111 Hello -123 World -111 Hello -123 World -111 Hello -123 World -111 Hello -123 World -111 Hello -123 World -111 Hello -123 World -111 Hello -123 World -111 Hello -123 World -111 Hello -123 World -111 Hello -123 World -111 Hello -123 World -111 Hello -123 World -111 Hello -123 World -111 Hello -123 World -111 Hello -123 World -111 Hello -123 World -111 Hello -123 World -111 Hello -123 World -111 Hello -123 World -111 Hello -123 World +Hello 111 +World 123 +Hello 111 +World 123 +Hello 111 +World 123 +Hello 111 +World 123 +Hello 111 +World 123 +Hello 111 +World 123 +Hello 111 +World 123 +Hello 111 +World 123 +Hello 111 +World 123 +Hello 111 +World 123 +Hello 111 +World 123 +Hello 111 +World 123 +Hello 111 +World 123 +Hello 111 +World 123 +Hello 111 +World 123 +Hello 111 +World 123 +Hello 111 +World 123 +Hello 111 +World 123 +Hello 111 +World 123 +Hello 111 +World 123 1 2 [1,2,3] [['abc'],[],['d','e']] c1 Nullable(Float64) c2 Nullable(Float64) c3 Array(Nullable(Float64)) c4 Array(Array(Nullable(String))) -111 Hello -123 World -111 Hello -131 Hello -123 World -b Nullable(Float64) +Hello 111 +World 123 +Hello 111 +Hello 131 +World 123 a Nullable(String) +b Nullable(Float64) diff --git a/tests/queries/0_stateless/02222_create_table_without_columns_metadata.reference b/tests/queries/0_stateless/02222_create_table_without_columns_metadata.reference index 9e9e0082cb3..6c1dc4ebeb9 100644 --- a/tests/queries/0_stateless/02222_create_table_without_columns_metadata.reference +++ b/tests/queries/0_stateless/02222_create_table_without_columns_metadata.reference @@ -1,3 +1,3 @@ -CREATE TABLE default.test\n(\n `y` Nullable(String),\n `x` Nullable(Float64)\n)\nENGINE = File(\'JSONEachRow\', \'data.jsonl\') +CREATE TABLE test.test\n(\n `x` Nullable(Float64),\n `y` Nullable(String)\n)\nENGINE = File(\'JSONEachRow\', \'data.jsonl\') OK OK diff --git a/tests/queries/0_stateless/02240_tskv_schema_inference_bug.reference b/tests/queries/0_stateless/02240_tskv_schema_inference_bug.reference index a8abc33648e..69ed3536951 100644 --- a/tests/queries/0_stateless/02240_tskv_schema_inference_bug.reference +++ b/tests/queries/0_stateless/02240_tskv_schema_inference_bug.reference @@ -1,8 +1,8 @@ +a Nullable(String) b Nullable(String) c Nullable(String) -a Nullable(String) -s1 \N 1 -} [2] 2 +1 s1 \N +2 } [2] \N \N \N \N \N \N -\N [3] \N +\N \N [3] diff --git a/tests/queries/0_stateless/02247_names_order_in_json_and_tskv.reference b/tests/queries/0_stateless/02247_names_order_in_json_and_tskv.reference index 49a285dc11a..e49ad3f8d93 100644 --- a/tests/queries/0_stateless/02247_names_order_in_json_and_tskv.reference +++ b/tests/queries/0_stateless/02247_names_order_in_json_and_tskv.reference @@ -4,7 +4,6 @@ c Nullable(String) 1 s1 \N 2 } [2] \N \N \N -\N \N \N \N \N [3] b Nullable(String) a Nullable(String) @@ -12,7 +11,6 @@ c Nullable(String) e Nullable(String) 1 \N \N \N \N 2 3 \N -\N \N \N \N \N \N \N 3 3 3 1 \N a Nullable(Float64) diff --git a/tests/queries/0_stateless/02247_names_order_in_json_and_tskv.sh b/tests/queries/0_stateless/02247_names_order_in_json_and_tskv.sh old mode 100644 new mode 100755 From d166bb51153f630b9581902f8120ee0247b9d792 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Thu, 31 Mar 2022 13:47:15 +0200 Subject: [PATCH 077/239] Update 02245_format_string_stack_overflow.sql --- tests/queries/0_stateless/02245_format_string_stack_overflow.sql | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/queries/0_stateless/02245_format_string_stack_overflow.sql b/tests/queries/0_stateless/02245_format_string_stack_overflow.sql index 1ee3606d3a6..9376b12aa1e 100644 --- a/tests/queries/0_stateless/02245_format_string_stack_overflow.sql +++ b/tests/queries/0_stateless/02245_format_string_stack_overflow.sql @@ -1 +1,2 @@ +-- Tags: no-backward-compatibility-check:22.3.2.2 select format('{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}', toString(number)) str from numbers(1); From 42acb1dc29bd8e6272e38a8bc33ca9577ff011d6 Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Thu, 31 Mar 2022 13:26:32 +0000 Subject: [PATCH 078/239] fix inserts to columns of type Object in partitioned tables --- src/DataTypes/ObjectUtils.cpp | 16 ++++++------- src/DataTypes/ObjectUtils.h | 2 +- .../MergeTree/MergeTreeDataWriter.cpp | 24 ++++++++++++------- src/Storages/MergeTree/MergeTreeDataWriter.h | 9 +++---- src/Storages/MergeTree/MergeTreeSink.cpp | 2 ++ .../MergeTree/ReplicatedMergeTreeSink.cpp | 4 +++- src/Storages/StorageMemory.cpp | 3 +-- .../01825_type_json_partitions.reference | 2 ++ .../01825_type_json_partitions.sql | 13 ++++++++++ 9 files changed, 49 insertions(+), 26 deletions(-) create mode 100644 tests/queries/0_stateless/01825_type_json_partitions.reference create mode 100644 tests/queries/0_stateless/01825_type_json_partitions.sql diff --git a/src/DataTypes/ObjectUtils.cpp b/src/DataTypes/ObjectUtils.cpp index 9004a5296e0..cbabc71a965 100644 --- a/src/DataTypes/ObjectUtils.cpp +++ b/src/DataTypes/ObjectUtils.cpp @@ -128,22 +128,21 @@ static auto extractVector(const std::vector & vec) return res; } -void convertObjectsToTuples(NamesAndTypesList & columns_list, Block & block, const NamesAndTypesList & extended_storage_columns) +void convertObjectsToTuples(Block & block, const NamesAndTypesList & extended_storage_columns) { std::unordered_map storage_columns_map; for (const auto & [name, type] : extended_storage_columns) storage_columns_map[name] = type; - for (auto & name_type : columns_list) + for (auto & column : block) { - if (!isObject(name_type.type)) + if (!isObject(column.type)) continue; - auto & column = block.getByName(name_type.name); if (!isObject(column.type)) throw Exception(ErrorCodes::TYPE_MISMATCH, "Type for column '{}' mismatch in columns list and in block. In list: {}, in block: {}", - name_type.name, name_type.type->getName(), column.type->getName()); + column.name, column.type->getName(), column.type->getName()); const auto & column_object = assert_cast(*column.column); const auto & subcolumns = column_object.getSubcolumns(); @@ -151,7 +150,7 @@ void convertObjectsToTuples(NamesAndTypesList & columns_list, Block & block, con if (!column_object.isFinalized()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot convert to tuple column '{}' from type {}. Column should be finalized first", - name_type.name, name_type.type->getName()); + column.name, column.type->getName()); PathsInData tuple_paths; DataTypes tuple_types; @@ -164,12 +163,11 @@ void convertObjectsToTuples(NamesAndTypesList & columns_list, Block & block, con tuple_columns.emplace_back(entry->data.getFinalizedColumnPtr()); } - auto it = storage_columns_map.find(name_type.name); + auto it = storage_columns_map.find(column.name); if (it == storage_columns_map.end()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Column '{}' not found in storage", name_type.name); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Column '{}' not found in storage", column.name); std::tie(column.column, column.type) = unflattenTuple(tuple_paths, tuple_types, tuple_columns); - name_type.type = column.type; /// Check that constructed Tuple type and type in storage are compatible. getLeastCommonTypeForObject({column.type, it->second}, true); diff --git a/src/DataTypes/ObjectUtils.h b/src/DataTypes/ObjectUtils.h index 199a048c8cd..1dbeac2b244 100644 --- a/src/DataTypes/ObjectUtils.h +++ b/src/DataTypes/ObjectUtils.h @@ -38,7 +38,7 @@ DataTypePtr getDataTypeByColumn(const IColumn & column); /// Converts Object types and columns to Tuples in @columns_list and @block /// and checks that types are consistent with types in @extended_storage_columns. -void convertObjectsToTuples(NamesAndTypesList & columns_list, Block & block, const NamesAndTypesList & extended_storage_columns); +void convertObjectsToTuples(Block & block, const NamesAndTypesList & extended_storage_columns); /// Checks that each path is not the prefix of any other path. void checkObjectHasNoAmbiguosPaths(const PathsInData & paths); diff --git a/src/Storages/MergeTree/MergeTreeDataWriter.cpp b/src/Storages/MergeTree/MergeTreeDataWriter.cpp index 4805a273c70..fc05e293684 100644 --- a/src/Storages/MergeTree/MergeTreeDataWriter.cpp +++ b/src/Storages/MergeTree/MergeTreeDataWriter.cpp @@ -145,7 +145,7 @@ void MergeTreeDataWriter::TemporaryPart::finalize() } BlocksWithPartition MergeTreeDataWriter::splitBlockIntoParts( - const Block & block, size_t max_parts, const StorageMetadataPtr & metadata_snapshot, ContextPtr context) + const Block & block, size_t max_parts, const StorageMetadataPtr & metadata_snapshot, ContextPtr context) { BlocksWithPartition result; if (!block || !block.rows()) @@ -282,16 +282,12 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeTempPart( { TemporaryPart temp_part; Block & block = block_with_partition.block; + auto columns = metadata_snapshot->getColumns().getAllPhysical().filter(block.getNames()); - auto storage_snapshot = data.getStorageSnapshot(metadata_snapshot); - if (!storage_snapshot->object_columns.empty()) - { - auto extended_storage_columns = storage_snapshot->getColumns( - GetColumnsOptions(GetColumnsOptions::AllPhysical).withExtendedObjects()); - - convertObjectsToTuples(columns, block, extended_storage_columns); - } + for (auto & column : columns) + if (isObject(column.type)) + column.type = block.getByName(column.name).type; static const String TMP_PREFIX = "tmp_insert_"; @@ -466,6 +462,16 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeTempPart( return temp_part; } +void MergeTreeDataWriter::deduceTypesOfObjectColumns(const StorageSnapshotPtr & storage_snapshot, Block & block) +{ + if (!storage_snapshot->object_columns.empty()) + { + auto options = GetColumnsOptions(GetColumnsOptions::AllPhysical).withExtendedObjects(); + auto storage_columns = storage_snapshot->getColumns(options); + convertObjectsToTuples(block, storage_columns); + } +} + MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeProjectionPartImpl( const String & part_name, MergeTreeDataPartType part_type, diff --git a/src/Storages/MergeTree/MergeTreeDataWriter.h b/src/Storages/MergeTree/MergeTreeDataWriter.h index ae46a94ccd7..33742d7e52a 100644 --- a/src/Storages/MergeTree/MergeTreeDataWriter.h +++ b/src/Storages/MergeTree/MergeTreeDataWriter.h @@ -42,14 +42,12 @@ public: */ static BlocksWithPartition splitBlockIntoParts(const Block & block, size_t max_parts, const StorageMetadataPtr & metadata_snapshot, ContextPtr context); - /** All rows must correspond to same partition. - * Returns part with unique name starting with 'tmp_', yet not added to MergeTreeData. - */ - MergeTreeData::MutableDataPartPtr writeTempPart(BlockWithPartition & block, const StorageMetadataPtr & metadata_snapshot, bool optimize_on_insert); + void deduceTypesOfObjectColumns(const StorageSnapshotPtr & storage_snapshot, Block & block); /// This structure contains not completely written temporary part. /// Some writes may happen asynchronously, e.g. for blob storages. /// You should call finalize() to wait until all data is written. + struct TemporaryPart { MergeTreeData::MutableDataPartPtr part; @@ -65,6 +63,9 @@ public: void finalize(); }; + /** All rows must correspond to same partition. + * Returns part with unique name starting with 'tmp_', yet not added to MergeTreeData. + */ TemporaryPart writeTempPart(BlockWithPartition & block, const StorageMetadataPtr & metadata_snapshot, ContextPtr context); /// For insertion. diff --git a/src/Storages/MergeTree/MergeTreeSink.cpp b/src/Storages/MergeTree/MergeTreeSink.cpp index 97bbfc17e9d..7a4ecae24b3 100644 --- a/src/Storages/MergeTree/MergeTreeSink.cpp +++ b/src/Storages/MergeTree/MergeTreeSink.cpp @@ -50,7 +50,9 @@ struct MergeTreeSink::DelayedChunk void MergeTreeSink::consume(Chunk chunk) { auto block = getHeader().cloneWithColumns(chunk.detachColumns()); + auto storage_snapshot = storage.getStorageSnapshot(metadata_snapshot); + storage.writer.deduceTypesOfObjectColumns(storage_snapshot, block); auto part_blocks = storage.writer.splitBlockIntoParts(block, max_parts_per_block, metadata_snapshot, context); using DelayedPartitions = std::vector; diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp index 550c586f7de..63fa2071056 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp @@ -150,7 +150,8 @@ void ReplicatedMergeTreeSink::consume(Chunk chunk) if (quorum) checkQuorumPrecondition(zookeeper); - const Settings & settings = context->getSettingsRef(); + auto storage_snapshot = storage.getStorageSnapshot(metadata_snapshot); + storage.writer.deduceTypesOfObjectColumns(storage_snapshot, block); auto part_blocks = storage.writer.splitBlockIntoParts(block, max_parts_per_block, metadata_snapshot, context); using DelayedPartitions = std::vector; @@ -158,6 +159,7 @@ void ReplicatedMergeTreeSink::consume(Chunk chunk) size_t streams = 0; bool support_parallel_write = false; + const Settings & settings = context->getSettingsRef(); for (auto & current_block : part_blocks) { diff --git a/src/Storages/StorageMemory.cpp b/src/Storages/StorageMemory.cpp index 30be297194a..a371ac1ccf8 100644 --- a/src/Storages/StorageMemory.cpp +++ b/src/Storages/StorageMemory.cpp @@ -137,11 +137,10 @@ public: storage_snapshot->metadata->check(block, true); if (!storage_snapshot->object_columns.empty()) { - auto columns = storage_snapshot->metadata->getColumns().getAllPhysical().filter(block.getNames()); auto extended_storage_columns = storage_snapshot->getColumns( GetColumnsOptions(GetColumnsOptions::AllPhysical).withExtendedObjects()); - convertObjectsToTuples(columns, block, extended_storage_columns); + convertObjectsToTuples(block, extended_storage_columns); } if (storage.compress) diff --git a/tests/queries/0_stateless/01825_type_json_partitions.reference b/tests/queries/0_stateless/01825_type_json_partitions.reference new file mode 100644 index 00000000000..5a7ba251572 --- /dev/null +++ b/tests/queries/0_stateless/01825_type_json_partitions.reference @@ -0,0 +1,2 @@ +{"id":1,"obj":{"k1":"v1","k2":""}} +{"id":2,"obj":{"k1":"","k2":"v2"}} diff --git a/tests/queries/0_stateless/01825_type_json_partitions.sql b/tests/queries/0_stateless/01825_type_json_partitions.sql new file mode 100644 index 00000000000..2cb9bca7702 --- /dev/null +++ b/tests/queries/0_stateless/01825_type_json_partitions.sql @@ -0,0 +1,13 @@ +DROP TABLE IF EXISTS t_json_partitions; + +SET allow_experimental_object_type = 1; +SET output_format_json_named_tuples_as_objects = 1; + +CREATE TABLE t_json_partitions (id UInt32, obj JSON) +ENGINE MergeTree ORDER BY id PARTITION BY id; + +INSERT INTO t_json_partitions FORMAT JSONEachRow {"id": 1, "obj": {"k1": "v1"}} {"id": 2, "obj": {"k2": "v2"}}; + +SELECT * FROM t_json_partitions ORDER BY id FORMAT JSONEachRow; + +DROP TABLE t_json_partitions; From 252d66e80d99fcc1e6c1b0e6e9df191d99469e9e Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Thu, 31 Mar 2022 16:08:37 +0200 Subject: [PATCH 079/239] Update src/Processors/Formats/ISchemaReader.cpp Co-authored-by: Antonio Andelic --- src/Processors/Formats/ISchemaReader.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Processors/Formats/ISchemaReader.cpp b/src/Processors/Formats/ISchemaReader.cpp index 567a5b05096..1a8a9a0ab12 100644 --- a/src/Processors/Formats/ISchemaReader.cpp +++ b/src/Processors/Formats/ISchemaReader.cpp @@ -24,7 +24,7 @@ static void chooseResultType( /// If the new type and the previous type for this column are different, /// we will use default type if we have it or throw an exception. - if (new_type && type->equals(*new_type)) + if (new_type && !type->equals(*new_type)) { /// Check if we have Bool and Number and if allow_bools_as_numbers /// is true make the result type Number From e74d5f9d4c5e72f7c65cd9030ad566d12aa20c38 Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Thu, 31 Mar 2022 18:30:19 +0200 Subject: [PATCH 080/239] Update 01825_type_json_partitions.sql --- tests/queries/0_stateless/01825_type_json_partitions.sql | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/queries/0_stateless/01825_type_json_partitions.sql b/tests/queries/0_stateless/01825_type_json_partitions.sql index 2cb9bca7702..27804e7edae 100644 --- a/tests/queries/0_stateless/01825_type_json_partitions.sql +++ b/tests/queries/0_stateless/01825_type_json_partitions.sql @@ -1,3 +1,5 @@ +-- Tags: no-fasttest + DROP TABLE IF EXISTS t_json_partitions; SET allow_experimental_object_type = 1; From 9a4cc78dfb083a20c7f8ffa9cea5baaeca958ba5 Mon Sep 17 00:00:00 2001 From: jewisliu Date: Thu, 31 Mar 2022 17:50:07 +0800 Subject: [PATCH 081/239] support ALTER TABLE t DETACH PARTITION ALL syntax --- src/Parsers/ASTPartition.h | 1 + src/Parsers/ParserPartition.cpp | 10 ++++ src/Storages/MergeTree/MergeTreeData.cpp | 9 +++- src/Storages/StorageMergeTree.cpp | 6 ++- src/Storages/StorageReplicatedMergeTree.cpp | 37 +++++++++++--- .../0_stateless/00753_alter_attach.reference | 12 +++++ .../0_stateless/00753_alter_attach.sql | 49 +++++++++++++++++++ .../0_stateless/01015_attach_part.reference | 1 + .../queries/0_stateless/01015_attach_part.sql | 4 ++ 9 files changed, 120 insertions(+), 9 deletions(-) diff --git a/src/Parsers/ASTPartition.h b/src/Parsers/ASTPartition.h index 87092f532c4..1bd16d55795 100644 --- a/src/Parsers/ASTPartition.h +++ b/src/Parsers/ASTPartition.h @@ -15,6 +15,7 @@ public: size_t fields_count = 0; String id; + bool all = false; String getID(char) const override; ASTPtr clone() const override; diff --git a/src/Parsers/ParserPartition.cpp b/src/Parsers/ParserPartition.cpp index c10999361de..5af442826df 100644 --- a/src/Parsers/ParserPartition.cpp +++ b/src/Parsers/ParserPartition.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include namespace DB @@ -13,6 +14,7 @@ namespace DB bool ParserPartition::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) { ParserKeyword s_id("ID"); + ParserKeyword s_all("ALL"); ParserStringLiteral parser_string_literal; ParserExpression parser_expr; @@ -28,6 +30,14 @@ bool ParserPartition::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) partition->id = partition_id->as().value.get(); } + else if (s_all.ignore(pos, expected)) + { + ASTPtr value = makeASTFunction("tuple"); + partition->value = value; + partition->children.push_back(value); + partition->fields_count = 0; + partition->all = true; + } else { ASTPtr value; diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index f66586b121a..b89ca4021ae 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -3373,7 +3373,12 @@ void MergeTreeData::checkAlterPartitionIsPossible( void MergeTreeData::checkPartitionCanBeDropped(const ASTPtr & partition) { const String partition_id = getPartitionIDFromQuery(partition, getContext()); - auto parts_to_remove = getDataPartsVectorInPartition(MergeTreeDataPartState::Active, partition_id); + DataPartsVector parts_to_remove; + const auto * partition_ast = partition->as(); + if (partition_ast && partition_ast->all) + parts_to_remove = getDataPartsVector(); + else + parts_to_remove = getDataPartsVectorInPartition(MergeTreeDataPartState::Active, partition_id); UInt64 partition_size = 0; @@ -3824,6 +3829,8 @@ String MergeTreeData::getPartitionIDFromQuery(const ASTPtr & ast, ContextPtr loc auto metadata_snapshot = getInMemoryMetadataPtr(); const Block & key_sample_block = metadata_snapshot->getPartitionKey().sample_block; + if (partition_ast.all) + return "ALL"; size_t fields_count = key_sample_block.columns(); if (partition_ast.fields_count != fields_count) throw Exception(ErrorCodes::INVALID_PARTITION_VALUE, diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index 7f4c3deca37..521e4147968 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -1348,7 +1348,11 @@ void StorageMergeTree::dropPartition(const ASTPtr & partition, bool detach, Cont /// This protects against "revival" of data for a removed partition after completion of merge. auto merge_blocker = stopMergesAndWait(); String partition_id = getPartitionIDFromQuery(partition, local_context); - parts_to_remove = getDataPartsVectorInPartition(MergeTreeDataPartState::Active, partition_id); + const auto * partition_ast = partition->as(); + if (partition_ast && partition_ast->all) + parts_to_remove = getDataPartsVector(); + else + parts_to_remove = getDataPartsVectorInPartition(MergeTreeDataPartState::Active, partition_id); /// TODO should we throw an exception if parts_to_remove is empty? removePartsFromWorkingSet(parts_to_remove, true); diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index d9f72cf7feb..1947a0ad427 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include @@ -4941,15 +4942,37 @@ void StorageReplicatedMergeTree::dropPartition(const ASTPtr & partition, bool de throw Exception("DROP PARTITION cannot be done on this replica because it is not a leader", ErrorCodes::NOT_A_LEADER); zkutil::ZooKeeperPtr zookeeper = getZooKeeperAndAssertNotReadonly(); - LogEntry entry; - String partition_id = getPartitionIDFromQuery(partition, query_context); - bool did_drop = dropAllPartsInPartition(*zookeeper, partition_id, entry, query_context, detach); - - if (did_drop) + const auto * partition_ast = partition->as(); + if (partition_ast && partition_ast->all) { - waitForLogEntryToBeProcessedIfNecessary(entry, query_context); - cleanLastPartNode(partition_id); + Strings partitions = zookeeper->getChildren(fs::path(zookeeper_path) / "block_numbers"); + + std::vector>> entries_with_partitionid_to_drop; + entries_with_partitionid_to_drop.reserve(partitions.size()); + for (String & partition_id : partitions) + { + auto entry = std::make_unique(); + if (dropAllPartsInPartition(*zookeeper, partition_id, *entry, query_context, detach)) + entries_with_partitionid_to_drop.emplace_back(partition_id, std::move(entry)); + } + + for (const auto & entry : entries_with_partitionid_to_drop) + { + waitForLogEntryToBeProcessedIfNecessary(*entry.second, query_context); + cleanLastPartNode(entry.first); + } + } + else + { + LogEntry entry; + String partition_id = getPartitionIDFromQuery(partition, query_context); + bool did_drop = dropAllPartsInPartition(*zookeeper, partition_id, entry, query_context, detach); + if (did_drop) + { + waitForLogEntryToBeProcessedIfNecessary(entry, query_context); + cleanLastPartNode(partition_id); + } } } diff --git a/tests/queries/0_stateless/00753_alter_attach.reference b/tests/queries/0_stateless/00753_alter_attach.reference index 007b99d4748..b0d2a3d031c 100644 --- a/tests/queries/0_stateless/00753_alter_attach.reference +++ b/tests/queries/0_stateless/00753_alter_attach.reference @@ -10,3 +10,15 @@ 5 2 6 3 7 3 +4 2 +5 2 +1 1 +2 1 +3 1 +1 1 +2 1 +3 1 +1 1 +2 2 +1 1 +1 1 diff --git a/tests/queries/0_stateless/00753_alter_attach.sql b/tests/queries/0_stateless/00753_alter_attach.sql index ca43fb3aeae..2910bcc222b 100644 --- a/tests/queries/0_stateless/00753_alter_attach.sql +++ b/tests/queries/0_stateless/00753_alter_attach.sql @@ -19,4 +19,53 @@ INSERT INTO alter_attach VALUES (6, 3), (7, 3); ALTER TABLE alter_attach ATTACH PARTITION 2; SELECT * FROM alter_attach ORDER BY x; +ALTER TABLE alter_attach DETACH PARTITION ALL; +SELECT * FROM alter_attach ORDER BY x; + +ALTER TABLE alter_attach ATTACH PARTITION 2; +SELECT * FROM alter_attach ORDER BY x; + +DROP TABLE IF EXISTS detach_all_no_partition; +CREATE TABLE detach_all_no_partition (x UInt64, p UInt8) ENGINE = MergeTree ORDER BY tuple(); +INSERT INTO detach_all_no_partition VALUES (1, 1), (2, 1), (3, 1); +SELECT * FROM detach_all_no_partition ORDER BY x; + +ALTER TABLE detach_all_no_partition DETACH PARTITION ALL; +SELECT * FROM detach_all_no_partition ORDER BY x; + +ALTER TABLE detach_all_no_partition ATTACH PARTITION tuple(); +SELECT * FROM detach_all_no_partition ORDER BY x; + DROP TABLE alter_attach; +DROP TABLE detach_all_no_partition; + +DROP TABLE IF EXISTS replicated_table_detach_all1; +DROP TABLE IF EXISTS replicated_table_detach_all2; + +CREATE TABLE replicated_table_detach_all1 ( + id UInt64, + Data String +) ENGINE = ReplicatedMergeTree('/clickhouse/tables/test_00753_{database}/replicated_table_detach_all', '1') ORDER BY id PARTITION BY id; + +CREATE TABLE replicated_table_detach_all2 ( + id UInt64, + Data String +) ENGINE = ReplicatedMergeTree('/clickhouse/tables/test_00753_{database}/replicated_table_detach_all', '2') ORDER BY id PARTITION BY id; + + +INSERT INTO replicated_table_detach_all1 VALUES (1, '1'), (2, '2'); +select * from replicated_table_detach_all1 order by id; + +ALTER TABLE replicated_table_detach_all1 DETACH PARTITION ALL; +select * from replicated_table_detach_all1 order by id; +SYSTEM SYNC REPLICA replicated_table_detach_all2; +select * from replicated_table_detach_all2 order by id; + +ALTER TABLE replicated_table_detach_all1 ATTACH PARTITION tuple(1); +select * from replicated_table_detach_all1 order by id; +SYSTEM SYNC REPLICA replicated_table_detach_all2; +select * from replicated_table_detach_all2 order by id; + +DROP TABLE replicated_table_detach_all1; +DROP TABLE replicated_table_detach_all2; + diff --git a/tests/queries/0_stateless/01015_attach_part.reference b/tests/queries/0_stateless/01015_attach_part.reference index b6cd514cd25..81c49e654ac 100644 --- a/tests/queries/0_stateless/01015_attach_part.reference +++ b/tests/queries/0_stateless/01015_attach_part.reference @@ -1,3 +1,4 @@ 1000 0 1000 +0 diff --git a/tests/queries/0_stateless/01015_attach_part.sql b/tests/queries/0_stateless/01015_attach_part.sql index 6b786bfbab9..a2f949d3499 100644 --- a/tests/queries/0_stateless/01015_attach_part.sql +++ b/tests/queries/0_stateless/01015_attach_part.sql @@ -21,4 +21,8 @@ ALTER TABLE table_01 ATTACH PART '20191001_1_1_0'; SELECT COUNT() FROM table_01; +ALTER TABLE table_01 DETACH PARTITION ALL; + +SELECT COUNT() FROM table_01; + DROP TABLE IF EXISTS table_01; From ececee3817edefb84794e00a2cea93f7ef07d29b Mon Sep 17 00:00:00 2001 From: zzsmdfj Date: Fri, 1 Apr 2022 20:13:34 +0800 Subject: [PATCH 082/239] to #34966_fix_dateTime_deserialize --- src/IO/ReadHelpers.h | 18 +++++++++++++++--- .../02249_parse_date_time_basic.reference | 3 +++ .../02249_parse_date_time_basic.sql | 6 ++++++ 3 files changed, 24 insertions(+), 3 deletions(-) create mode 100644 tests/queries/0_stateless/02249_parse_date_time_basic.reference create mode 100644 tests/queries/0_stateless/02249_parse_date_time_basic.sql diff --git a/src/IO/ReadHelpers.h b/src/IO/ReadHelpers.h index 9396e1d32f7..8296b8db4d7 100644 --- a/src/IO/ReadHelpers.h +++ b/src/IO/ReadHelpers.h @@ -851,6 +851,8 @@ inline ReturnType readDateTimeTextImpl(time_t & datetime, ReadBuffer & buf, cons /// YYYY-MM-DD hh:mm:ss static constexpr auto DateTimeStringInputSize = 19; + ///YYYY-MM-DD + static constexpr auto DateStringInputSize = 10; bool optimistic_path_for_date_time_input = s + DateTimeStringInputSize <= buf.buffer().end(); if (optimistic_path_for_date_time_input) @@ -861,16 +863,26 @@ inline ReturnType readDateTimeTextImpl(time_t & datetime, ReadBuffer & buf, cons UInt8 month = (s[5] - '0') * 10 + (s[6] - '0'); UInt8 day = (s[8] - '0') * 10 + (s[9] - '0'); - UInt8 hour = (s[11] - '0') * 10 + (s[12] - '0'); - UInt8 minute = (s[14] - '0') * 10 + (s[15] - '0'); - UInt8 second = (s[17] - '0') * 10 + (s[18] - '0'); + UInt8 hour = 0; + UInt8 minute = 0; + UInt8 second = 0; + ///simply determine whether it is YYYY-MM-DD hh:mm:ss or YYYY-MM-DD by the content of the tenth character in an optimistic scenario + if (s[10] == ' ') + { + hour = (s[11] - '0') * 10 + (s[12] - '0'); + minute = (s[14] - '0') * 10 + (s[15] - '0'); + second = (s[17] - '0') * 10 + (s[18] - '0'); + } if (unlikely(year == 0)) datetime = 0; else datetime = date_lut.makeDateTime(year, month, day, hour, minute, second); + if (s[10] == ' ') buf.position() += DateTimeStringInputSize; + else + buf.position() += DateStringInputSize; return ReturnType(true); } else diff --git a/tests/queries/0_stateless/02249_parse_date_time_basic.reference b/tests/queries/0_stateless/02249_parse_date_time_basic.reference new file mode 100644 index 00000000000..d67e0ae15e0 --- /dev/null +++ b/tests/queries/0_stateless/02249_parse_date_time_basic.reference @@ -0,0 +1,3 @@ +2022-03-31 00:00:00 1 +2022-04-01 17:10:24 2 +2022-03-31 10:18:56 3 diff --git a/tests/queries/0_stateless/02249_parse_date_time_basic.sql b/tests/queries/0_stateless/02249_parse_date_time_basic.sql new file mode 100644 index 00000000000..dd2306d99aa --- /dev/null +++ b/tests/queries/0_stateless/02249_parse_date_time_basic.sql @@ -0,0 +1,6 @@ +drop table if exists t; +CREATE TABLE t (a DateTime, b String, c String, d String, e Int32) ENGINE = Memory; +INSERT INTO t(a, b, c, d ,e) VALUES ('2022-03-31','','','',1); +INSERT INTO t(a, b, c, d ,e) VALUES (1648804224,'','','',2); +INSERT INTO t(a, b, c, d ,e) VALUES ('2022-03-31 10:18:56','','','',3); +select a, e from t; \ No newline at end of file From 3cae0c74d9fd18e76a2d02b1178aa7f533600900 Mon Sep 17 00:00:00 2001 From: zzsmdfj Date: Fri, 1 Apr 2022 20:16:07 +0800 Subject: [PATCH 083/239] to #34966_fix_dateTime_deserialize --- src/IO/ReadHelpers.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/IO/ReadHelpers.h b/src/IO/ReadHelpers.h index 8296b8db4d7..48c291d8fcc 100644 --- a/src/IO/ReadHelpers.h +++ b/src/IO/ReadHelpers.h @@ -880,7 +880,7 @@ inline ReturnType readDateTimeTextImpl(time_t & datetime, ReadBuffer & buf, cons datetime = date_lut.makeDateTime(year, month, day, hour, minute, second); if (s[10] == ' ') - buf.position() += DateTimeStringInputSize; + buf.position() += DateTimeStringInputSize; else buf.position() += DateStringInputSize; return ReturnType(true); From 79c75d8a712a3a49807164a0c618d04cfadbbb9c Mon Sep 17 00:00:00 2001 From: zzsmdfj Date: Fri, 1 Apr 2022 20:18:28 +0800 Subject: [PATCH 084/239] to #34966_fix_dateTime_deserialize --- tests/queries/0_stateless/02249_parse_date_time_basic.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02249_parse_date_time_basic.sql b/tests/queries/0_stateless/02249_parse_date_time_basic.sql index dd2306d99aa..2cea41874d5 100644 --- a/tests/queries/0_stateless/02249_parse_date_time_basic.sql +++ b/tests/queries/0_stateless/02249_parse_date_time_basic.sql @@ -3,4 +3,4 @@ CREATE TABLE t (a DateTime, b String, c String, d String, e Int32) ENGINE = Memo INSERT INTO t(a, b, c, d ,e) VALUES ('2022-03-31','','','',1); INSERT INTO t(a, b, c, d ,e) VALUES (1648804224,'','','',2); INSERT INTO t(a, b, c, d ,e) VALUES ('2022-03-31 10:18:56','','','',3); -select a, e from t; \ No newline at end of file +select a, e from t; From 3c14d00aea9a0e926d128da588c6356a7300b50c Mon Sep 17 00:00:00 2001 From: rfraposa Date: Fri, 1 Apr 2022 10:18:17 -0600 Subject: [PATCH 085/239] Delete docs_release.yml --- .github/workflows/docs_release.yml | 121 ----------------------------- 1 file changed, 121 deletions(-) delete mode 100644 .github/workflows/docs_release.yml diff --git a/.github/workflows/docs_release.yml b/.github/workflows/docs_release.yml deleted file mode 100644 index 66838a05552..00000000000 --- a/.github/workflows/docs_release.yml +++ /dev/null @@ -1,121 +0,0 @@ -name: DocsReleaseChecks - -env: - # Force the stdout and stderr streams to be unbuffered - PYTHONUNBUFFERED: 1 - -concurrency: - group: master-release - cancel-in-progress: true -on: # yamllint disable-line rule:truthy - push: - branches: - - master - paths: - - 'docs/**' - - 'website/**' - - 'benchmark/**' - - 'docker/**' - - '.github/**' - workflow_dispatch: -jobs: - DockerHubPushAarch64: - runs-on: [self-hosted, style-checker-aarch64] - steps: - - name: Clear repository - run: | - sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE" - - name: Check out repository code - uses: actions/checkout@v2 - - name: Images check - run: | - cd "$GITHUB_WORKSPACE/tests/ci" - python3 docker_images_check.py --suffix aarch64 - - name: Upload images files to artifacts - uses: actions/upload-artifact@v2 - with: - name: changed_images_aarch64 - path: ${{ runner.temp }}/docker_images_check/changed_images_aarch64.json - DockerHubPushAmd64: - runs-on: [self-hosted, style-checker] - steps: - - name: Clear repository - run: | - sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE" - - name: Check out repository code - uses: actions/checkout@v2 - - name: Images check - run: | - cd "$GITHUB_WORKSPACE/tests/ci" - python3 docker_images_check.py --suffix amd64 - - name: Upload images files to artifacts - uses: actions/upload-artifact@v2 - with: - name: changed_images_amd64 - path: ${{ runner.temp }}/docker_images_check/changed_images_amd64.json - DockerHubPush: - needs: [DockerHubPushAmd64, DockerHubPushAarch64] - runs-on: [self-hosted, style-checker] - steps: - - name: Clear repository - run: | - sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE" - - name: Check out repository code - uses: actions/checkout@v2 - - name: Download changed aarch64 images - uses: actions/download-artifact@v2 - with: - name: changed_images_aarch64 - path: ${{ runner.temp }} - - name: Download changed amd64 images - uses: actions/download-artifact@v2 - with: - name: changed_images_amd64 - path: ${{ runner.temp }} - - name: Images check - run: | - cd "$GITHUB_WORKSPACE/tests/ci" - python3 docker_manifests_merge.py --suffix amd64 --suffix aarch64 - - name: Upload images files to artifacts - uses: actions/upload-artifact@v2 - with: - name: changed_images - path: ${{ runner.temp }}/changed_images.json - DocsRelease: - needs: DockerHubPush - runs-on: [self-hosted, func-tester] - steps: - - name: Set envs - # https://docs.github.com/en/actions/learn-github-actions/workflow-commands-for-github-actions#multiline-strings - run: | - cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/docs_release - REPO_COPY=${{runner.temp}}/docs_release/ClickHouse - CLOUDFLARE_TOKEN=${{secrets.CLOUDFLARE}} - ROBOT_CLICKHOUSE_SSH_KEY< Date: Fri, 1 Apr 2022 11:15:41 -0600 Subject: [PATCH 086/239] Fixing conflicts with source repo --- docs/en/engines/table-engines/mergetree-family/mergetree.md | 2 +- docs/en/sql-reference/aggregate-functions/reference/index.md | 1 - .../external-dictionaries/external-dicts-dict-lifetime.md | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/docs/en/engines/table-engines/mergetree-family/mergetree.md b/docs/en/engines/table-engines/mergetree-family/mergetree.md index 095adc32505..1195ee55dc7 100644 --- a/docs/en/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md @@ -693,7 +693,7 @@ Tags: - `volume_name_N` — Volume name. Volume names must be unique. - `disk` — a disk within a volume. - `max_data_part_size_bytes` — the maximum size of a part that can be stored on any of the volume’s disks. If the a size of a merged part estimated to be bigger than `max_data_part_size_bytes` then this part will be written to a next volume. Basically this feature allows to keep new/small parts on a hot (SSD) volume and move them to a cold (HDD) volume when they reach large size. Do not use this setting if your policy has only one volume. -- `move_factor` — when the amount of available space gets lower than this factor, data automatically start to move on the next volume if any (by default, 0.1). +- `move_factor` — when the amount of available space gets lower than this factor, data automatically starts to move on the next volume if any (by default, 0.1). ClickHouse sorts existing parts by size from largest to smallest (in descending order) and selects parts with the total size that is sufficient to meet the `move_factor` condition. If the total size of all parts is insufficient, all parts will be moved. - `prefer_not_to_merge` — Disables merging of data parts on this volume. When this setting is enabled, merging data on this volume is not allowed. This allows controlling how ClickHouse works with slow disks. Cofiguration examples: diff --git a/docs/en/sql-reference/aggregate-functions/reference/index.md b/docs/en/sql-reference/aggregate-functions/reference/index.md index 4bbd00043a8..cd71bca2556 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/index.md +++ b/docs/en/sql-reference/aggregate-functions/reference/index.md @@ -35,7 +35,6 @@ ClickHouse-specific aggregate functions: - [groupArrayInsertAt](../../../sql-reference/aggregate-functions/reference/grouparrayinsertat.md) - [groupArrayMovingAvg](../../../sql-reference/aggregate-functions/reference/grouparraymovingavg.md) - [groupArrayMovingSum](../../../sql-reference/aggregate-functions/reference/grouparraymovingsum.md) -- [groupArraySorted](../../../sql-reference/aggregate-functions/reference/grouparraysorted.md) - [groupBitAnd](../../../sql-reference/aggregate-functions/reference/groupbitand.md) - [groupBitOr](../../../sql-reference/aggregate-functions/reference/groupbitor.md) - [groupBitXor](../../../sql-reference/aggregate-functions/reference/groupbitxor.md) diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md index 83814781005..ab83017f263 100644 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md @@ -5,7 +5,7 @@ sidebar_label: Dictionary Updates # Dictionary Updates -ClickHouse periodically updates the dictionaries. The update interval for fully downloaded dictionaries and the invalidation interval for cached dictionaries are defined in the `` tag in seconds. +ClickHouse periodically updates the dictionaries. The update interval for fully downloaded dictionaries and the invalidation interval for cached dictionaries are defined in the `lifetime` tag in seconds. Dictionary updates (other than loading for first use) do not block queries. During updates, the old version of a dictionary is used. If an error occurs during an update, the error is written to the server log, and queries continue using the old version of dictionaries. From 59e1ef9577083f369d847e0dda9421274bf7ab17 Mon Sep 17 00:00:00 2001 From: rfraposa Date: Fri, 1 Apr 2022 11:18:06 -0600 Subject: [PATCH 087/239] Update grouparraysorted.md --- .../aggregate-functions/reference/grouparraysorted.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/aggregate-functions/reference/grouparraysorted.md b/docs/en/sql-reference/aggregate-functions/reference/grouparraysorted.md index 0237885bcb6..e34fcbc5788 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/grouparraysorted.md +++ b/docs/en/sql-reference/aggregate-functions/reference/grouparraysorted.md @@ -1,5 +1,5 @@ --- -toc_priority: 108 +sidebar_position: 108 --- # groupArraySorted {#groupArraySorted} From e5ee39541e94cd5b3dff881904cdaf03f217435a Mon Sep 17 00:00:00 2001 From: rfraposa Date: Fri, 1 Apr 2022 11:34:35 -0600 Subject: [PATCH 088/239] Fixing conflicts with source repo --- docs/en/operations/named-collections.md | 230 ++++++++++++++++++ .../sql-reference/statements/create/table.md | 2 +- 2 files changed, 231 insertions(+), 1 deletion(-) create mode 100644 docs/en/operations/named-collections.md diff --git a/docs/en/operations/named-collections.md b/docs/en/operations/named-collections.md new file mode 100644 index 00000000000..52520ba76b7 --- /dev/null +++ b/docs/en/operations/named-collections.md @@ -0,0 +1,230 @@ +--- +sidebar_position: 69 +sidebar_label: "Named connections" +--- + +# Storing details for connecting to external sources in configuration files {#named-collections} + +Details for connecting to external sources (dictionaries, tables, table functions) can be saved +in configuration files and thus simplify the creation of objects and hide credentials +from users with only SQL access. + +Parameters can be set in XML `CSV` and overridden in SQL `, format = 'TSV'`. +The parameters in SQL can be overridden using format `key` = `value`: `compression_method = 'gzip'`. + +Named connections are stored in the `config.xml` file of the ClickHouse server in the `` section and are applied when ClickHouse starts. + +Example of configuration: +```xml +$ cat /etc/clickhouse-server/config.d/named_collections.xml + + + ... + + +``` + +## Named connections for accessing S3. + +The description of parameters see [s3 Table Function](../sql-reference/table-functions/s3.md). + +Example of configuration: +```xml + + + + AKIAIOSFODNN7EXAMPLE + wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY + CSV + https://s3.us-east-1.amazonaws.com/yourbucket/mydata/ + + + +``` + +### Example of using named connections with the s3 function + +```sql +INSERT INTO FUNCTION s3(s3_mydata, filename = 'test_file.tsv.gz', + format = 'TSV', structure = 'number UInt64', compression_method = 'gzip') +SELECT * FROM numbers(10000); + +SELECT count() +FROM s3(s3_mydata, filename = 'test_file.tsv.gz') + +┌─count()─┐ +│ 10000 │ +└─────────┘ +1 rows in set. Elapsed: 0.279 sec. Processed 10.00 thousand rows, 90.00 KB (35.78 thousand rows/s., 322.02 KB/s.) +``` + +### Example of using named connections with an S3 table + +```sql +CREATE TABLE s3_engine_table (number Int64) +ENGINE=S3(s3_mydata, url='https://s3.us-east-1.amazonaws.com/yourbucket/mydata/test_file.tsv.gz', format = 'TSV') +SETTINGS input_format_with_names_use_header = 0; + +SELECT * FROM s3_engine_table LIMIT 3; +┌─number─┐ +│ 0 │ +│ 1 │ +│ 2 │ +└────────┘ +``` + +## Named connections for accessing MySQL database + +The description of parameters see [mysql](../sql-reference/table-functions/mysql.md). + +Example of configuration: +```xml + + + + myuser + mypass + 127.0.0.1 + 3306 + test + 8 + 1 + 1 + + + +``` + +### Example of using named connections with the mysql function + +```sql +SELECT count() FROM mysql(mymysql, table = 'test'); + +┌─count()─┐ +│ 3 │ +└─────────┘ +``` + +### Example of using named connections with an MySQL table + +```sql +CREATE TABLE mytable(A Int64) ENGINE = MySQL(mymysql, table = 'test', connection_pool_size=3, replace_query=0); +SELECT count() FROM mytable; + +┌─count()─┐ +│ 3 │ +└─────────┘ +``` + +### Example of using named connections with database with engine MySQL + +```sql +CREATE DATABASE mydatabase ENGINE = MySQL(mymysql); + +SHOW TABLES FROM mydatabase; + +┌─name───┐ +│ source │ +│ test │ +└────────┘ +``` + +### Example of using named connections with an external dictionary with source MySQL + +```sql +CREATE DICTIONARY dict (A Int64, B String) +PRIMARY KEY A +SOURCE(MYSQL(NAME mymysql TABLE 'source')) +LIFETIME(MIN 1 MAX 2) +LAYOUT(HASHED()); + +SELECT dictGet('dict', 'B', 2); + +┌─dictGet('dict', 'B', 2)─┐ +│ two │ +└─────────────────────────┘ +``` + +## Named connections for accessing PostgreSQL database + +The description of parameters see [postgresql](../sql-reference/table-functions/postgresql.md). + +Example of configuration: +```xml + + + + pguser + jw8s0F4 + 127.0.0.1 + 5432 + test + test_schema + 8 + + + +``` + +### Example of using named connections with the postgresql function + +```sql +SELECT * FROM postgresql(mypg, table = 'test'); + +┌─a─┬─b───┐ +│ 2 │ two │ +│ 1 │ one │ +└───┴─────┘ + + +SELECT * FROM postgresql(mypg, table = 'test', schema = 'public'); + +┌─a─┐ +│ 1 │ +│ 2 │ +│ 3 │ +└───┘ +``` + + +### Example of using named connections with database with engine PostgreSQL + +```sql +CREATE TABLE mypgtable (a Int64) ENGINE = PostgreSQL(mypg, table = 'test', schema = 'public'); + +SELECT * FROM mypgtable; + +┌─a─┐ +│ 1 │ +│ 2 │ +│ 3 │ +└───┘ +``` + +### Example of using named connections with database with engine PostgreSQL + +```sql +CREATE DATABASE mydatabase ENGINE = PostgreSQL(mypg); + +SHOW TABLES FROM mydatabase + +┌─name─┐ +│ test │ +└──────┘ +``` + +### Example of using named connections with an external dictionary with source POSTGRESQL + +```sql +CREATE DICTIONARY dict (a Int64, b String) +PRIMARY KEY a +SOURCE(POSTGRESQL(NAME mypg TABLE test)) +LIFETIME(MIN 1 MAX 2) +LAYOUT(HASHED()); + +SELECT dictGet('dict', 'b', 2); + +┌─dictGet('dict', 'b', 2)─┐ +│ two │ +└─────────────────────────┘ +``` \ No newline at end of file diff --git a/docs/en/sql-reference/statements/create/table.md b/docs/en/sql-reference/statements/create/table.md index 82aad344117..ea98796427e 100644 --- a/docs/en/sql-reference/statements/create/table.md +++ b/docs/en/sql-reference/statements/create/table.md @@ -116,7 +116,7 @@ In addition, this column is not substituted when using an asterisk in a SELECT q `EPHEMERAL expr` -Ephemeral column. Such a column isn't stored in the table and cannot be SELECTed, but can be referenced in the defaults of CREATE statement. +Ephemeral column. Such a column isn't stored in the table and cannot be SELECTed, but can be referenced in the defaults of CREATE statement. If `expr` is omitted type for column is required. INSERT without list of columns will skip such column, so SELECT/INSERT invariant is preserved - the dump obtained using `SELECT *` can be inserted back into the table using INSERT without specifying the list of columns. ### ALIAS {#alias} From a565a937409120993cdc450884ff1ac8915086d5 Mon Sep 17 00:00:00 2001 From: shuchaome Date: Mon, 28 Mar 2022 12:23:51 +0800 Subject: [PATCH 089/239] reduce mutex scope when setenv LIBHDFS3_CONF --- src/Storages/HDFS/HDFSCommon.cpp | 23 ++++++++++++++--------- src/Storages/HDFS/ReadBufferFromHDFS.cpp | 8 -------- 2 files changed, 14 insertions(+), 17 deletions(-) diff --git a/src/Storages/HDFS/HDFSCommon.cpp b/src/Storages/HDFS/HDFSCommon.cpp index b186808a2db..da08f429457 100644 --- a/src/Storages/HDFS/HDFSCommon.cpp +++ b/src/Storages/HDFS/HDFSCommon.cpp @@ -25,6 +25,8 @@ namespace ErrorCodes const String HDFSBuilderWrapper::CONFIG_PREFIX = "hdfs"; const String HDFS_URL_REGEXP = "^hdfs://[^/]*/.*"; +std::once_flag init_libhdfs3_conf_flag; + void HDFSBuilderWrapper::loadFromConfig(const Poco::Util::AbstractConfiguration & config, const String & prefix, bool isUser) { @@ -123,19 +125,22 @@ HDFSBuilderWrapper createHDFSBuilder(const String & uri_str, const Poco::Util::A throw Exception("Illegal HDFS URI: " + uri.toString(), ErrorCodes::BAD_ARGUMENTS); // Shall set env LIBHDFS3_CONF *before* HDFSBuilderWrapper construction. - String libhdfs3_conf = config.getString(HDFSBuilderWrapper::CONFIG_PREFIX + ".libhdfs3_conf", ""); - if (!libhdfs3_conf.empty()) + std::call_once(init_libhdfs3_conf_flag, [&config]() { - if (std::filesystem::path{libhdfs3_conf}.is_relative() && !std::filesystem::exists(libhdfs3_conf)) + String libhdfs3_conf = config.getString(HDFSBuilderWrapper::CONFIG_PREFIX + ".libhdfs3_conf", ""); + if (!libhdfs3_conf.empty()) { - const String config_path = config.getString("config-file", "config.xml"); - const auto config_dir = std::filesystem::path{config_path}.remove_filename(); - if (std::filesystem::exists(config_dir / libhdfs3_conf)) - libhdfs3_conf = std::filesystem::absolute(config_dir / libhdfs3_conf); + if (std::filesystem::path{libhdfs3_conf}.is_relative() && !std::filesystem::exists(libhdfs3_conf)) + { + const String config_path = config.getString("config-file", "config.xml"); + const auto config_dir = std::filesystem::path{config_path}.remove_filename(); + if (std::filesystem::exists(config_dir / libhdfs3_conf)) + libhdfs3_conf = std::filesystem::absolute(config_dir / libhdfs3_conf); + } + setenv("LIBHDFS3_CONF", libhdfs3_conf.c_str(), 1); } + }); - setenv("LIBHDFS3_CONF", libhdfs3_conf.c_str(), 1); - } HDFSBuilderWrapper builder; if (builder.get() == nullptr) throw Exception("Unable to create builder to connect to HDFS: " + diff --git a/src/Storages/HDFS/ReadBufferFromHDFS.cpp b/src/Storages/HDFS/ReadBufferFromHDFS.cpp index 902307fc828..1bafa49e55b 100644 --- a/src/Storages/HDFS/ReadBufferFromHDFS.cpp +++ b/src/Storages/HDFS/ReadBufferFromHDFS.cpp @@ -22,8 +22,6 @@ ReadBufferFromHDFS::~ReadBufferFromHDFS() = default; struct ReadBufferFromHDFS::ReadBufferFromHDFSImpl : public BufferWithOwnMemory { - /// HDFS create/open functions are not thread safe - static std::mutex hdfs_init_mutex; String hdfs_uri; String hdfs_file_path; @@ -46,8 +44,6 @@ struct ReadBufferFromHDFS::ReadBufferFromHDFSImpl : public BufferWithOwnMemory Date: Sat, 2 Apr 2022 11:45:48 +0000 Subject: [PATCH 090/239] Update test reference file --- .../02149_schema_inference.reference | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/queries/0_stateless/02149_schema_inference.reference b/tests/queries/0_stateless/02149_schema_inference.reference index f46e3bee101..3e1d272bbf7 100644 --- a/tests/queries/0_stateless/02149_schema_inference.reference +++ b/tests/queries/0_stateless/02149_schema_inference.reference @@ -38,32 +38,32 @@ JSONCompactEachRow c1 Nullable(Float64) c2 Array(Tuple(Nullable(Float64), Nullable(String))) c3 Map(String, Nullable(Float64)) -c4 Nullable(UInt8) -42.42 [(1,'String'),(2,'abcd')] {'key':42,'key2':24} 1 +c4 Nullable(Bool) +42.42 [(1,'String'),(2,'abcd')] {'key':42,'key2':24} true c1 Nullable(Float64) c2 Array(Tuple(Nullable(Float64), Nullable(String))) c3 Map(String, Nullable(Float64)) -c4 Nullable(UInt8) +c4 Nullable(Bool) \N [(1,'String'),(2,NULL)] {'key':NULL,'key2':24} \N -32 [(2,'String 2'),(3,'hello')] {'key3':4242,'key4':2424} 1 +32 [(2,'String 2'),(3,'hello')] {'key3':4242,'key4':2424} true JSONCompactEachRowWithNames a Nullable(Float64) b Array(Tuple(Nullable(Float64), Nullable(String))) c Map(String, Nullable(Float64)) -d Nullable(UInt8) -42.42 [(1,'String'),(2,'abcd')] {'key':42,'key2':24} 1 +d Nullable(Bool) +42.42 [(1,'String'),(2,'abcd')] {'key':42,'key2':24} true JSONEachRow -d Nullable(UInt8) +d Nullable(Bool) b Array(Tuple(Nullable(Float64), Nullable(String))) c Map(String, Nullable(Float64)) a Nullable(Float64) -1 [(1,'String'),(2,'abcd')] {'key':42,'key2':24} 42.42 -d Nullable(UInt8) +true [(1,'String'),(2,'abcd')] {'key':42,'key2':24} 42.42 +d Nullable(Bool) b Array(Tuple(Nullable(Float64), Nullable(String))) c Map(String, Nullable(Float64)) a Nullable(Float64) \N [(1,'String'),(2,NULL)] {'key':NULL,'key2':24} \N -1 [(2,'String 2'),(3,'hello')] {'key3':4242,'key4':2424} 32 +true [(2,'String 2'),(3,'hello')] {'key3':4242,'key4':2424} 32 b Nullable(String) c Array(Nullable(Float64)) a Nullable(Float64) From f7902ae2d1029a7127cc0fcdefa705f69db96c21 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Sat, 2 Apr 2022 13:53:30 +0200 Subject: [PATCH 091/239] Fix test --- .../02222_create_table_without_columns_metadata.reference | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02222_create_table_without_columns_metadata.reference b/tests/queries/0_stateless/02222_create_table_without_columns_metadata.reference index 6c1dc4ebeb9..f32b0eb8a92 100644 --- a/tests/queries/0_stateless/02222_create_table_without_columns_metadata.reference +++ b/tests/queries/0_stateless/02222_create_table_without_columns_metadata.reference @@ -1,3 +1,3 @@ -CREATE TABLE test.test\n(\n `x` Nullable(Float64),\n `y` Nullable(String)\n)\nENGINE = File(\'JSONEachRow\', \'data.jsonl\') +CREATE TABLE default.test\n(\n `x` Nullable(Float64),\n `y` Nullable(String)\n)\nENGINE = File(\'JSONEachRow\', \'data.jsonl\') OK OK From 60c2b54dc96229786d6816f3d9419b0627ba0ad1 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Sat, 2 Apr 2022 13:55:37 +0200 Subject: [PATCH 092/239] Fix test --- .../0_stateless/02247_names_order_in_json_and_tskv.reference | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/queries/0_stateless/02247_names_order_in_json_and_tskv.reference b/tests/queries/0_stateless/02247_names_order_in_json_and_tskv.reference index e49ad3f8d93..a275a9f6bed 100644 --- a/tests/queries/0_stateless/02247_names_order_in_json_and_tskv.reference +++ b/tests/queries/0_stateless/02247_names_order_in_json_and_tskv.reference @@ -4,6 +4,7 @@ c Nullable(String) 1 s1 \N 2 } [2] \N \N \N +\N \N \N \N \N [3] b Nullable(String) a Nullable(String) @@ -28,5 +29,6 @@ e Nullable(Float64) 1 \N \N \N \N 2 3 \N \N \N \N \N +\N \N \N \N \N \N \N 3 3 3 1 \N From 3ae36ac4831c2cd8f31b764a4622bf341bab9ca2 Mon Sep 17 00:00:00 2001 From: avogar Date: Sat, 2 Apr 2022 11:59:32 +0000 Subject: [PATCH 093/239] Fix tests --- .../0_stateless/02247_names_order_in_json_and_tskv.reference | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02247_names_order_in_json_and_tskv.reference b/tests/queries/0_stateless/02247_names_order_in_json_and_tskv.reference index a275a9f6bed..49a285dc11a 100644 --- a/tests/queries/0_stateless/02247_names_order_in_json_and_tskv.reference +++ b/tests/queries/0_stateless/02247_names_order_in_json_and_tskv.reference @@ -12,6 +12,7 @@ c Nullable(String) e Nullable(String) 1 \N \N \N \N 2 3 \N +\N \N \N \N \N \N \N 3 3 3 1 \N a Nullable(Float64) @@ -29,6 +30,5 @@ e Nullable(Float64) 1 \N \N \N \N 2 3 \N \N \N \N \N -\N \N \N \N \N \N \N 3 3 3 1 \N From 860b1a1b1bd8a1dec6d1176396400937d0346977 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Sat, 2 Apr 2022 14:04:04 +0200 Subject: [PATCH 094/239] Update 02248_nullable_custom_types_to_string.sql --- .../0_stateless/02248_nullable_custom_types_to_string.sql | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/queries/0_stateless/02248_nullable_custom_types_to_string.sql b/tests/queries/0_stateless/02248_nullable_custom_types_to_string.sql index 313f703fd03..b6032f7741b 100644 --- a/tests/queries/0_stateless/02248_nullable_custom_types_to_string.sql +++ b/tests/queries/0_stateless/02248_nullable_custom_types_to_string.sql @@ -1,3 +1,4 @@ +-- Tags: no-backward-compatibility-check:22.3.2.2 select toString(toNullable(true)); select toString(CAST(NULL, 'Nullable(Bool)')); select toString(toNullable(toIPv4('0.0.0.0'))); From 195b4c47ea8d45e1ddaf993b819d5a17eca69064 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Thu, 31 Mar 2022 20:40:33 +0300 Subject: [PATCH 095/239] Avoid processing per-column TTL multiple times Before this patch ttl.txt will not be written for per-column TTLs, and hence it will be calculated every time after server restart of DETACH/ATTACH cycle (note, that it will work w/o restart since in-memory representation will avoid this). v2: convert test to .sh to get correct current database over default for MV v3: extract UUID to avoid error like in [1]: [ 490 ] {} void DB::SystemLog::flushImpl(const std::vector &, uint64_t) []: Code: 349. DB::Exception: Cannot convert NULL value to non-Nullable type: While processing query_id LIKE concat('%', CAST(_CAST(NULL, 'Nullable(UUID)') AS uuid, 'String'), '%'): while pushing to view test_0hc2ro.this_text_log (c64e5af4-059e-4330-a728-354ecf83c031). (CANNOT_INSERT_NULL_IN_ORDINARY_COLUMN) [1]: https://s3.amazonaws.com/clickhouse-test-reports/35820/a512d322b024d37d2f1082c4833f59f86057555f/stateless_tests_flaky_check__address__actions_.html v4: add no-parallel to avoid issues with disappeared underlying table while pushing to text_log Signed-off-by: Azat Khuzhin --- .../MergeTree/MergeTreeDataPartTTLInfo.h | 2 +- .../0_stateless/02262_column_ttl.reference | 1 + tests/queries/0_stateless/02262_column_ttl.sh | 51 +++++++++++++++++++ 3 files changed, 53 insertions(+), 1 deletion(-) create mode 100644 tests/queries/0_stateless/02262_column_ttl.reference create mode 100755 tests/queries/0_stateless/02262_column_ttl.sh diff --git a/src/Storages/MergeTree/MergeTreeDataPartTTLInfo.h b/src/Storages/MergeTree/MergeTreeDataPartTTLInfo.h index 2b79ad1aac5..c60a7eec09a 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartTTLInfo.h +++ b/src/Storages/MergeTree/MergeTreeDataPartTTLInfo.h @@ -81,7 +81,7 @@ struct MergeTreeDataPartTTLInfos bool empty() const { /// part_min_ttl in minimum of rows, rows_where and group_by TTLs - return !part_min_ttl && moves_ttl.empty() && recompression_ttl.empty(); + return !part_min_ttl && moves_ttl.empty() && recompression_ttl.empty() && columns_ttl.empty(); } }; diff --git a/tests/queries/0_stateless/02262_column_ttl.reference b/tests/queries/0_stateless/02262_column_ttl.reference new file mode 100644 index 00000000000..f59cb48c5f5 --- /dev/null +++ b/tests/queries/0_stateless/02262_column_ttl.reference @@ -0,0 +1 @@ +1 0 diff --git a/tests/queries/0_stateless/02262_column_ttl.sh b/tests/queries/0_stateless/02262_column_ttl.sh new file mode 100755 index 00000000000..affb0c802ff --- /dev/null +++ b/tests/queries/0_stateless/02262_column_ttl.sh @@ -0,0 +1,51 @@ +#!/usr/bin/env bash +# Tags: no-parallel +# ^^^^^^^^^^^ +# Since the underlying view may disappears while flushing log, and leads to: +# +# DB::Exception: Table test_x449vo..inner_id.9c14fb82-e6b1-4d1a-85a6-935c3a2a2029 is dropped. (TABLE_IS_DROPPED) +# + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +# regression test for columns TTLs +# note, that this should be written in .sh since we need $CLICKHOUSE_DATABASE +# not 'default' to catch text_log + +$CLICKHOUSE_CLIENT -nm -q " + drop table if exists ttl_02262; + drop table if exists this_text_log; + + create table ttl_02262 (date Date, key Int, value String TTL date + interval 1 month) engine=MergeTree order by key; + insert into ttl_02262 values ('2010-01-01', 2010, 'foo'); + optimize table ttl_02262 final; + + detach table ttl_02262; + attach table ttl_02262; + + -- create system.text_log + system flush logs; +" + +ttl_02262_uuid=$($CLICKHOUSE_CLIENT -q "select uuid from system.tables where database = '$CLICKHOUSE_DATABASE' and name = 'ttl_02262'") + +$CLICKHOUSE_CLIENT -nm -q " + -- OPTIMIZE TABLE x FINAL will be done in background + -- attach to it's log, via table UUID in query_id (see merger/mutator code). + create materialized view this_text_log engine=Memory() as + select * from system.text_log where query_id like '%${ttl_02262_uuid}%'; + + optimize table ttl_02262 final; + system flush logs; + -- If TTL will be applied again (during OPTIMIZE TABLE FINAL) it will produce the following message: + -- + -- Some TTL values were not calculated for part 201701_487_641_3. Will calculate them forcefully during merge. + -- + -- Let's ensure that this is not happen anymore: + select count()>0, countIf(message LIKE '%TTL%') from this_text_log; + + drop table ttl_02262; + drop table this_text_log; +" From c33a6ced7b2a656dc7b76446ce9c54cf02c32e05 Mon Sep 17 00:00:00 2001 From: rfraposa Date: Sat, 2 Apr 2022 10:16:20 -0600 Subject: [PATCH 096/239] Fixed category links --- docs/en/engines/_category_.yml | 3 ++- docs/en/example-datasets/_category_.yml | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/en/engines/_category_.yml b/docs/en/engines/_category_.yml index f8554057fdc..0c462323df4 100644 --- a/docs/en/engines/_category_.yml +++ b/docs/en/engines/_category_.yml @@ -4,4 +4,5 @@ collapsible: true collapsed: true link: type: generated-index - title: Database & Table Engines \ No newline at end of file + title: Database & Table Engines + slug: /en/table-engines \ No newline at end of file diff --git a/docs/en/example-datasets/_category_.yml b/docs/en/example-datasets/_category_.yml index 5824de77e1d..310ce834a92 100644 --- a/docs/en/example-datasets/_category_.yml +++ b/docs/en/example-datasets/_category_.yml @@ -4,4 +4,5 @@ collapsible: true collapsed: true link: type: generated-index - title: Example Datasets \ No newline at end of file + title: Example Datasets + slug: /en/example-datasets \ No newline at end of file From afe0563856589c1c1daaa74813746744b41a6dc8 Mon Sep 17 00:00:00 2001 From: rfraposa Date: Sat, 2 Apr 2022 15:39:11 -0600 Subject: [PATCH 097/239] Adding the playground back in --- docs/en/playground.md | 45 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 docs/en/playground.md diff --git a/docs/en/playground.md b/docs/en/playground.md new file mode 100644 index 00000000000..6b42107af3a --- /dev/null +++ b/docs/en/playground.md @@ -0,0 +1,45 @@ +--- +sidebar_label: Playground +sidebar_position: 2 +keywords: [clickhouse, playground, getting, started, docs] +description: The ClickHouse Playground allows people to experiment with ClickHouse by running queries instantly, without setting up their server or cluster. +--- + +# ClickHouse Playground {#clickhouse-playground} + +[ClickHouse Playground](https://play.clickhouse.com/play?user=play) allows people to experiment with ClickHouse by running queries instantly, without setting up their server or cluster. +Several example datasets are available in Playground. + +You can make queries to Playground using any HTTP client, for example [curl](https://curl.haxx.se) or [wget](https://www.gnu.org/software/wget/), or set up a connection using [JDBC](../interfaces/jdbc.md) or [ODBC](../interfaces/odbc.md) drivers. More information about software products that support ClickHouse is available [here](../interfaces/index.md). + +## Credentials {#credentials} + +| Parameter | Value | +|:--------------------|:-----------------------------------| +| HTTPS endpoint | `https://play.clickhouse.com:443/` | +| Native TCP endpoint | `play.clickhouse.com:9440` | +| User | `explorer` or `play` | +| Password | (empty) | + +## Limitations {#limitations} + +The queries are executed as a read-only user. It implies some limitations: + +- DDL queries are not allowed +- INSERT queries are not allowed + +The service also have quotas on its usage. + +## Examples {#examples} + +HTTPS endpoint example with `curl`: + +``` bash +curl "https://play.clickhouse.com/?user=explorer" --data-binary "SELECT 'Play ClickHouse'" +``` + +TCP endpoint example with [CLI](../interfaces/cli.md): + +``` bash +clickhouse client --secure --host play.clickhouse.com --user explorer +``` From 5d4a87778583560b56189a326a050332035fc423 Mon Sep 17 00:00:00 2001 From: rfraposa Date: Sat, 2 Apr 2022 15:44:53 -0600 Subject: [PATCH 098/239] Fixed slug for playground and install pages --- docs/en/install.md | 1 + docs/en/playground.md | 1 + 2 files changed, 2 insertions(+) diff --git a/docs/en/install.md b/docs/en/install.md index a5405143d77..036452f3697 100644 --- a/docs/en/install.md +++ b/docs/en/install.md @@ -3,6 +3,7 @@ sidebar_label: Installation sidebar_position: 1 keywords: [clickhouse, install, installation, docs] description: ClickHouse can run on any Linux, FreeBSD, or Mac OS X with x86_64, AArch64, or PowerPC64LE CPU architecture. +slug: /en/getting-started/install --- # Installation {#installation} diff --git a/docs/en/playground.md b/docs/en/playground.md index 6b42107af3a..fee687dd856 100644 --- a/docs/en/playground.md +++ b/docs/en/playground.md @@ -3,6 +3,7 @@ sidebar_label: Playground sidebar_position: 2 keywords: [clickhouse, playground, getting, started, docs] description: The ClickHouse Playground allows people to experiment with ClickHouse by running queries instantly, without setting up their server or cluster. +slug: /en/getting-started/playground --- # ClickHouse Playground {#clickhouse-playground} From 72a00e2c6253153f2695eb31a8762f32573f31e1 Mon Sep 17 00:00:00 2001 From: rfraposa Date: Sat, 2 Apr 2022 17:38:54 -0600 Subject: [PATCH 099/239] Fixed broken links --- docs/en/development/continuous-integration.md | 2 +- docs/en/engines/table-engines/integrations/kafka.md | 2 +- docs/en/example-datasets/cell-towers.md | 2 +- docs/en/example-datasets/menus.md | 10 +++++----- docs/en/example-datasets/metrica.md | 2 +- docs/en/example-datasets/opensky.md | 4 ++-- docs/en/example-datasets/recipes.md | 8 ++++---- docs/en/example-datasets/uk-price-paid.md | 10 +++++----- docs/en/install.md | 6 +++--- docs/en/interfaces/cli.md | 2 +- .../optimizing-performance/sampling-query-profiler.md | 2 +- docs/en/operations/performance-test.md | 2 +- docs/en/operations/requirements.md | 4 ++-- docs/en/operations/troubleshooting.md | 4 ++-- docs/en/playground.md | 4 ++-- .../aggregate-functions/reference/anyheavy.md | 2 +- .../aggregate-functions/reference/topk.md | 2 +- .../dictionaries/external-dictionaries/_category_.yml | 3 ++- .../external-dicts-dict-hierarchical.md | 4 ++-- .../external-dicts-dict-sources.md | 2 +- .../external-dicts-dict-structure.md | 2 +- docs/en/sql-reference/functions/other-functions.md | 2 +- docs/en/sql-reference/statements/create/table.md | 2 +- .../en/sql-reference/statements/select/into-outfile.md | 2 +- docs/en/sql-reference/statements/system.md | 2 +- 25 files changed, 44 insertions(+), 43 deletions(-) diff --git a/docs/en/development/continuous-integration.md b/docs/en/development/continuous-integration.md index 379b78a2c42..b3172d103f0 100644 --- a/docs/en/development/continuous-integration.md +++ b/docs/en/development/continuous-integration.md @@ -151,7 +151,7 @@ checks page](../development/build.md#you-dont-have-to-build-clickhouse), or buil ## Functional Stateful Tests -Runs [stateful functional tests](tests.md#functional-tests). Treat them in the same way as the functional stateless tests. The difference is that they require `hits` and `visits` tables from the [Yandex.Metrica dataset](../getting-started/example-datasets/metrica.md) to run. +Runs [stateful functional tests](tests.md#functional-tests). Treat them in the same way as the functional stateless tests. The difference is that they require `hits` and `visits` tables from the [clickstream dataset](../example-datasets/metrica.md) to run. ## Integration Tests diff --git a/docs/en/engines/table-engines/integrations/kafka.md b/docs/en/engines/table-engines/integrations/kafka.md index 90e0925f531..3a8d98e1ca9 100644 --- a/docs/en/engines/table-engines/integrations/kafka.md +++ b/docs/en/engines/table-engines/integrations/kafka.md @@ -134,7 +134,7 @@ Example: SELECT level, sum(total) FROM daily GROUP BY level; ``` -To improve performance, received messages are grouped into blocks the size of [max_insert_block_size](../../../operations/settings/settings/#settings-max_insert_block_size). If the block wasn’t formed within [stream_flush_interval_ms](../../../operations/settings/settings/#stream-flush-interval-ms) milliseconds, the data will be flushed to the table regardless of the completeness of the block. +To improve performance, received messages are grouped into blocks the size of [max_insert_block_size](../../../operations/settings/settings.md#settings-max_insert_block_size). If the block wasn’t formed within [stream_flush_interval_ms](../../../operations/settings/settings.md/#stream-flush-interval-ms) milliseconds, the data will be flushed to the table regardless of the completeness of the block. To stop receiving topic data or to change the conversion logic, detach the materialized view: diff --git a/docs/en/example-datasets/cell-towers.md b/docs/en/example-datasets/cell-towers.md index 7a35a28faa6..6c3201ff2b2 100644 --- a/docs/en/example-datasets/cell-towers.md +++ b/docs/en/example-datasets/cell-towers.md @@ -95,7 +95,7 @@ SELECT mcc, count() FROM cell_towers GROUP BY mcc ORDER BY count() DESC LIMIT 10 So, the top countries are: the USA, Germany, and Russia. -You may want to create an [External Dictionary](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md) in ClickHouse to decode these values. +You may want to create an [External Dictionary](../sql-reference/dictionaries/external-dictionaries/external-dicts.md) in ClickHouse to decode these values. ## Use case {#use-case} diff --git a/docs/en/example-datasets/menus.md b/docs/en/example-datasets/menus.md index c572dcdb491..c41195223a2 100644 --- a/docs/en/example-datasets/menus.md +++ b/docs/en/example-datasets/menus.md @@ -39,7 +39,7 @@ The data is normalized consisted of four tables: ## Create the Tables {#create-tables} -We use [Decimal](../../sql-reference/data-types/decimal.md) data type to store prices. +We use [Decimal](../sql-reference/data-types/decimal.md) data type to store prices. ```sql CREATE TABLE dish @@ -115,17 +115,17 @@ clickhouse-client --format_csv_allow_single_quotes 0 --input_format_null_as_defa clickhouse-client --format_csv_allow_single_quotes 0 --input_format_null_as_default 0 --date_time_input_format best_effort --query "INSERT INTO menu_item FORMAT CSVWithNames" < MenuItem.csv ``` -We use [CSVWithNames](../../interfaces/formats.md#csvwithnames) format as the data is represented by CSV with header. +We use [CSVWithNames](../interfaces/formats.md#csvwithnames) format as the data is represented by CSV with header. We disable `format_csv_allow_single_quotes` as only double quotes are used for data fields and single quotes can be inside the values and should not confuse the CSV parser. -We disable [input_format_null_as_default](../../operations/settings/settings.md#settings-input-format-null-as-default) as our data does not have [NULL](../../sql-reference/syntax.md#null-literal). Otherwise ClickHouse will try to parse `\N` sequences and can be confused with `\` in data. +We disable [input_format_null_as_default](../operations/settings/settings.md#settings-input-format-null-as-default) as our data does not have [NULL](../sql-reference/syntax.md#null-literal). Otherwise ClickHouse will try to parse `\N` sequences and can be confused with `\` in data. -The setting [date_time_input_format best_effort](../../operations/settings/settings.md#settings-date_time_input_format) allows to parse [DateTime](../../sql-reference/data-types/datetime.md) fields in wide variety of formats. For example, ISO-8601 without seconds like '2000-01-01 01:02' will be recognized. Without this setting only fixed DateTime format is allowed. +The setting [date_time_input_format best_effort](../operations/settings/settings.md#settings-date_time_input_format) allows to parse [DateTime](../sql-reference/data-types/datetime.md) fields in wide variety of formats. For example, ISO-8601 without seconds like '2000-01-01 01:02' will be recognized. Without this setting only fixed DateTime format is allowed. ## Denormalize the Data {#denormalize-data} -Data is presented in multiple tables in [normalized form](https://en.wikipedia.org/wiki/Database_normalization#Normal_forms). It means you have to perform [JOIN](../../sql-reference/statements/select/join.md#select-join) if you want to query, e.g. dish names from menu items. +Data is presented in multiple tables in [normalized form](https://en.wikipedia.org/wiki/Database_normalization#Normal_forms). It means you have to perform [JOIN](../sql-reference/statements/select/join.md#select-join) if you want to query, e.g. dish names from menu items. For typical analytical tasks it is way more efficient to deal with pre-JOINed data to avoid doing `JOIN` every time. It is called "denormalized" data. We will create a table `menu_item_denorm` where will contain all the data JOINed together: diff --git a/docs/en/example-datasets/metrica.md b/docs/en/example-datasets/metrica.md index 2194ad85091..c5ef74750a6 100644 --- a/docs/en/example-datasets/metrica.md +++ b/docs/en/example-datasets/metrica.md @@ -73,6 +73,6 @@ clickhouse-client --query "SELECT COUNT(*) FROM datasets.visits_v1" ## Example Queries {#example-queries} -[The ClickHouse tutorial](../../getting-started/tutorial.md) is based on this web analytics dataset, and the recommended way to get started with this dataset is to go through the tutorial. +[The ClickHouse tutorial](../../tutorial.md) is based on this web analytics dataset, and the recommended way to get started with this dataset is to go through the tutorial. Additional examples of queries to these tables can be found among [stateful tests](https://github.com/ClickHouse/ClickHouse/tree/master/tests/queries/1_stateful) of ClickHouse (they are named `test.hits` and `test.visits` there). diff --git a/docs/en/example-datasets/opensky.md b/docs/en/example-datasets/opensky.md index f55ebc79590..719f32d7c3e 100644 --- a/docs/en/example-datasets/opensky.md +++ b/docs/en/example-datasets/opensky.md @@ -60,9 +60,9 @@ ls -1 flightlist_*.csv.gz | xargs -P100 -I{} bash -c 'gzip -c -d "{}" | clickhou `xargs -P100` specifies to use up to 100 parallel workers but as we only have 30 files, the number of workers will be only 30. - For every file, `xargs` will run a script with `bash -c`. The script has substitution in form of `{}` and the `xargs` command will substitute the filename to it (we have asked it for `xargs` with `-I{}`). - The script will decompress the file (`gzip -c -d "{}"`) to standard output (`-c` parameter) and the output is redirected to `clickhouse-client`. -- We also asked to parse [DateTime](../../sql-reference/data-types/datetime.md) fields with extended parser ([--date_time_input_format best_effort](../../operations/settings/settings.md#settings-date_time_input_format)) to recognize ISO-8601 format with timezone offsets. +- We also asked to parse [DateTime](../sql-reference/data-types/datetime.md) fields with extended parser ([--date_time_input_format best_effort](../operations/settings/settings.md#settings-date_time_input_format)) to recognize ISO-8601 format with timezone offsets. -Finally, `clickhouse-client` will do insertion. It will read input data in [CSVWithNames](../../interfaces/formats.md#csvwithnames) format. +Finally, `clickhouse-client` will do insertion. It will read input data in [CSVWithNames](../interfaces/formats.md#csvwithnames) format. Parallel upload takes 24 seconds. diff --git a/docs/en/example-datasets/recipes.md b/docs/en/example-datasets/recipes.md index 9a27255e6a8..b01efc8de26 100644 --- a/docs/en/example-datasets/recipes.md +++ b/docs/en/example-datasets/recipes.md @@ -50,13 +50,13 @@ clickhouse-client --query " This is a showcase how to parse custom CSV, as it requires multiple tunes. Explanation: -- The dataset is in CSV format, but it requires some preprocessing on insertion; we use table function [input](../../sql-reference/table-functions/input.md) to perform preprocessing; +- The dataset is in CSV format, but it requires some preprocessing on insertion; we use table function [input](../sql-reference/table-functions/input.md) to perform preprocessing; - The structure of CSV file is specified in the argument of the table function `input`; - The field `num` (row number) is unneeded - we parse it from file and ignore; - We use `FORMAT CSVWithNames` but the header in CSV will be ignored (by command line parameter `--input_format_with_names_use_header 0`), because the header does not contain the name for the first field; - File is using only double quotes to enclose CSV strings; some strings are not enclosed in double quotes, and single quote must not be parsed as the string enclosing - that's why we also add the `--format_csv_allow_single_quote 0` parameter; - Some strings from CSV cannot parse, because they contain `\M/` sequence at the beginning of the value; the only value starting with backslash in CSV can be `\N` that is parsed as SQL NULL. We add `--input_format_allow_errors_num 10` parameter and up to ten malformed records can be skipped; -- There are arrays for ingredients, directions and NER fields; these arrays are represented in unusual form: they are serialized into string as JSON and then placed in CSV - we parse them as String and then use [JSONExtract](../../sql-reference/functions/json-functions/) function to transform it to Array. +- There are arrays for ingredients, directions and NER fields; these arrays are represented in unusual form: they are serialized into string as JSON and then placed in CSV - we parse them as String and then use [JSONExtract](../sql-reference/functions/json-functions/) function to transform it to Array. ## Validate the Inserted Data @@ -80,7 +80,7 @@ Result: ### Top Components by the Number of Recipes: -In this example we learn how to use [arrayJoin](../../sql-reference/functions/array-join/) function to expand an array into a set of rows. +In this example we learn how to use [arrayJoin](../sql-reference/functions/array-join/) function to expand an array into a set of rows. Query: @@ -185,7 +185,7 @@ Result: 10 rows in set. Elapsed: 0.215 sec. Processed 2.23 million rows, 1.48 GB (10.35 million rows/s., 6.86 GB/s.) ``` -In this example, we involve [has](../../sql-reference/functions/array-functions/#hasarr-elem) function to filter by array elements and sort by the number of directions. +In this example, we involve [has](../sql-reference/functions/array-functions/#hasarr-elem) function to filter by array elements and sort by the number of directions. There is a wedding cake that requires the whole 126 steps to produce! Show that directions: diff --git a/docs/en/example-datasets/uk-price-paid.md b/docs/en/example-datasets/uk-price-paid.md index e0f20639aea..e19e801dcf9 100644 --- a/docs/en/example-datasets/uk-price-paid.md +++ b/docs/en/example-datasets/uk-price-paid.md @@ -54,9 +54,9 @@ In this example, we define the structure of source data from the CSV file and sp The preprocessing is: - splitting the postcode to two different columns `postcode1` and `postcode2` that is better for storage and queries; - coverting the `time` field to date as it only contains 00:00 time; -- ignoring the [UUid](../../sql-reference/data-types/uuid.md) field because we don't need it for analysis; -- transforming `type` and `duration` to more readable Enum fields with function [transform](../../sql-reference/functions/other-functions.md#transform); -- transforming `is_new` and `category` fields from single-character string (`Y`/`N` and `A`/`B`) to [UInt8](../../sql-reference/data-types/int-uint.md#uint8-uint16-uint32-uint64-uint256-int8-int16-int32-int64-int128-int256) field with 0 and 1. +- ignoring the [UUid](../sql-reference/data-types/uuid.md) field because we don't need it for analysis; +- transforming `type` and `duration` to more readable Enum fields with function [transform](../sql-reference/functions/other-functions.md#transform); +- transforming `is_new` and `category` fields from single-character string (`Y`/`N` and `A`/`B`) to [UInt8](../sql-reference/data-types/int-uint.md#uint8-uint16-uint32-uint64-uint256-int8-int16-int32-int64-int128-int256) field with 0 and 1. Preprocessed data is piped directly to `clickhouse-client` to be inserted into ClickHouse table in streaming fashion. @@ -352,7 +352,7 @@ Result: ## Let's Speed Up Queries Using Projections {#speedup-with-projections} -[Projections](../../sql-reference/statements/alter/projection.md) allow to improve queries speed by storing pre-aggregated data. +[Projections](../sql-reference/statements/alter/projection.md) allow to improve queries speed by storing pre-aggregated data. ### Build a Projection {#build-projection} @@ -388,7 +388,7 @@ SETTINGS mutations_sync = 1; Let's run the same 3 queries. -[Enable](../../operations/settings/settings.md#allow-experimental-projection-optimization) projections for selects: +[Enable](../operations/settings/settings.md#allow-experimental-projection-optimization) projections for selects: ```sql SET allow_experimental_projection_optimization = 1; diff --git a/docs/en/install.md b/docs/en/install.md index 036452f3697..37cb113bc4a 100644 --- a/docs/en/install.md +++ b/docs/en/install.md @@ -216,7 +216,7 @@ Use the `clickhouse client` to connect to the server, or `clickhouse local` to p ### From Sources {#from-sources} -To manually compile ClickHouse, follow the instructions for [Linux](../development/build.md) or [Mac OS X](../development/build-osx.md). +To manually compile ClickHouse, follow the instructions for [Linux](./development/build.md) or [Mac OS X](./development/build-osx.md). You can compile packages and install them or use programs without installing packages. Also by building manually you can disable SSE 4.2 requirement or build for AArch64 CPUs. @@ -271,7 +271,7 @@ If the configuration file is in the current directory, you do not need to specif ClickHouse supports access restriction settings. They are located in the `users.xml` file (next to `config.xml`). By default, access is allowed from anywhere for the `default` user, without a password. See `user/default/networks`. -For more information, see the section [“Configuration Files”](../operations/configuration-files.md). +For more information, see the section [“Configuration Files”](./operations/configuration-files.md). After launching server, you can use the command-line client to connect to it: @@ -282,7 +282,7 @@ $ clickhouse-client By default, it connects to `localhost:9000` on behalf of the user `default` without a password. It can also be used to connect to a remote server using `--host` argument. The terminal must use UTF-8 encoding. -For more information, see the section [“Command-line client”](../interfaces/cli.md). +For more information, see the section [“Command-line client”](./interfaces/cli.md). Example: diff --git a/docs/en/interfaces/cli.md b/docs/en/interfaces/cli.md index 2e78bad6445..9ef1cea280a 100644 --- a/docs/en/interfaces/cli.md +++ b/docs/en/interfaces/cli.md @@ -7,7 +7,7 @@ sidebar_label: Command-Line Client ClickHouse provides a native command-line client: `clickhouse-client`. The client supports command-line options and configuration files. For more information, see [Configuring](#interfaces_cli_configuration). -[Install](../getting-started/index.md) it from the `clickhouse-client` package and run it with the command `clickhouse-client`. +[Install](../../quick-start.mdx) it from the `clickhouse-client` package and run it with the command `clickhouse-client`. ``` bash $ clickhouse-client diff --git a/docs/en/operations/optimizing-performance/sampling-query-profiler.md b/docs/en/operations/optimizing-performance/sampling-query-profiler.md index 35e0157df6b..39e83545506 100644 --- a/docs/en/operations/optimizing-performance/sampling-query-profiler.md +++ b/docs/en/operations/optimizing-performance/sampling-query-profiler.md @@ -21,7 +21,7 @@ The default sampling frequency is one sample per second and both CPU and real ti To analyze the `trace_log` system table: -- Install the `clickhouse-common-static-dbg` package. See [Install from DEB Packages](../../getting-started/install.md#install-from-deb-packages). +- Install the `clickhouse-common-static-dbg` package. See [Install from DEB Packages](../../install.md#install-from-deb-packages). - Allow introspection functions by the [allow_introspection_functions](../../operations/settings/settings.md#settings-allow_introspection_functions) setting. diff --git a/docs/en/operations/performance-test.md b/docs/en/operations/performance-test.md index 47827f331c7..0ba3a9908a5 100644 --- a/docs/en/operations/performance-test.md +++ b/docs/en/operations/performance-test.md @@ -59,7 +59,7 @@ wget https://raw.githubusercontent.com/ClickHouse/ClickHouse/master/benchmark/cl chmod a+x benchmark-new.sh wget https://raw.githubusercontent.com/ClickHouse/ClickHouse/master/benchmark/clickhouse/queries.sql ``` -3. Download the [web analytics dataset](../getting-started/example-datasets/metrica.md) (“hits” table containing 100 million rows). +3. Download the [web analytics dataset](../example-datasets/metrica.md) (“hits” table containing 100 million rows). ```bash wget https://datasets.clickhouse.com/hits/partitions/hits_100m_obfuscated_v1.tar.xz tar xvf hits_100m_obfuscated_v1.tar.xz -C . diff --git a/docs/en/operations/requirements.md b/docs/en/operations/requirements.md index 698603dfb84..4b13033096f 100644 --- a/docs/en/operations/requirements.md +++ b/docs/en/operations/requirements.md @@ -3,7 +3,7 @@ sidebar_position: 44 sidebar_label: Requirements --- -# Requirements {#requirements} +# Requirements ## CPU {#cpu} @@ -56,4 +56,4 @@ The network bandwidth is critical for processing distributed queries with a larg ClickHouse is developed primarily for the Linux family of operating systems. The recommended Linux distribution is Ubuntu. The `tzdata` package should be installed in the system. -ClickHouse can also work in other operating system families. See details in the [Getting started](../getting-started/index.md) section of the documentation. +ClickHouse can also work in other operating system families. See details in the [install guide](../install.md) section of the documentation. diff --git a/docs/en/operations/troubleshooting.md b/docs/en/operations/troubleshooting.md index e0efe4f57f5..b67282c8aa1 100644 --- a/docs/en/operations/troubleshooting.md +++ b/docs/en/operations/troubleshooting.md @@ -3,7 +3,7 @@ sidebar_position: 46 sidebar_label: Troubleshooting --- -# Troubleshooting {#troubleshooting} +# Troubleshooting - [Installation](#troubleshooting-installation-errors) - [Connecting to the server](#troubleshooting-accepts-no-connections) @@ -15,7 +15,7 @@ sidebar_label: Troubleshooting ### You Cannot Get Deb Packages from ClickHouse Repository with Apt-get {#you-cannot-get-deb-packages-from-clickhouse-repository-with-apt-get} - Check firewall settings. -- If you cannot access the repository for any reason, download packages as described in the [Getting started](../getting-started/index.md) article and install them manually using the `sudo dpkg -i ` command. You will also need the `tzdata` package. +- If you cannot access the repository for any reason, download packages as described in the [install guide](../install.md) article and install them manually using the `sudo dpkg -i ` command. You will also need the `tzdata` package. ## Connecting to the Server {#troubleshooting-accepts-no-connections} diff --git a/docs/en/playground.md b/docs/en/playground.md index fee687dd856..ea7b2ccf2c5 100644 --- a/docs/en/playground.md +++ b/docs/en/playground.md @@ -11,7 +11,7 @@ slug: /en/getting-started/playground [ClickHouse Playground](https://play.clickhouse.com/play?user=play) allows people to experiment with ClickHouse by running queries instantly, without setting up their server or cluster. Several example datasets are available in Playground. -You can make queries to Playground using any HTTP client, for example [curl](https://curl.haxx.se) or [wget](https://www.gnu.org/software/wget/), or set up a connection using [JDBC](../interfaces/jdbc.md) or [ODBC](../interfaces/odbc.md) drivers. More information about software products that support ClickHouse is available [here](../interfaces/index.md). +You can make queries to Playground using any HTTP client, for example [curl](https://curl.haxx.se) or [wget](https://www.gnu.org/software/wget/), or set up a connection using [JDBC](./interfaces/jdbc.md) or [ODBC](./interfaces/odbc.md) drivers. More information about software products that support ClickHouse is available [here](./interfaces/index.md). ## Credentials {#credentials} @@ -39,7 +39,7 @@ HTTPS endpoint example with `curl`: curl "https://play.clickhouse.com/?user=explorer" --data-binary "SELECT 'Play ClickHouse'" ``` -TCP endpoint example with [CLI](../interfaces/cli.md): +TCP endpoint example with [CLI](./interfaces/cli.md): ``` bash clickhouse client --secure --host play.clickhouse.com --user explorer diff --git a/docs/en/sql-reference/aggregate-functions/reference/anyheavy.md b/docs/en/sql-reference/aggregate-functions/reference/anyheavy.md index 491754453e3..29144ee2f50 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/anyheavy.md +++ b/docs/en/sql-reference/aggregate-functions/reference/anyheavy.md @@ -16,7 +16,7 @@ anyHeavy(column) **Example** -Take the [OnTime](../../../getting-started/example-datasets/ontime.md) data set and select any frequently occurring value in the `AirlineID` column. +Take the [OnTime](../../../example-datasets/ontime.md) data set and select any frequently occurring value in the `AirlineID` column. ``` sql SELECT anyHeavy(AirlineID) AS res diff --git a/docs/en/sql-reference/aggregate-functions/reference/topk.md b/docs/en/sql-reference/aggregate-functions/reference/topk.md index 19e98262899..d0f445c6710 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/topk.md +++ b/docs/en/sql-reference/aggregate-functions/reference/topk.md @@ -28,7 +28,7 @@ If the parameter is omitted, default value 10 is used. **Example** -Take the [OnTime](../../../getting-started/example-datasets/ontime.md) data set and select the three most frequently occurring values in the `AirlineID` column. +Take the [OnTime](../../../example-datasets/ontime.md) data set and select the three most frequently occurring values in the `AirlineID` column. ``` sql SELECT topK(3)(AirlineID) AS res diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/_category_.yml b/docs/en/sql-reference/dictionaries/external-dictionaries/_category_.yml index 77f42ba74d1..6708e987968 100644 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/_category_.yml +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/_category_.yml @@ -4,4 +4,5 @@ collapsible: true collapsed: true link: type: generated-index - title: External Dictionaries \ No newline at end of file + title: External Dictionaries + slug: /en/sql-reference/dictionaries/external-dictionaries \ No newline at end of file diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-hierarchical.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-hierarchical.md index c48ad217431..6df7cf231b9 100644 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-hierarchical.md +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-hierarchical.md @@ -5,7 +5,7 @@ sidebar_label: Hierarchical dictionaries # Hierarchical Dictionaries -ClickHouse supports hierarchical dictionaries with a [numeric key](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md#ext_dict-numeric-key). +ClickHouse supports hierarchical dictionaries with a [numeric key](../../dictionaries/external-dictionaries/external-dicts-dict-structure.md#numeric-key). Look at the following hierarchical structure: @@ -35,7 +35,7 @@ This hierarchy can be expressed as the following dictionary table. This table contains a column `parent_region` that contains the key of the nearest parent for the element. -ClickHouse supports the [hierarchical](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md#hierarchical-dict-attr) property for [external dictionary](../../../sql-reference/dictionaries/external-dictionaries/index.md) attributes. This property allows you to configure the hierarchical dictionary similar to described above. +ClickHouse supports the [hierarchical](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md#hierarchical-dict-attr) property for [external dictionary](../../../sql-reference/dictionaries/external-dictionaries/) attributes. This property allows you to configure the hierarchical dictionary similar to described above. The [dictGetHierarchy](../../../sql-reference/functions/ext-dict-functions.md#dictgethierarchy) function allows you to get the parent chain of an element. diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md index e5502a17a3a..443f72ad72b 100644 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md @@ -478,7 +478,7 @@ The `table` and `query` fields cannot be used together. And either one of the `t ClickHouse receives quoting symbols from ODBC-driver and quote all settings in queries to driver, so it’s necessary to set table name accordingly to table name case in database. -If you have a problems with encodings when using Oracle, see the corresponding [F.A.Q.](../../../faq/integration/oracle-odbc.md) item. +If you have a problems with encodings when using Oracle, see the corresponding [FAQ](../../../../faq/integration/oracle-odbc.md) item. ### Mysql {#dicts-external_dicts_dict_sources-mysql} diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md index 2712bbf6911..cc98c9e666a 100644 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md @@ -60,7 +60,7 @@ An xml structure can contain either `` or ``. DDL-query must contain si You must not describe key as an attribute. ::: -### Numeric Key {#ext_dict-numeric-key} +### Numeric Key {#numeric-key} Type: `UInt64`. diff --git a/docs/en/sql-reference/functions/other-functions.md b/docs/en/sql-reference/functions/other-functions.md index 45e9ef43c6a..14cd7337d76 100644 --- a/docs/en/sql-reference/functions/other-functions.md +++ b/docs/en/sql-reference/functions/other-functions.md @@ -1216,7 +1216,7 @@ SELECT * FROM table WHERE indexHint() **Example** -Here is the example of test data from the table [ontime](../../getting-started/example-datasets/ontime.md). +Here is the example of test data from the table [ontime](../../example-datasets/ontime.md). Input table: diff --git a/docs/en/sql-reference/statements/create/table.md b/docs/en/sql-reference/statements/create/table.md index 82c26630e54..c477e41ba02 100644 --- a/docs/en/sql-reference/statements/create/table.md +++ b/docs/en/sql-reference/statements/create/table.md @@ -239,7 +239,7 @@ Codecs: High compression levels are useful for asymmetric scenarios, like compress once, decompress repeatedly. Higher levels mean better compression and higher CPU usage. -### Specialized Codecs {#create-query-specialized-codecs} +### Specialized Codecs {#specialized-codecs} These codecs are designed to make compression more effective by using specific features of data. Some of these codecs do not compress data themself. Instead, they prepare the data for a common purpose codec, which compresses it better than without this preparation. diff --git a/docs/en/sql-reference/statements/select/into-outfile.md b/docs/en/sql-reference/statements/select/into-outfile.md index 08f53348cd3..b37285cb0cc 100644 --- a/docs/en/sql-reference/statements/select/into-outfile.md +++ b/docs/en/sql-reference/statements/select/into-outfile.md @@ -2,7 +2,7 @@ sidebar_label: INTO OUTFILE --- -# INTO OUTFILE Clause {#into-outfile-clause} +# INTO OUTFILE Clause `INTO OUTFILE` clause redirects the result of a `SELECT` query to a file on the **client** side. diff --git a/docs/en/sql-reference/statements/system.md b/docs/en/sql-reference/statements/system.md index 14eed981381..1d638ab3965 100644 --- a/docs/en/sql-reference/statements/system.md +++ b/docs/en/sql-reference/statements/system.md @@ -67,7 +67,7 @@ SELECT name, status FROM system.dictionaries; ## RELOAD MODELS {#query_language-system-reload-models} -Reloads all [CatBoost](../../guides/apply-catboost-model.md#applying-catboost-model-in-clickhouse) models if the configuration was updated without restarting the server. +Reloads all [CatBoost](../../../guides/developer/apply-catboost-model.md) models if the configuration was updated without restarting the server. **Syntax** From 6780cd2d8293f7eccb06c32c8c27acdfb2cc72ca Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 3 Apr 2022 04:52:54 +0200 Subject: [PATCH 100/239] Update URL in test visualizer --- utils/tests-visualizer/index.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/tests-visualizer/index.html b/utils/tests-visualizer/index.html index 00076f683fa..15ee221aa8e 100644 --- a/utils/tests-visualizer/index.html +++ b/utils/tests-visualizer/index.html @@ -144,7 +144,7 @@ let test_names_query = ` async function loadDataByQuery(query) { const response = await fetch( - "https://play-ci.clickhouse.com?user=play&add_http_cors_header=1", + "https://play.clickhouse.com?user=play&add_http_cors_header=1", { method: "POST", body: query } ) if (!response.ok) throw new Error(`Data download failed\nHTTP status ${response.status}`); From 808c802caeedf8ca74c02250606255dc8d38a7be Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 3 Apr 2022 05:07:37 +0200 Subject: [PATCH 101/239] Disable session_log --- programs/server/config.xml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/programs/server/config.xml b/programs/server/config.xml index 4e4cabdb03b..3b035fb39ac 100644 --- a/programs/server/config.xml +++ b/programs/server/config.xml @@ -1030,14 +1030,17 @@ 1000 - - + + diff --git a/docs/en/development/style.md b/docs/en/development/style.md index 82cd9273680..03121880555 100644 --- a/docs/en/development/style.md +++ b/docs/en/development/style.md @@ -1,10 +1,9 @@ --- -sidebar_position: 69 -sidebar_label: C++ Guide -description: A list of recommendations regarding coding style, naming convention, formatting and more +toc_priority: 69 +toc_title: C++ Guide --- -# How to Write C++ Code +# How to Write C++ Code {#how-to-write-c-code} ## General Recommendations {#general-recommendations} diff --git a/docs/en/development/tests.md b/docs/en/development/tests.md index 29b69f0b697..be9fc7907af 100644 --- a/docs/en/development/tests.md +++ b/docs/en/development/tests.md @@ -1,12 +1,11 @@ --- -sidebar_position: 70 -sidebar_label: Testing -description: Most of ClickHouse features can be tested with functional tests and they are mandatory to use for every change in ClickHouse code that can be tested that way. +toc_priority: 70 +toc_title: Testing --- -# ClickHouse Testing +# ClickHouse Testing {#clickhouse-testing} -## Functional Tests +## Functional Tests {#functional-tests} Functional tests are the most simple and convenient to use. Most of ClickHouse features can be tested with functional tests and they are mandatory to use for every change in ClickHouse code that can be tested that way. diff --git a/docs/en/engines/_category_.yml b/docs/en/engines/_category_.yml deleted file mode 100644 index 0c462323df4..00000000000 --- a/docs/en/engines/_category_.yml +++ /dev/null @@ -1,8 +0,0 @@ -position: 30 -label: 'Database & Table Engines' -collapsible: true -collapsed: true -link: - type: generated-index - title: Database & Table Engines - slug: /en/table-engines \ No newline at end of file diff --git a/docs/en/engines/database-engines/atomic.md b/docs/en/engines/database-engines/atomic.md index 878307121aa..1e555a0a502 100644 --- a/docs/en/engines/database-engines/atomic.md +++ b/docs/en/engines/database-engines/atomic.md @@ -1,9 +1,9 @@ --- -sidebar_label: Atomic -sidebar_position: 10 +toc_priority: 32 +toc_title: Atomic --- -# Atomic +# Atomic {#atomic} It supports non-blocking [DROP TABLE](#drop-detach-table) and [RENAME TABLE](#rename-table) queries and atomic [EXCHANGE TABLES](#exchange-tables) queries. `Atomic` database engine is used by default. @@ -18,21 +18,14 @@ CREATE DATABASE test [ENGINE = Atomic]; ### Table UUID {#table-uuid} All tables in database `Atomic` have persistent [UUID](../../sql-reference/data-types/uuid.md) and store data in directory `/clickhouse_path/store/xxx/xxxyyyyy-yyyy-yyyy-yyyy-yyyyyyyyyyyy/`, where `xxxyyyyy-yyyy-yyyy-yyyy-yyyyyyyyyyyy` is UUID of the table. -Usually, the UUID is generated automatically, but the user can also explicitly specify the UUID in the same way when creating the table (this is not recommended). - -For example: +Usually, the UUID is generated automatically, but the user can also explicitly specify the UUID in the same way when creating the table (this is not recommended). To display the `SHOW CREATE` query with the UUID you can use setting [show_table_uuid_in_table_create_query_if_not_nil](../../operations/settings/settings.md#show_table_uuid_in_table_create_query_if_not_nil). For example: ```sql CREATE TABLE name UUID '28f1c61c-2970-457a-bffe-454156ddcfef' (n UInt64) ENGINE = ...; ``` - -:::note -You can use the [show_table_uuid_in_table_create_query_if_not_nil](../../operations/settings/settings.md#show_table_uuid_in_table_create_query_if_not_nil) setting to display the UUID with the `SHOW CREATE` query. -::: - ### RENAME TABLE {#rename-table} -[RENAME](../../sql-reference/statements/rename.md) queries are performed without changing the UUID or moving table data. These queries do not wait for the completion of queries using the table and are executed instantly. +[RENAME](../../sql-reference/statements/rename.md) queries are performed without changing UUID and moving table data. These queries do not wait for the completion of queries using the table and are executed instantly. ### DROP/DETACH TABLE {#drop-detach-table} diff --git a/docs/en/engines/database-engines/index.md b/docs/en/engines/database-engines/index.md index 0cee580abcd..dd8959d2700 100644 --- a/docs/en/engines/database-engines/index.md +++ b/docs/en/engines/database-engines/index.md @@ -6,11 +6,11 @@ toc_title: Introduction # Database Engines {#database-engines} -Database engines allow you to work with tables. By default, ClickHouse uses the [Atomic](../../engines/database-engines/atomic.md) database engine, which provides configurable [table engines](../../engines/table-engines/index.md) and an [SQL dialect](../../sql-reference/syntax.md). +Database engines allow you to work with tables. -Here is a complete list of available database engines. Follow the links for more details: +By default, ClickHouse uses database engine [Atomic](../../engines/database-engines/atomic.md). It provides configurable [table engines](../../engines/table-engines/index.md) and an [SQL dialect](../../sql-reference/syntax.md). -- [Atomic](../../engines/database-engines/atomic.md) +You can also use the following database engines: - [MySQL](../../engines/database-engines/mysql.md) @@ -18,6 +18,8 @@ Here is a complete list of available database engines. Follow the links for more - [Lazy](../../engines/database-engines/lazy.md) +- [Atomic](../../engines/database-engines/atomic.md) + - [PostgreSQL](../../engines/database-engines/postgresql.md) - [Replicated](../../engines/database-engines/replicated.md) diff --git a/docs/en/engines/database-engines/lazy.md b/docs/en/engines/database-engines/lazy.md index b95ade19df4..ecd4b94f579 100644 --- a/docs/en/engines/database-engines/lazy.md +++ b/docs/en/engines/database-engines/lazy.md @@ -1,6 +1,6 @@ --- -sidebar_label: Lazy -sidebar_position: 20 +toc_priority: 31 +toc_title: Lazy --- # Lazy {#lazy} diff --git a/docs/en/engines/database-engines/materialized-mysql.md b/docs/en/engines/database-engines/materialized-mysql.md index df072682097..d7dcf21cb02 100644 --- a/docs/en/engines/database-engines/materialized-mysql.md +++ b/docs/en/engines/database-engines/materialized-mysql.md @@ -1,15 +1,16 @@ --- -sidebar_label: MaterializedMySQL -sidebar_position: 70 +toc_priority: 29 +toc_title: MaterializedMySQL --- -# [experimental] MaterializedMySQL +# [experimental] MaterializedMySQL {#materialized-mysql} -:::warning -This is an experimental feature that should not be used in production. -::: +!!! warning "Warning" + This is an experimental feature that should not be used in production. -Creates a ClickHouse database with all the tables existing in MySQL, and all the data in those tables. The ClickHouse server works as MySQL replica. It reads `binlog` and performs DDL and DML queries. +Creates ClickHouse database with all the tables existing in MySQL, and all the data in those tables. + +ClickHouse server works as MySQL replica. It reads binlog and performs DDL and DML queries. ## Creating a Database {#creating-a-database} @@ -30,6 +31,8 @@ ENGINE = MaterializedMySQL('host:port', ['database' | database], 'user', 'passwo - `max_rows_in_buffer` — Maximum number of rows that data is allowed to cache in memory (for single table and the cache data unable to query). When this number is exceeded, the data will be materialized. Default: `65 505`. - `max_bytes_in_buffer` — Maximum number of bytes that data is allowed to cache in memory (for single table and the cache data unable to query). When this number is exceeded, the data will be materialized. Default: `1 048 576`. +- `max_rows_in_buffers` — Maximum number of rows that data is allowed to cache in memory (for database and the cache data unable to query). When this number is exceeded, the data will be materialized. Default: `65 505`. +- `max_bytes_in_buffers` — Maximum number of bytes that data is allowed to cache in memory (for database and the cache data unable to query). When this number is exceeded, the data will be materialized. Default: `1 048 576`. - `max_flush_data_time` — Maximum number of milliseconds that data is allowed to cache in memory (for database and the cache data unable to query). When this time is exceeded, the data will be materialized. Default: `1000`. - `max_wait_time_when_mysql_unavailable` — Retry interval when MySQL is not available (milliseconds). Negative value disables retry. Default: `1000`. - `allows_query_when_mysql_lost` — Allows to query a materialized table when MySQL is lost. Default: `0` (`false`). @@ -49,9 +52,8 @@ For the correct work of `MaterializedMySQL`, there are few mandatory `MySQL`-sid - `default_authentication_plugin = mysql_native_password` since `MaterializedMySQL` can only authorize with this method. - `gtid_mode = on` since GTID based logging is a mandatory for providing correct `MaterializedMySQL` replication. -:::note -While turning on `gtid_mode` you should also specify `enforce_gtid_consistency = on`. -::: +!!! attention "Attention" + While turning on `gtid_mode` you should also specify `enforce_gtid_consistency = on`. ## Virtual Columns {#virtual-columns} @@ -74,7 +76,7 @@ When working with the `MaterializedMySQL` database engine, [ReplacingMergeTree]( | FLOAT | [Float32](../../sql-reference/data-types/float.md) | | DOUBLE | [Float64](../../sql-reference/data-types/float.md) | | DECIMAL, NEWDECIMAL | [Decimal](../../sql-reference/data-types/decimal.md) | -| DATE, NEWDATE | [Date](../../sql-reference/data-types/date.md) | +| DATE, NEWDATE | [Date32](../../sql-reference/data-types/date32.md) | | DATETIME, TIMESTAMP | [DateTime](../../sql-reference/data-types/datetime.md) | | DATETIME2, TIMESTAMP2 | [DateTime64](../../sql-reference/data-types/datetime64.md) | | YEAR | [UInt16](../../sql-reference/data-types/int-uint.md) | @@ -218,14 +220,13 @@ extra care needs to be taken. You may specify overrides for tables that do not exist yet. -:::warning -It is easy to break replication with table overrides if not used with care. For example: +!!! warning "Warning" + It is easy to break replication with table overrides if not used with care. For example: -* If an ALIAS column is added with a table override, and a column with the same name is later added to the source - MySQL table, the converted ALTER TABLE query in ClickHouse will fail and replication stops. -* It is currently possible to add overrides that reference nullable columns where not-nullable are required, such as in - `ORDER BY` or `PARTITION BY`. This will cause CREATE TABLE queries that will fail, also causing replication to stop. -::: + * If an ALIAS column is added with a table override, and a column with the same name is later added to the source + MySQL table, the converted ALTER TABLE query in ClickHouse will fail and replication stops. + * It is currently possible to add overrides that reference nullable columns where not-nullable are required, such as in + `ORDER BY` or `PARTITION BY`. This will cause CREATE TABLE queries that will fail, also causing replication to stop. ## Examples of Use {#examples-of-use} diff --git a/docs/en/engines/database-engines/materialized-postgresql.md b/docs/en/engines/database-engines/materialized-postgresql.md index ff8f7b192e0..56793435fac 100644 --- a/docs/en/engines/database-engines/materialized-postgresql.md +++ b/docs/en/engines/database-engines/materialized-postgresql.md @@ -1,6 +1,6 @@ --- -sidebar_label: MaterializedPostgreSQL -sidebar_position: 60 +toc_priority: 30 +toc_title: MaterializedPostgreSQL --- # [experimental] MaterializedPostgreSQL {#materialize-postgresql} @@ -46,9 +46,7 @@ After `MaterializedPostgreSQL` database is created, it does not automatically de ATTACH TABLE postgres_database.new_table; ``` -:::warning -Before version 22.1, adding a table to replication left an unremoved temporary replication slot (named `{db_name}_ch_replication_slot_tmp`). If attaching tables in ClickHouse version before 22.1, make sure to delete it manually (`SELECT pg_drop_replication_slot('{db_name}_ch_replication_slot_tmp')`). Otherwise disk usage will grow. This issue is fixed in 22.1. -::: +Warning: before version 22.1 adding table to replication left unremoved temprorary replication slot (named `{db_name}_ch_replication_slot_tmp`). If attaching tables in clickhouse version before 22.1, make sure to delete it manually (`SELECT pg_drop_replication_slot('{db_name}_ch_replication_slot_tmp')`). Otherwise disk usage will grow. Issue is fixed in 22.1. ## Dynamically removing tables from replication {#dynamically-removing-table-from-replication} @@ -137,70 +135,69 @@ FROM pg_class WHERE oid = 'postgres_table'::regclass; ``` -:::warning -Replication of [**TOAST**](https://www.postgresql.org/docs/9.5/storage-toast.html) values is not supported. The default value for the data type will be used. -::: +!!! warning "Warning" + Replication of [**TOAST**](https://www.postgresql.org/docs/9.5/storage-toast.html) values is not supported. The default value for the data type will be used. ## Settings {#settings} -1. `materialized_postgresql_tables_list` {#materialized-postgresql-tables-list} +1. materialized_postgresql_tables_list {#materialized-postgresql-tables-list} - Sets a comma-separated list of PostgreSQL database tables, which will be replicated via [MaterializedPostgreSQL](../../engines/database-engines/materialized-postgresql.md) database engine. +Sets a comma-separated list of PostgreSQL database tables, which will be replicated via [MaterializedPostgreSQL](../../engines/database-engines/materialized-postgresql.md) database engine. - Default value: empty list — means whole PostgreSQL database will be replicated. +Default value: empty list — means whole PostgreSQL database will be replicated. -2. `materialized_postgresql_schema` {#materialized-postgresql-schema} +2. materialized_postgresql_schema {#materialized-postgresql-schema} - Default value: empty string. (Default schema is used) +Default value: empty string. (Default schema is used) -3. `materialized_postgresql_schema_list` {#materialized-postgresql-schema-list} +3. materialized_postgresql_schema_list {#materialized-postgresql-schema-list} - Default value: empty list. (Default schema is used) +Default value: empty list. (Default schema is used) -4. `materialized_postgresql_allow_automatic_update` {#materialized-postgresql-allow-automatic-update} +4. materialized_postgresql_allow_automatic_update {#materialized-postgresql-allow-automatic-update} - Do not use this setting before 22.1 version. +Do not use this setting before 22.1 version. - Allows reloading table in the background, when schema changes are detected. DDL queries on the PostgreSQL side are not replicated via ClickHouse [MaterializedPostgreSQL](../../engines/database-engines/materialized-postgresql.md) engine, because it is not allowed with PostgreSQL logical replication protocol, but the fact of DDL changes is detected transactionally. In this case, the default behaviour is to stop replicating those tables once DDL is detected. However, if this setting is enabled, then, instead of stopping the replication of those tables, they will be reloaded in the background via database snapshot without data losses and replication will continue for them. +Allows reloading table in the background, when schema changes are detected. DDL queries on the PostgreSQL side are not replicated via ClickHouse [MaterializedPostgreSQL](../../engines/database-engines/materialized-postgresql.md) engine, because it is not allowed with PostgreSQL logical replication protocol, but the fact of DDL changes is detected transactionally. In this case, the default behaviour is to stop replicating those tables once DDL is detected. However, if this setting is enabled, then, instead of stopping the replication of those tables, they will be reloaded in the background via database snapshot without data losses and replication will continue for them. - Possible values: +Possible values: - - 0 — The table is not automatically updated in the background, when schema changes are detected. - - 1 — The table is automatically updated in the background, when schema changes are detected. +- 0 — The table is not automatically updated in the background, when schema changes are detected. +- 1 — The table is automatically updated in the background, when schema changes are detected. - Default value: `0`. +Default value: `0`. -5. `materialized_postgresql_max_block_size` {#materialized-postgresql-max-block-size} +5. materialized_postgresql_max_block_size {#materialized-postgresql-max-block-size} - Sets the number of rows collected in memory before flushing data into PostgreSQL database table. +Sets the number of rows collected in memory before flushing data into PostgreSQL database table. - Possible values: +Possible values: - - Positive integer. +- Positive integer. - Default value: `65536`. +Default value: `65536`. -6. `materialized_postgresql_replication_slot` {#materialized-postgresql-replication-slot} +6. materialized_postgresql_replication_slot {#materialized-postgresql-replication-slot} - A user-created replication slot. Must be used together with `materialized_postgresql_snapshot`. +A user-created replication slot. Must be used together with `materialized_postgresql_snapshot`. -7. `materialized_postgresql_snapshot` {#materialized-postgresql-snapshot} +7. materialized_postgresql_snapshot {#materialized-postgresql-snapshot} - A text string identifying a snapshot, from which [initial dump of PostgreSQL tables](../../engines/database-engines/materialized-postgresql.md) will be performed. Must be used together with `materialized_postgresql_replication_slot`. +A text string identifying a snapshot, from which [initial dump of PostgreSQL tables](../../engines/database-engines/materialized-postgresql.md) will be performed. Must be used together with `materialized_postgresql_replication_slot`. - ``` sql - CREATE DATABASE database1 - ENGINE = MaterializedPostgreSQL('postgres1:5432', 'postgres_database', 'postgres_user', 'postgres_password') - SETTINGS materialized_postgresql_tables_list = 'table1,table2,table3'; +``` sql +CREATE DATABASE database1 +ENGINE = MaterializedPostgreSQL('postgres1:5432', 'postgres_database', 'postgres_user', 'postgres_password') +SETTINGS materialized_postgresql_tables_list = 'table1,table2,table3'; - SELECT * FROM database1.table1; - ``` +SELECT * FROM database1.table1; +``` - The settings can be changed, if necessary, using a DDL query. But it is impossible to change the setting `materialized_postgresql_tables_list`. To update the list of tables in this setting use the `ATTACH TABLE` query. +The settings can be changed, if necessary, using a DDL query. But it is impossible to change the setting `materialized_postgresql_tables_list`. To update the list of tables in this setting use the `ATTACH TABLE` query. - ``` sql - ALTER DATABASE postgres_database MODIFY SETTING materialized_postgresql_max_block_size = ; - ``` +``` sql +ALTER DATABASE postgres_database MODIFY SETTING materialized_postgresql_max_block_size = ; +``` ## Notes {#notes} @@ -216,47 +213,47 @@ Please note that this should be used only if it is actually needed. If there is 1. Configure replication slot in PostgreSQL. - ```yaml - apiVersion: "acid.zalan.do/v1" - kind: postgresql - metadata: - name: acid-demo-cluster - spec: - numberOfInstances: 2 - postgresql: - parameters: - wal_level: logical - patroni: - slots: - clickhouse_sync: - type: logical - database: demodb - plugin: pgoutput - ``` +```yaml +apiVersion: "acid.zalan.do/v1" +kind: postgresql +metadata: + name: acid-demo-cluster +spec: + numberOfInstances: 2 + postgresql: + parameters: + wal_level: logical + patroni: + slots: + clickhouse_sync: + type: logical + database: demodb + plugin: pgoutput +``` 2. Wait for replication slot to be ready, then begin a transaction and export the transaction snapshot identifier: - ```sql - BEGIN; - SELECT pg_export_snapshot(); - ``` +```sql +BEGIN; +SELECT pg_export_snapshot(); +``` 3. In ClickHouse create database: - ```sql - CREATE DATABASE demodb - ENGINE = MaterializedPostgreSQL('postgres1:5432', 'postgres_database', 'postgres_user', 'postgres_password') - SETTINGS - materialized_postgresql_replication_slot = 'clickhouse_sync', - materialized_postgresql_snapshot = '0000000A-0000023F-3', - materialized_postgresql_tables_list = 'table1,table2,table3'; - ``` +```sql +CREATE DATABASE demodb +ENGINE = MaterializedPostgreSQL('postgres1:5432', 'postgres_database', 'postgres_user', 'postgres_password') +SETTINGS + materialized_postgresql_replication_slot = 'clickhouse_sync', + materialized_postgresql_snapshot = '0000000A-0000023F-3', + materialized_postgresql_tables_list = 'table1,table2,table3'; +``` 4. End the PostgreSQL transaction once replication to ClickHouse DB is confirmed. Verify that replication continues after failover: - ```bash - kubectl exec acid-demo-cluster-0 -c postgres -- su postgres -c 'patronictl failover --candidate acid-demo-cluster-1 --force' - ``` +```bash +kubectl exec acid-demo-cluster-0 -c postgres -- su postgres -c 'patronictl failover --candidate acid-demo-cluster-1 --force' +``` ### Required permissions diff --git a/docs/en/engines/database-engines/mysql.md b/docs/en/engines/database-engines/mysql.md index 89a0786a9ec..df4965b1f8c 100644 --- a/docs/en/engines/database-engines/mysql.md +++ b/docs/en/engines/database-engines/mysql.md @@ -1,9 +1,9 @@ --- -sidebar_position: 50 -sidebar_label: MySQL +toc_priority: 30 +toc_title: MySQL --- -# MySQL +# MySQL {#mysql} Allows to connect to databases on a remote MySQL server and perform `INSERT` and `SELECT` queries to exchange data between ClickHouse and MySQL. @@ -49,6 +49,8 @@ ENGINE = MySQL('host:port', ['database' | database], 'user', 'password') All other MySQL data types are converted into [String](../../sql-reference/data-types/string.md). +Because of the ClickHouse date type has a different range from the MySQL date range,If the MySQL date type is out of the range of ClickHouse date, you can use the setting mysql_datatypes_support_level to modify the mapping from the MySQL date type to the Clickhouse date type: date2Date32 (convert MySQL's date type to ClickHouse Date32) or date2String(convert MySQL's date type to ClickHouse String,this is usually used when your mysql data is less than 1925) or default(convert MySQL's date type to ClickHouse Date). + [Nullable](../../sql-reference/data-types/nullable.md) is supported. ## Global Variables Support {#global-variables-support} @@ -59,9 +61,8 @@ These variables are supported: - `version` - `max_allowed_packet` -:::warning -By now these variables are stubs and don't correspond to anything. -::: +!!! warning "Warning" + By now these variables are stubs and don't correspond to anything. Example: diff --git a/docs/en/engines/database-engines/postgresql.md b/docs/en/engines/database-engines/postgresql.md index bc5e93d0923..76ef484e773 100644 --- a/docs/en/engines/database-engines/postgresql.md +++ b/docs/en/engines/database-engines/postgresql.md @@ -1,6 +1,6 @@ --- -sidebar_position: 40 -sidebar_label: PostgreSQL +toc_priority: 35 +toc_title: PostgreSQL --- # PostgreSQL {#postgresql} diff --git a/docs/en/engines/database-engines/replicated.md b/docs/en/engines/database-engines/replicated.md index 63d955dc889..bdc17d32393 100644 --- a/docs/en/engines/database-engines/replicated.md +++ b/docs/en/engines/database-engines/replicated.md @@ -1,6 +1,6 @@ --- -sidebar_position: 30 -sidebar_label: Replicated +toc_priority: 36 +toc_title: Replicated --- # [experimental] Replicated {#replicated} @@ -20,9 +20,8 @@ One ClickHouse server can have multiple replicated databases running and updatin - `shard_name` — Shard name. Database replicas are grouped into shards by `shard_name`. - `replica_name` — Replica name. Replica names must be different for all replicas of the same shard. -:::warning -For [ReplicatedMergeTree](../table-engines/mergetree-family/replication.md#table_engines-replication) tables if no arguments provided, then default arguments are used: `/clickhouse/tables/{uuid}/{shard}` and `{replica}`. These can be changed in the server settings [default_replica_path](../../operations/server-configuration-parameters/settings.md#default_replica_path) and [default_replica_name](../../operations/server-configuration-parameters/settings.md#default_replica_name). Macro `{uuid}` is unfolded to table's uuid, `{shard}` and `{replica}` are unfolded to values from server config, not from database engine arguments. But in the future, it will be possible to use `shard_name` and `replica_name` of Replicated database. -::: +!!! note "Warning" + For [ReplicatedMergeTree](../table-engines/mergetree-family/replication.md#table_engines-replication) tables if no arguments provided, then default arguments are used: `/clickhouse/tables/{uuid}/{shard}` and `{replica}`. These can be changed in the server settings [default_replica_path](../../operations/server-configuration-parameters/settings.md#default_replica_path) and [default_replica_name](../../operations/server-configuration-parameters/settings.md#default_replica_name). Macro `{uuid}` is unfolded to table's uuid, `{shard}` and `{replica}` are unfolded to values from server config, not from database engine arguments. But in the future, it will be possible to use `shard_name` and `replica_name` of Replicated database. ## Specifics and Recommendations {#specifics-and-recommendations} diff --git a/docs/en/engines/database-engines/sqlite.md b/docs/en/engines/database-engines/sqlite.md index 2f8b44c9a09..ee9db90859f 100644 --- a/docs/en/engines/database-engines/sqlite.md +++ b/docs/en/engines/database-engines/sqlite.md @@ -1,6 +1,6 @@ --- -sidebar_position: 55 -sidebar_label: SQLite +toc_priority: 32 +toc_title: SQLite --- # SQLite {#sqlite} diff --git a/docs/en/engines/index.md b/docs/en/engines/index.md new file mode 100644 index 00000000000..b3f4a4f7b69 --- /dev/null +++ b/docs/en/engines/index.md @@ -0,0 +1,15 @@ +--- +toc_folder_title: Engines +toc_hidden: true +toc_priority: 25 +toc_title: hidden +--- + +# ClickHouse Engines {#clickhouse-engines} + +There are two key engine kinds in ClickHouse: + +- [Table engines](../engines/table-engines/index.md) +- [Database engines](../engines/database-engines/index.md) + +{## [Original article](https://clickhouse.com/docs/en/engines/) ##} diff --git a/docs/en/engines/table-engines/integrations/ExternalDistributed.md b/docs/en/engines/table-engines/integrations/ExternalDistributed.md index c9aae1934db..0ecbc5383e1 100644 --- a/docs/en/engines/table-engines/integrations/ExternalDistributed.md +++ b/docs/en/engines/table-engines/integrations/ExternalDistributed.md @@ -1,6 +1,6 @@ --- -sidebar_position: 12 -sidebar_label: ExternalDistributed +toc_priority: 12 +toc_title: ExternalDistributed --- # ExternalDistributed {#externaldistributed} @@ -51,6 +51,3 @@ You can specify any number of shards and any number of replicas for each shard. - [MySQL table engine](../../../engines/table-engines/integrations/mysql.md) - [PostgreSQL table engine](../../../engines/table-engines/integrations/postgresql.md) - [Distributed table engine](../../../engines/table-engines/special/distributed.md) - - -[Original article](https://clickhouse.com/docs/en/engines/table-engines/integrations/ExternalDistributed/) diff --git a/docs/en/engines/table-engines/integrations/embedded-rocksdb.md b/docs/en/engines/table-engines/integrations/embedded-rocksdb.md index 701d190f022..385abeb83ad 100644 --- a/docs/en/engines/table-engines/integrations/embedded-rocksdb.md +++ b/docs/en/engines/table-engines/integrations/embedded-rocksdb.md @@ -1,6 +1,6 @@ --- -sidebar_position: 9 -sidebar_label: EmbeddedRocksDB +toc_priority: 9 +toc_title: EmbeddedRocksDB --- # EmbeddedRocksDB Engine {#EmbeddedRocksDB-engine} diff --git a/docs/en/engines/table-engines/integrations/hdfs.md b/docs/en/engines/table-engines/integrations/hdfs.md index 503bd779abf..0d6d90f9d31 100644 --- a/docs/en/engines/table-engines/integrations/hdfs.md +++ b/docs/en/engines/table-engines/integrations/hdfs.md @@ -1,6 +1,6 @@ --- -sidebar_position: 6 -sidebar_label: HDFS +toc_priority: 6 +toc_title: HDFS --- # HDFS {#table_engines-hdfs} @@ -98,9 +98,8 @@ Table consists of all the files in both directories (all files should satisfy fo CREATE TABLE table_with_asterisk (name String, value UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/{some,another}_dir/*', 'TSV') ``` -:::warning -If the listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`. -::: +!!! warning "Warning" + If the listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`. **Example** diff --git a/docs/en/engines/table-engines/integrations/hive.md b/docs/en/engines/table-engines/integrations/hive.md index 6731f0e7559..61147467690 100644 --- a/docs/en/engines/table-engines/integrations/hive.md +++ b/docs/en/engines/table-engines/integrations/hive.md @@ -1,6 +1,6 @@ --- -sidebar_position: 4 -sidebar_label: Hive +toc_priority: 4 +toc_title: Hive --- # Hive {#hive} @@ -137,7 +137,7 @@ CREATE TABLE test.test_orc `f_array_array_float` Array(Array(Float32)), `day` String ) -ENGINE = Hive('thrift://202.168.117.26:9083', 'test', 'test_orc') +ENGINE = Hive('thrift://localhost:9083', 'test', 'test_orc') PARTITION BY day ``` @@ -406,5 +406,3 @@ f_char: hello world f_bool: true day: 2021-09-18 ``` - -[Original article](https://clickhouse.com/docs/en/engines/table-engines/integrations/hive/) diff --git a/docs/en/engines/table-engines/integrations/index.md b/docs/en/engines/table-engines/integrations/index.md index 9230ad624ba..a06b4c78394 100644 --- a/docs/en/engines/table-engines/integrations/index.md +++ b/docs/en/engines/table-engines/integrations/index.md @@ -1,6 +1,6 @@ --- -sidebar_position: 40 -sidebar_label: Integrations +toc_folder_title: Integrations +toc_priority: 1 --- # Table Engines for Integrations {#table-engines-for-integrations} diff --git a/docs/en/engines/table-engines/integrations/jdbc.md b/docs/en/engines/table-engines/integrations/jdbc.md index 0ce31f36070..2f442fd7753 100644 --- a/docs/en/engines/table-engines/integrations/jdbc.md +++ b/docs/en/engines/table-engines/integrations/jdbc.md @@ -1,6 +1,6 @@ --- -sidebar_position: 3 -sidebar_label: JDBC +toc_priority: 3 +toc_title: JDBC --- # JDBC {#table-engine-jdbc} diff --git a/docs/en/engines/table-engines/integrations/kafka.md b/docs/en/engines/table-engines/integrations/kafka.md index 3a8d98e1ca9..1d80f143098 100644 --- a/docs/en/engines/table-engines/integrations/kafka.md +++ b/docs/en/engines/table-engines/integrations/kafka.md @@ -1,6 +1,6 @@ --- -sidebar_position: 8 -sidebar_label: Kafka +toc_priority: 8 +toc_title: Kafka --- # Kafka {#kafka} @@ -87,9 +87,8 @@ Examples: Deprecated Method for Creating a Table -:::warning -Do not use this method in new projects. If possible, switch old projects to the method described above. -::: +!!! attention "Attention" + Do not use this method in new projects. If possible, switch old projects to the method described above. ``` sql Kafka(kafka_broker_list, kafka_topic_list, kafka_group_name, kafka_format @@ -134,7 +133,7 @@ Example: SELECT level, sum(total) FROM daily GROUP BY level; ``` -To improve performance, received messages are grouped into blocks the size of [max_insert_block_size](../../../operations/settings/settings.md#settings-max_insert_block_size). If the block wasn’t formed within [stream_flush_interval_ms](../../../operations/settings/settings.md/#stream-flush-interval-ms) milliseconds, the data will be flushed to the table regardless of the completeness of the block. +To improve performance, received messages are grouped into blocks the size of [max_insert_block_size](../../../operations/settings/settings/#settings-max_insert_block_size). If the block wasn’t formed within [stream_flush_interval_ms](../../../operations/settings/settings/#stream-flush-interval-ms) milliseconds, the data will be flushed to the table regardless of the completeness of the block. To stop receiving topic data or to change the conversion logic, detach the materialized view: diff --git a/docs/en/engines/table-engines/integrations/materialized-postgresql.md b/docs/en/engines/table-engines/integrations/materialized-postgresql.md index 61f97961ddb..fa349e49af5 100644 --- a/docs/en/engines/table-engines/integrations/materialized-postgresql.md +++ b/docs/en/engines/table-engines/integrations/materialized-postgresql.md @@ -1,6 +1,6 @@ --- -sidebar_position: 12 -sidebar_label: MaterializedPostgreSQL +toc_priority: 12 +toc_title: MaterializedPostgreSQL --- # MaterializedPostgreSQL {#materialize-postgresql} @@ -52,8 +52,5 @@ PRIMARY KEY key; SELECT key, value, _version FROM postgresql_db.postgresql_replica; ``` -:::warning -Replication of [**TOAST**](https://www.postgresql.org/docs/9.5/storage-toast.html) values is not supported. The default value for the data type will be used. -::: - -[Original article](https://clickhouse.com/docs/en/engines/table-engines/integrations/materialized-postgresql) +!!! warning "Warning" + Replication of [**TOAST**](https://www.postgresql.org/docs/9.5/storage-toast.html) values is not supported. The default value for the data type will be used. diff --git a/docs/en/engines/table-engines/integrations/mongodb.md b/docs/en/engines/table-engines/integrations/mongodb.md index d212ab4720f..475416ffb94 100644 --- a/docs/en/engines/table-engines/integrations/mongodb.md +++ b/docs/en/engines/table-engines/integrations/mongodb.md @@ -1,6 +1,6 @@ --- -sidebar_position: 5 -sidebar_label: MongoDB +toc_priority: 5 +toc_title: MongoDB --- # MongoDB {#mongodb} diff --git a/docs/en/engines/table-engines/integrations/mysql.md b/docs/en/engines/table-engines/integrations/mysql.md index e962db58873..7f28f16aa27 100644 --- a/docs/en/engines/table-engines/integrations/mysql.md +++ b/docs/en/engines/table-engines/integrations/mysql.md @@ -1,6 +1,6 @@ --- -sidebar_position: 4 -sidebar_label: MySQL +toc_priority: 4 +toc_title: MySQL --- # MySQL {#mysql} @@ -148,5 +148,3 @@ Default value: `16`. - [The mysql table function](../../../sql-reference/table-functions/mysql.md) - [Using MySQL as a source of external dictionary](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-mysql) - -[Original article](https://clickhouse.com/docs/en/engines/table-engines/integrations/mysql/) diff --git a/docs/en/engines/table-engines/integrations/odbc.md b/docs/en/engines/table-engines/integrations/odbc.md index ed2b77d7ca3..0ef21d8565a 100644 --- a/docs/en/engines/table-engines/integrations/odbc.md +++ b/docs/en/engines/table-engines/integrations/odbc.md @@ -1,6 +1,6 @@ --- -sidebar_position: 2 -sidebar_label: ODBC +toc_priority: 2 +toc_title: ODBC --- # ODBC {#table-engine-odbc} diff --git a/docs/en/engines/table-engines/integrations/postgresql.md b/docs/en/engines/table-engines/integrations/postgresql.md index d6826000a1a..789759ec521 100644 --- a/docs/en/engines/table-engines/integrations/postgresql.md +++ b/docs/en/engines/table-engines/integrations/postgresql.md @@ -1,6 +1,6 @@ --- -sidebar_position: 11 -sidebar_label: PostgreSQL +toc_priority: 11 +toc_title: PostgreSQL --- # PostgreSQL {#postgresql} @@ -73,9 +73,8 @@ All joins, aggregations, sorting, `IN [ array ]` conditions and the `LIMIT` samp PostgreSQL `Array` types are converted into ClickHouse arrays. -:::warning -Be careful - in PostgreSQL an array data, created like a `type_name[]`, may contain multi-dimensional arrays of different dimensions in different table rows in same column. But in ClickHouse it is only allowed to have multidimensional arrays of the same count of dimensions in all table rows in same column. -::: +!!! info "Note" + Be careful - in PostgreSQL an array data, created like a `type_name[]`, may contain multi-dimensional arrays of different dimensions in different table rows in same column. But in ClickHouse it is only allowed to have multidimensional arrays of the same count of dimensions in all table rows in same column. Supports multiple replicas that must be listed by `|`. For example: diff --git a/docs/en/engines/table-engines/integrations/rabbitmq.md b/docs/en/engines/table-engines/integrations/rabbitmq.md index 6653b76594a..78c144ac76f 100644 --- a/docs/en/engines/table-engines/integrations/rabbitmq.md +++ b/docs/en/engines/table-engines/integrations/rabbitmq.md @@ -1,6 +1,6 @@ --- -sidebar_position: 10 -sidebar_label: RabbitMQ +toc_priority: 10 +toc_title: RabbitMQ --- # RabbitMQ Engine {#rabbitmq-engine} diff --git a/docs/en/engines/table-engines/integrations/s3.md b/docs/en/engines/table-engines/integrations/s3.md index 42abc2a0b1e..c7301a55bf0 100644 --- a/docs/en/engines/table-engines/integrations/s3.md +++ b/docs/en/engines/table-engines/integrations/s3.md @@ -1,6 +1,6 @@ --- -sidebar_position: 7 -sidebar_label: S3 +toc_priority: 7 +toc_title: S3 --- # S3 Table Engine {#table-engine-s3} @@ -66,9 +66,8 @@ For more information about virtual columns see [here](../../../engines/table-eng Constructions with `{}` are similar to the [remote](../../../sql-reference/table-functions/remote.md) table function. -:::warning -If the listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`. -::: +!!! warning "Warning" + If the listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`. **Example with wildcards 1** @@ -159,5 +158,3 @@ The following settings can be specified in configuration file for given endpoint ## See also - [s3 table function](../../../sql-reference/table-functions/s3.md) - -[Original article](https://clickhouse.com/docs/en/engines/table-engines/integrations/s3/) diff --git a/docs/en/engines/table-engines/integrations/sqlite.md b/docs/en/engines/table-engines/integrations/sqlite.md index 45cc1cfc28a..391f1696291 100644 --- a/docs/en/engines/table-engines/integrations/sqlite.md +++ b/docs/en/engines/table-engines/integrations/sqlite.md @@ -1,6 +1,6 @@ --- -sidebar_position: 7 -sidebar_label: SQLite +toc_priority: 7 +toc_title: SQLite --- # SQLite {#sqlite} @@ -56,7 +56,4 @@ SELECT * FROM sqlite_db.table2 ORDER BY col1; **See Also** - [SQLite](../../../engines/database-engines/sqlite.md) engine -- [sqlite](../../../sql-reference/table-functions/sqlite.md) table function - - -[Original article](https://clickhouse.com/docs/en/engines/table-engines/integrations/sqlite/) +- [sqlite](../../../sql-reference/table-functions/sqlite.md) table function \ No newline at end of file diff --git a/docs/en/engines/table-engines/log-family/index.md b/docs/en/engines/table-engines/log-family/index.md index 89eb08ad7b9..910df09e67f 100644 --- a/docs/en/engines/table-engines/log-family/index.md +++ b/docs/en/engines/table-engines/log-family/index.md @@ -1,6 +1,7 @@ --- -sidebar_position: 20 -sidebar_label: Log Family +toc_folder_title: Log Family +toc_priority: 29 +toc_title: Introduction --- # Log Engine Family {#log-engine-family} diff --git a/docs/en/engines/table-engines/log-family/log.md b/docs/en/engines/table-engines/log-family/log.md index 8858699f045..2aeef171128 100644 --- a/docs/en/engines/table-engines/log-family/log.md +++ b/docs/en/engines/table-engines/log-family/log.md @@ -10,6 +10,3 @@ The engine belongs to the family of `Log` engines. See the common properties of `Log` differs from [TinyLog](../../../engines/table-engines/log-family/tinylog.md) in that a small file of "marks" resides with the column files. These marks are written on every data block and contain offsets that indicate where to start reading the file in order to skip the specified number of rows. This makes it possible to read table data in multiple threads. For concurrent data access, the read operations can be performed simultaneously, while write operations block reads and each other. The `Log` engine does not support indexes. Similarly, if writing to a table failed, the table is broken, and reading from it returns an error. The `Log` engine is appropriate for temporary data, write-once tables, and for testing or demonstration purposes. - -[Original article](https://clickhouse.com/docs/en/engines/table-engines/log-family/log/) - diff --git a/docs/en/engines/table-engines/mergetree-family/aggregatingmergetree.md b/docs/en/engines/table-engines/mergetree-family/aggregatingmergetree.md index 7be10cec2f5..8c9f8dd8ce3 100644 --- a/docs/en/engines/table-engines/mergetree-family/aggregatingmergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/aggregatingmergetree.md @@ -1,6 +1,6 @@ --- -sidebar_position: 60 -sidebar_label: AggregatingMergeTree +toc_priority: 35 +toc_title: AggregatingMergeTree --- # AggregatingMergeTree {#aggregatingmergetree} @@ -42,9 +42,8 @@ When creating a `AggregatingMergeTree` table the same [clauses](../../../engines Deprecated Method for Creating a Table -:::warning -Do not use this method in new projects and, if possible, switch the old projects to the method described above. -::: +!!! attention "Attention" + Do not use this method in new projects and, if possible, switch the old projects to the method described above. ``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] diff --git a/docs/en/engines/table-engines/mergetree-family/collapsingmergetree.md b/docs/en/engines/table-engines/mergetree-family/collapsingmergetree.md index 22863611e79..271b8b20fdb 100644 --- a/docs/en/engines/table-engines/mergetree-family/collapsingmergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/collapsingmergetree.md @@ -1,6 +1,6 @@ --- -sidebar_position: 70 -sidebar_label: CollapsingMergeTree +toc_priority: 36 +toc_title: CollapsingMergeTree --- # CollapsingMergeTree {#table_engine-collapsingmergetree} @@ -42,9 +42,8 @@ When creating a `CollapsingMergeTree` table, the same [query clauses](../../../e Deprecated Method for Creating a Table -:::warning -Do not use this method in new projects and, if possible, switch old projects to the method described above. -::: +!!! attention "Attention" + Do not use this method in new projects and, if possible, switch the old projects to the method described above. ``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] diff --git a/docs/en/engines/table-engines/mergetree-family/custom-partitioning-key.md b/docs/en/engines/table-engines/mergetree-family/custom-partitioning-key.md index 716528f8d77..b58e90a3d92 100644 --- a/docs/en/engines/table-engines/mergetree-family/custom-partitioning-key.md +++ b/docs/en/engines/table-engines/mergetree-family/custom-partitioning-key.md @@ -1,15 +1,12 @@ --- -sidebar_position: 30 -sidebar_label: Custom Partitioning Key +toc_priority: 32 +toc_title: Custom Partitioning Key --- # Custom Partitioning Key {#custom-partitioning-key} -:::warning -In most cases you do not need a partition key, and in most other cases you do not need a partition key more granular than by months. Partitioning does not speed up queries (in contrast to the ORDER BY expression). - -You should never use too granular of partitioning. Don't partition your data by client identifiers or names. Instead, make a client identifier or name the first column in the ORDER BY expression. -::: +!!! warning "Warning" + In most cases you don't need partition key, and in most other cases you don't need partition key more granular than by months. Partitioning does not speed up queries (in contrast to the ORDER BY expression). You should never use too granular partitioning. Don't partition your data by client identifiers or names (instead make client identifier or name the first column in the ORDER BY expression). Partitioning is available for the [MergeTree](../../../engines/table-engines/mergetree-family/mergetree.md) family tables (including [replicated](../../../engines/table-engines/mergetree-family/replication.md) tables). [Materialized views](../../../engines/table-engines/special/materializedview.md#materializedview) based on MergeTree tables support partitioning, as well. @@ -43,9 +40,8 @@ By default, the floating-point partition key is not supported. To use it enable When inserting new data to a table, this data is stored as a separate part (chunk) sorted by the primary key. In 10-15 minutes after inserting, the parts of the same partition are merged into the entire part. -:::info -A merge only works for data parts that have the same value for the partitioning expression. This means **you shouldn’t make overly granular partitions** (more than about a thousand partitions). Otherwise, the `SELECT` query performs poorly because of an unreasonably large number of files in the file system and open file descriptors. -::: +!!! info "Info" + A merge only works for data parts that have the same value for the partitioning expression. This means **you shouldn’t make overly granular partitions** (more than about a thousand partitions). Otherwise, the `SELECT` query performs poorly because of an unreasonably large number of files in the file system and open file descriptors. Use the [system.parts](../../../operations/system-tables/parts.md#system_tables-parts) table to view the table parts and partitions. For example, let’s assume that we have a `visits` table with partitioning by month. Let’s perform the `SELECT` query for the `system.parts` table: @@ -82,9 +78,8 @@ Let’s break down the name of the part: `201901_1_9_2_11`: - `2` is the chunk level (the depth of the merge tree it is formed from). - `11` is the mutation version (if a part mutated) -:::info -The parts of old-type tables have the name: `20190117_20190123_2_2_0` (minimum date - maximum date - minimum block number - maximum block number - level). -::: +!!! info "Info" + The parts of old-type tables have the name: `20190117_20190123_2_2_0` (minimum date - maximum date - minimum block number - maximum block number - level). The `active` column shows the status of the part. `1` is active; `0` is inactive. The inactive parts are, for example, source parts remaining after merging to a larger part. The corrupted data parts are also indicated as inactive. diff --git a/docs/en/engines/table-engines/mergetree-family/graphitemergetree.md b/docs/en/engines/table-engines/mergetree-family/graphitemergetree.md index 35f3f99d5a9..e1d571c909c 100644 --- a/docs/en/engines/table-engines/mergetree-family/graphitemergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/graphitemergetree.md @@ -1,6 +1,6 @@ --- -sidebar_position: 90 -sidebar_label: GraphiteMergeTree +toc_priority: 38 +toc_title: GraphiteMergeTree --- # GraphiteMergeTree {#graphitemergetree} @@ -54,9 +54,8 @@ When creating a `GraphiteMergeTree` table, the same [clauses](../../../engines/t Deprecated Method for Creating a Table -:::warning -Do not use this method in new projects and, if possible, switch old projects to the method described above. -::: +!!! attention "Attention" + Do not use this method in new projects and, if possible, switch the old projects to the method described above. ``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] @@ -120,13 +119,12 @@ default ... ``` -:::warning -Patterns must be strictly ordered: +!!! warning "Attention" + Patterns must be strictly ordered: -1. Patterns without `function` or `retention`. -1. Patterns with both `function` and `retention`. -1. Pattern `default`. -::: + 1. Patterns without `function` or `retention`. + 1. Patterns with both `function` and `retention`. + 1. Pattern `default`. When processing a row, ClickHouse checks the rules in the `pattern` sections. Each of `pattern` (including `default`) sections can contain `function` parameter for aggregation, `retention` parameters or both. If the metric name matches the `regexp`, the rules from the `pattern` section (or sections) are applied; otherwise, the rules from the `default` section are used. @@ -255,6 +253,7 @@ Valid values: ``` -:::warning -Data rollup is performed during merges. Usually, for old partitions, merges are not started, so for rollup it is necessary to trigger an unscheduled merge using [optimize](../../../sql-reference/statements/optimize.md). Or use additional tools, for example [graphite-ch-optimizer](https://github.com/innogames/graphite-ch-optimizer). -::: +!!! warning "Warning" + Data rollup is performed during merges. Usually, for old partitions, merges are not started, so for rollup it is necessary to trigger an unscheduled merge using [optimize](../../../sql-reference/statements/optimize.md). Or use additional tools, for example [graphite-ch-optimizer](https://github.com/innogames/graphite-ch-optimizer). + +[Original article](https://clickhouse.com/docs/en/operations/table_engines/graphitemergetree/) diff --git a/docs/en/engines/table-engines/mergetree-family/index.md b/docs/en/engines/table-engines/mergetree-family/index.md index 37e7bf5b589..32796a252ac 100644 --- a/docs/en/engines/table-engines/mergetree-family/index.md +++ b/docs/en/engines/table-engines/mergetree-family/index.md @@ -1,6 +1,7 @@ --- -sidebar_position: 10 -sidebar_label: MergeTree Family +toc_folder_title: MergeTree Family +toc_priority: 28 +toc_title: Introduction --- # MergeTree Engine Family {#mergetree-engine-family} diff --git a/docs/en/engines/table-engines/mergetree-family/mergetree.md b/docs/en/engines/table-engines/mergetree-family/mergetree.md index 1195ee55dc7..b70cd225cdd 100644 --- a/docs/en/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md @@ -1,6 +1,6 @@ --- -sidebar_position: 11 -sidebar_label: MergeTree +toc_priority: 30 +toc_title: MergeTree --- # MergeTree {#table_engines-mergetree} @@ -27,9 +27,8 @@ Main features: If necessary, you can set the data sampling method in the table. -:::info -The [Merge](../../../engines/table-engines/special/merge.md#merge) engine does not belong to the `*MergeTree` family. -::: +!!! info "Info" + The [Merge](../../../engines/table-engines/special/merge.md#merge) engine does not belong to the `*MergeTree` family. ## Creating a Table {#table_engine-mergetree-creating-a-table} @@ -128,9 +127,8 @@ The `index_granularity` setting can be omitted because 8192 is the default value Deprecated Method for Creating a Table -:::warning -Do not use this method in new projects. If possible, switch old projects to the method described above. -::: +!!! attention "Attention" + Do not use this method in new projects. If possible, switch old projects to the method described above. ``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] @@ -306,8 +304,8 @@ CREATE TABLE table_name Indices from the example can be used by ClickHouse to reduce the amount of data to read from disk in the following queries: ``` sql -SELECT count() FROM table WHERE s < 'z' -SELECT count() FROM table WHERE u64 * i32 == 10 AND u64 * length(s) >= 1234 +SELECT count() FROM table WHERE s < 'z' +SELECT count() FROM table WHERE u64 * i32 == 10 AND u64 * length(s) >= 1234 ``` #### Available Types of Indices {#available-types-of-indices} @@ -366,7 +364,7 @@ The `set` index can be used with all functions. Function subsets for other index | Function (operator) / Index | primary key | minmax | ngrambf_v1 | tokenbf_v1 | bloom_filter | |------------------------------------------------------------------------------------------------------------|-------------|--------|-------------|-------------|---------------| | [equals (=, ==)](../../../sql-reference/functions/comparison-functions.md#function-equals) | ✔ | ✔ | ✔ | ✔ | ✔ | -| [notEquals(!=, <>)](../../../sql-reference/functions/comparison-functions.md#function-notequals) | ✔ | ✔ | ✔ | ✔ | ✔ | +| [notEquals(!=, <>)](../../../sql-reference/functions/comparison-functions.md#function-notequals) | ✔ | ✔ | ✔ | ✔ | ✔ | | [like](../../../sql-reference/functions/string-search-functions.md#function-like) | ✔ | ✔ | ✔ | ✔ | ✗ | | [notLike](../../../sql-reference/functions/string-search-functions.md#function-notlike) | ✔ | ✔ | ✔ | ✔ | ✗ | | [startsWith](../../../sql-reference/functions/string-functions.md#startswith) | ✔ | ✔ | ✔ | ✔ | ✗ | @@ -384,10 +382,8 @@ The `set` index can be used with all functions. Function subsets for other index Functions with a constant argument that is less than ngram size can’t be used by `ngrambf_v1` for query optimization. -:::note -Bloom filters can have false positive matches, so the `ngrambf_v1`, `tokenbf_v1`, and `bloom_filter` indexes can not be used for optimizing queries where the result of a function is expected to be false. - -For example: +!!! note "Note" + Bloom filters can have false positive matches, so the `ngrambf_v1`, `tokenbf_v1`, and `bloom_filter` indexes can’t be used for optimizing queries where the result of a function is expected to be false, for example: - Can be optimized: - `s LIKE '%test%'` @@ -395,13 +391,12 @@ For example: - `s = 1` - `NOT s != 1` - `startsWith(s, 'test')` -- Can not be optimized: +- Can’t be optimized: - `NOT s LIKE '%test%'` - `s NOT LIKE '%test%'` - `NOT s = 1` - `s != 1` - `NOT startsWith(s, 'test')` -::: ## Projections {#projections} Projections are like [materialized views](../../../sql-reference/statements/create/view.md#materialized) but defined in part-level. It provides consistency guarantees along with automatic usage in queries. diff --git a/docs/en/engines/table-engines/mergetree-family/replacingmergetree.md b/docs/en/engines/table-engines/mergetree-family/replacingmergetree.md index 47651527f99..ca0db24e640 100644 --- a/docs/en/engines/table-engines/mergetree-family/replacingmergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/replacingmergetree.md @@ -1,6 +1,6 @@ --- -sidebar_position: 40 -sidebar_label: ReplacingMergeTree +toc_priority: 33 +toc_title: ReplacingMergeTree --- # ReplacingMergeTree {#replacingmergetree} @@ -29,9 +29,8 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] For a description of request parameters, see [statement description](../../../sql-reference/statements/create/table.md). -:::warning -Uniqueness of rows is determined by the `ORDER BY` table section, not `PRIMARY KEY`. -::: +!!! note "Attention" + Uniqueness of rows is determined by the `ORDER BY` table section, not `PRIMARY KEY`. **ReplacingMergeTree Parameters** @@ -50,9 +49,8 @@ When creating a `ReplacingMergeTree` table the same [clauses](../../../engines/t Deprecated Method for Creating a Table -:::warning -Do not use this method in new projects and, if possible, switch old projects to the method described above. -::: +!!! attention "Attention" + Do not use this method in new projects and, if possible, switch the old projects to the method described above. ``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] diff --git a/docs/en/engines/table-engines/mergetree-family/replication.md b/docs/en/engines/table-engines/mergetree-family/replication.md index 67c503854a9..d574bd9449e 100644 --- a/docs/en/engines/table-engines/mergetree-family/replication.md +++ b/docs/en/engines/table-engines/mergetree-family/replication.md @@ -1,6 +1,6 @@ --- -sidebar_position: 20 -sidebar_label: Data Replication +toc_priority: 31 +toc_title: Data Replication --- # Data Replication {#table_engines-replication} @@ -31,9 +31,8 @@ ClickHouse uses [Apache ZooKeeper](https://zookeeper.apache.org) for storing rep To use replication, set parameters in the [zookeeper](../../../operations/server-configuration-parameters/settings.md#server-settings_zookeeper) server configuration section. -:::warning -Don’t neglect the security setting. ClickHouse supports the `digest` [ACL scheme](https://zookeeper.apache.org/doc/current/zookeeperProgrammers.html#sc_ZooKeeperAccessControl) of the ZooKeeper security subsystem. -::: +!!! attention "Attention" + Don’t neglect the security setting. ClickHouse supports the `digest` [ACL scheme](https://zookeeper.apache.org/doc/current/zookeeperProgrammers.html#sc_ZooKeeperAccessControl) of the ZooKeeper security subsystem. Example of setting the addresses of the ZooKeeper cluster: diff --git a/docs/en/engines/table-engines/mergetree-family/summingmergetree.md b/docs/en/engines/table-engines/mergetree-family/summingmergetree.md index 5d180782ed3..5726acf000e 100644 --- a/docs/en/engines/table-engines/mergetree-family/summingmergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/summingmergetree.md @@ -1,6 +1,6 @@ --- -sidebar_position: 50 -sidebar_label: SummingMergeTree +toc_priority: 34 +toc_title: SummingMergeTree --- # SummingMergeTree {#summingmergetree} @@ -41,9 +41,8 @@ When creating a `SummingMergeTree` table the same [clauses](../../../engines/tab Deprecated Method for Creating a Table -:::warning -Do not use this method in new projects and, if possible, switch the old projects to the method described above. -::: +!!! attention "Attention" + Do not use this method in new projects and, if possible, switch the old projects to the method described above. ``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] diff --git a/docs/en/engines/table-engines/mergetree-family/versionedcollapsingmergetree.md b/docs/en/engines/table-engines/mergetree-family/versionedcollapsingmergetree.md index 77cf192dcda..8266bf34876 100644 --- a/docs/en/engines/table-engines/mergetree-family/versionedcollapsingmergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/versionedcollapsingmergetree.md @@ -1,6 +1,6 @@ --- -sidebar_position: 80 -sidebar_label: VersionedCollapsingMergeTree +toc_priority: 37 +toc_title: VersionedCollapsingMergeTree --- # VersionedCollapsingMergeTree {#versionedcollapsingmergetree} @@ -53,9 +53,8 @@ When creating a `VersionedCollapsingMergeTree` table, the same [clauses](../../. Deprecated Method for Creating a Table -:::warning -Do not use this method in new projects. If possible, switch old projects to the method described above. -::: +!!! attention "Attention" + Do not use this method in new projects. If possible, switch the old projects to the method described above. ``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] diff --git a/docs/en/engines/table-engines/special/buffer.md b/docs/en/engines/table-engines/special/buffer.md index a0aff2ec813..d1f92d347a4 100644 --- a/docs/en/engines/table-engines/special/buffer.md +++ b/docs/en/engines/table-engines/special/buffer.md @@ -1,6 +1,6 @@ --- -sidebar_position: 120 -sidebar_label: Buffer +toc_priority: 45 +toc_title: Buffer --- # Buffer Table Engine {#buffer} @@ -54,9 +54,8 @@ If the set of columns in the Buffer table does not match the set of columns in a If the types do not match for one of the columns in the Buffer table and a subordinate table, an error message is entered in the server log, and the buffer is cleared. The same thing happens if the subordinate table does not exist when the buffer is flushed. -:::warning -Running ALTER on the Buffer table in releases made before 26 Oct 2021 will cause a `Block structure mismatch` error (see [#15117](https://github.com/ClickHouse/ClickHouse/issues/15117) and [#30565](https://github.com/ClickHouse/ClickHouse/pull/30565)), so deleting the Buffer table and then recreating is the only option. It is advisable to check that this error is fixed in your release before trying to run ALTER on the Buffer table. -::: +!!! attention "Attention" + Running ALTER on the Buffer table in releases made before 26 Oct 2021 will cause a `Block structure mismatch` error (see [#15117](https://github.com/ClickHouse/ClickHouse/issues/15117) and [#30565](https://github.com/ClickHouse/ClickHouse/pull/30565)), so deleting the Buffer table and then recreating is the only option. It is advisable to check that this error is fixed in your release before trying to run ALTER on the Buffer table. If the server is restarted abnormally, the data in the buffer is lost. @@ -74,4 +73,4 @@ A Buffer table is used when too many INSERTs are received from a large number of Note that it does not make sense to insert data one row at a time, even for Buffer tables. This will only produce a speed of a few thousand rows per second, while inserting larger blocks of data can produce over a million rows per second (see the section “Performance”). -[Original article](https://clickhouse.com/docs/en/engines/table-engines/special/buffer/) +[Original article](https://clickhouse.com/docs/en/operations/table_engines/buffer/) diff --git a/docs/en/engines/table-engines/special/dictionary.md b/docs/en/engines/table-engines/special/dictionary.md index 67b97e37d44..d76adebe01e 100644 --- a/docs/en/engines/table-engines/special/dictionary.md +++ b/docs/en/engines/table-engines/special/dictionary.md @@ -1,6 +1,6 @@ --- -sidebar_position: 20 -sidebar_label: Dictionary +toc_priority: 35 +toc_title: Dictionary --- # Dictionary Table Engine {#dictionary} @@ -97,5 +97,3 @@ select * from products limit 1; **See Also** - [Dictionary function](../../../sql-reference/table-functions/dictionary.md#dictionary-function) - -[Original article](https://clickhouse.com/docs/en/engines/table-engines/special/dictionary/) diff --git a/docs/en/engines/table-engines/special/distributed.md b/docs/en/engines/table-engines/special/distributed.md index db89175e4d9..5072465687e 100644 --- a/docs/en/engines/table-engines/special/distributed.md +++ b/docs/en/engines/table-engines/special/distributed.md @@ -1,6 +1,6 @@ --- -sidebar_position: 10 -sidebar_label: Distributed +toc_priority: 33 +toc_title: Distributed --- # Distributed Table Engine {#distributed} @@ -64,19 +64,19 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] AS [db2.]name2 - `monitor_max_sleep_time_ms` - same as [distributed_directory_monitor_max_sleep_time_ms](../../../operations/settings/settings.md#distributed_directory_monitor_max_sleep_time_ms) -:::note -**Durability settings** (`fsync_...`): +!!! note "Note" -- Affect only asynchronous INSERTs (i.e. `insert_distributed_sync=false`) when data first stored on the initiator node disk and later asynchronously send to shards. -- May significantly decrease the inserts' performance -- Affect writing the data stored inside Distributed table folder into the **node which accepted your insert**. If you need to have guarantees of writing data to underlying MergeTree tables - see durability settings (`...fsync...`) in `system.merge_tree_settings` + **Durability settings** (`fsync_...`): -For **Insert limit settings** (`..._insert`) see also: + - Affect only asynchronous INSERTs (i.e. `insert_distributed_sync=false`) when data first stored on the initiator node disk and later asynchronously send to shards. + - May significantly decrease the inserts' performance + - Affect writing the data stored inside Distributed table folder into the **node which accepted your insert**. If you need to have guarantees of writing data to underlying MergeTree tables - see durability settings (`...fsync...`) in `system.merge_tree_settings` -- [insert_distributed_sync](../../../operations/settings/settings.md#insert_distributed_sync) setting -- [prefer_localhost_replica](../../../operations/settings/settings.md#settings-prefer-localhost-replica) setting -- `bytes_to_throw_insert` handled before `bytes_to_delay_insert`, so you should not set it to the value less then `bytes_to_delay_insert` -::: + For **Insert limit settings** (`..._insert`) see also: + + - [insert_distributed_sync](../../../operations/settings/settings.md#insert_distributed_sync) setting + - [prefer_localhost_replica](../../../operations/settings/settings.md#settings-prefer-localhost-replica) setting + - `bytes_to_throw_insert` handled before `bytes_to_delay_insert`, so you should not set it to the value less then `bytes_to_delay_insert` **Example** @@ -215,9 +215,8 @@ To learn more about how distibuted `in` and `global in` queries are processed, r - `_shard_num` — Contains the `shard_num` value from the table `system.clusters`. Type: [UInt32](../../../sql-reference/data-types/int-uint.md). -:::note -Since [remote](../../../sql-reference/table-functions/remote.md) and [cluster](../../../sql-reference/table-functions/cluster.md) table functions internally create temporary Distributed table, `_shard_num` is available there too. -::: +!!! note "Note" + Since [remote](../../../sql-reference/table-functions/remote.md) and [cluster](../../../sql-reference/table-functions/cluster.md) table functions internally create temporary Distributed table, `_shard_num` is available there too. **See Also** @@ -226,4 +225,3 @@ Since [remote](../../../sql-reference/table-functions/remote.md) and [cluster](. - [shardNum()](../../../sql-reference/functions/other-functions.md#shard-num) and [shardCount()](../../../sql-reference/functions/other-functions.md#shard-count) functions -[Original article](https://clickhouse.com/docs/en/engines/table-engines/special/distributed/) diff --git a/docs/en/engines/table-engines/special/external-data.md b/docs/en/engines/table-engines/special/external-data.md index 1f4336c74fe..4ec90905fe5 100644 --- a/docs/en/engines/table-engines/special/external-data.md +++ b/docs/en/engines/table-engines/special/external-data.md @@ -1,6 +1,6 @@ --- -sidebar_position: 130 -sidebar_label: External Data +toc_priority: 45 +toc_title: External Data --- # External Data for Query Processing {#external-data-for-query-processing} @@ -63,3 +63,4 @@ $ curl -F 'passwd=@passwd.tsv;' 'http://localhost:8123/?query=SELECT+shell,+coun For distributed query processing, the temporary tables are sent to all the remote servers. +[Original article](https://clickhouse.com/docs/en/operations/table_engines/external_data/) diff --git a/docs/en/engines/table-engines/special/file.md b/docs/en/engines/table-engines/special/file.md index 6e4449bf1a9..7673f45ca8d 100644 --- a/docs/en/engines/table-engines/special/file.md +++ b/docs/en/engines/table-engines/special/file.md @@ -1,6 +1,6 @@ --- -sidebar_position: 40 -sidebar_label: File +toc_priority: 37 +toc_title: File --- # File Table Engine {#table_engines-file} @@ -30,9 +30,8 @@ When creating table using `File(Format)` it creates empty subdirectory in that f You may manually create this subfolder and file in server filesystem and then [ATTACH](../../../sql-reference/statements/attach.md) it to table information with matching name, so you can query data from that file. -:::warning -Be careful with this functionality, because ClickHouse does not keep track of external changes to such files. The result of simultaneous writes via ClickHouse and outside of ClickHouse is undefined. -::: +!!! warning "Warning" + Be careful with this functionality, because ClickHouse does not keep track of external changes to such files. The result of simultaneous writes via ClickHouse and outside of ClickHouse is undefined. ## Example {#example} @@ -86,4 +85,4 @@ $ echo -e "1,2\n3,4" | clickhouse-local -q "CREATE TABLE table (a Int64, b Int64 - Indices - Replication -[Original article](https://clickhouse.com/docs/en/operations/table_engines/special/file/) +[Original article](https://clickhouse.com/docs/en/operations/table_engines/file/) diff --git a/docs/en/engines/table-engines/special/generate.md b/docs/en/engines/table-engines/special/generate.md index 453f3b5db0b..fabe31897bb 100644 --- a/docs/en/engines/table-engines/special/generate.md +++ b/docs/en/engines/table-engines/special/generate.md @@ -1,6 +1,6 @@ --- -sidebar_position: 140 -sidebar_label: GenerateRandom +toc_priority: 46 +toc_title: GenerateRandom --- # GenerateRandom Table Engine {#table_engines-generate} @@ -56,4 +56,4 @@ SELECT * FROM generate_engine_table LIMIT 3 - Indices - Replication -[Original article](https://clickhouse.com/docs/en/engines/table-engines/special/generate/) +[Original article](https://clickhouse.com/docs/en/operations/table_engines/generate/) diff --git a/docs/en/engines/table-engines/special/index.md b/docs/en/engines/table-engines/special/index.md index f87cd86c891..872c01385e0 100644 --- a/docs/en/engines/table-engines/special/index.md +++ b/docs/en/engines/table-engines/special/index.md @@ -1,6 +1,6 @@ --- -sidebar_position: 50 -sidebar_label: Special +toc_folder_title: Special +toc_priority: 31 --- # Special Table Engines {#special-table-engines} diff --git a/docs/en/engines/table-engines/special/join.md b/docs/en/engines/table-engines/special/join.md index 7d6f6e99b9f..4e4a5e9fc03 100644 --- a/docs/en/engines/table-engines/special/join.md +++ b/docs/en/engines/table-engines/special/join.md @@ -1,15 +1,14 @@ --- -sidebar_position: 70 -sidebar_label: Join +toc_priority: 40 +toc_title: Join --- # Join Table Engine {#join} Optional prepared data structure for usage in [JOIN](../../../sql-reference/statements/select/join.md#select-join) operations. -:::note -This is not an article about the [JOIN clause](../../../sql-reference/statements/select/join.md#select-join) itself. -::: +!!! note "Note" + This is not an article about the [JOIN clause](../../../sql-reference/statements/select/join.md#select-join) itself. ## Creating a Table {#creating-a-table} @@ -126,5 +125,3 @@ ALTER TABLE id_val_join DELETE WHERE id = 3; │ 1 │ 21 │ └────┴─────┘ ``` - -[Original article](https://clickhouse.com/docs/en/operations/table_engines/special/join/) diff --git a/docs/en/engines/table-engines/special/materializedview.md b/docs/en/engines/table-engines/special/materializedview.md index 6c9a5e84f60..75161829a7e 100644 --- a/docs/en/engines/table-engines/special/materializedview.md +++ b/docs/en/engines/table-engines/special/materializedview.md @@ -1,10 +1,10 @@ --- -sidebar_position: 100 -sidebar_label: MaterializedView +toc_priority: 43 +toc_title: MaterializedView --- # MaterializedView Table Engine {#materializedview} Used for implementing materialized views (for more information, see [CREATE VIEW](../../../sql-reference/statements/create/view.md#materialized)). For storing data, it uses a different engine that was specified when creating the view. When reading from a table, it just uses that engine. -[Original article](https://clickhouse.com/docs/en/engines/table-engines/special/materializedview/) +[Original article](https://clickhouse.com/docs/en/operations/table_engines/materializedview/) diff --git a/docs/en/engines/table-engines/special/memory.md b/docs/en/engines/table-engines/special/memory.md index 1e154a323d1..eb557d36c50 100644 --- a/docs/en/engines/table-engines/special/memory.md +++ b/docs/en/engines/table-engines/special/memory.md @@ -1,6 +1,6 @@ --- -sidebar_position: 110 -sidebar_label: Memory +toc_priority: 44 +toc_title: Memory --- # Memory Table Engine {#memory} @@ -15,4 +15,4 @@ Normally, using this table engine is not justified. However, it can be used for The Memory engine is used by the system for temporary tables with external query data (see the section “External data for processing a query”), and for implementing `GLOBAL IN` (see the section “IN operators”). -[Original article](https://clickhouse.com/docs/en/engines/table-engines/special/memory/) +[Original article](https://clickhouse.com/docs/en/operations/table_engines/memory/) diff --git a/docs/en/engines/table-engines/special/merge.md b/docs/en/engines/table-engines/special/merge.md index bcad7a0c1f6..27f783a3cea 100644 --- a/docs/en/engines/table-engines/special/merge.md +++ b/docs/en/engines/table-engines/special/merge.md @@ -1,6 +1,6 @@ --- -sidebar_position: 30 -sidebar_label: Merge +toc_priority: 36 +toc_title: Merge --- # Merge Table Engine {#merge} @@ -12,7 +12,7 @@ Reading is automatically parallelized. Writing to a table is not supported. When ## Creating a Table {#creating-a-table} ``` sql -CREATE TABLE ... Engine=Merge(db_name, tables_regexp) + CREATE TABLE ... Engine=Merge(db_name, tables_regexp) ``` **Engine Parameters** @@ -81,5 +81,3 @@ SELECT * FROM WatchLog; - [Virtual columns](../../../engines/table-engines/special/index.md#table_engines-virtual_columns) - [merge](../../../sql-reference/table-functions/merge.md) table function - -[Original article](https://clickhouse.com/docs/en/operations/table_engines/special/merge/) diff --git a/docs/en/engines/table-engines/special/null.md b/docs/en/engines/table-engines/special/null.md index 309b09ba779..39ed9c1c1a6 100644 --- a/docs/en/engines/table-engines/special/null.md +++ b/docs/en/engines/table-engines/special/null.md @@ -1,15 +1,13 @@ --- -sidebar_position: 50 -sidebar_label: 'Null' +toc_priority: 38 +toc_title: 'Null' --- # Null Table Engine {#null} When writing to a `Null` table, data is ignored. When reading from a `Null` table, the response is empty. -:::note -If you are wondering why this is useful, note that you can create a materialized view on a `Null` table. So the data written to the table will end up affecting the view, but original raw data will still be discarded. -::: +!!! info "Hint" + However, you can create a materialized view on a `Null` table. So the data written to the table will end up affecting the view, but original raw data will still be discarded. - -[Original article](https://clickhouse.com/docs/en/operations/table_engines/special/null/) +[Original article](https://clickhouse.com/docs/en/operations/table_engines/null/) diff --git a/docs/en/engines/table-engines/special/set.md b/docs/en/engines/table-engines/special/set.md index 5fd80ba55fe..c38c2418093 100644 --- a/docs/en/engines/table-engines/special/set.md +++ b/docs/en/engines/table-engines/special/set.md @@ -1,6 +1,6 @@ --- -sidebar_position: 60 -sidebar_label: Set +toc_priority: 39 +toc_title: Set --- # Set Table Engine {#set} @@ -20,4 +20,4 @@ When creating a table, the following settings are applied: - [persistent](../../../operations/settings/settings.md#persistent) -[Original article](https://clickhouse.com/docs/en/operations/table_engines/special/set/) +[Original article](https://clickhouse.com/docs/en/operations/table_engines/set/) diff --git a/docs/en/engines/table-engines/special/url.md b/docs/en/engines/table-engines/special/url.md index 64642623f88..26d928085ce 100644 --- a/docs/en/engines/table-engines/special/url.md +++ b/docs/en/engines/table-engines/special/url.md @@ -1,6 +1,6 @@ --- -sidebar_position: 80 -sidebar_label: URL +toc_priority: 41 +toc_title: URL --- # URL Table Engine {#table_engines-url} @@ -89,4 +89,4 @@ SELECT * FROM url_engine_table - Indexes. - Replication. -[Original article](https://clickhouse.com/docs/en/operations/table_engines/special/url/) +[Original article](https://clickhouse.com/docs/en/operations/table_engines/url/) diff --git a/docs/en/engines/table-engines/special/view.md b/docs/en/engines/table-engines/special/view.md index 455c301fb01..9b847a0e2d5 100644 --- a/docs/en/engines/table-engines/special/view.md +++ b/docs/en/engines/table-engines/special/view.md @@ -1,10 +1,10 @@ --- -sidebar_position: 90 -sidebar_label: View +toc_priority: 42 +toc_title: View --- # View Table Engine {#table_engines-view} Used for implementing views (for more information, see the `CREATE VIEW query`). It does not store data, but only stores the specified `SELECT` query. When reading from a table, it runs this query (and deletes all unnecessary columns from the query). -[Original article](https://clickhouse.com/docs/en/operations/table_engines/special/view/) +[Original article](https://clickhouse.com/docs/en/operations/table_engines/view/) diff --git a/docs/en/example-datasets/_category_.yml b/docs/en/example-datasets/_category_.yml deleted file mode 100644 index 310ce834a92..00000000000 --- a/docs/en/example-datasets/_category_.yml +++ /dev/null @@ -1,8 +0,0 @@ -position: 10 -label: 'Example Datasets' -collapsible: true -collapsed: true -link: - type: generated-index - title: Example Datasets - slug: /en/example-datasets \ No newline at end of file diff --git a/docs/en/faq/general/columnar-database.md b/docs/en/faq/general/columnar-database.md new file mode 100644 index 00000000000..11bbd2e63f6 --- /dev/null +++ b/docs/en/faq/general/columnar-database.md @@ -0,0 +1,25 @@ +--- +title: What is a columnar database? +toc_hidden: true +toc_priority: 101 +--- + +# What Is a Columnar Database? {#what-is-a-columnar-database} + +A columnar database stores data of each column independently. This allows to read data from disks only for those columns that are used in any given query. The cost is that operations that affect whole rows become proportionally more expensive. The synonym for a columnar database is a column-oriented database management system. ClickHouse is a typical example of such a system. + +Key columnar database advantages are: + +- Queries that use only a few columns out of many. +- Aggregating queries against large volumes of data. +- Column-wise data compression. + +Here is the illustration of the difference between traditional row-oriented systems and columnar databases when building reports: + +**Traditional row-oriented** +![Traditional row-oriented](https://clickhouse.com/docs/en/images/row-oriented.gif#) + +**Columnar** +![Columnar](https://clickhouse.com/docs/en/images/column-oriented.gif#) + +A columnar database is a preferred choice for analytical applications because it allows to have many columns in a table just in case, but do not pay the cost for unused columns on read query execution time. Column-oriented databases are designed for big data processing and data warehousing, because they often natively scale using distributed clusters of low-cost hardware to increase throughput. ClickHouse does it with combination of [distributed](../../engines/table-engines/special/distributed.md) and [replicated](../../engines/table-engines/mergetree-family/replication.md) tables. diff --git a/docs/en/faq/general/dbms-naming.md b/docs/en/faq/general/dbms-naming.md new file mode 100644 index 00000000000..d4e87ff450a --- /dev/null +++ b/docs/en/faq/general/dbms-naming.md @@ -0,0 +1,17 @@ +--- +title: "What does \u201CClickHouse\u201D mean?" +toc_hidden: true +toc_priority: 10 +--- + +# What Does “ClickHouse” Mean? {#what-does-clickhouse-mean} + +It’s a combination of “**Click**stream” and “Data ware**House**”. It comes from the original use case at Yandex.Metrica, where ClickHouse was supposed to keep records of all clicks by people from all over the Internet, and it still does the job. You can read more about this use case on [ClickHouse history](../../introduction/history.md) page. + +This two-part meaning has two consequences: + +- The only correct way to write Click**H**ouse is with capital H. +- If you need to abbreviate it, use **CH**. For some historical reasons, abbreviating as CK is also popular in China, mostly because one of the first talks about ClickHouse in Chinese used this form. + +!!! info "Fun fact" + Many years after ClickHouse got its name, this approach of combining two words that are meaningful on their own has been highlighted as the best way to name a database in a [research by Andy Pavlo](https://www.cs.cmu.edu/~pavlo/blog/2020/03/on-naming-a-database-management-system.html), an Associate Professor of Databases at Carnegie Mellon University. ClickHouse shared his “best database name of all time” award with Postgres. diff --git a/docs/en/faq/general/how-do-i-contribute-code-to-clickhouse.md b/docs/en/faq/general/how-do-i-contribute-code-to-clickhouse.md new file mode 100644 index 00000000000..731dc9dface --- /dev/null +++ b/docs/en/faq/general/how-do-i-contribute-code-to-clickhouse.md @@ -0,0 +1,15 @@ +--- +title: How do I contribute code to ClickHouse? +toc_hidden: true +toc_priority: 120 +--- + +# How do I contribute code to ClickHouse? {#how-do-i-contribute-code-to-clickhouse} + +ClickHouse is an open-source project [developed on GitHub](https://github.com/ClickHouse/ClickHouse). + +As customary, contribution instructions are published in [CONTRIBUTING.md](https://github.com/ClickHouse/ClickHouse/blob/master/CONTRIBUTING.md) file in the root of the source code repository. + +If you want to suggest a substantial change to ClickHouse, consider [opening a GitHub issue](https://github.com/ClickHouse/ClickHouse/issues/new/choose) explaining what you want to do, to discuss it with maintainers and community first. [Examples of such RFC issues](https://github.com/ClickHouse/ClickHouse/issues?q=is%3Aissue+is%3Aopen+rfc). + +If your contributions are security related, please check out [our security policy](https://github.com/ClickHouse/ClickHouse/security/policy/) too. diff --git a/docs/en/faq/general/index.md b/docs/en/faq/general/index.md new file mode 100644 index 00000000000..51fff9a53ae --- /dev/null +++ b/docs/en/faq/general/index.md @@ -0,0 +1,25 @@ +--- +title: General questions about ClickHouse +toc_hidden_folder: true +toc_priority: 1 +toc_title: General +--- + +# General Questions About ClickHouse {#general-questions} + +Questions: + +- [What is ClickHouse?](../../index.md#what-is-clickhouse) +- [Why ClickHouse is so fast?](../../faq/general/why-clickhouse-is-so-fast.md) +- [Who is using ClickHouse?](../../faq/general/who-is-using-clickhouse.md) +- [What does “ClickHouse” mean?](../../faq/general/dbms-naming.md) +- [What does “Не тормозит” mean?](../../faq/general/ne-tormozit.md) +- [What is OLAP?](../../faq/general/olap.md) +- [What is a columnar database?](../../faq/general/columnar-database.md) +- [Why not use something like MapReduce?](../../faq/general/mapreduce.md) +- [How do I contribute code to ClickHouse?](../../faq/general/how-do-i-contribute-code-to-clickhouse.md) + +!!! info "Don’t see what you were looking for?" + Check out [other F.A.Q. categories](../../faq/index.md) or browse around main documentation articles found in the left sidebar. + +{## [Original article](https://clickhouse.com/docs/en/faq/general/) ##} diff --git a/docs/en/faq/general/mapreduce.md b/docs/en/faq/general/mapreduce.md new file mode 100644 index 00000000000..30cae65cba2 --- /dev/null +++ b/docs/en/faq/general/mapreduce.md @@ -0,0 +1,13 @@ +--- +title: Why not use something like MapReduce? +toc_hidden: true +toc_priority: 110 +--- + +# Why Not Use Something Like MapReduce? {#why-not-use-something-like-mapreduce} + +We can refer to systems like MapReduce as distributed computing systems in which the reduce operation is based on distributed sorting. The most common open-source solution in this class is [Apache Hadoop](http://hadoop.apache.org). Large IT companies often have proprietary in-house solutions. + +These systems aren’t appropriate for online queries due to their high latency. In other words, they can’t be used as the back-end for a web interface. These types of systems aren’t useful for real-time data updates. Distributed sorting isn’t the best way to perform reduce operations if the result of the operation and all the intermediate results (if there are any) are located in the RAM of a single server, which is usually the case for online queries. In such a case, a hash table is an optimal way to perform reduce operations. A common approach to optimizing map-reduce tasks is pre-aggregation (partial reduce) using a hash table in RAM. The user performs this optimization manually. Distributed sorting is one of the main causes of reduced performance when running simple map-reduce tasks. + +Most MapReduce implementations allow you to execute arbitrary code on a cluster. But a declarative query language is better suited to OLAP to run experiments quickly. For example, Hadoop has Hive and Pig. Also consider Cloudera Impala or Shark (outdated) for Spark, as well as Spark SQL, Presto, and Apache Drill. Performance when running such tasks is highly sub-optimal compared to specialized systems, but relatively high latency makes it unrealistic to use these systems as the backend for a web interface. diff --git a/docs/en/faq/general/ne-tormozit.md b/docs/en/faq/general/ne-tormozit.md new file mode 100644 index 00000000000..e8dc7388eff --- /dev/null +++ b/docs/en/faq/general/ne-tormozit.md @@ -0,0 +1,26 @@ +--- +title: "What does \u201C\u043D\u0435 \u0442\u043E\u0440\u043C\u043E\u0437\u0438\u0442\ + \u201D mean?" +toc_hidden: true +toc_priority: 11 +--- + +# What Does “Не тормозит” Mean? {#what-does-ne-tormozit-mean} + +This question usually arises when people see official ClickHouse t-shirts. They have large words **“ClickHouse не тормозит”** on the front. + +Before ClickHouse became open-source, it has been developed as an in-house storage system by the largest Russian IT company, Yandex. That’s why it initially got its slogan in Russian, which is “не тормозит” (pronounced as “ne tormozit”). After the open-source release we first produced some of those t-shirts for events in Russia and it was a no-brainer to use the slogan as-is. + +One of the following batches of those t-shirts was supposed to be given away on events outside of Russia and we tried to make the English version of the slogan. Unfortunately, the Russian language is kind of elegant in terms of expressing stuff and there was a restriction of limited space on a t-shirt, so we failed to come up with good enough translation (most options appeared to be either long or inaccurate) and decided to keep the slogan in Russian even on t-shirts produced for international events. It appeared to be a great decision because people all over the world get positively surprised and curious when they see it. + +So, what does it mean? Here are some ways to translate *“не тормозит”*: + +- If you translate it literally, it’d be something like *“ClickHouse does not press the brake pedal”*. +- If you’d want to express it as close to how it sounds to a Russian person with IT background, it’d be something like *“If your larger system lags, it’s not because it uses ClickHouse”*. +- Shorter, but not so precise versions could be *“ClickHouse is not slow”*, *“ClickHouse does not lag”* or just *“ClickHouse is fast”*. + +If you haven’t seen one of those t-shirts in person, you can check them out online in many ClickHouse-related videos. For example, this one: + +![iframe](https://www.youtube.com/embed/bSyQahMVZ7w) + +P.S. These t-shirts are not for sale, they are given away for free on most [ClickHouse Meetups](https://clickhouse.com/#meet), usually for best questions or other forms of active participation. diff --git a/docs/en/faq/general/olap.md b/docs/en/faq/general/olap.md new file mode 100644 index 00000000000..1f6df183f8c --- /dev/null +++ b/docs/en/faq/general/olap.md @@ -0,0 +1,39 @@ +--- +title: What is OLAP? +toc_hidden: true +toc_priority: 100 +--- + +# What Is OLAP? {#what-is-olap} + +[OLAP](https://en.wikipedia.org/wiki/Online_analytical_processing) stands for Online Analytical Processing. It is a broad term that can be looked at from two perspectives: technical and business. But at the very high level, you can just read these words backward: + +Processing +: Some source data is processed… + +Analytical +: …to produce some analytical reports and insights… + +Online +: …in real-time. + +## OLAP from the Business Perspective {#olap-from-the-business-perspective} + +In recent years, business people started to realize the value of data. Companies who make their decisions blindly, more often than not fail to keep up with the competition. The data-driven approach of successful companies forces them to collect all data that might be remotely useful for making business decisions and need mechanisms to timely analyze them. Here’s where OLAP database management systems (DBMS) come in. + +In a business sense, OLAP allows companies to continuously plan, analyze, and report operational activities, thus maximizing efficiency, reducing expenses, and ultimately conquering the market share. It could be done either in an in-house system or outsourced to SaaS providers like web/mobile analytics services, CRM services, etc. OLAP is the technology behind many BI applications (Business Intelligence). + +ClickHouse is an OLAP database management system that is pretty often used as a backend for those SaaS solutions for analyzing domain-specific data. However, some businesses are still reluctant to share their data with third-party providers and an in-house data warehouse scenario is also viable. + +## OLAP from the Technical Perspective {#olap-from-the-technical-perspective} + +All database management systems could be classified into two groups: OLAP (Online **Analytical** Processing) and OLTP (Online **Transactional** Processing). Former focuses on building reports, each based on large volumes of historical data, but doing it not so frequently. While the latter usually handle a continuous stream of transactions, constantly modifying the current state of data. + +In practice OLAP and OLTP are not categories, it’s more like a spectrum. Most real systems usually focus on one of them but provide some solutions or workarounds if the opposite kind of workload is also desired. This situation often forces businesses to operate multiple storage systems integrated, which might be not so big deal but having more systems make it more expensive to maintain. So the trend of recent years is HTAP (**Hybrid Transactional/Analytical Processing**) when both kinds of the workload are handled equally well by a single database management system. + +Even if a DBMS started as a pure OLAP or pure OLTP, they are forced to move towards that HTAP direction to keep up with their competition. And ClickHouse is no exception, initially, it has been designed as [fast-as-possible OLAP system](../../faq/general/why-clickhouse-is-so-fast.md) and it still does not have full-fledged transaction support, but some features like consistent read/writes and mutations for updating/deleting data had to be added. + +The fundamental trade-off between OLAP and OLTP systems remains: + +- To build analytical reports efficiently it’s crucial to be able to read columns separately, thus most OLAP databases are [columnar](../../faq/general/columnar-database.md), +- While storing columns separately increases costs of operations on rows, like append or in-place modification, proportionally to the number of columns (which can be huge if the systems try to collect all details of an event just in case). Thus, most OLTP systems store data arranged by rows. diff --git a/docs/en/faq/general/who-is-using-clickhouse.md b/docs/en/faq/general/who-is-using-clickhouse.md new file mode 100644 index 00000000000..b7ff867d726 --- /dev/null +++ b/docs/en/faq/general/who-is-using-clickhouse.md @@ -0,0 +1,19 @@ +--- +title: Who is using ClickHouse? +toc_hidden: true +toc_priority: 9 +--- + +# Who Is Using ClickHouse? {#who-is-using-clickhouse} + +Being an open-source product makes this question not so straightforward to answer. You do not have to tell anyone if you want to start using ClickHouse, you just go grab source code or pre-compiled packages. There’s no contract to sign and the [Apache 2.0 license](https://github.com/ClickHouse/ClickHouse/blob/master/LICENSE) allows for unconstrained software distribution. + +Also, the technology stack is often in a grey zone of what’s covered by an NDA. Some companies consider technologies they use as a competitive advantage even if they are open-source and do not allow employees to share any details publicly. Some see some PR risks and allow employees to share implementation details only with their PR department approval. + +So how to tell who is using ClickHouse? + +One way is to **ask around**. If it’s not in writing, people are much more willing to share what technologies are used in their companies, what the use cases are, what kind of hardware is used, data volumes, etc. We’re talking with users regularly on [ClickHouse Meetups](https://www.youtube.com/channel/UChtmrD-dsdpspr42P_PyRAw/playlists) all over the world and have heard stories about 1000+ companies that use ClickHouse. Unfortunately, that’s not reproducible and we try to treat such stories as if they were told under NDA to avoid any potential troubles. But you can come to any of our future meetups and talk with other users on your own. There are multiple ways how meetups are announced, for example, you can subscribe to [our Twitter](http://twitter.com/ClickHouseDB/). + +The second way is to look for companies **publicly saying** that they use ClickHouse. It’s more substantial because there’s usually some hard evidence like a blog post, talk video recording, slide deck, etc. We collect the collection of links to such evidence on our **[Adopters](../../introduction/adopters.md)** page. Feel free to contribute the story of your employer or just some links you’ve stumbled upon (but try not to violate your NDA in the process). + +You can find names of very large companies in the adopters list, like Bloomberg, Cisco, China Telecom, Tencent, or Uber, but with the first approach, we found that there are many more. For example, if you take [the list of largest IT companies by Forbes (2020)](https://www.forbes.com/sites/hanktucker/2020/05/13/worlds-largest-technology-companies-2020-apple-stays-on-top-zoom-and-uber-debut/) over half of them are using ClickHouse in some way. Also, it would be unfair not to mention [Yandex](../../introduction/history.md), the company which initially open-sourced ClickHouse in 2016 and happens to be one of the largest IT companies in Europe. diff --git a/docs/en/faq/general/why-clickhouse-is-so-fast.md b/docs/en/faq/general/why-clickhouse-is-so-fast.md new file mode 100644 index 00000000000..1ccf2595768 --- /dev/null +++ b/docs/en/faq/general/why-clickhouse-is-so-fast.md @@ -0,0 +1,63 @@ +--- +title: Why ClickHouse is so fast? +toc_hidden: true +toc_priority: 8 +--- + +# Why ClickHouse Is So Fast? {#why-clickhouse-is-so-fast} + +It was designed to be fast. Query execution performance has always been a top priority during the development process, but other important characteristics like user-friendliness, scalability, and security were also considered so ClickHouse could become a real production system. + +ClickHouse was initially built as a prototype to do just a single task well: to filter and aggregate data as fast as possible. That’s what needs to be done to build a typical analytical report and that’s what a typical [GROUP BY](../../sql-reference/statements/select/group-by.md) query does. ClickHouse team has made several high-level decisions that combined made achieving this task possible: + +Column-oriented storage +: Source data often contain hundreds or even thousands of columns, while a report can use just a few of them. The system needs to avoid reading unnecessary columns, or most expensive disk read operations would be wasted. + +Indexes +: ClickHouse keeps data structures in memory that allows reading not only used columns but only necessary row ranges of those columns. + +Data compression +: Storing different values of the same column together often leads to better compression ratios (compared to row-oriented systems) because in real data column often has the same or not so many different values for neighboring rows. In addition to general-purpose compression, ClickHouse supports [specialized codecs](../../sql-reference/statements/create/table.md#create-query-specialized-codecs) that can make data even more compact. + +Vectorized query execution +: ClickHouse not only stores data in columns but also processes data in columns. It leads to better CPU cache utilization and allows for [SIMD](https://en.wikipedia.org/wiki/SIMD) CPU instructions usage. + +Scalability +: ClickHouse can leverage all available CPU cores and disks to execute even a single query. Not only on a single server but all CPU cores and disks of a cluster as well. + +But many other database management systems use similar techniques. What really makes ClickHouse stand out is **attention to low-level details**. Most programming languages provide implementations for most common algorithms and data structures, but they tend to be too generic to be effective. Every task can be considered as a landscape with various characteristics, instead of just throwing in random implementation. For example, if you need a hash table, here are some key questions to consider: + +- Which hash function to choose? +- Collision resolution algorithm: [open addressing](https://en.wikipedia.org/wiki/Open_addressing) vs [chaining](https://en.wikipedia.org/wiki/Hash_table#Separate_chaining)? +- Memory layout: one array for keys and values or separate arrays? Will it store small or large values? +- Fill factor: when and how to resize? How to move values around on resize? +- Will values be removed and which algorithm will work better if they will? +- Will we need fast probing with bitmaps, inline placement of string keys, support for non-movable values, prefetch, and batching? + +Hash table is a key data structure for `GROUP BY` implementation and ClickHouse automatically chooses one of [30+ variations](https://github.com/ClickHouse/ClickHouse/blob/master/src/Interpreters/Aggregator.h) for each specific query. + +The same goes for algorithms, for example, in sorting you might consider: + +- What will be sorted: an array of numbers, tuples, strings, or structures? +- Is all data available completely in RAM? +- Do we need a stable sort? +- Do we need a full sort? Maybe partial sort or n-th element will suffice? +- How to implement comparisons? +- Are we sorting data that has already been partially sorted? + +Algorithms that they rely on characteristics of data they are working with can often do better than their generic counterparts. If it is not really known in advance, the system can try various implementations and choose the one that works best in runtime. For example, see an [article on how LZ4 decompression is implemented in ClickHouse](https://habr.com/en/company/yandex/blog/457612/). + +Last but not least, the ClickHouse team always monitors the Internet on people claiming that they came up with the best implementation, algorithm, or data structure to do something and tries it out. Those claims mostly appear to be false, but from time to time you’ll indeed find a gem. + +!!! info "Tips for building your own high-performance software" + + + - Keep in mind low-level details when designing your system. + - Design based on hardware capabilities. + - Choose data structures and abstractions based on the needs of the task. + - Provide specializations for special cases. + - Try new, “best” algorithms, that you read about yesterday. + - Choose an algorithm in runtime based on statistics. + - Benchmark on real datasets. + - Test for performance regressions in CI. + - Measure and observe everything. diff --git a/docs/en/faq/index.md b/docs/en/faq/index.md new file mode 100644 index 00000000000..891e1ea464e --- /dev/null +++ b/docs/en/faq/index.md @@ -0,0 +1,47 @@ +--- +toc_folder_title: F.A.Q. +toc_hidden: true +toc_priority: 76 +--- + +# ClickHouse F.A.Q {#clickhouse-f-a-q} + +This section of the documentation is a place to collect answers to ClickHouse-related questions that arise often. + +Categories: + +- **[General](../faq/general/index.md)** + - [What is ClickHouse?](../index.md#what-is-clickhouse) + - [Why ClickHouse is so fast?](../faq/general/why-clickhouse-is-so-fast.md) + - [Who is using ClickHouse?](../faq/general/who-is-using-clickhouse.md) + - [What does “ClickHouse” mean?](../faq/general/dbms-naming.md) + - [What does “Не тормозит” mean?](../faq/general/ne-tormozit.md) + - [What is OLAP?](../faq/general/olap.md) + - [What is a columnar database?](../faq/general/columnar-database.md) + - [Why not use something like MapReduce?](../faq/general/mapreduce.md) +- **[Use Cases](../faq/use-cases/index.md)** + - [Can I use ClickHouse as a time-series database?](../faq/use-cases/time-series.md) + - [Can I use ClickHouse as a key-value storage?](../faq/use-cases/key-value.md) +- **[Operations](../faq/operations/index.md)** + - [Which ClickHouse version to use in production?](../faq/operations/production.md) + - [Is it possible to delete old records from a ClickHouse table?](../faq/operations/delete-old-data.md) + - [Does ClickHouse support multi-region replication?](../faq/operations/multi-region-replication.md) +- **[Integration](../faq/integration/index.md)** + - [How do I export data from ClickHouse to a file?](../faq/integration/file-export.md) + - [What if I have a problem with encodings when connecting to Oracle via ODBC?](../faq/integration/oracle-odbc.md) + +{## TODO +Question candidates: +- How to choose a primary key? +- How to add a column in ClickHouse? +- Too many parts +- How to filter ClickHouse table by an array column contents? +- How to insert all rows from one table to another of identical structure? +- How to kill a process (query) in ClickHouse? +- How to implement pivot (like in pandas)? +- How to remove the default ClickHouse user through users.d? +- Importing MySQL dump to ClickHouse +- Window function workarounds (row_number, lag/lead, running diff/sum/average) +##} + +{## [Original article](https://clickhouse.com/docs/en/faq) ##} diff --git a/docs/en/faq/integration/file-export.md b/docs/en/faq/integration/file-export.md new file mode 100644 index 00000000000..f8f458929f9 --- /dev/null +++ b/docs/en/faq/integration/file-export.md @@ -0,0 +1,37 @@ +--- +title: How do I export data from ClickHouse to a file? +toc_hidden: true +toc_priority: 10 +--- + +# How Do I Export Data from ClickHouse to a File? {#how-to-export-to-file} + +## Using INTO OUTFILE Clause {#using-into-outfile-clause} + +Add an [INTO OUTFILE](../../sql-reference/statements/select/into-outfile.md#into-outfile-clause) clause to your query. + +For example: + +``` sql +SELECT * FROM table INTO OUTFILE 'file' +``` + +By default, ClickHouse uses the [TabSeparated](../../interfaces/formats.md#tabseparated) format for output data. To select the [data format](../../interfaces/formats.md), use the [FORMAT clause](../../sql-reference/statements/select/format.md#format-clause). + +For example: + +``` sql +SELECT * FROM table INTO OUTFILE 'file' FORMAT CSV +``` + +## Using a File-Engine Table {#using-a-file-engine-table} + +See [File](../../engines/table-engines/special/file.md) table engine. + +## Using Command-Line Redirection {#using-command-line-redirection} + +``` bash +$ clickhouse-client --query "SELECT * from table" --format FormatName > result.txt +``` + +See [clickhouse-client](../../interfaces/cli.md). diff --git a/docs/en/faq/integration/index.md b/docs/en/faq/integration/index.md new file mode 100644 index 00000000000..51a2593b751 --- /dev/null +++ b/docs/en/faq/integration/index.md @@ -0,0 +1,19 @@ +--- +title: Questions about integrating ClickHouse and other systems +toc_hidden_folder: true +toc_priority: 4 +toc_title: Integration +--- + +# Questions About Integrating ClickHouse and Other Systems {#question-about-integrating-clickhouse-and-other-systems} + +Questions: + +- [How do I export data from ClickHouse to a file?](../../faq/integration/file-export.md) +- [How to import JSON into ClickHouse?](../../faq/integration/json-import.md) +- [What if I have a problem with encodings when connecting to Oracle via ODBC?](../../faq/integration/oracle-odbc.md) + +!!! info "Don’t see what you were looking for?" + Check out [other F.A.Q. categories](../../faq/index.md) or browse around main documentation articles found in the left sidebar. + +{## [Original article](https://clickhouse.com/docs/en/faq/integration/) ##} diff --git a/docs/en/faq/integration/json-import.md b/docs/en/faq/integration/json-import.md new file mode 100644 index 00000000000..3fa026c794a --- /dev/null +++ b/docs/en/faq/integration/json-import.md @@ -0,0 +1,33 @@ +--- +title: How to import JSON into ClickHouse? +toc_hidden: true +toc_priority: 11 +--- + +# How to Import JSON Into ClickHouse? {#how-to-import-json-into-clickhouse} + +ClickHouse supports a wide range of [data formats for input and output](../../interfaces/formats.md). There are multiple JSON variations among them, but the most commonly used for data ingestion is [JSONEachRow](../../interfaces/formats.md#jsoneachrow). It expects one JSON object per row, each object separated by a newline. + +## Examples {#examples} + +Using [HTTP interface](../../interfaces/http.md): + +``` bash +$ echo '{"foo":"bar"}' | curl 'http://localhost:8123/?query=INSERT%20INTO%20test%20FORMAT%20JSONEachRow' --data-binary @- +``` + +Using [CLI interface](../../interfaces/cli.md): + +``` bash +$ echo '{"foo":"bar"}' | clickhouse-client --query="INSERT INTO test FORMAT JSONEachRow" +``` + +Instead of inserting data manually, you might consider to use one of [client libraries](../../interfaces/index.md) instead. + +## Useful Settings {#useful-settings} + +- `input_format_skip_unknown_fields` allows to insert JSON even if there were additional fields not present in table schema (by discarding them). +- `input_format_import_nested_json` allows to insert nested JSON objects into columns of [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) type. + +!!! note "Note" + Settings are specified as `GET` parameters for the HTTP interface or as additional command-line arguments prefixed with `--` for the `CLI` interface. diff --git a/docs/en/faq/integration/oracle-odbc.md b/docs/en/faq/integration/oracle-odbc.md new file mode 100644 index 00000000000..91265a3daa2 --- /dev/null +++ b/docs/en/faq/integration/oracle-odbc.md @@ -0,0 +1,15 @@ +--- +title: What if I have a problem with encodings when using Oracle via ODBC? +toc_hidden: true +toc_priority: 20 +--- + +# What If I Have a Problem with Encodings When Using Oracle Via ODBC? {#oracle-odbc-encodings} + +If you use Oracle as a source of ClickHouse external dictionaries via Oracle ODBC driver, you need to set the correct value for the `NLS_LANG` environment variable in `/etc/default/clickhouse`. For more information, see the [Oracle NLS_LANG FAQ](https://www.oracle.com/technetwork/products/globalization/nls-lang-099431.html). + +**Example** + +``` sql +NLS_LANG=RUSSIAN_RUSSIA.UTF8 +``` diff --git a/docs/en/faq/operations/delete-old-data.md b/docs/en/faq/operations/delete-old-data.md new file mode 100644 index 00000000000..32fc485e98a --- /dev/null +++ b/docs/en/faq/operations/delete-old-data.md @@ -0,0 +1,42 @@ +--- +title: Is it possible to delete old records from a ClickHouse table? +toc_hidden: true +toc_priority: 20 +--- + +# Is It Possible to Delete Old Records from a ClickHouse Table? {#is-it-possible-to-delete-old-records-from-a-clickhouse-table} + +The short answer is “yes”. ClickHouse has multiple mechanisms that allow freeing up disk space by removing old data. Each mechanism is aimed for different scenarios. + +## TTL {#ttl} + +ClickHouse allows to automatically drop values when some condition happens. This condition is configured as an expression based on any columns, usually just static offset for any timestamp column. + +The key advantage of this approach is that it does not need any external system to trigger, once TTL is configured, data removal happens automatically in background. + +!!! note "Note" + TTL can also be used to move data not only to [/dev/null](https://en.wikipedia.org/wiki/Null_device), but also between different storage systems, like from SSD to HDD. + +More details on [configuring TTL](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-ttl). + +## ALTER DELETE {#alter-delete} + +ClickHouse does not have real-time point deletes like in [OLTP](https://en.wikipedia.org/wiki/Online_transaction_processing) databases. The closest thing to them are mutations. They are issued as `ALTER ... DELETE` or `ALTER ... UPDATE` queries to distinguish from normal `DELETE` or `UPDATE` as they are asynchronous batch operations, not immediate modifications. The rest of syntax after `ALTER TABLE` prefix is similar. + +`ALTER DELETE` can be issued to flexibly remove old data. If you need to do it regularly, the main downside will be the need to have an external system to submit the query. There are also some performance considerations since mutation rewrite complete parts even there’s only a single row to be deleted. + +This is the most common approach to make your system based on ClickHouse [GDPR](https://gdpr-info.eu)-compliant. + +More details on [mutations](../../sql-reference/statements/alter/index.md#alter-mutations). + +## DROP PARTITION {#drop-partition} + +`ALTER TABLE ... DROP PARTITION` provides a cost-efficient way to drop a whole partition. It’s not that flexible and needs proper partitioning scheme configured on table creation, but still covers most common cases. Like mutations need to be executed from an external system for regular use. + +More details on [manipulating partitions](../../sql-reference/statements/alter/partition.md#alter_drop-partition). + +## TRUNCATE {#truncate} + +It’s rather radical to drop all data from a table, but in some cases it might be exactly what you need. + +More details on [table truncation](../../sql-reference/statements/truncate.md). diff --git a/docs/en/faq/operations/index.md b/docs/en/faq/operations/index.md new file mode 100644 index 00000000000..81aec18b9cf --- /dev/null +++ b/docs/en/faq/operations/index.md @@ -0,0 +1,19 @@ +--- +title: Question about operating ClickHouse servers and clusters +toc_hidden_folder: true +toc_priority: 3 +toc_title: Operations +--- + +# Question About Operating ClickHouse Servers and Clusters {#question-about-operating-clickhouse-servers-and-clusters} + +Questions: + +- [Which ClickHouse version to use in production?](../../faq/operations/production.md) +- [Is it possible to delete old records from a ClickHouse table?](../../faq/operations/delete-old-data.md) +- [Does ClickHouse support multi-region replication?](../../faq/operations/multi-region-replication.md) + +!!! info "Don’t see what you were looking for?" + Check out [other F.A.Q. categories](../../faq/index.md) or browse around main documentation articles found in the left sidebar. + +{## [Original article](https://clickhouse.com/docs/en/faq/production/) ##} diff --git a/docs/en/faq/operations/multi-region-replication.md b/docs/en/faq/operations/multi-region-replication.md new file mode 100644 index 00000000000..7d78737544a --- /dev/null +++ b/docs/en/faq/operations/multi-region-replication.md @@ -0,0 +1,13 @@ +--- +title: Does ClickHouse support multi-region replication? +toc_hidden: true +toc_priority: 30 +--- + +# Does ClickHouse support multi-region replication? {#does-clickhouse-support-multi-region-replication} + +The short answer is "yes". However, we recommend keeping latency between all regions/datacenters in two-digit range, otherwise write performance will suffer as it goes through distributed consensus protocol. For example, replication between US coasts will likely work fine, but between the US and Europe won't. + +Configuration-wise there's no difference compared to single-region replication, simply use hosts that are located in different locations for replicas. + +For more information, see [full article on data replication](../../engines/table-engines/mergetree-family/replication.md). diff --git a/docs/en/faq/operations/production.md b/docs/en/faq/operations/production.md new file mode 100644 index 00000000000..52ca300ced0 --- /dev/null +++ b/docs/en/faq/operations/production.md @@ -0,0 +1,70 @@ +--- +title: Which ClickHouse version to use in production? +toc_hidden: true +toc_priority: 10 +--- + +# Which ClickHouse Version to Use in Production? {#which-clickhouse-version-to-use-in-production} + +First of all, let’s discuss why people ask this question in the first place. There are two key reasons: + +1. ClickHouse is developed with pretty high velocity and usually, there are 10+ stable releases per year. It makes a wide range of releases to choose from, which is not so trivial choice. +2. Some users want to avoid spending time figuring out which version works best for their use case and just follow someone else’s advice. + +The second reason is more fundamental, so we’ll start with it and then get back to navigating through various ClickHouse releases. + +## Which ClickHouse Version Do You Recommend? {#which-clickhouse-version-do-you-recommend} + +It’s tempting to hire consultants or trust some known experts to get rid of responsibility for your production environment. You install some specific ClickHouse version that someone else recommended, now if there’s some issue with it - it’s not your fault, it’s someone else’s. This line of reasoning is a big trap. No external person knows better what’s going on in your company’s production environment. + +So how to properly choose which ClickHouse version to upgrade to? Or how to choose your first ClickHouse version? First of all, you need to invest in setting up a **realistic pre-production environment**. In an ideal world, it could be a completely identical shadow copy, but that’s usually expensive. + +Here’re some key points to get reasonable fidelity in a pre-production environment with not so high costs: + +- Pre-production environment needs to run an as close set of queries as you intend to run in production: + - Don’t make it read-only with some frozen data. + - Don’t make it write-only with just copying data without building some typical reports. + - Don’t wipe it clean instead of applying schema migrations. +- Use a sample of real production data and queries. Try to choose a sample that’s still representative and makes `SELECT` queries return reasonable results. Use obfuscation if your data is sensitive and internal policies do not allow it to leave the production environment. +- Make sure that pre-production is covered by your monitoring and alerting software the same way as your production environment does. +- If your production spans across multiple datacenters or regions, make your pre-production does the same. +- If your production uses complex features like replication, distributed table, cascading materialize views, make sure they are configured similarly in pre-production. +- There’s a trade-off on using the roughly same number of servers or VMs in pre-production as in production, but of smaller size, or much less of them, but of the same size. The first option might catch extra network-related issues, while the latter is easier to manage. + +The second area to invest in is **automated testing infrastructure**. Don’t assume that if some kind of query has executed successfully once, it’ll continue to do so forever. It’s ok to have some unit tests where ClickHouse is mocked but make sure your product has a reasonable set of automated tests that are run against real ClickHouse and check that all important use cases are still working as expected. + +Extra step forward could be contributing those automated tests to [ClickHouse’s open-source test infrastructure](https://github.com/ClickHouse/ClickHouse/tree/master/tests) that’s continuously used in its day-to-day development. It definitely will take some additional time and effort to learn [how to run it](../../development/tests.md) and then how to adapt your tests to this framework, but it’ll pay off by ensuring that ClickHouse releases are already tested against them when they are announced stable, instead of repeatedly losing time on reporting the issue after the fact and then waiting for a bugfix to be implemented, backported and released. Some companies even have such test contributions to infrastructure by its use as an internal policy, most notably it’s called [Beyonce’s Rule](https://www.oreilly.com/library/view/software-engineering-at/9781492082781/ch01.html#policies_that_scale_well) at Google. + +When you have your pre-production environment and testing infrastructure in place, choosing the best version is straightforward: + +1. Routinely run your automated tests against new ClickHouse releases. You can do it even for ClickHouse releases that are marked as `testing`, but going forward to the next steps with them is not recommended. +2. Deploy the ClickHouse release that passed the tests to pre-production and check that all processes are running as expected. +3. Report any issues you discovered to [ClickHouse GitHub Issues](https://github.com/ClickHouse/ClickHouse/issues). +4. If there were no major issues, it should be safe to start deploying ClickHouse release to your production environment. Investing in gradual release automation that implements an approach similar to [canary releases](https://martinfowler.com/bliki/CanaryRelease.html) or [green-blue deployments](https://martinfowler.com/bliki/BlueGreenDeployment.html) might further reduce the risk of issues in production. + +As you might have noticed, there’s nothing specific to ClickHouse in the approach described above, people do that for any piece of infrastructure they rely on if they take their production environment seriously. + +## How to Choose Between ClickHouse Releases? {#how-to-choose-between-clickhouse-releases} + +If you look into contents of ClickHouse package repository, you’ll see four kinds of packages: + +1. `testing` +2. `prestable` +3. `stable` +4. `lts` (long-term support) + +As was mentioned earlier, `testing` is good mostly to notice issues early, running them in production is not recommended because each of them is not tested as thoroughly as other kinds of packages. + +`prestable` is a release candidate which generally looks promising and is likely to become announced as `stable` soon. You can try them out in pre-production and report issues if you see any. + +For production use, there are two key options: `stable` and `lts`. Here is some guidance on how to choose between them: + +- `stable` is the kind of package we recommend by default. They are released roughly monthly (and thus provide new features with reasonable delay) and three latest stable releases are supported in terms of diagnostics and backporting of bugfixes. +- `lts` are released twice a year and are supported for a year after their initial release. You might prefer them over `stable` in the following cases: + - Your company has some internal policies that do not allow for frequent upgrades or using non-LTS software. + - You are using ClickHouse in some secondary products that either does not require any complex ClickHouse features and do not have enough resources to keep it updated. + +Many teams who initially thought that `lts` is the way to go, often switch to `stable` anyway because of some recent feature that’s important for their product. + +!!! warning "Important" + One more thing to keep in mind when upgrading ClickHouse: we’re always keeping eye on compatibility across releases, but sometimes it’s not reasonable to keep and some minor details might change. So make sure you check the [changelog](../../whats-new/changelog/index.md) before upgrading to see if there are any notes about backward-incompatible changes. diff --git a/docs/en/faq/use-cases/index.md b/docs/en/faq/use-cases/index.md new file mode 100644 index 00000000000..aac5493b105 --- /dev/null +++ b/docs/en/faq/use-cases/index.md @@ -0,0 +1,18 @@ +--- +title: Questions about ClickHouse use cases +toc_hidden_folder: true +toc_priority: 2 +toc_title: Use Cases +--- + +# Questions About ClickHouse Use Cases {#questions-about-clickhouse-use-cases} + +Questions: + +- [Can I use ClickHouse as a time-series database?](../../faq/use-cases/time-series.md) +- [Can I use ClickHouse as a key-value storage?](../../faq/use-cases/key-value.md) + +!!! info "Don’t see what you were looking for?" + Check out [other F.A.Q. categories](../../faq/index.md) or browse around main documentation articles found in the left sidebar. + +{## [Original article](https://clickhouse.com/docs/en/faq/use-cases/) ##} diff --git a/docs/en/faq/use-cases/key-value.md b/docs/en/faq/use-cases/key-value.md new file mode 100644 index 00000000000..2827dd2fa58 --- /dev/null +++ b/docs/en/faq/use-cases/key-value.md @@ -0,0 +1,17 @@ +--- +title: Can I use ClickHouse as a key-value storage? +toc_hidden: true +toc_priority: 101 +--- + +# Can I Use ClickHouse As a Key-Value Storage? {#can-i-use-clickhouse-as-a-key-value-storage} + +The short answer is **“no”**. The key-value workload is among top positions in the list of cases when **NOT**{.text-danger} to use ClickHouse. It’s an [OLAP](../../faq/general/olap.md) system after all, while there are many excellent key-value storage systems out there. + +However, there might be situations where it still makes sense to use ClickHouse for key-value-like queries. Usually, it’s some low-budget products where the main workload is analytical in nature and fits ClickHouse well, but there’s also some secondary process that needs a key-value pattern with not so high request throughput and without strict latency requirements. If you had an unlimited budget, you would have installed a secondary key-value database for thus secondary workload, but in reality, there’s an additional cost of maintaining one more storage system (monitoring, backups, etc.) which might be desirable to avoid. + +If you decide to go against recommendations and run some key-value-like queries against ClickHouse, here’re some tips: + +- The key reason why point queries are expensive in ClickHouse is its sparse primary index of main [MergeTree table engine family](../../engines/table-engines/mergetree-family/mergetree.md). This index can’t point to each specific row of data, instead, it points to each N-th and the system has to scan from the neighboring N-th row to the desired one, reading excessive data along the way. In a key-value scenario, it might be useful to reduce the value of N with the `index_granularity` setting. +- ClickHouse keeps each column in a separate set of files, so to assemble one complete row it needs to go through each of those files. Their count increases linearly with the number of columns, so in the key-value scenario, it might be worth to avoid using many columns and put all your payload in a single `String` column encoded in some serialization format like JSON, Protobuf or whatever makes sense. +- There’s an alternative approach that uses [Join](../../engines/table-engines/special/join.md) table engine instead of normal `MergeTree` tables and [joinGet](../../sql-reference/functions/other-functions.md#joinget) function to retrieve the data. It can provide better query performance but might have some usability and reliability issues. Here’s an [usage example](https://github.com/ClickHouse/ClickHouse/blob/master/tests/queries/0_stateless/00800_versatile_storage_join.sql#L49-L51). diff --git a/docs/en/faq/use-cases/time-series.md b/docs/en/faq/use-cases/time-series.md new file mode 100644 index 00000000000..bf97ac4b1e2 --- /dev/null +++ b/docs/en/faq/use-cases/time-series.md @@ -0,0 +1,15 @@ +--- +title: Can I use ClickHouse as a time-series database? +toc_hidden: true +toc_priority: 101 +--- + +# Can I Use ClickHouse As a Time-Series Database? {#can-i-use-clickhouse-as-a-time-series-database} + +ClickHouse is a generic data storage solution for [OLAP](../../faq/general/olap.md) workloads, while there are many specialized time-series database management systems. Nevertheless, ClickHouse’s [focus on query execution speed](../../faq/general/why-clickhouse-is-so-fast.md) allows it to outperform specialized systems in many cases. There are many independent benchmarks on this topic out there, so we’re not going to conduct one here. Instead, let’s focus on ClickHouse features that are important to use if that’s your use case. + +First of all, there are **[specialized codecs](../../sql-reference/statements/create/table.md#create-query-specialized-codecs)** which make typical time-series. Either common algorithms like `DoubleDelta` and `Gorilla` or specific to ClickHouse like `T64`. + +Second, time-series queries often hit only recent data, like one day or one week old. It makes sense to use servers that have both fast nVME/SSD drives and high-capacity HDD drives. ClickHouse [TTL](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes) feature allows to configure keeping fresh hot data on fast drives and gradually move it to slower drives as it ages. Rollup or removal of even older data is also possible if your requirements demand it. + +Even though it’s against ClickHouse philosophy of storing and processing raw data, you can use [materialized views](../../sql-reference/statements/create/view.md) to fit into even tighter latency or costs requirements. diff --git a/docs/en/example-datasets/amplab-benchmark.md b/docs/en/getting-started/example-datasets/amplab-benchmark.md similarity index 96% rename from docs/en/example-datasets/amplab-benchmark.md rename to docs/en/getting-started/example-datasets/amplab-benchmark.md index a87ac53e2e3..b410a3595ec 100644 --- a/docs/en/example-datasets/amplab-benchmark.md +++ b/docs/en/getting-started/example-datasets/amplab-benchmark.md @@ -1,6 +1,6 @@ --- -sidebar_label: AMPLab Big Data Benchmark -description: A benchmark dataset used for comparing the performance of data warehousing solutions. +toc_priority: 19 +toc_title: AMPLab Big Data Benchmark --- # AMPLab Big Data Benchmark {#amplab-big-data-benchmark} diff --git a/docs/en/example-datasets/brown-benchmark.md b/docs/en/getting-started/example-datasets/brown-benchmark.md similarity index 99% rename from docs/en/example-datasets/brown-benchmark.md rename to docs/en/getting-started/example-datasets/brown-benchmark.md index 0960756dbe9..93049d1f76a 100644 --- a/docs/en/example-datasets/brown-benchmark.md +++ b/docs/en/getting-started/example-datasets/brown-benchmark.md @@ -1,6 +1,6 @@ --- -sidebar_label: Brown University Benchmark -description: A new analytical benchmark for machine-generated log data +toc_priority: 20 +toc_title: Brown University Benchmark --- # Brown University Benchmark diff --git a/docs/en/example-datasets/cell-towers.md b/docs/en/getting-started/example-datasets/cell-towers.md similarity index 98% rename from docs/en/example-datasets/cell-towers.md rename to docs/en/getting-started/example-datasets/cell-towers.md index 6c3201ff2b2..1f681fc32d8 100644 --- a/docs/en/example-datasets/cell-towers.md +++ b/docs/en/getting-started/example-datasets/cell-towers.md @@ -1,8 +1,9 @@ --- -sidebar_label: Cell Towers +toc_priority: 21 +toc_title: Cell Towers --- -# Cell Towers +# Cell Towers {#cell-towers} This dataset is from [OpenCellid](https://www.opencellid.org/) - The world's largest Open Database of Cell Towers. @@ -95,7 +96,7 @@ SELECT mcc, count() FROM cell_towers GROUP BY mcc ORDER BY count() DESC LIMIT 10 So, the top countries are: the USA, Germany, and Russia. -You may want to create an [External Dictionary](../sql-reference/dictionaries/external-dictionaries/external-dicts.md) in ClickHouse to decode these values. +You may want to create an [External Dictionary](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md) in ClickHouse to decode these values. ## Use case {#use-case} diff --git a/docs/en/example-datasets/criteo.md b/docs/en/getting-started/example-datasets/criteo.md similarity index 96% rename from docs/en/example-datasets/criteo.md rename to docs/en/getting-started/example-datasets/criteo.md index 2d1c700d15c..08298172c70 100644 --- a/docs/en/example-datasets/criteo.md +++ b/docs/en/getting-started/example-datasets/criteo.md @@ -1,8 +1,9 @@ --- -sidebar_label: Terabyte Click Logs from Criteo +toc_priority: 18 +toc_title: Terabyte Click Logs from Criteo --- -# Terabyte of Click Logs from Criteo +# Terabyte of Click Logs from Criteo {#terabyte-of-click-logs-from-criteo} Download the data from http://labs.criteo.com/downloads/download-terabyte-click-logs/ diff --git a/docs/en/example-datasets/github-events.md b/docs/en/getting-started/example-datasets/github-events.md similarity index 89% rename from docs/en/example-datasets/github-events.md rename to docs/en/getting-started/example-datasets/github-events.md index 3a0cbc3324d..e470e88b182 100644 --- a/docs/en/example-datasets/github-events.md +++ b/docs/en/getting-started/example-datasets/github-events.md @@ -1,5 +1,6 @@ --- -sidebar_label: GitHub Events +toc_priority: 11 +toc_title: GitHub Events --- # GitHub Events Dataset diff --git a/docs/en/getting-started/example-datasets/index.md b/docs/en/getting-started/example-datasets/index.md new file mode 100644 index 00000000000..d4c9bab2441 --- /dev/null +++ b/docs/en/getting-started/example-datasets/index.md @@ -0,0 +1,28 @@ +--- +toc_folder_title: Example Datasets +toc_priority: 10 +toc_title: Introduction +--- + +# Example Datasets {#example-datasets} + +This section describes how to obtain example datasets and import them into ClickHouse. For some datasets example queries are also available. + +The list of documented datasets: + +- [GitHub Events](../../getting-started/example-datasets/github-events.md) +- [Anonymized Web Analytics Dataset](../../getting-started/example-datasets/metrica.md) +- [Recipes](../../getting-started/example-datasets/recipes.md) +- [Star Schema Benchmark](../../getting-started/example-datasets/star-schema.md) +- [WikiStat](../../getting-started/example-datasets/wikistat.md) +- [Terabyte of Click Logs from Criteo](../../getting-started/example-datasets/criteo.md) +- [AMPLab Big Data Benchmark](../../getting-started/example-datasets/amplab-benchmark.md) +- [Brown University Benchmark](../../getting-started/example-datasets/brown-benchmark.md) +- [New York Taxi Data](../../getting-started/example-datasets/nyc-taxi.md) +- [OpenSky](../../getting-started/example-datasets/opensky.md) +- [UK Property Price Paid](../../getting-started/example-datasets/uk-price-paid.md) +- [Cell Towers](../../getting-started/example-datasets/cell-towers.md) +- [What's on the Menu?](../../getting-started/example-datasets/menus.md) +- [OnTime](../../getting-started/example-datasets/ontime.md) + +[Original article](https://clickhouse.com/docs/en/getting_started/example_datasets) diff --git a/docs/en/example-datasets/menus.md b/docs/en/getting-started/example-datasets/menus.md similarity index 93% rename from docs/en/example-datasets/menus.md rename to docs/en/getting-started/example-datasets/menus.md index c41195223a2..665944b3e6f 100644 --- a/docs/en/example-datasets/menus.md +++ b/docs/en/getting-started/example-datasets/menus.md @@ -1,8 +1,9 @@ --- -sidebar_label: New York Public Library "What's on the Menu?" Dataset +toc_priority: 21 +toc_title: Menus --- -# New York Public Library "What's on the Menu?" Dataset +# New York Public Library "What's on the Menu?" Dataset {#menus-dataset} The dataset is created by the New York Public Library. It contains historical data on the menus of hotels, restaurants and cafes with the dishes along with their prices. @@ -39,7 +40,7 @@ The data is normalized consisted of four tables: ## Create the Tables {#create-tables} -We use [Decimal](../sql-reference/data-types/decimal.md) data type to store prices. +We use [Decimal](../../sql-reference/data-types/decimal.md) data type to store prices. ```sql CREATE TABLE dish @@ -115,17 +116,17 @@ clickhouse-client --format_csv_allow_single_quotes 0 --input_format_null_as_defa clickhouse-client --format_csv_allow_single_quotes 0 --input_format_null_as_default 0 --date_time_input_format best_effort --query "INSERT INTO menu_item FORMAT CSVWithNames" < MenuItem.csv ``` -We use [CSVWithNames](../interfaces/formats.md#csvwithnames) format as the data is represented by CSV with header. +We use [CSVWithNames](../../interfaces/formats.md#csvwithnames) format as the data is represented by CSV with header. We disable `format_csv_allow_single_quotes` as only double quotes are used for data fields and single quotes can be inside the values and should not confuse the CSV parser. -We disable [input_format_null_as_default](../operations/settings/settings.md#settings-input-format-null-as-default) as our data does not have [NULL](../sql-reference/syntax.md#null-literal). Otherwise ClickHouse will try to parse `\N` sequences and can be confused with `\` in data. +We disable [input_format_null_as_default](../../operations/settings/settings.md#settings-input-format-null-as-default) as our data does not have [NULL](../../sql-reference/syntax.md#null-literal). Otherwise ClickHouse will try to parse `\N` sequences and can be confused with `\` in data. -The setting [date_time_input_format best_effort](../operations/settings/settings.md#settings-date_time_input_format) allows to parse [DateTime](../sql-reference/data-types/datetime.md) fields in wide variety of formats. For example, ISO-8601 without seconds like '2000-01-01 01:02' will be recognized. Without this setting only fixed DateTime format is allowed. +The setting [date_time_input_format best_effort](../../operations/settings/settings.md#settings-date_time_input_format) allows to parse [DateTime](../../sql-reference/data-types/datetime.md) fields in wide variety of formats. For example, ISO-8601 without seconds like '2000-01-01 01:02' will be recognized. Without this setting only fixed DateTime format is allowed. ## Denormalize the Data {#denormalize-data} -Data is presented in multiple tables in [normalized form](https://en.wikipedia.org/wiki/Database_normalization#Normal_forms). It means you have to perform [JOIN](../sql-reference/statements/select/join.md#select-join) if you want to query, e.g. dish names from menu items. +Data is presented in multiple tables in [normalized form](https://en.wikipedia.org/wiki/Database_normalization#Normal_forms). It means you have to perform [JOIN](../../sql-reference/statements/select/join.md#select-join) if you want to query, e.g. dish names from menu items. For typical analytical tasks it is way more efficient to deal with pre-JOINed data to avoid doing `JOIN` every time. It is called "denormalized" data. We will create a table `menu_item_denorm` where will contain all the data JOINed together: diff --git a/docs/en/example-datasets/metrica.md b/docs/en/getting-started/example-datasets/metrica.md similarity index 97% rename from docs/en/example-datasets/metrica.md rename to docs/en/getting-started/example-datasets/metrica.md index c5ef74750a6..d9d8beb0181 100644 --- a/docs/en/example-datasets/metrica.md +++ b/docs/en/getting-started/example-datasets/metrica.md @@ -1,9 +1,9 @@ --- -sidebar_label: Web Analytics Data -description: Dataset consists of two tables containing anonymized web analytics data with hits and visits +toc_priority: 15 +toc_title: Web Analytics Data --- -# Anonymized Web Analytics Data +# Anonymized Web Analytics Data {#anonymized-web-analytics-data} Dataset consists of two tables containing anonymized web analytics data with hits (`hits_v1`) and visits (`visits_v1`). @@ -73,6 +73,6 @@ clickhouse-client --query "SELECT COUNT(*) FROM datasets.visits_v1" ## Example Queries {#example-queries} -[The ClickHouse tutorial](../../tutorial.md) is based on this web analytics dataset, and the recommended way to get started with this dataset is to go through the tutorial. +[The ClickHouse tutorial](../../getting-started/tutorial.md) is based on this web analytics dataset, and the recommended way to get started with this dataset is to go through the tutorial. Additional examples of queries to these tables can be found among [stateful tests](https://github.com/ClickHouse/ClickHouse/tree/master/tests/queries/1_stateful) of ClickHouse (they are named `test.hits` and `test.visits` there). diff --git a/docs/en/example-datasets/nyc-taxi.md b/docs/en/getting-started/example-datasets/nyc-taxi.md similarity index 99% rename from docs/en/example-datasets/nyc-taxi.md rename to docs/en/getting-started/example-datasets/nyc-taxi.md index 270aeb4929c..a7825988695 100644 --- a/docs/en/example-datasets/nyc-taxi.md +++ b/docs/en/getting-started/example-datasets/nyc-taxi.md @@ -1,9 +1,9 @@ --- -sidebar_label: New York Taxi Data -description: Data for billions of taxi and for-hire vehicle (Uber, Lyft, etc.) trips originating in New York City since 2009 +toc_priority: 20 +toc_title: New York Taxi Data --- -# New York Taxi Data +# New York Taxi Data {#new-york-taxi-data} This dataset can be obtained in two ways: @@ -290,9 +290,8 @@ $ sudo service clickhouse-server restart $ clickhouse-client --query "select count(*) from datasets.trips_mergetree" ``` -:::info -If you will run the queries described below, you have to use the full table name, `datasets.trips_mergetree`. -::: +!!! info "Info" + If you will run the queries described below, you have to use the full table name, `datasets.trips_mergetree`. ## Results on Single Server {#results-on-single-server} diff --git a/docs/en/example-datasets/ontime.md b/docs/en/getting-started/example-datasets/ontime.md similarity index 97% rename from docs/en/example-datasets/ontime.md rename to docs/en/getting-started/example-datasets/ontime.md index bb3c3644972..efc807b75fa 100644 --- a/docs/en/example-datasets/ontime.md +++ b/docs/en/getting-started/example-datasets/ontime.md @@ -1,9 +1,9 @@ --- -sidebar_label: OnTime Airline Flight Data -description: Dataset containing the on-time performance of airline flights +toc_priority: 21 +toc_title: OnTime --- -# OnTime +# OnTime {#ontime} This dataset can be obtained in two ways: @@ -156,9 +156,8 @@ $ sudo service clickhouse-server restart $ clickhouse-client --query "select count(*) from datasets.ontime" ``` -:::note -If you will run the queries described below, you have to use the full table name, `datasets.ontime`. -::: +!!! info "Info" + If you will run the queries described below, you have to use the full table name, `datasets.ontime`. ## Queries {#queries} diff --git a/docs/en/example-datasets/opensky.md b/docs/en/getting-started/example-datasets/opensky.md similarity index 98% rename from docs/en/example-datasets/opensky.md rename to docs/en/getting-started/example-datasets/opensky.md index 719f32d7c3e..2d901397cb2 100644 --- a/docs/en/example-datasets/opensky.md +++ b/docs/en/getting-started/example-datasets/opensky.md @@ -1,11 +1,11 @@ --- -sidebar_label: Air Traffic Data -description: The data in this dataset is derived and cleaned from the full OpenSky dataset to illustrate the development of air traffic during the COVID-19 pandemic. +toc_priority: 20 +toc_title: OpenSky --- -# Crowdsourced air traffic data from The OpenSky Network 2020 +# Crowdsourced air traffic data from The OpenSky Network 2020 {#opensky} -The data in this dataset is derived and cleaned from the full OpenSky dataset to illustrate the development of air traffic during the COVID-19 pandemic. It spans all flights seen by the network's more than 2500 members since 1 January 2019. More data will be periodically included in the dataset until the end of the COVID-19 pandemic. +"The data in this dataset is derived and cleaned from the full OpenSky dataset to illustrate the development of air traffic during the COVID-19 pandemic. It spans all flights seen by the network's more than 2500 members since 1 January 2019. More data will be periodically included in the dataset until the end of the COVID-19 pandemic". Source: https://zenodo.org/record/5092942#.YRBCyTpRXYd @@ -60,9 +60,9 @@ ls -1 flightlist_*.csv.gz | xargs -P100 -I{} bash -c 'gzip -c -d "{}" | clickhou `xargs -P100` specifies to use up to 100 parallel workers but as we only have 30 files, the number of workers will be only 30. - For every file, `xargs` will run a script with `bash -c`. The script has substitution in form of `{}` and the `xargs` command will substitute the filename to it (we have asked it for `xargs` with `-I{}`). - The script will decompress the file (`gzip -c -d "{}"`) to standard output (`-c` parameter) and the output is redirected to `clickhouse-client`. -- We also asked to parse [DateTime](../sql-reference/data-types/datetime.md) fields with extended parser ([--date_time_input_format best_effort](../operations/settings/settings.md#settings-date_time_input_format)) to recognize ISO-8601 format with timezone offsets. +- We also asked to parse [DateTime](../../sql-reference/data-types/datetime.md) fields with extended parser ([--date_time_input_format best_effort](../../operations/settings/settings.md#settings-date_time_input_format)) to recognize ISO-8601 format with timezone offsets. -Finally, `clickhouse-client` will do insertion. It will read input data in [CSVWithNames](../interfaces/formats.md#csvwithnames) format. +Finally, `clickhouse-client` will do insertion. It will read input data in [CSVWithNames](../../interfaces/formats.md#csvwithnames) format. Parallel upload takes 24 seconds. diff --git a/docs/en/example-datasets/recipes.md b/docs/en/getting-started/example-datasets/recipes.md similarity index 98% rename from docs/en/example-datasets/recipes.md rename to docs/en/getting-started/example-datasets/recipes.md index b01efc8de26..70a56a0547f 100644 --- a/docs/en/example-datasets/recipes.md +++ b/docs/en/getting-started/example-datasets/recipes.md @@ -1,5 +1,6 @@ --- -sidebar_label: Recipes Dataset +toc_priority: 16 +toc_title: Recipes Dataset --- # Recipes Dataset @@ -50,13 +51,13 @@ clickhouse-client --query " This is a showcase how to parse custom CSV, as it requires multiple tunes. Explanation: -- The dataset is in CSV format, but it requires some preprocessing on insertion; we use table function [input](../sql-reference/table-functions/input.md) to perform preprocessing; +- The dataset is in CSV format, but it requires some preprocessing on insertion; we use table function [input](../../sql-reference/table-functions/input.md) to perform preprocessing; - The structure of CSV file is specified in the argument of the table function `input`; - The field `num` (row number) is unneeded - we parse it from file and ignore; - We use `FORMAT CSVWithNames` but the header in CSV will be ignored (by command line parameter `--input_format_with_names_use_header 0`), because the header does not contain the name for the first field; - File is using only double quotes to enclose CSV strings; some strings are not enclosed in double quotes, and single quote must not be parsed as the string enclosing - that's why we also add the `--format_csv_allow_single_quote 0` parameter; - Some strings from CSV cannot parse, because they contain `\M/` sequence at the beginning of the value; the only value starting with backslash in CSV can be `\N` that is parsed as SQL NULL. We add `--input_format_allow_errors_num 10` parameter and up to ten malformed records can be skipped; -- There are arrays for ingredients, directions and NER fields; these arrays are represented in unusual form: they are serialized into string as JSON and then placed in CSV - we parse them as String and then use [JSONExtract](../sql-reference/functions/json-functions/) function to transform it to Array. +- There are arrays for ingredients, directions and NER fields; these arrays are represented in unusual form: they are serialized into string as JSON and then placed in CSV - we parse them as String and then use [JSONExtract](../../sql-reference/functions/json-functions/) function to transform it to Array. ## Validate the Inserted Data @@ -80,7 +81,7 @@ Result: ### Top Components by the Number of Recipes: -In this example we learn how to use [arrayJoin](../sql-reference/functions/array-join/) function to expand an array into a set of rows. +In this example we learn how to use [arrayJoin](../../sql-reference/functions/array-join/) function to expand an array into a set of rows. Query: @@ -185,7 +186,7 @@ Result: 10 rows in set. Elapsed: 0.215 sec. Processed 2.23 million rows, 1.48 GB (10.35 million rows/s., 6.86 GB/s.) ``` -In this example, we involve [has](../sql-reference/functions/array-functions/#hasarr-elem) function to filter by array elements and sort by the number of directions. +In this example, we involve [has](../../sql-reference/functions/array-functions/#hasarr-elem) function to filter by array elements and sort by the number of directions. There is a wedding cake that requires the whole 126 steps to produce! Show that directions: diff --git a/docs/en/example-datasets/star-schema.md b/docs/en/getting-started/example-datasets/star-schema.md similarity index 96% rename from docs/en/example-datasets/star-schema.md rename to docs/en/getting-started/example-datasets/star-schema.md index 35ff492c360..14fa7cef654 100644 --- a/docs/en/example-datasets/star-schema.md +++ b/docs/en/getting-started/example-datasets/star-schema.md @@ -1,11 +1,9 @@ --- -sidebar_label: Star Schema Benchmark -description: "Dataset based on the TPC-H dbgen source. The coding style and architecture -follows the TPCH dbgen." +toc_priority: 16 +toc_title: Star Schema Benchmark --- -# Star Schema Benchmark - +# Star Schema Benchmark {#star-schema-benchmark} Compiling dbgen: @@ -17,9 +15,8 @@ $ make Generating data: -:::warning -With `-s 100` dbgen generates 600 million rows (67 GB), while while `-s 1000` it generates 6 billion rows (which takes a lot of time) -::: +!!! warning "Attention" + With `-s 100` dbgen generates 600 million rows (67 GB), while while `-s 1000` it generates 6 billion rows (which takes a lot of time) ``` bash $ ./dbgen -s 1000 -T c diff --git a/docs/en/example-datasets/uk-price-paid.md b/docs/en/getting-started/example-datasets/uk-price-paid.md similarity index 98% rename from docs/en/example-datasets/uk-price-paid.md rename to docs/en/getting-started/example-datasets/uk-price-paid.md index e19e801dcf9..4b0ba25907d 100644 --- a/docs/en/example-datasets/uk-price-paid.md +++ b/docs/en/getting-started/example-datasets/uk-price-paid.md @@ -1,8 +1,9 @@ --- -sidebar_label: UK Property Price Paid +toc_priority: 20 +toc_title: UK Property Price Paid --- -# UK Property Price Paid +# UK Property Price Paid {#uk-property-price-paid} The dataset contains data about prices paid for real-estate property in England and Wales. The data is available since year 1995. The size of the dataset in uncompressed form is about 4 GiB and it will take about 278 MiB in ClickHouse. @@ -54,9 +55,9 @@ In this example, we define the structure of source data from the CSV file and sp The preprocessing is: - splitting the postcode to two different columns `postcode1` and `postcode2` that is better for storage and queries; - coverting the `time` field to date as it only contains 00:00 time; -- ignoring the [UUid](../sql-reference/data-types/uuid.md) field because we don't need it for analysis; -- transforming `type` and `duration` to more readable Enum fields with function [transform](../sql-reference/functions/other-functions.md#transform); -- transforming `is_new` and `category` fields from single-character string (`Y`/`N` and `A`/`B`) to [UInt8](../sql-reference/data-types/int-uint.md#uint8-uint16-uint32-uint64-uint256-int8-int16-int32-int64-int128-int256) field with 0 and 1. +- ignoring the [UUid](../../sql-reference/data-types/uuid.md) field because we don't need it for analysis; +- transforming `type` and `duration` to more readable Enum fields with function [transform](../../sql-reference/functions/other-functions.md#transform); +- transforming `is_new` and `category` fields from single-character string (`Y`/`N` and `A`/`B`) to [UInt8](../../sql-reference/data-types/int-uint.md#uint8-uint16-uint32-uint64-uint256-int8-int16-int32-int64-int128-int256) field with 0 and 1. Preprocessed data is piped directly to `clickhouse-client` to be inserted into ClickHouse table in streaming fashion. @@ -352,7 +353,7 @@ Result: ## Let's Speed Up Queries Using Projections {#speedup-with-projections} -[Projections](../sql-reference/statements/alter/projection.md) allow to improve queries speed by storing pre-aggregated data. +[Projections](../../sql-reference/statements/alter/projection.md) allow to improve queries speed by storing pre-aggregated data. ### Build a Projection {#build-projection} @@ -388,7 +389,7 @@ SETTINGS mutations_sync = 1; Let's run the same 3 queries. -[Enable](../operations/settings/settings.md#allow-experimental-projection-optimization) projections for selects: +[Enable](../../operations/settings/settings.md#allow-experimental-projection-optimization) projections for selects: ```sql SET allow_experimental_projection_optimization = 1; diff --git a/docs/en/example-datasets/wikistat.md b/docs/en/getting-started/example-datasets/wikistat.md similarity index 91% rename from docs/en/example-datasets/wikistat.md rename to docs/en/getting-started/example-datasets/wikistat.md index 1185338a1da..3e3f7b164ce 100644 --- a/docs/en/example-datasets/wikistat.md +++ b/docs/en/getting-started/example-datasets/wikistat.md @@ -1,10 +1,11 @@ --- -sidebar_label: WikiStat +toc_priority: 17 +toc_title: WikiStat --- -# WikiStat +# WikiStat {#wikistat} -See http://dumps.wikimedia.org/other/pagecounts-raw/ for details. +See: http://dumps.wikimedia.org/other/pagecounts-raw/ Creating a table: diff --git a/docs/en/getting-started/index.md b/docs/en/getting-started/index.md new file mode 100644 index 00000000000..372e8d7bd64 --- /dev/null +++ b/docs/en/getting-started/index.md @@ -0,0 +1,15 @@ +--- +toc_folder_title: Getting Started +toc_hidden: true +toc_priority: 8 +toc_title: hidden +--- + +# Getting Started {#getting-started} + +If you are new to ClickHouse and want to get a hands-on feeling of its performance, first of all, you need to go through the [installation process](../getting-started/install.md). After that you can: + +- [Go through detailed tutorial](../getting-started/tutorial.md) +- [Experiment with example datasets](../getting-started/example-datasets/ontime.md) + +[Original article](https://clickhouse.com/docs/en/getting_started/) diff --git a/docs/en/install.md b/docs/en/getting-started/install.md similarity index 58% rename from docs/en/install.md rename to docs/en/getting-started/install.md index 37cb113bc4a..cd734d4dc8b 100644 --- a/docs/en/install.md +++ b/docs/en/getting-started/install.md @@ -1,9 +1,6 @@ --- -sidebar_label: Installation -sidebar_position: 1 -keywords: [clickhouse, install, installation, docs] -description: ClickHouse can run on any Linux, FreeBSD, or Mac OS X with x86_64, AArch64, or PowerPC64LE CPU architecture. -slug: /en/getting-started/install +toc_priority: 11 +toc_title: Installation --- # Installation {#installation} @@ -27,36 +24,15 @@ To run ClickHouse on processors that do not support SSE 4.2 or have AArch64 or P It is recommended to use official pre-compiled `deb` packages for Debian or Ubuntu. Run these commands to install packages: ``` bash -sudo apt-get install apt-transport-https ca-certificates dirmngr -sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv E0C56BD4 - -echo "deb https://repo.clickhouse.com/deb/stable/ main/" | sudo tee \ - /etc/apt/sources.list.d/clickhouse.list -sudo apt-get update - -sudo apt-get install -y clickhouse-server clickhouse-client - -sudo service clickhouse-server start -clickhouse-client # or "clickhouse-client --password" if you set up a password. +{% include 'install/deb.sh' %} ``` -
+
+ Deprecated Method for installing deb-packages - ``` bash -sudo apt-get install apt-transport-https ca-certificates dirmngr -sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv E0C56BD4 - -echo "deb https://repo.clickhouse.com/deb/stable/ main/" | sudo tee \ - /etc/apt/sources.list.d/clickhouse.list -sudo apt-get update - -sudo apt-get install -y clickhouse-server clickhouse-client - -sudo service clickhouse-server start -clickhouse-client # or "clickhouse-client --password" if you set up a password. +{% include 'install/deb_repo.sh' %} ``` -
You can replace `stable` with `lts` or `testing` to use different [release trains](../faq/operations/production.md) based on your needs. @@ -70,10 +46,9 @@ You can also download and install packages manually from [here](https://packages - `clickhouse-client` — Creates a symbolic link for `clickhouse-client` and other client-related tools. and installs client configuration files. - `clickhouse-common-static-dbg` — Installs ClickHouse compiled binary files with debug info. -:::info -If you need to install specific version of ClickHouse you have to install all packages with the same version: -`sudo apt-get install clickhouse-server=21.8.5.7 clickhouse-client=21.8.5.7 clickhouse-common-static=21.8.5.7` -::: +!!! attention "Attention" + If you need to install specific version of ClickHouse you have to install all packages with the same version: + `sudo apt-get install clickhouse-server=21.8.5.7 clickhouse-client=21.8.5.7 clickhouse-common-static=21.8.5.7` ### From RPM Packages {#from-rpm-packages} @@ -82,28 +57,15 @@ It is recommended to use official pre-compiled `rpm` packages for CentOS, RedHat First, you need to add the official repository: ``` bash -sudo yum install -y yum-utils -sudo yum-config-manager --add-repo https://packages.clickhouse.com/rpm/clickhouse.repo -sudo yum install -y clickhouse-server clickhouse-client - -sudo /etc/init.d/clickhouse-server start -clickhouse-client # or "clickhouse-client --password" if you set up a password. +{% include 'install/rpm.sh' %} ```
Deprecated Method for installing rpm-packages - ``` bash -sudo yum install yum-utils -sudo rpm --import https://repo.clickhouse.com/CLICKHOUSE-KEY.GPG -sudo yum-config-manager --add-repo https://repo.clickhouse.com/rpm/clickhouse.repo -sudo yum install clickhouse-server clickhouse-client - -sudo /etc/init.d/clickhouse-server start -clickhouse-client # or "clickhouse-client --password" if you set up a password. +{% include 'install/rpm_repo.sh' %} ``` -
If you want to use the most recent version, replace `stable` with `testing` (this is recommended for your testing environments). `prestable` is sometimes also available. @@ -124,52 +86,14 @@ The required version can be downloaded with `curl` or `wget` from repository htt After that downloaded archives should be unpacked and installed with installation scripts. Example for the latest stable version: ``` bash -LATEST_VERSION=$(curl -s https://packages.clickhouse.com/tgz/stable/ | \ - grep -Eo '[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' | sort -V -r | head -n 1) -export LATEST_VERSION -curl -O "https://packages.clickhouse.com/tgz/stable/clickhouse-common-static-$LATEST_VERSION.tgz" -curl -O "https://packages.clickhouse.com/tgz/stable/clickhouse-common-static-dbg-$LATEST_VERSION.tgz" -curl -O "https://packages.clickhouse.com/tgz/stable/clickhouse-server-$LATEST_VERSION.tgz" -curl -O "https://packages.clickhouse.com/tgz/stable/clickhouse-client-$LATEST_VERSION.tgz" - -tar -xzvf "clickhouse-common-static-$LATEST_VERSION.tgz" -sudo "clickhouse-common-static-$LATEST_VERSION/install/doinst.sh" - -tar -xzvf "clickhouse-common-static-dbg-$LATEST_VERSION.tgz" -sudo "clickhouse-common-static-dbg-$LATEST_VERSION/install/doinst.sh" - -tar -xzvf "clickhouse-server-$LATEST_VERSION.tgz" -sudo "clickhouse-server-$LATEST_VERSION/install/doinst.sh" -sudo /etc/init.d/clickhouse-server start - -tar -xzvf "clickhouse-client-$LATEST_VERSION.tgz" -sudo "clickhouse-client-$LATEST_VERSION/install/doinst.sh" +{% include 'install/tgz.sh' %} ```
Deprecated Method for installing tgz archives - ``` bash -export LATEST_VERSION=$(curl -s https://repo.clickhouse.com/tgz/stable/ | \ - grep -Eo '[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' | sort -V -r | head -n 1) -curl -O https://repo.clickhouse.com/tgz/stable/clickhouse-common-static-$LATEST_VERSION.tgz -curl -O https://repo.clickhouse.com/tgz/stable/clickhouse-common-static-dbg-$LATEST_VERSION.tgz -curl -O https://repo.clickhouse.com/tgz/stable/clickhouse-server-$LATEST_VERSION.tgz -curl -O https://repo.clickhouse.com/tgz/stable/clickhouse-client-$LATEST_VERSION.tgz - -tar -xzvf clickhouse-common-static-$LATEST_VERSION.tgz -sudo clickhouse-common-static-$LATEST_VERSION/install/doinst.sh - -tar -xzvf clickhouse-common-static-dbg-$LATEST_VERSION.tgz -sudo clickhouse-common-static-dbg-$LATEST_VERSION/install/doinst.sh - -tar -xzvf clickhouse-server-$LATEST_VERSION.tgz -sudo clickhouse-server-$LATEST_VERSION/install/doinst.sh -sudo /etc/init.d/clickhouse-server start - -tar -xzvf clickhouse-client-$LATEST_VERSION.tgz -sudo clickhouse-client-$LATEST_VERSION/install/doinst.sh +{% include 'install/tgz_repo.sh' %} ```
@@ -190,33 +114,22 @@ sudo ./clickhouse install ### From Precompiled Binaries for Non-Standard Environments {#from-binaries-non-linux} -For non-Linux operating systems and for AArch64 CPU arhitecture, ClickHouse builds are provided as a cross-compiled binary from the latest commit of the `master` branch (with a few hours delay). +For non-Linux operating systems and for AArch64 CPU arhitecture, ClickHouse builds are provided as a cross-compiled binary from the latest commit of the `master` branch (with a few hours delay). +- [MacOS x86_64](https://builds.clickhouse.com/master/macos/clickhouse) — `curl -O 'https://builds.clickhouse.com/master/macos/clickhouse' && chmod a+x ./clickhouse` +- [MacOS Aarch64 (Apple Silicon)](https://builds.clickhouse.com/master/macos-aarch64/clickhouse) — `curl -O 'https://builds.clickhouse.com/master/macos-aarch64/clickhouse' && chmod a+x ./clickhouse` +- [FreeBSD x86_64](https://builds.clickhouse.com/master/freebsd/clickhouse) — `curl -O 'https://builds.clickhouse.com/master/freebsd/clickhouse' && chmod a+x ./clickhouse` +- [Linux AArch64](https://builds.clickhouse.com/master/aarch64/clickhouse) — `curl -O 'https://builds.clickhouse.com/master/aarch64/clickhouse' && chmod a+x ./clickhouse` -- [MacOS x86_64](https://builds.clickhouse.com/master/macos/clickhouse) - ```bash - curl -O 'https://builds.clickhouse.com/master/macos/clickhouse' && chmod a+x ./clickhouse - ``` -- [MacOS Aarch64 (Apple Silicon)](https://builds.clickhouse.com/master/macos-aarch64/clickhouse) - ```bash - curl -O 'https://builds.clickhouse.com/master/macos-aarch64/clickhouse' && chmod a+x ./clickhouse - ``` -- [FreeBSD x86_64](https://builds.clickhouse.com/master/freebsd/clickhouse) - ```bash - curl -O 'https://builds.clickhouse.com/master/freebsd/clickhouse' && chmod a+x ./clickhouse - ``` -- [Linux AArch64](https://builds.clickhouse.com/master/aarch64/clickhouse) - ```bash - curl -O 'https://builds.clickhouse.com/master/aarch64/clickhouse' && chmod a+x ./clickhouse - ``` +After downloading, you can use the `clickhouse client` to connect to the server, or `clickhouse local` to process local data. -Run `sudo ./clickhouse install` to install ClickHouse system-wide (also with needed configuration files, configuring users etc.). Then run `clickhouse start` commands to start the clickhouse-server and `clickhouse-client` to connect to it. +Run `sudo ./clickhouse install` if you want to install clickhouse system-wide (also with needed configuration files, configuring users etc.). After that run `clickhouse start` commands to start the clickhouse-server and `clickhouse-client` to connect to it. -Use the `clickhouse client` to connect to the server, or `clickhouse local` to process local data. +These builds are not recommended for use in production environments because they are less thoroughly tested, but you can do so on your own risk. They also have only a subset of ClickHouse features available. ### From Sources {#from-sources} -To manually compile ClickHouse, follow the instructions for [Linux](./development/build.md) or [Mac OS X](./development/build-osx.md). +To manually compile ClickHouse, follow the instructions for [Linux](../development/build.md) or [Mac OS X](../development/build-osx.md). You can compile packages and install them or use programs without installing packages. Also by building manually you can disable SSE 4.2 requirement or build for AArch64 CPUs. @@ -271,7 +184,7 @@ If the configuration file is in the current directory, you do not need to specif ClickHouse supports access restriction settings. They are located in the `users.xml` file (next to `config.xml`). By default, access is allowed from anywhere for the `default` user, without a password. See `user/default/networks`. -For more information, see the section [“Configuration Files”](./operations/configuration-files.md). +For more information, see the section [“Configuration Files”](../operations/configuration-files.md). After launching server, you can use the command-line client to connect to it: @@ -282,7 +195,7 @@ $ clickhouse-client By default, it connects to `localhost:9000` on behalf of the user `default` without a password. It can also be used to connect to a remote server using `--host` argument. The terminal must use UTF-8 encoding. -For more information, see the section [“Command-line client”](./interfaces/cli.md). +For more information, see the section [“Command-line client”](../interfaces/cli.md). Example: diff --git a/docs/en/playground.md b/docs/en/getting-started/playground.md similarity index 69% rename from docs/en/playground.md rename to docs/en/getting-started/playground.md index ea7b2ccf2c5..01d7dd5b69f 100644 --- a/docs/en/playground.md +++ b/docs/en/getting-started/playground.md @@ -1,9 +1,6 @@ --- -sidebar_label: Playground -sidebar_position: 2 -keywords: [clickhouse, playground, getting, started, docs] -description: The ClickHouse Playground allows people to experiment with ClickHouse by running queries instantly, without setting up their server or cluster. -slug: /en/getting-started/playground +toc_priority: 14 +toc_title: Playground --- # ClickHouse Playground {#clickhouse-playground} @@ -11,7 +8,7 @@ slug: /en/getting-started/playground [ClickHouse Playground](https://play.clickhouse.com/play?user=play) allows people to experiment with ClickHouse by running queries instantly, without setting up their server or cluster. Several example datasets are available in Playground. -You can make queries to Playground using any HTTP client, for example [curl](https://curl.haxx.se) or [wget](https://www.gnu.org/software/wget/), or set up a connection using [JDBC](./interfaces/jdbc.md) or [ODBC](./interfaces/odbc.md) drivers. More information about software products that support ClickHouse is available [here](./interfaces/index.md). +You can make queries to Playground using any HTTP client, for example [curl](https://curl.haxx.se) or [wget](https://www.gnu.org/software/wget/), or set up a connection using [JDBC](../interfaces/jdbc.md) or [ODBC](../interfaces/odbc.md) drivers. More information about software products that support ClickHouse is available [here](../interfaces/index.md). ## Credentials {#credentials} @@ -39,7 +36,7 @@ HTTPS endpoint example with `curl`: curl "https://play.clickhouse.com/?user=explorer" --data-binary "SELECT 'Play ClickHouse'" ``` -TCP endpoint example with [CLI](./interfaces/cli.md): +TCP endpoint example with [CLI](../interfaces/cli.md): ``` bash clickhouse client --secure --host play.clickhouse.com --user explorer diff --git a/docs/en/getting-started/tutorial.md b/docs/en/getting-started/tutorial.md new file mode 100644 index 00000000000..9f43cc8769d --- /dev/null +++ b/docs/en/getting-started/tutorial.md @@ -0,0 +1,662 @@ +--- +toc_priority: 12 +toc_title: Tutorial +--- + +# ClickHouse Tutorial {#clickhouse-tutorial} + +## What to Expect from This Tutorial? {#what-to-expect-from-this-tutorial} + +By going through this tutorial, you’ll learn how to set up a simple ClickHouse cluster. It’ll be small, but fault-tolerant and scalable. Then we will use one of the example datasets to fill it with data and execute some demo queries. + +## Single Node Setup {#single-node-setup} + +To postpone the complexities of a distributed environment, we’ll start with deploying ClickHouse on a single server or virtual machine. ClickHouse is usually installed from [deb](../getting-started/install.md#install-from-deb-packages) or [rpm](../getting-started/install.md#from-rpm-packages) packages, but there are [alternatives](../getting-started/install.md#from-docker-image) for the operating systems that do not support them. + +For example, you have chosen `deb` packages and executed: + +``` bash +{% include 'install/deb.sh' %} +``` + +What do we have in the packages that got installed: + +- `clickhouse-client` package contains [clickhouse-client](../interfaces/cli.md) application, interactive ClickHouse console client. +- `clickhouse-common` package contains a ClickHouse executable file. +- `clickhouse-server` package contains configuration files to run ClickHouse as a server. + +Server config files are located in `/etc/clickhouse-server/`. Before going further, please notice the `` element in `config.xml`. Path determines the location for data storage, so it should be located on volume with large disk capacity; the default value is `/var/lib/clickhouse/`. If you want to adjust the configuration, it’s not handy to directly edit `config.xml` file, considering it might get rewritten on future package updates. The recommended way to override the config elements is to create [files in config.d directory](../operations/configuration-files.md) which serve as “patches” to config.xml. + +As you might have noticed, `clickhouse-server` is not launched automatically after package installation. It won’t be automatically restarted after updates, either. The way you start the server depends on your init system, usually, it is: + +``` bash +sudo service clickhouse-server start +``` + +or + +``` bash +sudo /etc/init.d/clickhouse-server start +``` + +The default location for server logs is `/var/log/clickhouse-server/`. The server is ready to handle client connections once it logs the `Ready for connections` message. + +Once the `clickhouse-server` is up and running, we can use `clickhouse-client` to connect to the server and run some test queries like `SELECT "Hello, world!";`. + +
+ +Quick tips for clickhouse-client + +Interactive mode: + +``` bash +clickhouse-client +clickhouse-client --host=... --port=... --user=... --password=... +``` + +Enable multiline queries: + +``` bash +clickhouse-client -m +clickhouse-client --multiline +``` + +Run queries in batch-mode: + +``` bash +clickhouse-client --query='SELECT 1' +echo 'SELECT 1' | clickhouse-client +clickhouse-client <<< 'SELECT 1' +``` + +Insert data from a file in specified format: + +``` bash +clickhouse-client --query='INSERT INTO table VALUES' < data.txt +clickhouse-client --query='INSERT INTO table FORMAT TabSeparated' < data.tsv +``` + +
+ +## Import Sample Dataset {#import-sample-dataset} + +Now it’s time to fill our ClickHouse server with some sample data. In this tutorial, we’ll use some anonymized web analytics data. There are [multiple ways to import the dataset](../getting-started/example-datasets/metrica.md), and for the sake of the tutorial, we’ll go with the most realistic one. + +### Download and Extract Table Data {#download-and-extract-table-data} + +``` bash +curl https://datasets.clickhouse.com/hits/tsv/hits_v1.tsv.xz | unxz --threads=`nproc` > hits_v1.tsv +curl https://datasets.clickhouse.com/visits/tsv/visits_v1.tsv.xz | unxz --threads=`nproc` > visits_v1.tsv +``` + +The extracted files are about 10GB in size. + +### Create Tables {#create-tables} + +As in most databases management systems, ClickHouse logically groups tables into “databases”. There’s a `default` database, but we’ll create a new one named `tutorial`: + +``` bash +clickhouse-client --query "CREATE DATABASE IF NOT EXISTS tutorial" +``` + +Syntax for creating tables is way more complicated compared to databases (see [reference](../sql-reference/statements/create/table.md). In general `CREATE TABLE` statement has to specify three key things: + +1. Name of table to create. +2. Table schema, i.e. list of columns and their [data types](../sql-reference/data-types/index.md). +3. [Table engine](../engines/table-engines/index.md) and its settings, which determines all the details on how queries to this table will be physically executed. + +There are two tables to create: + +- `hits` is a table with each action done by all users on all websites covered by the service. +- `visits` is a table that contains pre-built sessions instead of individual actions. + +Let’s see and execute the real create table queries for these tables: + +``` sql +CREATE TABLE tutorial.hits_v1 +( + `WatchID` UInt64, + `JavaEnable` UInt8, + `Title` String, + `GoodEvent` Int16, + `EventTime` DateTime, + `EventDate` Date, + `CounterID` UInt32, + `ClientIP` UInt32, + `ClientIP6` FixedString(16), + `RegionID` UInt32, + `UserID` UInt64, + `CounterClass` Int8, + `OS` UInt8, + `UserAgent` UInt8, + `URL` String, + `Referer` String, + `URLDomain` String, + `RefererDomain` String, + `Refresh` UInt8, + `IsRobot` UInt8, + `RefererCategories` Array(UInt16), + `URLCategories` Array(UInt16), + `URLRegions` Array(UInt32), + `RefererRegions` Array(UInt32), + `ResolutionWidth` UInt16, + `ResolutionHeight` UInt16, + `ResolutionDepth` UInt8, + `FlashMajor` UInt8, + `FlashMinor` UInt8, + `FlashMinor2` String, + `NetMajor` UInt8, + `NetMinor` UInt8, + `UserAgentMajor` UInt16, + `UserAgentMinor` FixedString(2), + `CookieEnable` UInt8, + `JavascriptEnable` UInt8, + `IsMobile` UInt8, + `MobilePhone` UInt8, + `MobilePhoneModel` String, + `Params` String, + `IPNetworkID` UInt32, + `TraficSourceID` Int8, + `SearchEngineID` UInt16, + `SearchPhrase` String, + `AdvEngineID` UInt8, + `IsArtifical` UInt8, + `WindowClientWidth` UInt16, + `WindowClientHeight` UInt16, + `ClientTimeZone` Int16, + `ClientEventTime` DateTime, + `SilverlightVersion1` UInt8, + `SilverlightVersion2` UInt8, + `SilverlightVersion3` UInt32, + `SilverlightVersion4` UInt16, + `PageCharset` String, + `CodeVersion` UInt32, + `IsLink` UInt8, + `IsDownload` UInt8, + `IsNotBounce` UInt8, + `FUniqID` UInt64, + `HID` UInt32, + `IsOldCounter` UInt8, + `IsEvent` UInt8, + `IsParameter` UInt8, + `DontCountHits` UInt8, + `WithHash` UInt8, + `HitColor` FixedString(1), + `UTCEventTime` DateTime, + `Age` UInt8, + `Sex` UInt8, + `Income` UInt8, + `Interests` UInt16, + `Robotness` UInt8, + `GeneralInterests` Array(UInt16), + `RemoteIP` UInt32, + `RemoteIP6` FixedString(16), + `WindowName` Int32, + `OpenerName` Int32, + `HistoryLength` Int16, + `BrowserLanguage` FixedString(2), + `BrowserCountry` FixedString(2), + `SocialNetwork` String, + `SocialAction` String, + `HTTPError` UInt16, + `SendTiming` Int32, + `DNSTiming` Int32, + `ConnectTiming` Int32, + `ResponseStartTiming` Int32, + `ResponseEndTiming` Int32, + `FetchTiming` Int32, + `RedirectTiming` Int32, + `DOMInteractiveTiming` Int32, + `DOMContentLoadedTiming` Int32, + `DOMCompleteTiming` Int32, + `LoadEventStartTiming` Int32, + `LoadEventEndTiming` Int32, + `NSToDOMContentLoadedTiming` Int32, + `FirstPaintTiming` Int32, + `RedirectCount` Int8, + `SocialSourceNetworkID` UInt8, + `SocialSourcePage` String, + `ParamPrice` Int64, + `ParamOrderID` String, + `ParamCurrency` FixedString(3), + `ParamCurrencyID` UInt16, + `GoalsReached` Array(UInt32), + `OpenstatServiceName` String, + `OpenstatCampaignID` String, + `OpenstatAdID` String, + `OpenstatSourceID` String, + `UTMSource` String, + `UTMMedium` String, + `UTMCampaign` String, + `UTMContent` String, + `UTMTerm` String, + `FromTag` String, + `HasGCLID` UInt8, + `RefererHash` UInt64, + `URLHash` UInt64, + `CLID` UInt32, + `YCLID` UInt64, + `ShareService` String, + `ShareURL` String, + `ShareTitle` String, + `ParsedParams` Nested( + Key1 String, + Key2 String, + Key3 String, + Key4 String, + Key5 String, + ValueDouble Float64), + `IslandID` FixedString(16), + `RequestNum` UInt32, + `RequestTry` UInt8 +) +ENGINE = MergeTree() +PARTITION BY toYYYYMM(EventDate) +ORDER BY (CounterID, EventDate, intHash32(UserID)) +SAMPLE BY intHash32(UserID) +``` + +``` sql +CREATE TABLE tutorial.visits_v1 +( + `CounterID` UInt32, + `StartDate` Date, + `Sign` Int8, + `IsNew` UInt8, + `VisitID` UInt64, + `UserID` UInt64, + `StartTime` DateTime, + `Duration` UInt32, + `UTCStartTime` DateTime, + `PageViews` Int32, + `Hits` Int32, + `IsBounce` UInt8, + `Referer` String, + `StartURL` String, + `RefererDomain` String, + `StartURLDomain` String, + `EndURL` String, + `LinkURL` String, + `IsDownload` UInt8, + `TraficSourceID` Int8, + `SearchEngineID` UInt16, + `SearchPhrase` String, + `AdvEngineID` UInt8, + `PlaceID` Int32, + `RefererCategories` Array(UInt16), + `URLCategories` Array(UInt16), + `URLRegions` Array(UInt32), + `RefererRegions` Array(UInt32), + `IsYandex` UInt8, + `GoalReachesDepth` Int32, + `GoalReachesURL` Int32, + `GoalReachesAny` Int32, + `SocialSourceNetworkID` UInt8, + `SocialSourcePage` String, + `MobilePhoneModel` String, + `ClientEventTime` DateTime, + `RegionID` UInt32, + `ClientIP` UInt32, + `ClientIP6` FixedString(16), + `RemoteIP` UInt32, + `RemoteIP6` FixedString(16), + `IPNetworkID` UInt32, + `SilverlightVersion3` UInt32, + `CodeVersion` UInt32, + `ResolutionWidth` UInt16, + `ResolutionHeight` UInt16, + `UserAgentMajor` UInt16, + `UserAgentMinor` UInt16, + `WindowClientWidth` UInt16, + `WindowClientHeight` UInt16, + `SilverlightVersion2` UInt8, + `SilverlightVersion4` UInt16, + `FlashVersion3` UInt16, + `FlashVersion4` UInt16, + `ClientTimeZone` Int16, + `OS` UInt8, + `UserAgent` UInt8, + `ResolutionDepth` UInt8, + `FlashMajor` UInt8, + `FlashMinor` UInt8, + `NetMajor` UInt8, + `NetMinor` UInt8, + `MobilePhone` UInt8, + `SilverlightVersion1` UInt8, + `Age` UInt8, + `Sex` UInt8, + `Income` UInt8, + `JavaEnable` UInt8, + `CookieEnable` UInt8, + `JavascriptEnable` UInt8, + `IsMobile` UInt8, + `BrowserLanguage` UInt16, + `BrowserCountry` UInt16, + `Interests` UInt16, + `Robotness` UInt8, + `GeneralInterests` Array(UInt16), + `Params` Array(String), + `Goals` Nested( + ID UInt32, + Serial UInt32, + EventTime DateTime, + Price Int64, + OrderID String, + CurrencyID UInt32), + `WatchIDs` Array(UInt64), + `ParamSumPrice` Int64, + `ParamCurrency` FixedString(3), + `ParamCurrencyID` UInt16, + `ClickLogID` UInt64, + `ClickEventID` Int32, + `ClickGoodEvent` Int32, + `ClickEventTime` DateTime, + `ClickPriorityID` Int32, + `ClickPhraseID` Int32, + `ClickPageID` Int32, + `ClickPlaceID` Int32, + `ClickTypeID` Int32, + `ClickResourceID` Int32, + `ClickCost` UInt32, + `ClickClientIP` UInt32, + `ClickDomainID` UInt32, + `ClickURL` String, + `ClickAttempt` UInt8, + `ClickOrderID` UInt32, + `ClickBannerID` UInt32, + `ClickMarketCategoryID` UInt32, + `ClickMarketPP` UInt32, + `ClickMarketCategoryName` String, + `ClickMarketPPName` String, + `ClickAWAPSCampaignName` String, + `ClickPageName` String, + `ClickTargetType` UInt16, + `ClickTargetPhraseID` UInt64, + `ClickContextType` UInt8, + `ClickSelectType` Int8, + `ClickOptions` String, + `ClickGroupBannerID` Int32, + `OpenstatServiceName` String, + `OpenstatCampaignID` String, + `OpenstatAdID` String, + `OpenstatSourceID` String, + `UTMSource` String, + `UTMMedium` String, + `UTMCampaign` String, + `UTMContent` String, + `UTMTerm` String, + `FromTag` String, + `HasGCLID` UInt8, + `FirstVisit` DateTime, + `PredLastVisit` Date, + `LastVisit` Date, + `TotalVisits` UInt32, + `TraficSource` Nested( + ID Int8, + SearchEngineID UInt16, + AdvEngineID UInt8, + PlaceID UInt16, + SocialSourceNetworkID UInt8, + Domain String, + SearchPhrase String, + SocialSourcePage String), + `Attendance` FixedString(16), + `CLID` UInt32, + `YCLID` UInt64, + `NormalizedRefererHash` UInt64, + `SearchPhraseHash` UInt64, + `RefererDomainHash` UInt64, + `NormalizedStartURLHash` UInt64, + `StartURLDomainHash` UInt64, + `NormalizedEndURLHash` UInt64, + `TopLevelDomain` UInt64, + `URLScheme` UInt64, + `OpenstatServiceNameHash` UInt64, + `OpenstatCampaignIDHash` UInt64, + `OpenstatAdIDHash` UInt64, + `OpenstatSourceIDHash` UInt64, + `UTMSourceHash` UInt64, + `UTMMediumHash` UInt64, + `UTMCampaignHash` UInt64, + `UTMContentHash` UInt64, + `UTMTermHash` UInt64, + `FromHash` UInt64, + `WebVisorEnabled` UInt8, + `WebVisorActivity` UInt32, + `ParsedParams` Nested( + Key1 String, + Key2 String, + Key3 String, + Key4 String, + Key5 String, + ValueDouble Float64), + `Market` Nested( + Type UInt8, + GoalID UInt32, + OrderID String, + OrderPrice Int64, + PP UInt32, + DirectPlaceID UInt32, + DirectOrderID UInt32, + DirectBannerID UInt32, + GoodID String, + GoodName String, + GoodQuantity Int32, + GoodPrice Int64), + `IslandID` FixedString(16) +) +ENGINE = CollapsingMergeTree(Sign) +PARTITION BY toYYYYMM(StartDate) +ORDER BY (CounterID, StartDate, intHash32(UserID), VisitID) +SAMPLE BY intHash32(UserID) +``` + +You can execute those queries using the interactive mode of `clickhouse-client` (just launch it in a terminal without specifying a query in advance) or try some [alternative interface](../interfaces/index.md) if you want. + +As we can see, `hits_v1` uses the [basic MergeTree engine](../engines/table-engines/mergetree-family/mergetree.md), while the `visits_v1` uses the [Collapsing](../engines/table-engines/mergetree-family/collapsingmergetree.md) variant. + +### Import Data {#import-data} + +Data import to ClickHouse is done via [INSERT INTO](../sql-reference/statements/insert-into.md) query like in many other SQL databases. However, data is usually provided in one of the [supported serialization formats](../interfaces/formats.md) instead of `VALUES` clause (which is also supported). + +The files we downloaded earlier are in tab-separated format, so here’s how to import them via console client: + +``` bash +clickhouse-client --query "INSERT INTO tutorial.hits_v1 FORMAT TSV" --max_insert_block_size=100000 < hits_v1.tsv +clickhouse-client --query "INSERT INTO tutorial.visits_v1 FORMAT TSV" --max_insert_block_size=100000 < visits_v1.tsv +``` + +ClickHouse has a lot of [settings to tune](../operations/settings/index.md) and one way to specify them in console client is via arguments, as we can see with `--max_insert_block_size`. The easiest way to figure out what settings are available, what do they mean and what the defaults are is to query the `system.settings` table: + +``` sql +SELECT name, value, changed, description +FROM system.settings +WHERE name LIKE '%max_insert_b%' +FORMAT TSV + +max_insert_block_size 1048576 0 "The maximum block size for insertion, if we control the creation of blocks for insertion." +``` + +Optionally you can [OPTIMIZE](../sql-reference/statements/optimize.md) the tables after import. Tables that are configured with an engine from MergeTree-family always do merges of data parts in the background to optimize data storage (or at least check if it makes sense). These queries force the table engine to do storage optimization right now instead of some time later: + +``` bash +clickhouse-client --query "OPTIMIZE TABLE tutorial.hits_v1 FINAL" +clickhouse-client --query "OPTIMIZE TABLE tutorial.visits_v1 FINAL" +``` + +These queries start an I/O and CPU intensive operation, so if the table consistently receives new data, it’s better to leave it alone and let merges run in the background. + +Now we can check if the table import was successful: + +``` bash +clickhouse-client --query "SELECT COUNT(*) FROM tutorial.hits_v1" +clickhouse-client --query "SELECT COUNT(*) FROM tutorial.visits_v1" +``` + +## Example Queries {#example-queries} + +``` sql +SELECT + StartURL AS URL, + AVG(Duration) AS AvgDuration +FROM tutorial.visits_v1 +WHERE StartDate BETWEEN '2014-03-23' AND '2014-03-30' +GROUP BY URL +ORDER BY AvgDuration DESC +LIMIT 10 +``` + +``` sql +SELECT + sum(Sign) AS visits, + sumIf(Sign, has(Goals.ID, 1105530)) AS goal_visits, + (100. * goal_visits) / visits AS goal_percent +FROM tutorial.visits_v1 +WHERE (CounterID = 912887) AND (toYYYYMM(StartDate) = 201403) AND (domain(StartURL) = 'yandex.ru') +``` + +## Cluster Deployment {#cluster-deployment} + +ClickHouse cluster is a homogenous cluster. Steps to set up: + +1. Install ClickHouse server on all machines of the cluster +2. Set up cluster configs in configuration files +3. Create local tables on each instance +4. Create a [Distributed table](../engines/table-engines/special/distributed.md) + +[Distributed table](../engines/table-engines/special/distributed.md) is actually a kind of “view” to local tables of ClickHouse cluster. SELECT query from a distributed table executes using resources of all cluster’s shards. You may specify configs for multiple clusters and create multiple distributed tables providing views to different clusters. + +Example config for a cluster with three shards, one replica each: + +``` xml + + + + + example-perftest01j + 9000 + + + + + example-perftest02j + 9000 + + + + + example-perftest03j + 9000 + + + + +``` + +For further demonstration, let’s create a new local table with the same `CREATE TABLE` query that we used for `hits_v1`, but different table name: + +``` sql +CREATE TABLE tutorial.hits_local (...) ENGINE = MergeTree() ... +``` + +Creating a distributed table providing a view into local tables of the cluster: + +``` sql +CREATE TABLE tutorial.hits_all AS tutorial.hits_local +ENGINE = Distributed(perftest_3shards_1replicas, tutorial, hits_local, rand()); +``` + +A common practice is to create similar Distributed tables on all machines of the cluster. It allows running distributed queries on any machine of the cluster. Also there’s an alternative option to create temporary distributed table for a given SELECT query using [remote](../sql-reference/table-functions/remote.md) table function. + +Let’s run [INSERT SELECT](../sql-reference/statements/insert-into.md) into the Distributed table to spread the table to multiple servers. + +``` sql +INSERT INTO tutorial.hits_all SELECT * FROM tutorial.hits_v1; +``` + +!!! warning "Notice" + This approach is not suitable for the sharding of large tables. There’s a separate tool [clickhouse-copier](../operations/utilities/clickhouse-copier.md) that can re-shard arbitrary large tables. + +As you could expect, computationally heavy queries run N times faster if they utilize 3 servers instead of one. + +In this case, we have used a cluster with 3 shards, and each contains a single replica. + +To provide resilience in a production environment, we recommend that each shard should contain 2-3 replicas spread between multiple availability zones or datacenters (or at least racks). Note that ClickHouse supports an unlimited number of replicas. + +Example config for a cluster of one shard containing three replicas: + +``` xml + + ... + + + + example-perftest01j + 9000 + + + example-perftest02j + 9000 + + + example-perftest03j + 9000 + + + + +``` + +To enable native replication [ZooKeeper](http://zookeeper.apache.org/) is required. ClickHouse takes care of data consistency on all replicas and runs restore procedure after failure automatically. It’s recommended to deploy the ZooKeeper cluster on separate servers (where no other processes including ClickHouse are running). + +!!! note "Note" + ZooKeeper is not a strict requirement: in some simple cases, you can duplicate the data by writing it into all the replicas from your application code. This approach is **not** recommended, in this case, ClickHouse won’t be able to guarantee data consistency on all replicas. Thus it becomes the responsibility of your application. + +ZooKeeper locations are specified in the configuration file: + +``` xml + + + zoo01 + 2181 + + + zoo02 + 2181 + + + zoo03 + 2181 + + +``` + +Also, we need to set macros for identifying each shard and replica which are used on table creation: + +``` xml + + 01 + 01 + +``` + +If there are no replicas at the moment on replicated table creation, a new first replica is instantiated. If there are already live replicas, the new replica clones data from existing ones. You have an option to create all replicated tables first, and then insert data to it. Another option is to create some replicas and add the others after or during data insertion. + +``` sql +CREATE TABLE tutorial.hits_replica (...) +ENGINE = ReplicatedMergeTree( + '/clickhouse_perftest/tables/{shard}/hits', + '{replica}' +) +... +``` + +Here we use [ReplicatedMergeTree](../engines/table-engines/mergetree-family/replication.md) table engine. In parameters we specify ZooKeeper path containing shard and replica identifiers. + +``` sql +INSERT INTO tutorial.hits_replica SELECT * FROM tutorial.hits_local; +``` + +Replication operates in multi-master mode. Data can be loaded into any replica, and the system then syncs it with other instances automatically. Replication is asynchronous so at a given moment, not all replicas may contain recently inserted data. At least one replica should be up to allow data ingestion. Others will sync up data and repair consistency once they will become active again. Note that this approach allows for the low possibility of a loss of recently inserted data. + +[Original article](https://clickhouse.com/docs/en/getting_started/tutorial/) diff --git a/docs/en/guides/apply-catboost-model.md b/docs/en/guides/apply-catboost-model.md new file mode 100644 index 00000000000..859703a31df --- /dev/null +++ b/docs/en/guides/apply-catboost-model.md @@ -0,0 +1,242 @@ +--- +toc_priority: 41 +toc_title: Applying CatBoost Models +--- + +# Applying a Catboost Model in ClickHouse {#applying-catboost-model-in-clickhouse} + +[CatBoost](https://catboost.ai) is a free and open-source gradient boosting library developed at Yandex for machine learning. + +With this instruction, you will learn to apply pre-trained models in ClickHouse by running model inference from SQL. + +To apply a CatBoost model in ClickHouse: + +1. [Create a Table](#create-table). +2. [Insert the Data to the Table](#insert-data-to-table). +3. [Integrate CatBoost into ClickHouse](#integrate-catboost-into-clickhouse) (Optional step). +4. [Run the Model Inference from SQL](#run-model-inference). + +For more information about training CatBoost models, see [Training and applying models](https://catboost.ai/docs/features/training.html#training). + +You can reload CatBoost models if the configuration was updated without restarting the server using [RELOAD MODEL](../sql-reference/statements/system.md#query_language-system-reload-model) and [RELOAD MODELS](../sql-reference/statements/system.md#query_language-system-reload-models) system queries. + +## Prerequisites {#prerequisites} + +If you do not have the [Docker](https://docs.docker.com/install/) yet, install it. + +!!! note "Note" + [Docker](https://www.docker.com) is a software platform that allows you to create containers that isolate a CatBoost and ClickHouse installation from the rest of the system. + +Before applying a CatBoost model: + +**1.** Pull the [Docker image](https://hub.docker.com/r/yandex/tutorial-catboost-clickhouse) from the registry: + +``` bash +$ docker pull yandex/tutorial-catboost-clickhouse +``` + +This Docker image contains everything you need to run CatBoost and ClickHouse: code, runtime, libraries, environment variables, and configuration files. + +**2.** Make sure the Docker image has been successfully pulled: + +``` bash +$ docker image ls +REPOSITORY TAG IMAGE ID CREATED SIZE +yandex/tutorial-catboost-clickhouse latest 622e4d17945b 22 hours ago 1.37GB +``` + +**3.** Start a Docker container based on this image: + +``` bash +$ docker run -it -p 8888:8888 yandex/tutorial-catboost-clickhouse +``` + +## 1. Create a Table {#create-table} + +To create a ClickHouse table for the training sample: + +**1.** Start ClickHouse console client in the interactive mode: + +``` bash +$ clickhouse client +``` + +!!! note "Note" + The ClickHouse server is already running inside the Docker container. + +**2.** Create the table using the command: + +``` sql +:) CREATE TABLE amazon_train +( + date Date MATERIALIZED today(), + ACTION UInt8, + RESOURCE UInt32, + MGR_ID UInt32, + ROLE_ROLLUP_1 UInt32, + ROLE_ROLLUP_2 UInt32, + ROLE_DEPTNAME UInt32, + ROLE_TITLE UInt32, + ROLE_FAMILY_DESC UInt32, + ROLE_FAMILY UInt32, + ROLE_CODE UInt32 +) +ENGINE = MergeTree ORDER BY date +``` + +**3.** Exit from ClickHouse console client: + +``` sql +:) exit +``` + +## 2. Insert the Data to the Table {#insert-data-to-table} + +To insert the data: + +**1.** Run the following command: + +``` bash +$ clickhouse client --host 127.0.0.1 --query 'INSERT INTO amazon_train FORMAT CSVWithNames' < ~/amazon/train.csv +``` + +**2.** Start ClickHouse console client in the interactive mode: + +``` bash +$ clickhouse client +``` + +**3.** Make sure the data has been uploaded: + +``` sql +:) SELECT count() FROM amazon_train + +SELECT count() +FROM amazon_train + ++-count()-+ +| 65538 | ++-------+ +``` + +## 3. Integrate CatBoost into ClickHouse {#integrate-catboost-into-clickhouse} + +!!! note "Note" + **Optional step.** The Docker image contains everything you need to run CatBoost and ClickHouse. + +To integrate CatBoost into ClickHouse: + +**1.** Build the evaluation library. + +The fastest way to evaluate a CatBoost model is compile `libcatboostmodel.` library. For more information about how to build the library, see [CatBoost documentation](https://catboost.ai/docs/concepts/c-plus-plus-api_dynamic-c-pluplus-wrapper.html). + +**2.** Create a new directory anywhere and with any name, for example, `data` and put the created library in it. The Docker image already contains the library `data/libcatboostmodel.so`. + +**3.** Create a new directory for config model anywhere and with any name, for example, `models`. + +**4.** Create a model configuration file with any name, for example, `models/amazon_model.xml`. + +**5.** Describe the model configuration: + +``` xml + + + + catboost + + amazon + + /home/catboost/tutorial/catboost_model.bin + + 0 + + +``` + +**6.** Add the path to CatBoost and the model configuration to the ClickHouse configuration: + +``` xml + +/home/catboost/data/libcatboostmodel.so +/home/catboost/models/*_model.xml +``` + +!!! note "Note" + You can change path to the CatBoost model configuration later without restarting server. + +## 4. Run the Model Inference from SQL {#run-model-inference} + +For test model run the ClickHouse client `$ clickhouse client`. + +Let’s make sure that the model is working: + +``` sql +:) SELECT + modelEvaluate('amazon', + RESOURCE, + MGR_ID, + ROLE_ROLLUP_1, + ROLE_ROLLUP_2, + ROLE_DEPTNAME, + ROLE_TITLE, + ROLE_FAMILY_DESC, + ROLE_FAMILY, + ROLE_CODE) > 0 AS prediction, + ACTION AS target +FROM amazon_train +LIMIT 10 +``` + +!!! note "Note" + Function [modelEvaluate](../sql-reference/functions/other-functions.md#function-modelevaluate) returns tuple with per-class raw predictions for multiclass models. + +Let’s predict the probability: + +``` sql +:) SELECT + modelEvaluate('amazon', + RESOURCE, + MGR_ID, + ROLE_ROLLUP_1, + ROLE_ROLLUP_2, + ROLE_DEPTNAME, + ROLE_TITLE, + ROLE_FAMILY_DESC, + ROLE_FAMILY, + ROLE_CODE) AS prediction, + 1. / (1 + exp(-prediction)) AS probability, + ACTION AS target +FROM amazon_train +LIMIT 10 +``` + +!!! note "Note" + More info about [exp()](../sql-reference/functions/math-functions.md) function. + +Let’s calculate LogLoss on the sample: + +``` sql +:) SELECT -avg(tg * log(prob) + (1 - tg) * log(1 - prob)) AS logloss +FROM +( + SELECT + modelEvaluate('amazon', + RESOURCE, + MGR_ID, + ROLE_ROLLUP_1, + ROLE_ROLLUP_2, + ROLE_DEPTNAME, + ROLE_TITLE, + ROLE_FAMILY_DESC, + ROLE_FAMILY, + ROLE_CODE) AS prediction, + 1. / (1. + exp(-prediction)) AS prob, + ACTION AS tg + FROM amazon_train +) +``` + +!!! note "Note" + More info about [avg()](../sql-reference/aggregate-functions/reference/avg.md#agg_function-avg) and [log()](../sql-reference/functions/math-functions.md) functions. + +[Original article](https://clickhouse.com/docs/en/guides/apply_catboost_model/) diff --git a/docs/en/guides/index.md b/docs/en/guides/index.md new file mode 100644 index 00000000000..eb4ca9af367 --- /dev/null +++ b/docs/en/guides/index.md @@ -0,0 +1,14 @@ +--- +toc_folder_title: Guides +toc_priority: 38 +toc_title: Overview +--- + +# ClickHouse Guides {#clickhouse-guides} + +List of detailed step-by-step instructions that help to solve various tasks using ClickHouse: + +- [Tutorial on simple cluster set-up](../getting-started/tutorial.md) +- [Applying a CatBoost model in ClickHouse](../guides/apply-catboost-model.md) + +[Original article](https://clickhouse.com/docs/en/guides/) diff --git a/docs/en/index.md b/docs/en/index.md new file mode 100644 index 00000000000..532be035bbc --- /dev/null +++ b/docs/en/index.md @@ -0,0 +1,95 @@ +--- +toc_priority: 0 +toc_title: Overview +--- + +# What Is ClickHouse? {#what-is-clickhouse} + +ClickHouse® is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP). + +In a “normal” row-oriented DBMS, data is stored in this order: + +| Row | WatchID | JavaEnable | Title | GoodEvent | EventTime | +|-----|-------------|------------|--------------------|-----------|---------------------| +| #0 | 89354350662 | 1 | Investor Relations | 1 | 2016-05-18 05:19:20 | +| #1 | 90329509958 | 0 | Contact us | 1 | 2016-05-18 08:10:20 | +| #2 | 89953706054 | 1 | Mission | 1 | 2016-05-18 07:38:00 | +| #N | … | … | … | … | … | + +In other words, all the values related to a row are physically stored next to each other. + +Examples of a row-oriented DBMS are MySQL, Postgres, and MS SQL Server. + +In a column-oriented DBMS, data is stored like this: + +| Row: | #0 | #1 | #2 | #N | +|-------------|---------------------|---------------------|---------------------|-----| +| WatchID: | 89354350662 | 90329509958 | 89953706054 | … | +| JavaEnable: | 1 | 0 | 1 | … | +| Title: | Investor Relations | Contact us | Mission | … | +| GoodEvent: | 1 | 1 | 1 | … | +| EventTime: | 2016-05-18 05:19:20 | 2016-05-18 08:10:20 | 2016-05-18 07:38:00 | … | + +These examples only show the order that data is arranged in. The values from different columns are stored separately, and data from the same column is stored together. + +Examples of a column-oriented DBMS: Vertica, Paraccel (Actian Matrix and Amazon Redshift), Sybase IQ, Exasol, Infobright, InfiniDB, MonetDB (VectorWise and Actian Vector), LucidDB, SAP HANA, Google Dremel, Google PowerDrill, Druid, and kdb+. + +Different orders for storing data are better suited to different scenarios. The data access scenario refers to what queries are made, how often, and in what proportion; how much data is read for each type of query – rows, columns, and bytes; the relationship between reading and updating data; the working size of the data and how locally it is used; whether transactions are used, and how isolated they are; requirements for data replication and logical integrity; requirements for latency and throughput for each type of query, and so on. + +The higher the load on the system, the more important it is to customize the system set up to match the requirements of the usage scenario, and the more fine grained this customization becomes. There is no system that is equally well-suited to significantly different scenarios. If a system is adaptable to a wide set of scenarios, under a high load, the system will handle all the scenarios equally poorly, or will work well for just one or few of possible scenarios. + +## Key Properties of OLAP Scenario {#key-properties-of-olap-scenario} + +- The vast majority of requests are for read access. +- Data is updated in fairly large batches (\> 1000 rows), not by single rows; or it is not updated at all. +- Data is added to the DB but is not modified. +- For reads, quite a large number of rows are extracted from the DB, but only a small subset of columns. +- Tables are “wide,” meaning they contain a large number of columns. +- Queries are relatively rare (usually hundreds of queries per server or less per second). +- For simple queries, latencies around 50 ms are allowed. +- Column values are fairly small: numbers and short strings (for example, 60 bytes per URL). +- Requires high throughput when processing a single query (up to billions of rows per second per server). +- Transactions are not necessary. +- Low requirements for data consistency. +- There is one large table per query. All tables are small, except for one. +- A query result is significantly smaller than the source data. In other words, data is filtered or aggregated, so the result fits in a single server’s RAM. + +It is easy to see that the OLAP scenario is very different from other popular scenarios (such as OLTP or Key-Value access). So it does not make sense to try to use OLTP or a Key-Value DB for processing analytical queries if you want to get decent performance. For example, if you try to use MongoDB or Redis for analytics, you will get very poor performance compared to OLAP databases. + +## Why Column-Oriented Databases Work Better in the OLAP Scenario {#why-column-oriented-databases-work-better-in-the-olap-scenario} + +Column-oriented databases are better suited to OLAP scenarios: they are at least 100 times faster in processing most queries. The reasons are explained in detail below, but the fact is easier to demonstrate visually: + +**Row-oriented DBMS** + +![Row-oriented](images/row-oriented.gif#) + +**Column-oriented DBMS** + +![Column-oriented](images/column-oriented.gif#) + +See the difference? + +### Input/output {#inputoutput} + +1. For an analytical query, only a small number of table columns need to be read. In a column-oriented database, you can read just the data you need. For example, if you need 5 columns out of 100, you can expect a 20-fold reduction in I/O. +2. Since data is read in packets, it is easier to compress. Data in columns is also easier to compress. This further reduces the I/O volume. +3. Due to the reduced I/O, more data fits in the system cache. + +For example, the query “count the number of records for each advertising platform” requires reading one “advertising platform ID” column, which takes up 1 byte uncompressed. If most of the traffic was not from advertising platforms, you can expect at least 10-fold compression of this column. When using a quick compression algorithm, data decompression is possible at a speed of at least several gigabytes of uncompressed data per second. In other words, this query can be processed at a speed of approximately several billion rows per second on a single server. This speed is actually achieved in practice. + +### CPU {#cpu} + +Since executing a query requires processing a large number of rows, it helps to dispatch all operations for entire vectors instead of for separate rows, or to implement the query engine so that there is almost no dispatching cost. If you do not do this, with any half-decent disk subsystem, the query interpreter inevitably stalls the CPU. It makes sense to both store data in columns and process it, when possible, by columns. + +There are two ways to do this: + +1. A vector engine. All operations are written for vectors, instead of for separate values. This means you do not need to call operations very often, and dispatching costs are negligible. Operation code contains an optimized internal cycle. + +2. Code generation. The code generated for the query has all the indirect calls in it. + +This is not done in “normal” databases, because it does not make sense when running simple queries. However, there are exceptions. For example, MemSQL uses code generation to reduce latency when processing SQL queries. (For comparison, analytical DBMSs require optimization of throughput, not latency.) + +Note that for CPU efficiency, the query language must be declarative (SQL or MDX), or at least a vector (J, K). The query should only contain implicit loops, allowing for optimization. + +{## [Original article](https://clickhouse.com/docs/en/) ##} diff --git a/docs/en/interfaces/cli.md b/docs/en/interfaces/cli.md index 9ef1cea280a..eaf7a96ce42 100644 --- a/docs/en/interfaces/cli.md +++ b/docs/en/interfaces/cli.md @@ -1,13 +1,13 @@ --- -sidebar_position: 17 -sidebar_label: Command-Line Client +toc_priority: 17 +toc_title: Command-Line Client --- # Command-line Client {#command-line-client} ClickHouse provides a native command-line client: `clickhouse-client`. The client supports command-line options and configuration files. For more information, see [Configuring](#interfaces_cli_configuration). -[Install](../../quick-start.mdx) it from the `clickhouse-client` package and run it with the command `clickhouse-client`. +[Install](../getting-started/index.md) it from the `clickhouse-client` package and run it with the command `clickhouse-client`. ``` bash $ clickhouse-client diff --git a/docs/en/interfaces/cpp.md b/docs/en/interfaces/cpp.md index a7b4188799e..dcd1228ea0f 100644 --- a/docs/en/interfaces/cpp.md +++ b/docs/en/interfaces/cpp.md @@ -1,6 +1,6 @@ --- -sidebar_position: 24 -sidebar_label: C++ Client Library +toc_priority: 24 +toc_title: C++ Client Library --- # C++ Client Library {#c-client-library} diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index 801b7c1a14f..a7066fca087 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -1,6 +1,6 @@ --- -sidebar_position: 21 -sidebar_label: Input and Output Formats +toc_priority: 21 +toc_title: Input and Output Formats --- # Formats for Input and Output Data {#formats} @@ -51,6 +51,7 @@ The supported formats are: | [PrettySpace](#prettyspace) | ✗ | ✔ | | [Protobuf](#protobuf) | ✔ | ✔ | | [ProtobufSingle](#protobufsingle) | ✔ | ✔ | +| [ProtobufList](#protobuflist) | ✔ | ✔ | | [Avro](#data-format-avro) | ✔ | ✔ | | [AvroConfluent](#data-format-avro-confluent) | ✔ | ✗ | | [Parquet](#data-format-parquet) | ✔ | ✔ | @@ -64,7 +65,7 @@ The supported formats are: | [Null](#null) | ✗ | ✔ | | [XML](#xml) | ✗ | ✔ | | [CapnProto](#capnproto) | ✔ | ✔ | -| [LineAsString](#lineasstring) | ✔ | ✗ | +| [LineAsString](#lineasstring) | ✔ | ✔ | | [Regexp](#data-format-regexp) | ✔ | ✗ | | [RawBLOB](#rawblob) | ✔ | ✔ | | [MsgPack](#msgpack) | ✔ | ✔ | @@ -401,7 +402,7 @@ Parsing allows the presence of the additional field `tskv` without the equal sig Comma Separated Values format ([RFC](https://tools.ietf.org/html/rfc4180)). -When formatting, rows are enclosed in double-quotes. A double quote inside a string is output as two double quotes in a row. There are no other rules for escaping characters. Date and date-time are enclosed in double-quotes. Numbers are output without quotes. Values are separated by a delimiter character, which is `,` by default. The delimiter character is defined in the setting [format_csv_delimiter](../operations/settings/settings.md#settings-format_csv_delimiter). Rows are separated using the Unix line feed (LF). Arrays are serialized in CSV as follows: first, the array is serialized to a string as in TabSeparated format, and then the resulting string is output to CSV in double-quotes. Tuples in CSV format are serialized as separate columns (that is, their nesting in the tuple is lost). +When formatting, strings are enclosed in double-quotes. A double quote inside a string is output as two double quotes in a row. There are no other rules for escaping characters. Date and date-time are enclosed in double-quotes. Numbers are output without quotes. Values are separated by a delimiter character, which is `,` by default. The delimiter character is defined in the setting [format_csv_delimiter](../operations/settings/settings.md#settings-format_csv_delimiter). Rows are separated using the Unix line feed (LF). Arrays are serialized in CSV as follows: first, the array is serialized to a string as in TabSeparated format, and then the resulting string is output to CSV in double-quotes. Tuples in CSV format are serialized as separate columns (that is, their nesting in the tuple is lost). ``` bash $ clickhouse-client --format_csv_delimiter="|" --query="INSERT INTO test.csv FORMAT CSV" < data.csv @@ -409,7 +410,7 @@ $ clickhouse-client --format_csv_delimiter="|" --query="INSERT INTO test.csv FOR \*By default, the delimiter is `,`. See the [format_csv_delimiter](../operations/settings/settings.md#settings-format_csv_delimiter) setting for more information. -When parsing, all values can be parsed either with or without quotes. Both double and single quotes are supported. Rows can also be arranged without quotes. In this case, they are parsed up to the delimiter character or line feed (CR or LF). In violation of the RFC, when parsing rows without quotes, the leading and trailing spaces and tabs are ignored. For the line feed, Unix (LF), Windows (CR LF) and Mac OS Classic (CR LF) types are all supported. +When parsing, all values can be parsed either with or without quotes. Both double and single quotes are supported. Strings can also be arranged without quotes. In this case, they are parsed up to the delimiter character or line feed (CR or LF). In violation of the RFC, when parsing strings without quotes, the leading and trailing spaces and tabs are ignored. For the line feed, Unix (LF), Windows (CR LF) and Mac OS Classic (CR LF) types are all supported. If setting [input_format_csv_empty_as_default](../operations/settings/settings.md#settings-input_format_csv_empty_as_default) is enabled, empty unquoted input values are replaced with default values. For complex default expressions [input_format_defaults_for_omitted_fields](../operations/settings/settings.md#settings-input_format_defaults_for_omitted_fields) must be enabled too. @@ -764,9 +765,8 @@ CREATE TABLE IF NOT EXISTS example_table - If `input_format_defaults_for_omitted_fields = 0`, then the default value for `x` and `a` equals `0` (as the default value for the `UInt32` data type). - If `input_format_defaults_for_omitted_fields = 1`, then the default value for `x` equals `0`, but the default value of `a` equals `x * 2`. -:::warning -When inserting data with `input_format_defaults_for_omitted_fields = 1`, ClickHouse consumes more computational resources, compared to insertion with `input_format_defaults_for_omitted_fields = 0`. -::: +!!! note "Warning" + When inserting data with `input_format_defaults_for_omitted_fields = 1`, ClickHouse consumes more computational resources, compared to insertion with `input_format_defaults_for_omitted_fields = 0`. ### Selecting Data {#selecting-data} @@ -788,9 +788,8 @@ The query `SELECT * FROM UserActivity FORMAT JSONEachRow` returns: Unlike the [JSON](#json) format, there is no substitution of invalid UTF-8 sequences. Values are escaped in the same way as for `JSON`. -:::info -Any set of bytes can be output in the strings. Use the `JSONEachRow` format if you are sure that the data in the table can be formatted as JSON without losing any information. -::: +!!! note "Note" + Any set of bytes can be output in the strings. Use the `JSONEachRow` format if you are sure that the data in the table can be formatted as JSON without losing any information. ### Usage of Nested Structures {#jsoneachrow-nested} @@ -1232,7 +1231,38 @@ See also [how to read/write length-delimited protobuf messages in popular langua ## ProtobufSingle {#protobufsingle} -Same as [Protobuf](#protobuf) but for storing/parsing single Protobuf message without length delimiters. +Same as [Protobuf](#protobuf) but for storing/parsing a single Protobuf message without length delimiter. +As a result, only a single table row can be written/read. + +## ProtobufList {#protobuflist} + +Similar to Protobuf but rows are represented as a sequence of sub-messages contained in a message with fixed name "Envelope". + +Usage example: + +``` sql +SELECT * FROM test.table FORMAT ProtobufList SETTINGS format_schema = 'schemafile:MessageType' +``` + +``` bash +cat protobuflist_messages.bin | clickhouse-client --query "INSERT INTO test.table FORMAT ProtobufList SETTINGS format_schema='schemafile:MessageType'" +``` + +where the file `schemafile.proto` looks like this: + +``` capnp +syntax = "proto3"; + +message Envelope { + message MessageType { + string name = 1; + string surname = 2; + uint32 birthDate = 3; + repeated string phoneNumbers = 4; + }; + MessageType row = 1; +}; +``` ## Avro {#data-format-avro} @@ -1342,9 +1372,8 @@ SET format_avro_schema_registry_url = 'http://schema-registry'; SELECT * FROM topic1_stream; ``` -:::warning -Setting `format_avro_schema_registry_url` needs to be configured in `users.xml` to maintain it’s value after a restart. Also you can use the `format_avro_schema_registry_url` setting of the `Kafka` table engine. -::: +!!! note "Warning" + Setting `format_avro_schema_registry_url` needs to be configured in `users.xml` to maintain it’s value after a restart. Also you can use the `format_avro_schema_registry_url` setting of the `Kafka` table engine. ## Parquet {#data-format-parquet} @@ -1367,7 +1396,8 @@ The table below shows supported data types and how they match ClickHouse [data t | `FLOAT`, `HALF_FLOAT` | [Float32](../sql-reference/data-types/float.md) | `FLOAT` | | `DOUBLE` | [Float64](../sql-reference/data-types/float.md) | `DOUBLE` | | `DATE32` | [Date](../sql-reference/data-types/date.md) | `UINT16` | -| `DATE64`, `TIMESTAMP` | [DateTime](../sql-reference/data-types/datetime.md) | `UINT32` | +| `DATE64` | [DateTime](../sql-reference/data-types/datetime.md) | `UINT32` | +| `TIMESTAMP` | [DateTime64](../sql-reference/data-types/datetime64.md) | `TIMESTAMP` | | `STRING`, `BINARY` | [String](../sql-reference/data-types/string.md) | `BINARY` | | — | [FixedString](../sql-reference/data-types/fixedstring.md) | `BINARY` | | `DECIMAL` | [Decimal](../sql-reference/data-types/decimal.md) | `DECIMAL` | @@ -1424,7 +1454,8 @@ The table below shows supported data types and how they match ClickHouse [data t | `FLOAT`, `HALF_FLOAT` | [Float32](../sql-reference/data-types/float.md) | `FLOAT32` | | `DOUBLE` | [Float64](../sql-reference/data-types/float.md) | `FLOAT64` | | `DATE32` | [Date](../sql-reference/data-types/date.md) | `UINT16` | -| `DATE64`, `TIMESTAMP` | [DateTime](../sql-reference/data-types/datetime.md) | `UINT32` | +| `DATE64` | [DateTime](../sql-reference/data-types/datetime.md) | `UINT32` | +| `TIMESTAMP` | [DateTime64](../sql-reference/data-types/datetime64.md) | `TIMESTAMP` | | `STRING`, `BINARY` | [String](../sql-reference/data-types/string.md) | `BINARY` | | `STRING`, `BINARY` | [FixedString](../sql-reference/data-types/fixedstring.md) | `BINARY` | | `DECIMAL` | [Decimal](../sql-reference/data-types/decimal.md) | `DECIMAL` | @@ -1486,7 +1517,8 @@ The table below shows supported data types and how they match ClickHouse [data t | `FLOAT`, `HALF_FLOAT` | [Float32](../sql-reference/data-types/float.md) | `FLOAT` | | `DOUBLE` | [Float64](../sql-reference/data-types/float.md) | `DOUBLE` | | `DATE32` | [Date](../sql-reference/data-types/date.md) | `DATE32` | -| `DATE64`, `TIMESTAMP` | [DateTime](../sql-reference/data-types/datetime.md) | `TIMESTAMP` | +| `DATE64` | [DateTime](../sql-reference/data-types/datetime.md) | `UINT32` | +| `TIMESTAMP` | [DateTime64](../sql-reference/data-types/datetime64.md) | `TIMESTAMP` | | `STRING`, `BINARY` | [String](../sql-reference/data-types/string.md) | `BINARY` | | `DECIMAL` | [Decimal](../sql-reference/data-types/decimal.md) | `DECIMAL` | | `LIST` | [Array](../sql-reference/data-types/array.md) | `LIST` | diff --git a/docs/en/interfaces/grpc.md b/docs/en/interfaces/grpc.md index 6ada38c6220..b30715082ec 100644 --- a/docs/en/interfaces/grpc.md +++ b/docs/en/interfaces/grpc.md @@ -1,6 +1,6 @@ --- -sidebar_position: 19 -sidebar_label: gRPC Interface +toc_priority: 19 +toc_title: gRPC Interface --- # gRPC Interface {#grpc-interface} diff --git a/docs/en/interfaces/http.md b/docs/en/interfaces/http.md index a97cf6671b2..d72fb4d6f17 100644 --- a/docs/en/interfaces/http.md +++ b/docs/en/interfaces/http.md @@ -1,6 +1,6 @@ --- -sidebar_position: 19 -sidebar_label: HTTP Interface +toc_priority: 19 +toc_title: HTTP Interface --- # HTTP Interface {#http-interface} @@ -178,9 +178,8 @@ You can also choose to use [HTTP compression](https://en.wikipedia.org/wiki/HTTP To send a compressed `POST` request, append the request header `Content-Encoding: compression_method`. In order for ClickHouse to compress the response, enable compression with [enable_http_compression](../operations/settings/settings.md#settings-enable_http_compression) setting and append `Accept-Encoding: compression_method` header to the request. You can configure the data compression level in the [http_zlib_compression_level](../operations/settings/settings.md#settings-http_zlib_compression_level) setting for all compression methods. -:::info -Some HTTP clients might decompress data from the server by default (with `gzip` and `deflate`) and you might get decompressed data even if you use the compression settings correctly. -::: +!!! note "Note" + Some HTTP clients might decompress data from the server by default (with `gzip` and `deflate`) and you might get decompressed data even if you use the compression settings correctly. **Examples** @@ -440,9 +439,8 @@ Next are the configuration methods for different `type`. The following example defines the values of [max_threads](../operations/settings/settings.md#settings-max_threads) and `max_final_threads` settings, then queries the system table to check whether these settings were set successfully. -:::warning -To keep the default `handlers` such as` query`, `play`,` ping`, add the `` rule. -::: +!!! note "Warning" + To keep the default `handlers` such as` query`, `play`,` ping`, use the `` rule. Example: @@ -471,9 +469,8 @@ $ curl -H 'XXX:TEST_HEADER_VALUE' -H 'PARAMS_XXX:max_threads' 'http://localhost: max_final_threads 2 ``` -:::warning -In one `predefined_query_handler` only supports one `query` of an insert type. -::: +!!! note "caution" + In one `predefined_query_handler` only supports one `query` of an insert type. ### dynamic_query_handler {#dynamic_query_handler} diff --git a/docs/en/interfaces/index.md b/docs/en/interfaces/index.md index 16e97ed7c62..e747b93a1a6 100644 --- a/docs/en/interfaces/index.md +++ b/docs/en/interfaces/index.md @@ -1,8 +1,7 @@ --- -sidebar_label: Interfaces -sidebar_position: 34 -keywords: [clickhouse, network, interfaces, http, tcp, grpc, command-line, client, jdbc, odbc, driver] -description: ClickHouse provides three network interfaces +toc_folder_title: Interfaces +toc_priority: 14 +toc_title: Introduction --- # Interfaces {#interfaces} diff --git a/docs/en/interfaces/jdbc.md b/docs/en/interfaces/jdbc.md index 4bea0600a2a..cf97568a8de 100644 --- a/docs/en/interfaces/jdbc.md +++ b/docs/en/interfaces/jdbc.md @@ -1,12 +1,11 @@ --- -sidebar_position: 22 -sidebar_label: JDBC Driver +toc_priority: 22 +toc_title: JDBC Driver --- # JDBC Driver {#jdbc-driver} -Use the [official JDBC driver](https://github.com/ClickHouse/clickhouse-jdbc) (and Java client) to access ClickHouse from your Java applications. - +- **[Official driver](https://github.com/ClickHouse/clickhouse-jdbc)** - Third-party drivers: - [ClickHouse-Native-JDBC](https://github.com/housepower/ClickHouse-Native-JDBC) - [clickhouse4j](https://github.com/blynkkk/clickhouse4j) diff --git a/docs/en/interfaces/mysql.md b/docs/en/interfaces/mysql.md index df8ef38d671..9932e6b6cb3 100644 --- a/docs/en/interfaces/mysql.md +++ b/docs/en/interfaces/mysql.md @@ -1,6 +1,6 @@ --- -sidebar_position: 20 -sidebar_label: MySQL Interface +toc_priority: 20 +toc_title: MySQL Interface --- # MySQL Interface {#mysql-interface} diff --git a/docs/en/interfaces/odbc.md b/docs/en/interfaces/odbc.md index 4c807654c28..fa58ed8b43e 100644 --- a/docs/en/interfaces/odbc.md +++ b/docs/en/interfaces/odbc.md @@ -1,12 +1,10 @@ --- -sidebar_position: 23 -sidebar_label: ODBC Driver +toc_priority: 23 +toc_title: ODBC Driver --- # ODBC Driver {#odbc-driver} -Use the [official ODBC driver](https://github.com/ClickHouse/clickhouse-odbc) for accessing ClickHouse as a data source. - - +- [Official driver](https://github.com/ClickHouse/clickhouse-odbc) [Original article](https://clickhouse.com/docs/en/interfaces/odbc/) diff --git a/docs/en/interfaces/tcp.md b/docs/en/interfaces/tcp.md index 5f2f400799f..b23f8110320 100644 --- a/docs/en/interfaces/tcp.md +++ b/docs/en/interfaces/tcp.md @@ -1,6 +1,6 @@ --- -sidebar_position: 18 -sidebar_label: Native Interface (TCP) +toc_priority: 18 +toc_title: Native Interface (TCP) --- # Native Interface (TCP) {#native-interface-tcp} diff --git a/docs/en/interfaces/third-party/client-libraries.md b/docs/en/interfaces/third-party/client-libraries.md index 885e9f430f2..8d1ff12cf0a 100644 --- a/docs/en/interfaces/third-party/client-libraries.md +++ b/docs/en/interfaces/third-party/client-libraries.md @@ -1,13 +1,12 @@ --- -sidebar_position: 26 -sidebar_label: Client Libraries +toc_priority: 26 +toc_title: Client Libraries --- # Client Libraries from Third-party Developers {#client-libraries-from-third-party-developers} -:::warning -ClickHouse Inc does **not** maintain the libraries listed below and hasn’t done any extensive testing to ensure their quality. -::: +!!! warning "Disclaimer" + ClickHouse Inc does **not** maintain the libraries listed below and hasn’t done any extensive testing to ensure their quality. - Python - [infi.clickhouse_orm](https://github.com/Infinidat/infi.clickhouse_orm) diff --git a/docs/en/interfaces/third-party/gui.md b/docs/en/interfaces/third-party/gui.md index 92d00f2812c..c0e270b7207 100644 --- a/docs/en/interfaces/third-party/gui.md +++ b/docs/en/interfaces/third-party/gui.md @@ -1,6 +1,6 @@ --- -sidebar_position: 28 -sidebar_label: Visual Interfaces +toc_priority: 28 +toc_title: Visual Interfaces --- # Visual Interfaces from Third-party Developers {#visual-interfaces-from-third-party-developers} diff --git a/docs/en/interfaces/third-party/index.md b/docs/en/interfaces/third-party/index.md index c9be2b6ada9..caf100681b4 100644 --- a/docs/en/interfaces/third-party/index.md +++ b/docs/en/interfaces/third-party/index.md @@ -1,6 +1,6 @@ --- toc_folder_title: Third-Party -sidebar_position: 24 +toc_priority: 24 --- # Third-Party Interfaces {#third-party-interfaces} @@ -12,6 +12,5 @@ This is a collection of links to third-party tools that provide some sort of int - [GUI](../../interfaces/third-party/gui.md) - [Proxies](../../interfaces/third-party/proxy.md) -:::note -Generic tools that support common API like [ODBC](../../interfaces/odbc.md) or [JDBC](../../interfaces/jdbc.md) usually can work with ClickHouse as well, but are not listed here because there are way too many of them. -::: \ No newline at end of file +!!! note "Note" + Generic tools that support common API like [ODBC](../../interfaces/odbc.md) or [JDBC](../../interfaces/jdbc.md) usually can work with ClickHouse as well, but are not listed here because there are way too many of them. diff --git a/docs/en/interfaces/third-party/integrations.md b/docs/en/interfaces/third-party/integrations.md index ae055d63a9d..3aac78f0878 100644 --- a/docs/en/interfaces/third-party/integrations.md +++ b/docs/en/interfaces/third-party/integrations.md @@ -1,13 +1,12 @@ --- -sidebar_position: 27 -sidebar_label: Integrations +toc_priority: 27 +toc_title: Integrations --- # Integration Libraries from Third-party Developers {#integration-libraries-from-third-party-developers} -:::warning Disclaimer -ClickHouse, Inc. does **not** maintain the tools and libraries listed below and haven’t done extensive testing to ensure their quality. -::: +!!! warning "Disclaimer" + ClickHouse, Inc. does **not** maintain the tools and libraries listed below and haven’t done extensive testing to ensure their quality. ## Infrastructure Products {#infrastructure-products} diff --git a/docs/en/interfaces/third-party/proxy.md b/docs/en/interfaces/third-party/proxy.md index 45077cb6a89..31a2d5afae9 100644 --- a/docs/en/interfaces/third-party/proxy.md +++ b/docs/en/interfaces/third-party/proxy.md @@ -1,6 +1,6 @@ --- -sidebar_position: 29 -sidebar_label: Proxies +toc_priority: 29 +toc_title: Proxies --- # Proxy Servers from Third-party Developers {#proxy-servers-from-third-party-developers} diff --git a/docs/en/introduction/adopters.md b/docs/en/introduction/adopters.md new file mode 100644 index 00000000000..ad199ce452e --- /dev/null +++ b/docs/en/introduction/adopters.md @@ -0,0 +1,200 @@ +--- +toc_priority: 8 +toc_title: Adopters +--- + +# ClickHouse Adopters {#clickhouse-adopters} + +!!! warning "Disclaimer" + The following list of companies using ClickHouse and their success stories is assembled from public sources, thus might differ from current reality. We’d appreciate it if you share the story of adopting ClickHouse in your company and [add it to the list](https://github.com/ClickHouse/ClickHouse/edit/master/docs/en/introduction/adopters.md), but please make sure you won’t have any NDA issues by doing so. Providing updates with publications from other companies is also useful. + +| Company | Industry | Usecase | Cluster Size | (Un)Compressed Data Size\* | Reference | +|---------|----------|---------|--------------|------------------------------------------------------------------------------|-----------| +| 2gis | Maps | Monitoring | — | — | [Talk in Russian, July 2019](https://youtu.be/58sPkXfq6nw) | +| Adapty | Subscription Analytics | Main product | — | — | [Tweet, November 2021](https://twitter.com/iwitaly/status/1462698148061659139) | +| Admiral | Martech | Engagement Management | — | — | [Webinar Slides, June 2020](https://altinity.com/presentations/2020/06/16/big-data-in-real-time-how-clickhouse-powers-admirals-visitor-relationships-for-publishers) | +| AdScribe | Ads | TV Analytics | — | — | [A quote from CTO](https://altinity.com/24x7-support/) | +| Ahrefs | SEO | Analytics | — | — | [Job listing](https://ahrefs.com/jobs/data-scientist-search) | +| Alibaba Cloud | Cloud | Managed Service | — | — | [Official Website](https://help.aliyun.com/product/144466.html) | +| Alibaba Cloud | Cloud | E-MapReduce | — | — | [Official Website](https://help.aliyun.com/document_detail/212195.html) | +| Aloha Browser | Mobile App | Browser backend | — | — | [Slides in Russian, May 2019](https://presentations.clickhouse.com/meetup22/aloha.pdf) | +| Altinity | Cloud, SaaS | Main product | — | — | [Official Website](https://altinity.com/) | +| Amadeus | Travel | Analytics | — | — | [Press Release, April 2018](https://www.altinity.com/blog/2018/4/5/amadeus-technologies-launches-investment-and-insights-tool-based-on-machine-learning-and-strategy-algorithms) | +| ApiRoad | API marketplace | Analytics | — | — | [Blog post, November 2018, March 2020](https://pixeljets.com/blog/clickhouse-vs-elasticsearch/) | +| Appsflyer | Mobile analytics | Main product | — | — | [Talk in Russian, July 2019](https://www.youtube.com/watch?v=M3wbRlcpBbY) | +| ArenaData | Data Platform | Main product | — | — | [Slides in Russian, December 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup38/indexes.pdf) | +| Argedor | ClickHouse support | — | — | — | [Official website](https://www.argedor.com/en/clickhouse/) | +| Avito | Classifieds | Monitoring | — | — | [Meetup, April 2020](https://www.youtube.com/watch?v=n1tm4j4W8ZQ) | +| Badoo | Dating | Timeseries | — | 1.6 mln events/sec (2018) | [Slides in Russian, December 2019](https://presentations.clickhouse.com/meetup38/forecast.pdf) | +| Beeline | Telecom | Data Platform | — | — | [Blog post, July 2021](https://habr.com/en/company/beeline/blog/567508/) | +| Benocs | Network Telemetry and Analytics | Main Product | — | — | [Slides in English, October 2017](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup9/lpm.pdf) | +| BIGO | Video | Computing Platform | — | — | [Blog Article, August 2020](https://www.programmersought.com/article/44544895251/) | +| BiliBili | Video sharing | — | — | — | [Blog post, June 2021](https://chowdera.com/2021/06/20210622012241476b.html) | +| Bloomberg | Finance, Media | Monitoring | — | — | [Job opening, September 2021](https://careers.bloomberg.com/job/detail/94913), [slides, May 2018](https://www.slideshare.net/Altinity/http-analytics-for-6m-requests-per-second-using-clickhouse-by-alexander-bocharov) | +| Bloxy | Blockchain | Analytics | — | — | [Slides in Russian, August 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup17/4_bloxy.pptx) | +| Bytedance | Social platforms | — | — | — | [The ClickHouse Meetup East, October 2020](https://www.youtube.com/watch?v=ckChUkC3Pns) | +| CardsMobile | Finance | Analytics | — | — | [VC.ru](https://vc.ru/s/cardsmobile/143449-rukovoditel-gruppy-analiza-dannyh) | +| CARTO | Business Intelligence | Geo analytics | — | — | [Geospatial processing with ClickHouse](https://carto.com/blog/geospatial-processing-with-clickhouse/) | +| CERN | Research | Experiment | — | — | [Press release, April 2012](https://www.yandex.com/company/press_center/press_releases/2012/2012-04-10/) | +| Checkly | Software Development | Analytics | — | — | [Tweet, October 2021](https://twitter.com/tim_nolet/status/1445810665743081474?s=20) | +| ChelPipe Group | Analytics | — | — | — | [Blog post, June 2021](https://vc.ru/trade/253172-tyazhelomu-proizvodstvu-user-friendly-sayt-internet-magazin-trub-dlya-chtpz) | +| Cisco | Networking | Traffic analysis | — | — | [Lightning talk, October 2019](https://youtu.be/-hI1vDR2oPY?t=5057) | +| Citadel Securities | Finance | — | — | — | [Contribution, March 2019](https://github.com/ClickHouse/ClickHouse/pull/4774) | +| Citymobil | Taxi | Analytics | — | — | [Blog Post in Russian, March 2020](https://habr.com/en/company/citymobil/blog/490660/) | +| Cloudflare | CDN | Traffic analysis | 36 servers | — | [Blog post, May 2017](https://blog.cloudflare.com/how-cloudflare-analyzes-1m-dns-queries-per-second/), [Blog post, March 2018](https://blog.cloudflare.com/http-analytics-for-6m-requests-per-second-using-clickhouse/) | +| Comcast | Media | CDN Traffic Analysis | — | — | [ApacheCon 2019 Talk](https://www.youtube.com/watch?v=e9TZ6gFDjNg) | +| ContentSquare | Web analytics | Main product | — | — | [Blog post in French, November 2018](http://souslecapot.net/2018/11/21/patrick-chatain-vp-engineering-chez-contentsquare-penser-davantage-amelioration-continue-que-revolution-constante/) | +| Corunet | Analytics | Main product | — | — | [Slides in English, April 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup21/predictive_models.pdf) | +| CraiditX 氪信 | Finance AI | Analysis | — | — | [Slides in English, November 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup33/udf.pptx) | +| Crazypanda | Games | | — | — | Live session on ClickHouse meetup | +| Criteo | Retail | Main product | — | — | [Slides in English, October 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup18/3_storetail.pptx) | +| Cryptology | Digital Assets Trading Platform | — | — | — | [Job advertisement, March 2021](https://career.habr.com/companies/cryptology/vacancies) | +| Dataliance for China Telecom | Telecom | Analytics | — | — | [Slides in Chinese, January 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup12/telecom.pdf) | +| Deutsche Bank | Finance | BI Analytics | — | — | [Slides in English, October 2019](https://bigdatadays.ru/wp-content/uploads/2019/10/D2-H3-3_Yakunin-Goihburg.pdf) | +| Deepl | Machine Learning | — | — | — | [Video, October 2021](https://www.youtube.com/watch?v=WIYJiPwxXdM&t=1182s) | +| Deeplay | Gaming Analytics | — | — | — | [Job advertisement, 2020](https://career.habr.com/vacancies/1000062568) | +| Diva-e | Digital consulting | Main Product | — | — | [Slides in English, September 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup29/ClickHouse-MeetUp-Unusual-Applications-sd-2019-09-17.pdf) | +| Ecommpay | Payment Processing | Logs | — | — | [Video, Nov 2019](https://www.youtube.com/watch?v=d3GdZTOWGLk) | +| Ecwid | E-commerce SaaS | Metrics, Logging | — | — | [Slides in Russian, April 2019](https://nastachku.ru/var/files/1/presentation/backend/2_Backend_6.pdf) | +| eBay | E-commerce | Logs, Metrics and Events | — | — | [Official website, Sep 2020](https://tech.ebayinc.com/engineering/ou-online-analytical-processing/) | +| Exness | Trading | Metrics, Logging | — | — | [Talk in Russian, May 2019](https://youtu.be/_rpU-TvSfZ8?t=3215) | +| EventBunker.io | Serverless Data Processing | — | — | — | [Tweet, April 2021](https://twitter.com/Halil_D_/status/1379839133472985091) | +| FastNetMon | DDoS Protection | Main Product | | — | [Official website](https://fastnetmon.com/docs-fnm-advanced/fastnetmon-advanced-traffic-persistency/) | +| Firebolt | Analytics | Main product | - | - | [YouTube Tech Talk](https://www.youtube.com/watch?v=9rW9uEJ15tU) | +| Flipkart | e-Commerce | — | — | — | [Talk in English, July 2020](https://youtu.be/GMiXCMFDMow?t=239) | +| FunCorp | Games | | — | 14 bn records/day as of Jan 2021 | [Article](https://www.altinity.com/blog/migrating-from-redshift-to-clickhouse) | +| Futurra Group | Analytics | — | — | — | [Article in Russian, December 2021](https://dou.ua/forums/topic/35587/) | +| Geniee | Ad network | Main product | — | — | [Blog post in Japanese, July 2017](https://tech.geniee.co.jp/entry/2017/07/20/160100) | +| Genotek | Bioinformatics | Main product | — | — | [Video, August 2020](https://youtu.be/v3KyZbz9lEE) | +| Gigapipe | Managed ClickHouse | Main product | — | — | [Official website](https://gigapipe.com/) | +| Gigasheet | Analytics | Main product | — | — | Direct Reference, February 2022| +| Glaber | Monitoring | Main product | — | — | [Website](https://glaber.io/) | +| GraphCDN | CDN | Traffic Analytics | — | — | [Blog Post in English, August 2021](https://altinity.com/blog/delivering-insight-on-graphql-apis-with-clickhouse-at-graphcdn/) | +| Grouparoo | Data Warehouse Integrations | Main product | — | — | [Official Website, November 2021](https://www.grouparoo.com/integrations) | +| HUYA | Video Streaming | Analytics | — | — | [Slides in Chinese, October 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup19/7.%20ClickHouse万亿数据分析实践%20李本旺(sundy-li)%20虎牙.pdf) | +| Hydrolix | Cloud data platform | Main product | — | — | [Documentation](https://docs.hydrolix.io/guide/query) | +| Hystax | Cloud Operations | Observability Analytics | - | - | [Blog](https://hystax.com/clickhouse-for-real-time-cost-saving-analytics-how-to-stop-hammering-screws-and-use-an-electric-screwdriver/) | +| ICA | FinTech | Risk Management | — | — | [Blog Post in English, Sep 2020](https://altinity.com/blog/clickhouse-vs-redshift-performance-for-fintech-risk-management?utm_campaign=ClickHouse%20vs%20RedShift&utm_content=143520807&utm_medium=social&utm_source=twitter&hss_channel=tw-3894792263) | +| Idealista | Real Estate | Analytics | — | — | [Blog Post in English, April 2019](https://clickhouse.com/blog/en/clickhouse-meetup-in-madrid-on-april-2-2019) | +| Infobaleen | AI markting tool | Analytics | — | — | [Official site](https://infobaleen.com) | +| Infovista | Networks | Analytics | — | — | [Slides in English, October 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup30/infovista.pdf) | +| InnoGames | Games | Metrics, Logging | — | — | [Slides in Russian, September 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup28/graphite_and_clickHouse.pdf) | +| Instabug | APM Platform | Main product | — | — | [A quote from Co-Founder](https://altinity.com/) | +| Instana | APM Platform | Main product | — | — | [Twitter post](https://twitter.com/mieldonkers/status/1248884119158882304) | +| Integros | Platform for video services | Analytics | — | — | [Slides in Russian, May 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup22/strategies.pdf) | +| Ippon Technologies | Technology Consulting | — | — | — | [Talk in English, July 2020](https://youtu.be/GMiXCMFDMow?t=205) | +| Ivi | Online Cinema | Analytics, Monitoring | — | — | [Article in Russian, Jan 2018](https://habr.com/en/company/ivi/blog/347408/) | +| Jinshuju 金数据 | BI Analytics | Main product | — | — | [Slides in Chinese, October 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup24/3.%20金数据数据架构调整方案Public.pdf) | +| Jitsu | Cloud Software | Data Pipeline | — | — | [Documentation](https://jitsu.com/docs/destinations-configuration/clickhouse-destination), [Hacker News post](https://news.ycombinator.com/item?id=29106082) | +| JuiceFS | Storage | Shopping Cart | - | - | [Blog](https://juicefs.com/blog/en/posts/shopee-clickhouse-with-juicefs/) | +| kakaocorp | Internet company | — | — | — | [if(kakao)2020](https://tv.kakao.com/channel/3693125/cliplink/414129353), [if(kakao)2021](https://if.kakao.com/session/24) | +| Kodiak Data | Clouds | Main product | — | — | [Slides in Engish, April 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup13/kodiak_data.pdf) | +| Kontur | Software Development | Metrics | — | — | [Talk in Russian, November 2018](https://www.youtube.com/watch?v=U4u4Bd0FtrY) | +| Kuaishou | Video | — | — | — | [ClickHouse Meetup, October 2018](https://clickhouse.com/blog/en/2018/clickhouse-community-meetup-in-beijing-on-october-28-2018/) | +| KGK Global | Vehicle monitoring | — | — | — | [Press release, June 2021](https://zoom.cnews.ru/news/item/530921) | +| LANCOM Systems | Network Solutions | Traffic analysis | - | - | [ClickHouse Operator for Kubernetes](https://www.lancom-systems.com/), [Hacker News post] (https://news.ycombinator.com/item?id=29413660) | +| Lawrence Berkeley National Laboratory | Research | Traffic analysis | 5 servers | 55 TiB | [Slides in English, April 2019](https://www.smitasin.com/presentations/2019-04-17_DOE-NSM.pdf) | +| Lever | Talent Management | Recruiting | - | - | [Hacker News post](https://news.ycombinator.com/item?id=29558544) | +| LifeStreet | Ad network | Main product | 75 servers (3 replicas) | 5.27 PiB | [Blog post in Russian, February 2017](https://habr.com/en/post/322620/) | +| Lookforsale | E-Commerce | — | — | — | [Job Posting, December 2021](https://telegram.me/javascript_jobs/587318) | +| Mail.ru Cloud Solutions | Cloud services | Main product | — | — | [Article in Russian](https://mcs.mail.ru/help/db-create/clickhouse#) | +| MAXILECT | Ad Tech, Blockchain, ML, AI | — | — | — | [Job advertisement, 2021](https://www.linkedin.com/feed/update/urn:li:activity:6780842017229430784/) | +| Marilyn | Advertising | Statistics | — | — | [Talk in Russian, June 2017](https://www.youtube.com/watch?v=iXlIgx2khwc) | +| Mello | Marketing | Analytics | 1 server | — | [Article, October 2020](https://vc.ru/marketing/166180-razrabotka-tipovogo-otcheta-skvoznoy-analitiki) | +| MessageBird | Telecommunications | Statistics | — | — | [Slides in English, November 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup20/messagebird.pdf) | +| Microsoft | Web Analytics | Clarity (Main Product) | — | — | [A question on GitHub](https://github.com/ClickHouse/ClickHouse/issues/21556) | +| MindsDB | Machine Learning | Main Product | — | — | [Official Website](https://www.mindsdb.com/blog/machine-learning-models-as-tables-in-ch) | +| MUX | Online Video | Video Analytics | — | — | [Talk in English, August 2019](https://altinity.com/presentations/2019/8/13/how-clickhouse-became-the-default-analytics-database-for-mux/) | +| MGID | Ad network | Web-analytics | — | — | [Blog post in Russian, April 2020](http://gs-studio.com/news-about-it/32777----clickhouse---c) | +| Muse Group | Music Software | Performance Monitoring | — | — | [Blog post in Russian, January 2021](https://habr.com/en/post/647079/) | +| Netskope | Network Security | — | — | — | [Job advertisement, March 2021](https://www.mendeley.com/careers/job/senior-software-developer-backend-developer-1346348) | +| NIC Labs | Network Monitoring | RaTA-DNS | — | — | [Blog post, March 2021](https://niclabs.cl/ratadns/2021/03/Clickhouse) | +| NLMK | Steel | Monitoring | — | — | [Article in Russian, Jan 2022](https://habr.com/en/company/nlmk/blog/645943/) | +| NOC Project | Network Monitoring | Analytics | Main Product | — | [Official Website](https://getnoc.com/features/big-data/) | +| Noction | Network Technology | Main Product | — | — | [Official Website](https://www.noction.com/news/irp-3-11-remote-triggered-blackholing-capability) +| ntop | Network Monitoning | Monitoring | — | — | [Official website, Jan 2022](https://www.ntop.org/ntop/historical-traffic-analysis-at-scale-using-clickhouse-with-ntopng/) | +| Nuna Inc. | Health Data Analytics | — | — | — | [Talk in English, July 2020](https://youtu.be/GMiXCMFDMow?t=170) | +| Ok.ru | Social Network | — | 72 servers | 810 TB compressed, 50bn rows/day, 1.5 TB/day | [SmartData conference, October 2021](https://assets.ctfassets.net/oxjq45e8ilak/4JPHkbJenLgZhBGGyyonFP/57472ec6987003ec4078d0941740703b/____________________ClickHouse_______________________.pdf) | +| Omnicomm | Transportation Monitoring | — | — | — | [Facebook post, October 2021](https://www.facebook.com/OmnicommTeam/posts/2824479777774500) | +| OneAPM | Monitoring and Data Analysis | Main product | — | — | [Slides in Chinese, October 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup19/8.%20clickhouse在OneAPM的应用%20杜龙.pdf) | +| Opensee | Financial Analytics | Main product | - | - | [Blog](https://opensee.io/news/from-moscow-to-wall-street-the-remarkable-journey-of-clickhouse/) | +| Open Targets | Genome Research | Genome Search | — | — | [Tweet, October 2021](https://twitter.com/OpenTargets/status/1452570865342758913?s=20), [Blog](https://blog.opentargets.org/graphql/) | +| OZON | E-commerce | — | — | — | [Official website](https://job.ozon.ru/vacancy/razrabotchik-clickhouse-ekspluatatsiya-40991870/) | +| Panelbear | Analytics | Monitoring and Analytics | — | — | [Tech Stack, November 2020](https://panelbear.com/blog/tech-stack/) | +| Percent 百分点 | Analytics | Main Product | — | — | [Slides in Chinese, June 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup24/4.%20ClickHouse万亿数据双中心的设计与实践%20.pdf) | +| Percona | Performance analysis | Percona Monitoring and Management | — | — | [Official website, Mar 2020](https://www.percona.com/blog/2020/03/30/advanced-query-analysis-in-percona-monitoring-and-management-with-direct-clickhouse-access/) | +| Plausible | Analytics | Main Product | — | — | [Blog post, June 2020](https://twitter.com/PlausibleHQ/status/1273889629087969280) | +| PostHog | Product Analytics | Main Product | — | — | [Release Notes, October 2020](https://posthog.com/blog/the-posthog-array-1-15-0), [Blog, November 2021](https://posthog.com/blog/how-we-turned-clickhouse-into-our-eventmansion) | +| Postmates | Delivery | — | — | — | [Talk in English, July 2020](https://youtu.be/GMiXCMFDMow?t=188) | +| Pragma Innovation | Telemetry and Big Data Analysis | Main product | — | — | [Slides in English, October 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup18/4_pragma_innovation.pdf) | +| PRANA | Industrial predictive analytics | Main product | — | — | [News (russian), Feb 2021](https://habr.com/en/news/t/541392/) | +| QINGCLOUD | Cloud services | Main product | — | — | [Slides in Chinese, October 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup19/4.%20Cloud%20%2B%20TSDB%20for%20ClickHouse%20张健%20QingCloud.pdf) | +| Qrator | DDoS protection | Main product | — | — | [Blog Post, March 2019](https://blog.qrator.net/en/clickhouse-ddos-mitigation_37/) | +| R-Vision | Information Security | — | — | — | [Article in Russian, December 2021](https://www.anti-malware.ru/reviews/R-Vision-SENSE-15) | +| Raiffeisenbank | Banking | Analytics | — | — | [Lecture in Russian, December 2020](https://cs.hse.ru/announcements/421965599.html) | +| Rambler | Internet services | Analytics | — | — | [Talk in Russian, April 2018](https://medium.com/@ramblertop/разработка-api-clickhouse-для-рамблер-топ-100-f4c7e56f3141) | +| Replica | Urban Planning | Analytics | — | — | [Job advertisement](https://boards.greenhouse.io/replica/jobs/5547732002?gh_jid=5547732002) | +| Retell | Speech synthesis | Analytics | — | — | [Blog Article, August 2020](https://vc.ru/services/153732-kak-sozdat-audiostati-na-vashem-sayte-i-zachem-eto-nuzhno) | +| Rollbar | Software Development | Main Product | — | — | [Official Website](https://www.rollbar.com) | +| Rspamd | Antispam | Analytics | — | — | [Official Website](https://rspamd.com/doc/modules/clickhouse.html) | +| RuSIEM | SIEM | Main Product | — | — | [Official Website](https://rusiem.com/en/products/architecture) | +| S7 Airlines | Airlines | Metrics, Logging | — | — | [Talk in Russian, March 2019](https://www.youtube.com/watch?v=nwG68klRpPg&t=15s) | +| Sber | Banking, Fintech, Retail, Cloud, Media | — | 128 servers | >1 PB | [Job advertisement, March 2021](https://career.habr.com/vacancies/1000073536) | +| scireum GmbH | e-Commerce | Main product | — | — | [Talk in German, February 2020](https://www.youtube.com/watch?v=7QWAn5RbyR4) | +| Segment | Data processing | Main product | 9 * i3en.3xlarge nodes 7.5TB NVME SSDs, 96GB Memory, 12 vCPUs | — | [Slides, 2019](https://slides.com/abraithwaite/segment-clickhouse) | +| sembot.io | Shopping Ads | — | — | — | A comment on LinkedIn, 2020 | +| SEMrush | Marketing | Main product | — | — | [Slides in Russian, August 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup17/5_semrush.pdf) | +| Sentry | Software Development | Main product | — | — | [Blog Post in English, May 2019](https://blog.sentry.io/2019/05/16/introducing-snuba-sentrys-new-search-infrastructure) | +| seo.do | Analytics | Main product | — | — | [Slides in English, November 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup35/CH%20Presentation-%20Metehan%20Çetinkaya.pdf) | +| SGK | Government Social Security | Analytics | — | — | [Slides in English, November 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup35/ClickHouse%20Meetup-Ramazan%20POLAT.pdf) | +| SigNoz | Observability Platform | Main Product | — | — | [Source code](https://github.com/SigNoz/signoz) | +| Sina | News | — | — | — | [Slides in Chinese, October 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup19/6.%20ClickHouse最佳实践%20高鹏_新浪.pdf) | +| Sipfront | Software Development | Analytics | — | — | [Tweet, October 2021](https://twitter.com/andreasgranig/status/1446404332337913895?s=20) | +| SMI2 | News | Analytics | — | — | [Blog Post in Russian, November 2017](https://habr.com/ru/company/smi2/blog/314558/) | +| Spark New Zealand | Telecommunications | Security Operations | — | — | [Blog Post, Feb 2020](https://blog.n0p.me/2020/02/2020-02-05-dnsmonster/) | +| Splitbee | Analytics | Main Product | — | — | [Blog Post, Mai 2021](https://splitbee.io/blog/new-pricing) | +| Splunk | Business Analytics | Main product | — | — | [Slides in English, January 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup12/splunk.pdf) | +| Spotify | Music | Experimentation | — | — | [Slides, July 2018](https://www.slideshare.net/glebus/using-clickhouse-for-experimentation-104247173) | +| Staffcop | Information Security | Main Product | — | — | [Official website, Documentation](https://www.staffcop.ru/sce43) | +| Suning | E-Commerce | User behaviour analytics | — | — | [Blog article](https://www.sohu.com/a/434152235_411876) | +| Superwall | Monetization Tooling | Main product | — | — | [Word of mouth, Jan 2022](https://github.com/ClickHouse/ClickHouse/pull/33573) | +| Teralytics | Mobility | Analytics | — | — | [Tech blog](https://www.teralytics.net/knowledge-hub/visualizing-mobility-data-the-scalability-challenge) | +| Tencent | Big Data | Data processing | — | — | [Slides in Chinese, October 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup19/5.%20ClickHouse大数据集群应用_李俊飞腾讯网媒事业部.pdf) | +| Tencent | Messaging | Logging | — | — | [Talk in Chinese, November 2019](https://youtu.be/T-iVQRuw-QY?t=5050) | +| Tencent Music Entertainment (TME) | BigData | Data processing | — | — | [Blog in Chinese, June 2020](https://cloud.tencent.com/developer/article/1637840) | +| Tesla | Electric vehicle and clean energy company | — | — | — | [Vacancy description, March 2021](https://news.ycombinator.com/item?id=26306170) | +| Timeflow | Software | Analytics | — | — | [Blog](https://timeflow.systems/why-we-moved-from-druid-to-clickhouse/ ) | +| Tinybird | Real-time Data Products | Data processing | — | — | [Official website](https://www.tinybird.co/) | +| Traffic Stars | AD network | — | 300 servers in Europe/US | 1.8 PiB, 700 000 insert rps (as of 2021) | [Slides in Russian, May 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup15/lightning/ninja.pdf) | +| Uber | Taxi | Logging | — | — | [Slides, February 2020](https://presentations.clickhouse.com/meetup40/uber.pdf) | +| UseTech | Software Development | — | — | — | [Job Posting, December 2021](https://vk.com/wall136266658_2418) | +| UTMSTAT | Analytics | Main product | — | — | [Blog post, June 2020](https://vc.ru/tribuna/133956-striming-dannyh-iz-servisa-skvoznoy-analitiki-v-clickhouse) | +| Vercel | Traffic and Performance Analytics | — | — | — | Direct reference, October 2021 | +| VKontakte | Social Network | Statistics, Logging | — | — | [Slides in Russian, August 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup17/3_vk.pdf) | +| VMware | Cloud | VeloCloud, SDN | — | — | [Product documentation](https://docs.vmware.com/en/vRealize-Operations-Manager/8.3/com.vmware.vcom.metrics.doc/GUID-A9AD72E1-C948-4CA2-971B-919385AB3CA8.html) | +| Walmart Labs | Internet, Retail | — | — | — | [Talk in English, July 2020](https://youtu.be/GMiXCMFDMow?t=144) | +| Wargaming | Games | | — | — | [Interview](https://habr.com/en/post/496954/) | +| Wildberries | E-commerce | | — | — | [Official website](https://it.wildberries.ru/) | +| Wisebits | IT Solutions | Analytics | — | — | [Slides in Russian, May 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup22/strategies.pdf) | +| Workato | Automation Software | — | — | — | [Talk in English, July 2020](https://youtu.be/GMiXCMFDMow?t=334) | +| Xenoss | Marketing, Advertising | — | — | — | [Instagram, March 2021](https://www.instagram.com/p/CNATV7qBgB1/) | +| Xiaoxin Tech | Education | Common purpose | — | — | [Slides in English, November 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup33/sync-clickhouse-with-mysql-mongodb.pptx) | +| Ximalaya | Audio sharing | OLAP | — | — | [Slides in English, November 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup33/ximalaya.pdf) | +| Yandex Cloud | Public Cloud | Main product | — | — | [Talk in Russian, December 2019](https://www.youtube.com/watch?v=pgnak9e_E0o) | +| Yandex DataLens | Business Intelligence | Main product | — | — | [Slides in Russian, December 2019](https://presentations.clickhouse.com/meetup38/datalens.pdf) | +| Yandex Market | e-Commerce | Metrics, Logging | — | — | [Talk in Russian, January 2019](https://youtu.be/_l1qP0DyBcA?t=478) | +| Yandex Metrica | Web analytics | Main product | 630 servers in one cluster, 360 servers in another cluster, 1862 servers in one department | 133 PiB / 8.31 PiB / 120 trillion records | [Slides, February 2020](https://presentations.clickhouse.com/meetup40/introduction/#13) | +| | Analytics | Main product | - | - | [Integration](https://www.yellowfinbi.com/campaign/yellowfin-9-whats-new#el-30219e0e) | +| Yotascale | Cloud | Data pipeline | — | 2 bn records/day | [LinkedIn (Accomplishments)](https://www.linkedin.com/in/adilsaleem/) | +| Your Analytics | Product Analytics | Main Product | — | - | [Tweet, November 2021](https://twitter.com/mikenikles/status/1459737241165565953) | +| Zagrava Trading | — | — | — | — | [Job offer, May 2021](https://twitter.com/datastackjobs/status/1394707267082063874) | +| ЦВТ | Software Development | Metrics, Logging | — | — | [Blog Post, March 2019, in Russian](https://vc.ru/dev/62715-kak-my-stroili-monitoring-na-prometheus-clickhouse-i-elk) | +| МКБ | Bank | Web-system monitoring | — | — | [Slides in Russian, September 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup28/mkb.pdf) | +| ЦФТ | Banking, Financial products, Payments | — | — | — | [Meetup in Russian, April 2020](https://team.cft.ru/events/162) | +| Цифровой Рабочий | Industrial IoT, Analytics | — | — | — | [Blog post in Russian, March 2021](https://habr.com/en/company/croc/blog/548018/) | +| ООО «МПЗ Богородский» | Agriculture | — | — | — | [Article in Russian, November 2020](https://cloud.yandex.ru/cases/okraina) | +| ДомКлик | Real Estate | — | — | — | [Article in Russian, October 2021](https://habr.com/ru/company/domclick/blog/585936/) | +| АС "Стрела" | Transportation | — | — | — | [Job posting, Jan 2022](https://vk.com/topic-111905078_35689124?post=3553) | +| Piwik PRO | Web Analytics | — | — | — | [Official website, Dec 2018](https://piwik.pro/blog/piwik-pro-clickhouse-faster-efficient-reports/) | + +[Original article](https://clickhouse.com/docs/en/introduction/adopters/) diff --git a/docs/en/introduction/distinctive-features.md b/docs/en/introduction/distinctive-features.md new file mode 100644 index 00000000000..951a8a9d3e5 --- /dev/null +++ b/docs/en/introduction/distinctive-features.md @@ -0,0 +1,96 @@ +--- +toc_priority: 4 +toc_title: Distinctive Features +--- + +# Distinctive Features of ClickHouse {#distinctive-features-of-clickhouse} + +## True Column-Oriented Database Management System {#true-column-oriented-dbms} + +In a real column-oriented DBMS, no extra data is stored with the values. Among other things, this means that constant-length values must be supported, to avoid storing their length “number” next to the values. For example, a billion UInt8-type values should consume around 1 GB uncompressed, or this strongly affects the CPU use. It is essential to store data compactly (without any “garbage”) even when uncompressed since the speed of decompression (CPU usage) depends mainly on the volume of uncompressed data. + +It is worth noting because there are systems that can store values of different columns separately, but that can’t effectively process analytical queries due to their optimization for other scenarios. Examples are HBase, BigTable, Cassandra, and HyperTable. You would get throughput around a hundred thousand rows per second in these systems, but not hundreds of millions of rows per second. + +It’s also worth noting that ClickHouse is a database management system, not a single database. ClickHouse allows creating tables and databases in runtime, loading data, and running queries without reconfiguring and restarting the server. + +## Data Compression {#data-compression} + +Some column-oriented DBMSs do not use data compression. However, data compression does play a key role in achieving excellent performance. + +In addition to efficient general-purpose compression codecs with different trade-offs between disk space and CPU consumption, ClickHouse provides [specialized codecs](../sql-reference/statements/create/table.md#create-query-specialized-codecs) for specific kinds of data, which allow ClickHouse to compete with and outperform more niche databases, like time-series ones. + +## Disk Storage of Data {#disk-storage-of-data} + +Keeping data physically sorted by primary key makes it possible to extract data for its specific values or value ranges with low latency, less than a few dozen milliseconds. Some column-oriented DBMSs (such as SAP HANA and Google PowerDrill) can only work in RAM. This approach encourages the allocation of a larger hardware budget than is necessary for real-time analysis. + +ClickHouse is designed to work on regular hard drives, which means the cost per GB of data storage is low, but SSD and additional RAM are also fully used if available. + +## Parallel Processing on Multiple Cores {#parallel-processing-on-multiple-cores} + +Large queries are parallelized naturally, taking all the necessary resources available on the current server. + +## Distributed Processing on Multiple Servers {#distributed-processing-on-multiple-servers} + +Almost none of the columnar DBMSs mentioned above have support for distributed query processing. + +In ClickHouse, data can reside on different shards. Each shard can be a group of replicas used for fault tolerance. All shards are used to run a query in parallel, transparently for the user. + +## SQL Support {#sql-support} + +ClickHouse supports a [declarative query language based on SQL](../sql-reference/index.md) that is identical to the ANSI SQL standard in [many cases](../sql-reference/ansi.md). + +Supported queries include [GROUP BY](../sql-reference/statements/select/group-by.md), [ORDER BY](../sql-reference/statements/select/order-by.md), subqueries in [FROM](../sql-reference/statements/select/from.md), [JOIN](../sql-reference/statements/select/join.md) clause, [IN](../sql-reference/operators/in.md) operator, [window functions](../sql-reference/window-functions/index.md) and scalar subqueries. + +Correlated (dependent) subqueries are not supported at the time of writing but might become available in the future. + +## Vector Computation Engine {#vector-engine} + +Data is not only stored by columns but is processed by vectors (parts of columns), which allows achieving high CPU efficiency. + +## Real-time Data Updates {#real-time-data-updates} + +ClickHouse supports tables with a primary key. To quickly perform queries on the range of the primary key, the data is sorted incrementally using the merge tree. Due to this, data can continually be added to the table. No locks are taken when new data is ingested. + +## Primary Index {#primary-index} + +Having a data physically sorted by primary key makes it possible to extract data for its specific values or value ranges with low latency, less than a few dozen milliseconds. + +## Secondary Indexes {#secondary-indexes} + +Unlike other database management systems, secondary indexes in ClickHouse does not point to specific rows or row ranges. Instead, they allow the database to know in advance that all rows in some data parts wouldn’t match the query filtering conditions and do not read them at all, thus they are called [data skipping indexes](../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-data_skipping-indexes). + +## Suitable for Online Queries {#suitable-for-online-queries} + +Most OLAP database management systems do not aim for online queries with sub-second latencies. In alternative systems, report building time of tens of seconds or even minutes is often considered acceptable. Sometimes it takes even more which forces to prepare reports offline (in advance or by responding with “come back later”). + +In ClickHouse low latency means that queries can be processed without delay and without trying to prepare an answer in advance, right at the same moment while the user interface page is loading. In other words, online. + +## Support for Approximated Calculations {#support-for-approximated-calculations} + +ClickHouse provides various ways to trade accuracy for performance: + +1. Aggregate functions for approximated calculation of the number of distinct values, medians, and quantiles. +2. Running a query based on a part (sample) of data and getting an approximated result. In this case, proportionally less data is retrieved from the disk. +3. Running an aggregation for a limited number of random keys, instead of for all keys. Under certain conditions for key distribution in the data, this provides a reasonably accurate result while using fewer resources. + +## Adaptive Join Algorithm {#adaptive-join-algorithm} + +ClickHouse adaptively chooses how to [JOIN](../sql-reference/statements/select/join.md) multiple tables, by preferring hash-join algorithm and falling back to the merge-join algorithm if there’s more than one large table. + +## Data Replication and Data Integrity Support {#data-replication-and-data-integrity-support} + +ClickHouse uses asynchronous multi-master replication. After being written to any available replica, all the remaining replicas retrieve their copy in the background. The system maintains identical data on different replicas. Recovery after most failures is performed automatically, or semi-automatically in complex cases. + +For more information, see the section [Data replication](../engines/table-engines/mergetree-family/replication.md). + +## Role-Based Access Control {#role-based-access-control} + +ClickHouse implements user account management using SQL queries and allows for [role-based access control configuration](../operations/access-rights.md) similar to what can be found in ANSI SQL standard and popular relational database management systems. + +## Features that Can Be Considered Disadvantages {#clickhouse-features-that-can-be-considered-disadvantages} + +1. No full-fledged transactions. +2. Lack of ability to modify or delete already inserted data with a high rate and low latency. There are batch deletes and updates available to clean up or modify data, for example, to comply with [GDPR](https://gdpr-info.eu). +3. The sparse index makes ClickHouse not so efficient for point queries retrieving single rows by their keys. + +[Original article](https://clickhouse.com/docs/en/introduction/distinctive-features/) diff --git a/docs/en/introduction/history.md b/docs/en/introduction/history.md new file mode 100644 index 00000000000..d192eff80ea --- /dev/null +++ b/docs/en/introduction/history.md @@ -0,0 +1,54 @@ +--- +toc_priority: 7 +toc_title: History +--- + +# ClickHouse History {#clickhouse-history} + +ClickHouse has been developed initially to power [Yandex.Metrica](https://metrica.yandex.com/), [the second largest web analytics platform in the world](http://w3techs.com/technologies/overview/traffic_analysis/all), and continues to be the core component of this system. With more than 13 trillion records in the database and more than 20 billion events daily, ClickHouse allows generating custom reports on the fly directly from non-aggregated data. This article briefly covers the goals of ClickHouse in the early stages of its development. + +Yandex.Metrica builds customized reports on the fly based on hits and sessions, with arbitrary segments defined by the user. Doing so often requires building complex aggregates, such as the number of unique users. New data for building a report arrives in real-time. + +As of April 2014, Yandex.Metrica was tracking about 12 billion events (page views and clicks) daily. All these events must be stored to build custom reports. A single query may require scanning millions of rows within a few hundred milliseconds, or hundreds of millions of rows in just a few seconds. + +## Usage in Yandex.Metrica and Other Yandex Services {#usage-in-yandex-metrica-and-other-yandex-services} + +ClickHouse serves multiple purposes in Yandex.Metrica. +Its main task is to build reports in online mode using non-aggregated data. It uses a cluster of 374 servers, which store over 20.3 trillion rows in the database. The volume of compressed data is about 2 PB, without accounting for duplicates and replicas. The volume of uncompressed data (in TSV format) would be approximately 17 PB. + +ClickHouse also plays a key role in the following processes: + +- Storing data for Session Replay from Yandex.Metrica. +- Processing intermediate data. +- Building global reports with Analytics. +- Running queries for debugging the Yandex.Metrica engine. +- Analyzing logs from the API and the user interface. + +Nowadays, there are multiple dozen ClickHouse installations in other Yandex services and departments: search verticals, e-commerce, advertisement, business analytics, mobile development, personal services, and others. + +## Aggregated and Non-aggregated Data {#aggregated-and-non-aggregated-data} + +There is a widespread opinion that to calculate statistics effectively, you must aggregate data since this reduces the volume of data. + +But data aggregation comes with a lot of limitations: + +- You must have a pre-defined list of required reports. +- The user can’t make custom reports. +- When aggregating over a large number of distinct keys, the data volume is barely reduced, so aggregation is useless. +- For a large number of reports, there are too many aggregation variations (combinatorial explosion). +- When aggregating keys with high cardinality (such as URLs), the volume of data is not reduced by much (less than twofold). +- For this reason, the volume of data with aggregation might grow instead of shrink. +- Users do not view all the reports we generate for them. A large portion of those calculations is useless. +- The logical integrity of data may be violated for various aggregations. + +If we do not aggregate anything and work with non-aggregated data, this might reduce the volume of calculations. + +However, with aggregation, a significant part of the work is taken offline and completed relatively calmly. In contrast, online calculations require calculating as fast as possible, since the user is waiting for the result. + +Yandex.Metrica has a specialized system for aggregating data called Metrage, which was used for the majority of reports. +Starting in 2009, Yandex.Metrica also used a specialized OLAP database for non-aggregated data called OLAPServer, which was previously used for the report builder. +OLAPServer worked well for non-aggregated data, but it had many restrictions that did not allow it to be used for all reports as desired. These included the lack of support for data types (only numbers), and the inability to incrementally update data in real-time (it could only be done by rewriting data daily). OLAPServer is not a DBMS, but a specialized DB. + +The initial goal for ClickHouse was to remove the limitations of OLAPServer and solve the problem of working with non-aggregated data for all reports, but over the years, it has grown into a general-purpose database management system suitable for a wide range of analytical tasks. + +[Original article](https://clickhouse.com/docs/en/introduction/history/) diff --git a/docs/en/introduction/index.md b/docs/en/introduction/index.md new file mode 100644 index 00000000000..ba80f9c2640 --- /dev/null +++ b/docs/en/introduction/index.md @@ -0,0 +1,6 @@ +--- +toc_folder_title: Introduction +toc_priority: 1 +--- + + diff --git a/docs/en/introduction/performance.md b/docs/en/introduction/performance.md new file mode 100644 index 00000000000..684b4ee4179 --- /dev/null +++ b/docs/en/introduction/performance.md @@ -0,0 +1,30 @@ +--- +toc_priority: 6 +toc_title: Performance +--- + +# Performance {#performance} + +ClickHouse shows the best performance (both the highest throughput for long queries and the lowest latency on short queries) for comparable operating scenarios among systems of its class that were available for testing. You can view the test results on a [separate page](https://clickhouse.com/benchmark/dbms/). + +Numerous independent benchmarks came to similar conclusions. They are not difficult to find using an internet search, or you can see [our small collection of related links](https://clickhouse.com/#independent-benchmarks). + +## Throughput for a Single Large Query {#throughput-for-a-single-large-query} + +Throughput can be measured in rows per second or megabytes per second. If the data is placed in the page cache, a query that is not too complex is processed on modern hardware at a speed of approximately 2-10 GB/s of uncompressed data on a single server (for the most straightforward cases, the speed may reach 30 GB/s). If data is not placed in the page cache, the speed depends on the disk subsystem and the data compression rate. For example, if the disk subsystem allows reading data at 400 MB/s, and the data compression rate is 3, the speed is expected to be around 1.2 GB/s. To get the speed in rows per second, divide the speed in bytes per second by the total size of the columns used in the query. For example, if 10 bytes of columns are extracted, the speed is expected to be around 100-200 million rows per second. + +The processing speed increases almost linearly for distributed processing, but only if the number of rows resulting from aggregation or sorting is not too large. + +## Latency When Processing Short Queries {#latency-when-processing-short-queries} + +If a query uses a primary key and does not select too many columns and rows to process (hundreds of thousands), you can expect less than 50 milliseconds of latency (single digits of milliseconds in the best case) if data is placed in the page cache. Otherwise, latency is mostly dominated by the number of seeks. If you use rotating disk drives, for a system that is not overloaded, the latency can be estimated with this formula: `seek time (10 ms) * count of columns queried * count of data parts`. + +## Throughput When Processing a Large Quantity of Short Queries {#throughput-when-processing-a-large-quantity-of-short-queries} + +Under the same conditions, ClickHouse can handle several hundred queries per second on a single server (up to several thousand in the best case). Since this scenario is not typical for analytical DBMSs, we recommend expecting a maximum of 100 queries per second. + +## Performance When Inserting Data {#performance-when-inserting-data} + +We recommend inserting data in packets of at least 1000 rows, or no more than a single request per second. When inserting to a MergeTree table from a tab-separated dump, the insertion speed can be from 50 to 200 MB/s. If the inserted rows are around 1 KB in size, the speed will be from 50,000 to 200,000 rows per second. If the rows are small, the performance can be higher in rows per second (on Banner System data -`>` 500,000 rows per second; on Graphite data -`>` 1,000,000 rows per second). To improve performance, you can make multiple INSERT queries in parallel, which scales linearly. + +{## [Original article](https://clickhouse.com/docs/en/introduction/performance/) ##} diff --git a/docs/en/operations/_category_.yml b/docs/en/operations/_category_.yml deleted file mode 100644 index 9d6dd1247db..00000000000 --- a/docs/en/operations/_category_.yml +++ /dev/null @@ -1,4 +0,0 @@ -position: 70 -label: 'Operations' -collapsible: true -collapsed: true diff --git a/docs/en/operations/access-rights.md b/docs/en/operations/access-rights.md index 7d75c47df2b..52f7fb87ffd 100644 --- a/docs/en/operations/access-rights.md +++ b/docs/en/operations/access-rights.md @@ -1,6 +1,6 @@ --- -sidebar_position: 48 -sidebar_label: Access Control and Account Management +toc_priority: 48 +toc_title: Access Control and Account Management --- # Access Control and Account Management {#access-control} @@ -24,9 +24,8 @@ You can configure access entities using: We recommend using SQL-driven workflow. Both of the configuration methods work simultaneously, so if you use the server configuration files for managing accounts and access rights, you can smoothly switch to SQL-driven workflow. -:::warning -You can’t manage the same access entity by both configuration methods simultaneously. -::: +!!! note "Warning" + You can’t manage the same access entity by both configuration methods simultaneously. To see all users, roles, profiles, etc. and all their grants use [SHOW ACCESS](../sql-reference/statements/show.md#show-access-statement) statement. @@ -102,9 +101,8 @@ Privileges can be granted to a role by the [GRANT](../sql-reference/statements/g Row policy is a filter that defines which of the rows are available to a user or a role. Row policy contains filters for one particular table, as well as a list of roles and/or users which should use this row policy. -:::warning -Row policies makes sense only for users with readonly access. If user can modify table or copy partitions between tables, it defeats the restrictions of row policies. -::: +!!! note "Warning" + Row policies makes sense only for users with readonly access. If user can modify table or copy partitions between tables, it defeats the restrictions of row policies. Management queries: diff --git a/docs/en/operations/backup.md b/docs/en/operations/backup.md index c39658aa4b0..7f0ed48928a 100644 --- a/docs/en/operations/backup.md +++ b/docs/en/operations/backup.md @@ -1,6 +1,6 @@ --- -sidebar_position: 49 -sidebar_label: Data Backup +toc_priority: 49 +toc_title: Data Backup --- # Data Backup {#data-backup} @@ -11,9 +11,8 @@ In order to effectively mitigate possible human errors, you should carefully pre Each company has different resources available and business requirements, so there’s no universal solution for ClickHouse backups and restores that will fit every situation. What works for one gigabyte of data likely won’t work for tens of petabytes. There are a variety of possible approaches with their own pros and cons, which will be discussed below. It is a good idea to use several approaches instead of just one in order to compensate for their various shortcomings. -:::note -Keep in mind that if you backed something up and never tried to restore it, chances are that restore will not work properly when you actually need it (or at least it will take longer than business can tolerate). So whatever backup approach you choose, make sure to automate the restore process as well, and practice it on a spare ClickHouse cluster regularly. -::: +!!! note "Note" + Keep in mind that if you backed something up and never tried to restore it, chances are that restore will not work properly when you actually need it (or at least it will take longer than business can tolerate). So whatever backup approach you choose, make sure to automate the restore process as well, and practice it on a spare ClickHouse cluster regularly. ## Duplicating Source Data Somewhere Else {#duplicating-source-data-somewhere-else} diff --git a/docs/en/operations/caches.md b/docs/en/operations/caches.md index f2427810184..9aa6419d89c 100644 --- a/docs/en/operations/caches.md +++ b/docs/en/operations/caches.md @@ -1,11 +1,11 @@ --- -sidebar_position: 65 -sidebar_label: Caches +toc_priority: 65 +toc_title: Caches --- # Cache Types {#cache-types} -When performing queries, ClichHouse uses different caches. +When performing queries, ClickHouse uses different caches. Main cache types: diff --git a/docs/en/operations/clickhouse-keeper.md b/docs/en/operations/clickhouse-keeper.md index 81547736441..a8ca2079070 100644 --- a/docs/en/operations/clickhouse-keeper.md +++ b/docs/en/operations/clickhouse-keeper.md @@ -1,15 +1,14 @@ --- -sidebar_position: 66 -sidebar_label: ClickHouse Keeper +toc_priority: 66 +toc_title: ClickHouse Keeper --- # [pre-production] ClickHouse Keeper {#clickHouse-keeper} ClickHouse server uses [ZooKeeper](https://zookeeper.apache.org/) coordination system for data [replication](../engines/table-engines/mergetree-family/replication.md) and [distributed DDL](../sql-reference/distributed-ddl.md) queries execution. ClickHouse Keeper is an alternative coordination system compatible with ZooKeeper. -:::warning -This feature is currently in the pre-production stage. We test it in our CI and on small internal installations. -::: +!!! warning "Warning" + This feature is currently in the pre-production stage. We test it in our CI and on small internal installations. ## Implementation details {#implementation-details} @@ -19,9 +18,8 @@ By default, ClickHouse Keeper provides the same guarantees as ZooKeeper (lineari ClickHouse Keeper supports Access Control List (ACL) the same way as [ZooKeeper](https://zookeeper.apache.org/doc/r3.1.2/zookeeperProgrammers.html#sc_ZooKeeperAccessControl) does. ClickHouse Keeper supports the same set of permissions and has the identical built-in schemes: `world`, `auth`, `digest`, `host` and `ip`. Digest authentication scheme uses pair `username:password`. Password is encoded in Base64. -:::note -External integrations are not supported. -::: +!!! info "Note" + External integrations are not supported. ## Configuration {#configuration} @@ -57,7 +55,7 @@ Internal coordination settings are located in `..` section and contain servers description. @@ -123,7 +121,7 @@ clickhouse keeper --config /etc/your_path_to_config/config.xml ClickHouse Keeper also provides 4lw commands which are almost the same with Zookeeper. Each command is composed of four letters such as `mntr`, `stat` etc. There are some more interesting commands: `stat` gives some general information about the server and connected clients, while `srvr` and `cons` give extended details on server and connections respectively. -The 4lw commands has a white list configuration `four_letter_word_white_list` which has default value "conf,cons,crst,envi,ruok,srst,srvr,stat,wchc,wchs,dirs,mntr,isro". +The 4lw commands has a allow list configuration `four_letter_word_allow_list` which has default value "conf,cons,crst,envi,ruok,srst,srvr,stat,wchs,dirs,mntr,isro". You can issue the commands to ClickHouse Keeper via telnet or nc, at the client port. @@ -203,7 +201,7 @@ Server stats reset. ``` server_id=1 tcp_port=2181 -four_letter_word_white_list=* +four_letter_word_allow_list=* log_storage_path=./coordination/logs snapshot_storage_path=./coordination/snapshots max_requests_batch_size=100 diff --git a/docs/en/operations/configuration-files.md b/docs/en/operations/configuration-files.md index 582e90544e0..cbc139dd958 100644 --- a/docs/en/operations/configuration-files.md +++ b/docs/en/operations/configuration-files.md @@ -1,6 +1,6 @@ --- -sidebar_position: 50 -sidebar_label: Configuration Files +toc_priority: 50 +toc_title: Configuration Files --- # Configuration Files {#configuration_files} diff --git a/docs/en/operations/external-authenticators/index.md b/docs/en/operations/external-authenticators/index.md index af2ba713ec1..850b6594b71 100644 --- a/docs/en/operations/external-authenticators/index.md +++ b/docs/en/operations/external-authenticators/index.md @@ -1,9 +1,10 @@ --- -sidebar_position: 48 -sidebar_label: External User Authenticators and Directories +toc_folder_title: External User Authenticators and Directories +toc_priority: 48 +toc_title: Introduction --- -# External User Authenticators and Directories +# External User Authenticators and Directories {#external-authenticators} ClickHouse supports authenticating and managing users using external services. diff --git a/docs/en/operations/external-authenticators/kerberos.md b/docs/en/operations/external-authenticators/kerberos.md index 3711bac79c3..da84c1f6a89 100644 --- a/docs/en/operations/external-authenticators/kerberos.md +++ b/docs/en/operations/external-authenticators/kerberos.md @@ -51,13 +51,12 @@ With filtering by realm: ``` -:::warning -You can define only one `kerberos` section. The presence of multiple `kerberos` sections will force ClickHouse to disable Kerberos authentication. -::: +!!! warning "Note" + You can define only one `kerberos` section. The presence of multiple `kerberos` sections will force ClickHouse to disable Kerberos authentication. + +!!! warning "Note" + `principal` and `realm` sections cannot be specified at the same time. The presence of both `principal` and `realm` sections will force ClickHouse to disable Kerberos authentication. -:::warning -`principal` and `realm` sections cannot be specified at the same time. The presence of both `principal` and `realm` sections will force ClickHouse to disable Kerberos authentication. -::: ## Kerberos as an external authenticator for existing users {#kerberos-as-an-external-authenticator-for-existing-users} @@ -95,13 +94,11 @@ Example (goes into `users.xml`): ``` -:::warning -Note that Kerberos authentication cannot be used alongside with any other authentication mechanism. The presence of any other sections like `password` alongside `kerberos` will force ClickHouse to shutdown. -::: +!!! warning "Warning" + Note that Kerberos authentication cannot be used alongside with any other authentication mechanism. The presence of any other sections like `password` alongside `kerberos` will force ClickHouse to shutdown. -:::info Reminder -Note, that now, once user `my_user` uses `kerberos`, Kerberos must be enabled in the main `config.xml` file as described previously. -::: +!!! info "Reminder" + Note, that now, once user `my_user` uses `kerberos`, Kerberos must be enabled in the main `config.xml` file as described previously. ### Enabling Kerberos using SQL {#enabling-kerberos-using-sql} diff --git a/docs/en/operations/index.md b/docs/en/operations/index.md index 824e851e997..b78633f2d6b 100644 --- a/docs/en/operations/index.md +++ b/docs/en/operations/index.md @@ -1,6 +1,7 @@ --- -sidebar_position: 41 -sidebar_label: Operations +toc_folder_title: Operations +toc_priority: 41 +toc_title: Introduction --- # Operations {#operations} @@ -22,4 +23,4 @@ ClickHouse operations manual consists of the following major sections: - [Settings](../operations/settings/index.md) - [Utilities](../operations/utilities/index.md) -[Original article](https://clickhouse.com/docs/en/operations/) +{## [Original article](https://clickhouse.com/docs/en/operations/) ##} diff --git a/docs/en/operations/monitoring.md b/docs/en/operations/monitoring.md index 437122e106d..ffcdae16c4d 100644 --- a/docs/en/operations/monitoring.md +++ b/docs/en/operations/monitoring.md @@ -1,6 +1,6 @@ --- -sidebar_position: 45 -sidebar_label: Monitoring +toc_priority: 45 +toc_title: Monitoring --- # Monitoring {#monitoring} diff --git a/docs/en/operations/named-collections.md b/docs/en/operations/named-collections.md index 52520ba76b7..ab972c72345 100644 --- a/docs/en/operations/named-collections.md +++ b/docs/en/operations/named-collections.md @@ -1,6 +1,6 @@ --- -sidebar_position: 69 -sidebar_label: "Named connections" +toc_priority: 69 +toc_title: "Named connections" --- # Storing details for connecting to external sources in configuration files {#named-collections} @@ -227,4 +227,4 @@ SELECT dictGet('dict', 'b', 2); ┌─dictGet('dict', 'b', 2)─┐ │ two │ └─────────────────────────┘ -``` \ No newline at end of file +``` diff --git a/docs/en/operations/opentelemetry.md b/docs/en/operations/opentelemetry.md index 740537d88bc..ec27ecfd6b2 100644 --- a/docs/en/operations/opentelemetry.md +++ b/docs/en/operations/opentelemetry.md @@ -1,15 +1,14 @@ --- -sidebar_position: 62 -sidebar_label: OpenTelemetry Support +toc_priority: 62 +toc_title: OpenTelemetry Support --- # [experimental] OpenTelemetry Support [OpenTelemetry](https://opentelemetry.io/) is an open standard for collecting traces and metrics from the distributed application. ClickHouse has some support for OpenTelemetry. -:::warning -This is an experimental feature that will change in backwards-incompatible ways in future releases. -::: +!!! warning "Warning" + This is an experimental feature that will change in backwards-incompatible ways in future releases. ## Supplying Trace Context to ClickHouse diff --git a/docs/en/operations/optimizing-performance/index.md b/docs/en/operations/optimizing-performance/index.md index ef9c6a4b664..142d3b2f976 100644 --- a/docs/en/operations/optimizing-performance/index.md +++ b/docs/en/operations/optimizing-performance/index.md @@ -1,6 +1,7 @@ --- -sidebar_label: Optimizing Performance -sidebar_position: 52 +toc_folder_title: Optimizing Performance +toc_hidden: true +toc_priority: 52 --- # Optimizing Performance {#optimizing-performance} diff --git a/docs/en/operations/optimizing-performance/sampling-query-profiler.md b/docs/en/operations/optimizing-performance/sampling-query-profiler.md index 39e83545506..72cfa59b8b2 100644 --- a/docs/en/operations/optimizing-performance/sampling-query-profiler.md +++ b/docs/en/operations/optimizing-performance/sampling-query-profiler.md @@ -1,6 +1,6 @@ --- -sidebar_position: 54 -sidebar_label: Query Profiling +toc_priority: 54 +toc_title: Query Profiling --- # Sampling Query Profiler {#sampling-query-profiler} @@ -21,7 +21,7 @@ The default sampling frequency is one sample per second and both CPU and real ti To analyze the `trace_log` system table: -- Install the `clickhouse-common-static-dbg` package. See [Install from DEB Packages](../../install.md#install-from-deb-packages). +- Install the `clickhouse-common-static-dbg` package. See [Install from DEB Packages](../../getting-started/install.md#install-from-deb-packages). - Allow introspection functions by the [allow_introspection_functions](../../operations/settings/settings.md#settings-allow_introspection_functions) setting. diff --git a/docs/en/operations/performance-test.md b/docs/en/operations/performance-test.md index 0ba3a9908a5..e410b1b2dfd 100644 --- a/docs/en/operations/performance-test.md +++ b/docs/en/operations/performance-test.md @@ -1,6 +1,6 @@ --- -sidebar_position: 54 -sidebar_label: Testing Hardware +toc_priority: 54 +toc_title: Testing Hardware --- # How to Test Your Hardware with ClickHouse {#how-to-test-your-hardware-with-clickhouse} @@ -59,7 +59,7 @@ wget https://raw.githubusercontent.com/ClickHouse/ClickHouse/master/benchmark/cl chmod a+x benchmark-new.sh wget https://raw.githubusercontent.com/ClickHouse/ClickHouse/master/benchmark/clickhouse/queries.sql ``` -3. Download the [web analytics dataset](../example-datasets/metrica.md) (“hits” table containing 100 million rows). +3. Download the [web analytics dataset](../getting-started/example-datasets/metrica.md) (“hits” table containing 100 million rows). ```bash wget https://datasets.clickhouse.com/hits/partitions/hits_100m_obfuscated_v1.tar.xz tar xvf hits_100m_obfuscated_v1.tar.xz -C . diff --git a/docs/en/operations/quotas.md b/docs/en/operations/quotas.md index 77b0697d483..6c6fbbf9cfb 100644 --- a/docs/en/operations/quotas.md +++ b/docs/en/operations/quotas.md @@ -1,6 +1,6 @@ --- -sidebar_position: 51 -sidebar_label: Quotas +toc_priority: 51 +toc_title: Quotas --- # Quotas {#quotas} @@ -101,7 +101,7 @@ Quotas can use the “quota key” feature to report on resources for multiple k diff --git a/docs/en/sql-reference/functions/geo/index.md b/docs/en/sql-reference/functions/geo/index.md index f76c3a3f731..65bf2ab83cb 100644 --- a/docs/en/sql-reference/functions/geo/index.md +++ b/docs/en/sql-reference/functions/geo/index.md @@ -1,8 +1,8 @@ --- -sidebar_label: Geo -sidebar_position: 62 +toc_title: hidden +toc_priority: 62 +toc_folder_title: Geo --- -# Geo Functions [Original article](https://clickhouse.com/docs/en/sql-reference/functions/geo/) diff --git a/docs/en/sql-reference/functions/geo/s2.md b/docs/en/sql-reference/functions/geo/s2.md index c3d95d2f0a9..f8736bcc61a 100644 --- a/docs/en/sql-reference/functions/geo/s2.md +++ b/docs/en/sql-reference/functions/geo/s2.md @@ -1,5 +1,5 @@ --- -sidebar_label: S2 Geometry +toc_title: S2 Geometry --- # Functions for Working with S2 Index {#s2index} diff --git a/docs/en/sql-reference/functions/hash-functions.md b/docs/en/sql-reference/functions/hash-functions.md index e4b1fdd3bbb..c892b814957 100644 --- a/docs/en/sql-reference/functions/hash-functions.md +++ b/docs/en/sql-reference/functions/hash-functions.md @@ -1,6 +1,6 @@ --- -sidebar_position: 50 -sidebar_label: Hash +toc_priority: 50 +toc_title: Hash --- # Hash Functions {#hash-functions} diff --git a/docs/en/sql-reference/functions/in-functions.md b/docs/en/sql-reference/functions/in-functions.md index ab8ba93daba..c8936e74954 100644 --- a/docs/en/sql-reference/functions/in-functions.md +++ b/docs/en/sql-reference/functions/in-functions.md @@ -1,6 +1,6 @@ --- -sidebar_position: 60 -sidebar_label: IN Operator +toc_priority: 60 +toc_title: IN Operator --- # Functions for Implementing the IN Operator {#functions-for-implementing-the-in-operator} diff --git a/docs/en/sql-reference/functions/index.md b/docs/en/sql-reference/functions/index.md index 261cf908e07..7cceec889bd 100644 --- a/docs/en/sql-reference/functions/index.md +++ b/docs/en/sql-reference/functions/index.md @@ -1,9 +1,10 @@ --- -sidebar_position: 32 -sidebar_label: Functions +toc_folder_title: Functions +toc_priority: 32 +toc_title: Introduction --- -# Functions +# Functions {#functions} There are at least\* two types of functions - regular functions (they are just called “functions”) and aggregate functions. These are completely different concepts. Regular functions work as if they are applied to each row separately (for each row, the result of the function does not depend on the other rows). Aggregate functions accumulate a set of values from various rows (i.e. they depend on the entire set of rows). diff --git a/docs/en/sql-reference/functions/introspection.md b/docs/en/sql-reference/functions/introspection.md index 694d07f18dc..1be68c6bdd4 100644 --- a/docs/en/sql-reference/functions/introspection.md +++ b/docs/en/sql-reference/functions/introspection.md @@ -1,15 +1,14 @@ --- -sidebar_position: 65 -sidebar_label: Introspection +toc_priority: 65 +toc_title: Introspection --- # Introspection Functions {#introspection-functions} You can use functions described in this chapter to introspect [ELF](https://en.wikipedia.org/wiki/Executable_and_Linkable_Format) and [DWARF](https://en.wikipedia.org/wiki/DWARF) for query profiling. -:::warning -These functions are slow and may impose security considerations. -::: +!!! warning "Warning" + These functions are slow and may impose security considerations. For proper operation of introspection functions: diff --git a/docs/en/sql-reference/functions/ip-address-functions.md b/docs/en/sql-reference/functions/ip-address-functions.md index c293c1ff317..cf3f92580aa 100644 --- a/docs/en/sql-reference/functions/ip-address-functions.md +++ b/docs/en/sql-reference/functions/ip-address-functions.md @@ -1,6 +1,6 @@ --- -sidebar_position: 55 -sidebar_label: IP Addresses +toc_priority: 55 +toc_title: IP Addresses --- # Functions for Working with IPv4 and IPv6 Addresses {#functions-for-working-with-ip-addresses} @@ -13,10 +13,18 @@ Alias: `INET_NTOA`. ## IPv4StringToNum(s) {#ipv4stringtonums} -The reverse function of IPv4NumToString. If the IPv4 address has an invalid format, it returns 0. +The reverse function of IPv4NumToString. If the IPv4 address has an invalid format, it throws exception. Alias: `INET_ATON`. +## IPv4StringToNumOrDefault(s) {#ipv4stringtonums} + +Same as `IPv4StringToNum`, but if the IPv4 address has an invalid format, it returns 0. + +## IPv4StringToNumOrNull(s) {#ipv4stringtonums} + +Same as `IPv4StringToNum`, but if the IPv4 address has an invalid format, it returns null. + ## IPv4NumToStringClassC(num) {#ipv4numtostringclasscnum} Similar to IPv4NumToString, but using xxx instead of the last octet. @@ -123,7 +131,7 @@ LIMIT 10 ## IPv6StringToNum {#ipv6stringtonums} -The reverse function of [IPv6NumToString](#ipv6numtostringx). If the IPv6 address has an invalid format, it returns a string of null bytes. +The reverse function of [IPv6NumToString](#ipv6numtostringx). If the IPv6 address has an invalid format, it throws exception. If the input string contains a valid IPv4 address, returns its IPv6 equivalent. HEX can be uppercase or lowercase. @@ -168,6 +176,14 @@ Result: - [cutIPv6](#cutipv6x-bytestocutforipv6-bytestocutforipv4). +## IPv6StringToNumOrDefault(s) {#ipv6stringtonums} + +Same as `IPv6StringToNum`, but if the IPv6 address has an invalid format, it returns 0. + +## IPv6StringToNumOrNull(s) {#ipv6stringtonums} + +Same as `IPv6StringToNum`, but if the IPv6 address has an invalid format, it returns null. + ## IPv4ToIPv6(x) {#ipv4toipv6x} Takes a `UInt32` number. Interprets it as an IPv4 address in [big endian](https://en.wikipedia.org/wiki/Endianness). Returns a `FixedString(16)` value containing the IPv6 address in binary format. Examples: @@ -261,6 +277,14 @@ SELECT └───────────────────────────────────┴──────────────────────────┘ ``` +## toIPv4OrDefault(string) {#toipv4ordefaultstring} + +Same as `toIPv4`, but if the IPv4 address has an invalid format, it returns 0. + +## toIPv4OrNull(string) {#toipv4ornullstring} + +Same as `toIPv4`, but if the IPv4 address has an invalid format, it returns null. + ## toIPv6 {#toipv6string} Converts a string form of IPv6 address to [IPv6](../../sql-reference/data-types/domains/ipv6.md) type. If the IPv6 address has an invalid format, returns an empty value. @@ -317,6 +341,14 @@ Result: └─────────────────────┘ ``` +## IPv6StringToNumOrDefault(s) {#toipv6ordefaultstring} + +Same as `toIPv6`, but if the IPv6 address has an invalid format, it returns 0. + +## IPv6StringToNumOrNull(s) {#toipv6ornullstring} + +Same as `toIPv6`, but if the IPv6 address has an invalid format, it returns null. + ## isIPv4String {#isipv4string} Determines whether the input string is an IPv4 address or not. If `string` is IPv6 address returns `0`. diff --git a/docs/en/sql-reference/functions/json-functions.md b/docs/en/sql-reference/functions/json-functions.md index be69b7b4f2b..8270864de74 100644 --- a/docs/en/sql-reference/functions/json-functions.md +++ b/docs/en/sql-reference/functions/json-functions.md @@ -1,13 +1,11 @@ --- -sidebar_position: 56 -sidebar_label: JSON +toc_priority: 56 +toc_title: JSON --- # Functions for Working with JSON {#functions-for-working-with-json} -ClickHouse has special functions for working with this JSON. All the JSON functions are based on strong assumptions about what the JSON can be, but they try to do as little as possible to get the job done. - -The following assumptions are made: +ClickHouse has special functions for working with this JSON. The `visitParam` functions make strong assumptions about what the JSON can be, but they try to do as little as possible to get the job done. The following assumptions are made: 1. The field name (function argument) must be a constant. 2. The field name is somehow canonically encoded in JSON. For example: `visitParamHas('{"abc":"def"}', 'abc') = 1`, but `visitParamHas('{"\\u0061\\u0062\\u0063":"def"}', 'abc') = 0` @@ -359,9 +357,8 @@ SELECT JSON_EXISTS('{"hello":["world"]}', '$.hello[*]'); SELECT JSON_EXISTS('{"hello":["world"]}', '$.hello[0]'); ``` -:::note -Before version 21.11 the order of arguments was wrong, i.e. JSON_EXISTS(path, json) -::: +!!! note "Note" + before version 21.11 the order of arguments was wrong, i.e. JSON_EXISTS(path, json) ## JSON_QUERY(json, path) {#json-query} @@ -386,9 +383,8 @@ Result: [2] String ``` -:::note -Before version 21.11 the order of arguments was wrong, i.e. JSON_QUERY(path, json) -::: +!!! note "Note" + before version 21.11 the order of arguments was wrong, i.e. JSON_QUERY(path, json) ## JSON_VALUE(json, path) {#json-value} @@ -414,9 +410,8 @@ Result: String ``` -:::note -Before version 21.11 the order of arguments was wrong, i.e. JSON_VALUE(path, json) -::: +!!! note "Note" + before version 21.11 the order of arguments was wrong, i.e. JSON_VALUE(path, json) ## toJSONString {#tojsonstring} diff --git a/docs/en/sql-reference/functions/logical-functions.md b/docs/en/sql-reference/functions/logical-functions.md index 0055e253951..dcdb01e2059 100644 --- a/docs/en/sql-reference/functions/logical-functions.md +++ b/docs/en/sql-reference/functions/logical-functions.md @@ -1,6 +1,6 @@ --- -sidebar_position: 37 -sidebar_label: Logical +toc_priority: 37 +toc_title: Logical --- # Logical Functions {#logical-functions} diff --git a/docs/en/sql-reference/functions/machine-learning-functions.md b/docs/en/sql-reference/functions/machine-learning-functions.md index 5b3e8b87e34..b823340058e 100644 --- a/docs/en/sql-reference/functions/machine-learning-functions.md +++ b/docs/en/sql-reference/functions/machine-learning-functions.md @@ -1,6 +1,6 @@ --- -sidebar_position: 64 -sidebar_label: Machine Learning +toc_priority: 64 +toc_title: Machine Learning --- # Machine Learning Functions {#machine-learning-functions} diff --git a/docs/en/sql-reference/functions/math-functions.md b/docs/en/sql-reference/functions/math-functions.md index 645587b4f5c..a5fc07cf687 100644 --- a/docs/en/sql-reference/functions/math-functions.md +++ b/docs/en/sql-reference/functions/math-functions.md @@ -1,6 +1,6 @@ --- -sidebar_position: 44 -sidebar_label: Mathematical +toc_priority: 44 +toc_title: Mathematical --- # Mathematical Functions {#mathematical-functions} diff --git a/docs/en/sql-reference/functions/nlp-functions.md b/docs/en/sql-reference/functions/nlp-functions.md index 5a00252f56c..8a1a44cf079 100644 --- a/docs/en/sql-reference/functions/nlp-functions.md +++ b/docs/en/sql-reference/functions/nlp-functions.md @@ -1,13 +1,12 @@ --- -sidebar_position: 67 -sidebar_label: NLP +toc_priority: 67 +toc_title: NLP --- # [experimental] Natural Language Processing functions {#nlp-functions} -:::warning -This is an experimental feature that is currently in development and is not ready for general use. It will change in unpredictable backwards-incompatible ways in future releases. Set `allow_experimental_nlp_functions = 1` to enable it. -::: +!!! warning "Warning" + This is an experimental feature that is currently in development and is not ready for general use. It will change in unpredictable backwards-incompatible ways in future releases. Set `allow_experimental_nlp_functions = 1` to enable it. ## stem {#stem} diff --git a/docs/en/sql-reference/functions/other-functions.md b/docs/en/sql-reference/functions/other-functions.md index 14cd7337d76..bce3f9144b1 100644 --- a/docs/en/sql-reference/functions/other-functions.md +++ b/docs/en/sql-reference/functions/other-functions.md @@ -1,6 +1,6 @@ --- -sidebar_position: 67 -sidebar_label: Other +toc_priority: 67 +toc_title: Other --- # Other Functions {#other-functions} @@ -729,9 +729,8 @@ neighbor(column, offset[, default_value]) The result of the function depends on the affected data blocks and the order of data in the block. -:::warning -It can reach the neighbor rows only inside the currently processed data block. -::: +!!! warning "Warning" + It can reach the neighbor rows only inside the currently processed data block. The rows order used during the calculation of `neighbor` can differ from the order of rows returned to the user. To prevent that you can make a subquery with [ORDER BY](../../sql-reference/statements/select/order-by.md) and call the function from outside the subquery. @@ -839,9 +838,8 @@ Result: Calculates the difference between successive row values ​​in the data block. Returns 0 for the first row and the difference from the previous row for each subsequent row. -:::warning -It can reach the previous row only inside the currently processed data block. -::: +!!! warning "Warning" + It can reach the previous row only inside the currently processed data block. The result of the function depends on the affected data blocks and the order of data in the block. @@ -923,9 +921,9 @@ Each event has a start time and an end time. The start time is included in the e The function calculates the total number of active (concurrent) events for each event start time. -:::warning -Events must be ordered by the start time in ascending order. If this requirement is violated the function raises an exception. Every data block is processed separately. If events from different data blocks overlap then they can not be processed correctly. -::: +!!! warning "Warning" + Events must be ordered by the start time in ascending order. If this requirement is violated the function raises an exception. + Every data block is processed separately. If events from different data blocks overlap then they can not be processed correctly. **Syntax** @@ -1216,7 +1214,7 @@ SELECT * FROM table WHERE indexHint() **Example** -Here is the example of test data from the table [ontime](../../example-datasets/ontime.md). +Here is the example of test data from the table [ontime](../../getting-started/example-datasets/ontime.md). Input table: @@ -1611,9 +1609,8 @@ Result: Accumulates states of an aggregate function for each row of a data block. -:::warning -The state is reset for each new data block. -::: +!!! warning "Warning" + The state is reset for each new data block. **Syntax** @@ -2071,9 +2068,8 @@ Number of digits. Type: [UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges). -:::note -For `Decimal` values takes into account their scales: calculates result over underlying integer type which is `(value * scale)`. For example: `countDigits(42) = 2`, `countDigits(42.000) = 5`, `countDigits(0.04200) = 4`. I.e. you may check decimal overflow for `Decimal64` with `countDecimal(x) > 18`. It's a slow variant of [isDecimalOverflow](#is-decimal-overflow). -::: + !!! note "Note" + For `Decimal` values takes into account their scales: calculates result over underlying integer type which is `(value * scale)`. For example: `countDigits(42) = 2`, `countDigits(42.000) = 5`, `countDigits(0.04200) = 4`. I.e. you may check decimal overflow for `Decimal64` with `countDecimal(x) > 18`. It's a slow variant of [isDecimalOverflow](#is-decimal-overflow). **Example** diff --git a/docs/en/sql-reference/functions/random-functions.md b/docs/en/sql-reference/functions/random-functions.md index 5e20a93da1f..aab9483de45 100644 --- a/docs/en/sql-reference/functions/random-functions.md +++ b/docs/en/sql-reference/functions/random-functions.md @@ -1,15 +1,14 @@ --- -sidebar_position: 51 -sidebar_label: Pseudo-Random Numbers +toc_priority: 51 +toc_title: Pseudo-Random Numbers --- # Functions for Generating Pseudo-Random Numbers {#functions-for-generating-pseudo-random-numbers} All the functions accept zero arguments or one argument. If an argument is passed, it can be any type, and its value is not used for anything. The only purpose of this argument is to prevent common subexpression elimination, so that two different instances of the same function return different columns with different random numbers. -:::note -Non-cryptographic generators of pseudo-random numbers are used. -::: +!!! note "Note" + Non-cryptographic generators of pseudo-random numbers are used. ## rand, rand32 {#rand} diff --git a/docs/en/sql-reference/functions/rounding-functions.md b/docs/en/sql-reference/functions/rounding-functions.md index a469318e623..c9044c62ca4 100644 --- a/docs/en/sql-reference/functions/rounding-functions.md +++ b/docs/en/sql-reference/functions/rounding-functions.md @@ -1,6 +1,6 @@ --- -sidebar_position: 45 -sidebar_label: Rounding +toc_priority: 45 +toc_title: Rounding --- # Rounding Functions {#rounding-functions} @@ -189,7 +189,7 @@ Accepts a number. If the number is less than one, it returns 0. Otherwise, it ro ## roundDuration(num) {#rounddurationnum} -Accepts a number. If the number is less than one, it returns 0. Otherwise, it rounds the number down to numbers from the set: 1, 10, 30, 60, 120, 180, 240, 300, 600, 1200, 1800, 3600, 7200, 18000, 36000. +Accepts a number. If the number is less than one, it returns 0. Otherwise, it rounds the number down to numbers from the set: 1, 10, 30, 60, 120, 180, 240, 300, 600, 1200, 1800, 3600, 7200, 18000, 36000. This function was specifically implemented for a web analytics use case for reporting on session lengths. ## roundAge(num) {#roundagenum} diff --git a/docs/en/sql-reference/functions/splitting-merging-functions.md b/docs/en/sql-reference/functions/splitting-merging-functions.md index 7e94c225f6b..7a4e04bbf6c 100644 --- a/docs/en/sql-reference/functions/splitting-merging-functions.md +++ b/docs/en/sql-reference/functions/splitting-merging-functions.md @@ -1,6 +1,6 @@ --- -sidebar_position: 47 -sidebar_label: Splitting and Merging Strings and Arrays +toc_priority: 47 +toc_title: Splitting and Merging Strings and Arrays --- # Functions for Splitting and Merging Strings and Arrays {#functions-for-splitting-and-merging-strings-and-arrays} diff --git a/docs/en/sql-reference/functions/statistics.md b/docs/en/sql-reference/functions/statistics.md new file mode 100644 index 00000000000..3f337b05cbc --- /dev/null +++ b/docs/en/sql-reference/functions/statistics.md @@ -0,0 +1,48 @@ +--- +toc_priority: 69 +toc_title: Statistics +--- + +# Functions for Working with Statistics {#functions-for-working-with-statistics} + +# proportionsZTest {#proportionsztest} + +Applies proportion z-test to samples from two populations (X and Y). The alternative is 'two-sided'. + +**Syntax** + +``` sql +proportionsZTest(successes_x, successes_y, trials_x, trials_y, significance_level, usevar) +``` + +**Arguments** + +- `successes_x` — The number of successes for X in trials. +- `successes_y` — The number of successes for X in trials. +- `trials_x` — The number of trials for X. +- `trials_y` — The number of trials for Y. +- `significance_level` +- `usevar` - It can be `'pooled'` or `'unpooled'`. + - `'pooled'` - The variance of the two populations are assumed to be equal. + - `'unpooled'` - The assumption of equal variances is dropped. + +**Returned value** + +- A tuple with the (z-statistic, p-value, confidence-interval-lower, confidence-interval-upper). + +Type: [Tuple](../../sql-reference/data-types/tuple.md). + +**Example** + +Query: + +``` sql +SELECT proportionsZTest(10, 11, 100, 101, 0.95, 'unpooled'); +``` + +Result: + +``` text +(-0.20656724435948853,0.8363478437079654,-0.09345975390115283,0.07563797172293502) +``` + diff --git a/docs/en/sql-reference/functions/string-functions.md b/docs/en/sql-reference/functions/string-functions.md index d63e466a836..a30cacde519 100644 --- a/docs/en/sql-reference/functions/string-functions.md +++ b/docs/en/sql-reference/functions/string-functions.md @@ -1,13 +1,12 @@ --- -sidebar_position: 40 -sidebar_label: Strings +toc_priority: 40 +toc_title: Strings --- # Functions for Working with Strings {#functions-for-working-with-strings} -:::note -Functions for [searching](../../sql-reference/functions/string-search-functions.md) and [replacing](../../sql-reference/functions/string-replace-functions.md) in strings are described separately. -::: +!!! note "Note" + Functions for [searching](../../sql-reference/functions/string-search-functions.md) and [replacing](../../sql-reference/functions/string-replace-functions.md) in strings are described separately. ## empty {#empty} diff --git a/docs/en/sql-reference/functions/string-replace-functions.md b/docs/en/sql-reference/functions/string-replace-functions.md index 1df8bfd0c44..144b4fbc1da 100644 --- a/docs/en/sql-reference/functions/string-replace-functions.md +++ b/docs/en/sql-reference/functions/string-replace-functions.md @@ -1,13 +1,12 @@ --- -sidebar_position: 42 -sidebar_label: For Replacing in Strings +toc_priority: 42 +toc_title: For Replacing in Strings --- # Functions for Searching and Replacing in Strings {#functions-for-searching-and-replacing-in-strings} -:::note -Functions for [searching](../../sql-reference/functions/string-search-functions.md) and [other manipulations with strings](../../sql-reference/functions/string-functions.md) are described separately. -::: +!!! note "Note" + Functions for [searching](../../sql-reference/functions/string-search-functions.md) and [other manipulations with strings](../../sql-reference/functions/string-functions.md) are described separately. ## replaceOne(haystack, pattern, replacement) {#replaceonehaystack-pattern-replacement} diff --git a/docs/en/sql-reference/functions/string-search-functions.md b/docs/en/sql-reference/functions/string-search-functions.md index 985d9f1e63a..a0c0116a058 100644 --- a/docs/en/sql-reference/functions/string-search-functions.md +++ b/docs/en/sql-reference/functions/string-search-functions.md @@ -1,15 +1,14 @@ --- -sidebar_position: 41 -sidebar_label: For Searching in Strings +toc_priority: 41 +toc_title: For Searching in Strings --- # Functions for Searching in Strings {#functions-for-searching-strings} The search is case-sensitive by default in all these functions. There are separate variants for case insensitive search. -:::note -Functions for [replacing](../../sql-reference/functions/string-replace-functions.md) and [other manipulations with strings](../../sql-reference/functions/string-functions.md) are described separately. -::: +!!! note "Note" + Functions for [replacing](../../sql-reference/functions/string-replace-functions.md) and [other manipulations with strings](../../sql-reference/functions/string-functions.md) are described separately. ## position(haystack, needle), locate(haystack, needle) {#position} @@ -31,9 +30,8 @@ position(needle IN haystack) Alias: `locate(haystack, needle[, start_pos])`. -:::note -Syntax of `position(needle IN haystack)` provides SQL-compatibility, the function works the same way as to `position(haystack, needle)`. -::: +!!! note "Note" + Syntax of `position(needle IN haystack)` provides SQL-compatibility, the function works the same way as to `position(haystack, needle)`. **Arguments** @@ -344,9 +342,8 @@ Returns 1, if at least one string needlei matches the string `haystac For a case-insensitive search or/and in UTF-8 format use functions `multiSearchAnyCaseInsensitive, multiSearchAnyUTF8, multiSearchAnyCaseInsensitiveUTF8`. -:::note -In all `multiSearch*` functions the number of needles should be less than 28 because of implementation specification. -::: +!!! note "Note" + In all `multiSearch*` functions the number of needles should be less than 28 because of implementation specification. ## match(haystack, pattern) {#matchhaystack-pattern} @@ -361,9 +358,8 @@ For patterns to search for substrings in a string, it is better to use LIKE or The same as `match`, but returns 0 if none of the regular expressions are matched and 1 if any of the patterns matches. It uses [hyperscan](https://github.com/intel/hyperscan) library. For patterns to search substrings in a string, it is better to use `multiSearchAny` since it works much faster. -:::note -The length of any of the `haystack` string must be less than 232 bytes otherwise the exception is thrown. This restriction takes place because of hyperscan API. -::: +!!! note "Note" + The length of any of the `haystack` string must be less than 232 bytes otherwise the exception is thrown. This restriction takes place because of hyperscan API. ## multiMatchAnyIndex(haystack, \[pattern1, pattern2, …, patternn\]) {#multimatchanyindexhaystack-pattern1-pattern2-patternn} @@ -385,13 +381,11 @@ The same as `multiFuzzyMatchAny`, but returns any index that matches the haystac The same as `multiFuzzyMatchAny`, but returns the array of all indices in any order that match the haystack within a constant edit distance. -:::note -`multiFuzzyMatch*` functions do not support UTF-8 regular expressions, and such expressions are treated as bytes because of hyperscan restriction. -::: +!!! note "Note" + `multiFuzzyMatch*` functions do not support UTF-8 regular expressions, and such expressions are treated as bytes because of hyperscan restriction. -:::note -To turn off all functions that use hyperscan, use setting `SET allow_hyperscan = 0;`. -::: +!!! note "Note" + To turn off all functions that use hyperscan, use setting `SET allow_hyperscan = 0;`. ## extract(haystack, pattern) {#extracthaystack-pattern} @@ -405,9 +399,8 @@ Extracts all the fragments of a string using a regular expression. If ‘haystac Matches all groups of the `haystack` string using the `pattern` regular expression. Returns an array of arrays, where the first array includes all fragments matching the first group, the second array - matching the second group, etc. -:::note -`extractAllGroupsHorizontal` function is slower than [extractAllGroupsVertical](#extractallgroups-vertical). -::: +!!! note "Note" + `extractAllGroupsHorizontal` function is slower than [extractAllGroupsVertical](#extractallgroups-vertical). **Syntax** @@ -577,9 +570,8 @@ Same as `ngramDistance` but calculates the non-symmetric difference between `nee For case-insensitive search or/and in UTF-8 format use functions `ngramSearchCaseInsensitive, ngramSearchUTF8, ngramSearchCaseInsensitiveUTF8`. -:::note -For UTF-8 case we use 3-gram distance. All these are not perfectly fair n-gram distances. We use 2-byte hashes to hash n-grams and then calculate the (non-)symmetric difference between these hash tables – collisions may occur. With UTF-8 case-insensitive format we do not use fair `tolower` function – we zero the 5-th bit (starting from zero) of each codepoint byte and first bit of zeroth byte if bytes more than one – this works for Latin and mostly for all Cyrillic letters. -::: +!!! note "Note" + For UTF-8 case we use 3-gram distance. All these are not perfectly fair n-gram distances. We use 2-byte hashes to hash n-grams and then calculate the (non-)symmetric difference between these hash tables – collisions may occur. With UTF-8 case-insensitive format we do not use fair `tolower` function – we zero the 5-th bit (starting from zero) of each codepoint byte and first bit of zeroth byte if bytes more than one – this works for Latin and mostly for all Cyrillic letters. ## countSubstrings {#countSubstrings} diff --git a/docs/en/sql-reference/functions/time-window-functions.md b/docs/en/sql-reference/functions/time-window-functions.md index b45866cf931..2ea44a6e585 100644 --- a/docs/en/sql-reference/functions/time-window-functions.md +++ b/docs/en/sql-reference/functions/time-window-functions.md @@ -1,6 +1,6 @@ --- -sidebar_position: 68 -sidebar_label: Time Window +toc_priority: 68 +toc_title: Time Window --- # Time Window Functions {#time-window-functions} diff --git a/docs/en/sql-reference/functions/tuple-functions.md b/docs/en/sql-reference/functions/tuple-functions.md index cfce02f4d31..96bceb8958c 100644 --- a/docs/en/sql-reference/functions/tuple-functions.md +++ b/docs/en/sql-reference/functions/tuple-functions.md @@ -1,6 +1,6 @@ --- -sidebar_position: 66 -sidebar_label: Tuples +toc_priority: 66 +toc_title: Tuples --- # Functions for Working with Tuples {#tuple-functions} diff --git a/docs/en/sql-reference/functions/tuple-map-functions.md b/docs/en/sql-reference/functions/tuple-map-functions.md index a0d62ff5ecb..8ead8c58c7a 100644 --- a/docs/en/sql-reference/functions/tuple-map-functions.md +++ b/docs/en/sql-reference/functions/tuple-map-functions.md @@ -1,6 +1,6 @@ --- -sidebar_position: 46 -sidebar_label: Working with maps +toc_priority: 46 +toc_title: Working with maps --- # Functions for maps {#functions-for-working-with-tuple-maps} diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index de6ca769589..18cc3d98561 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -1,6 +1,6 @@ --- -sidebar_position: 38 -sidebar_label: Type Conversion +toc_priority: 38 +toc_title: Type Conversion --- # Type Conversion Functions {#type-conversion-functions} @@ -689,9 +689,8 @@ x::t - Converted value. -:::note -If the input value does not fit the bounds of the target type, the result overflows. For example, `CAST(-1, 'UInt8')` returns `255`. -::: +!!! note "Note" + If the input value does not fit the bounds of the target type, the result overflows. For example, `CAST(-1, 'UInt8')` returns `255`. **Examples** @@ -1433,9 +1432,8 @@ Result: Converts a `DateTime64` to a `Int64` value with fixed sub-second precision. Input value is scaled up or down appropriately depending on it precision. -:::note -The output value is a timestamp in UTC, not in the timezone of `DateTime64`. -::: +!!! info "Note" + The output value is a timestamp in UTC, not in the timezone of `DateTime64`. **Syntax** diff --git a/docs/en/sql-reference/functions/url-functions.md b/docs/en/sql-reference/functions/url-functions.md index c91029c4fce..5a305aa5033 100644 --- a/docs/en/sql-reference/functions/url-functions.md +++ b/docs/en/sql-reference/functions/url-functions.md @@ -1,6 +1,6 @@ --- -sidebar_position: 54 -sidebar_label: URLs +toc_priority: 54 +toc_title: URLs --- # Functions for Working with URLs {#functions-for-working-with-urls} diff --git a/docs/en/sql-reference/functions/uuid-functions.md b/docs/en/sql-reference/functions/uuid-functions.md index d23b505a93f..3616b587bf7 100644 --- a/docs/en/sql-reference/functions/uuid-functions.md +++ b/docs/en/sql-reference/functions/uuid-functions.md @@ -1,6 +1,6 @@ --- -sidebar_position: 53 -sidebar_label: UUID +toc_priority: 53 +toc_title: UUID --- # Functions for Working with UUID {#functions-for-working-with-uuid} diff --git a/docs/en/sql-reference/functions/ym-dict-functions.md b/docs/en/sql-reference/functions/ym-dict-functions.md index 85215957443..1e6c9cbd0b4 100644 --- a/docs/en/sql-reference/functions/ym-dict-functions.md +++ b/docs/en/sql-reference/functions/ym-dict-functions.md @@ -1,6 +1,6 @@ --- -sidebar_position: 59 -sidebar_label: Embedded Dictionaries +toc_priority: 59 +toc_title: Embedded Dictionaries --- # Functions for Working with Embedded Dictionaries diff --git a/docs/en/sql-reference/index.md b/docs/en/sql-reference/index.md index 1123c8533a9..e8fe092e622 100644 --- a/docs/en/sql-reference/index.md +++ b/docs/en/sql-reference/index.md @@ -1,6 +1,8 @@ --- -sidebar_position: 28 -sidebar_label: SQL Reference +toc_folder_title: SQL Reference +toc_hidden: true +toc_priority: 28 +toc_title: hidden --- # SQL Reference {#sql-reference} diff --git a/docs/en/sql-reference/operators/exists.md b/docs/en/sql-reference/operators/exists.md index 25413790801..ee0c7317637 100644 --- a/docs/en/sql-reference/operators/exists.md +++ b/docs/en/sql-reference/operators/exists.md @@ -4,9 +4,8 @@ The `EXISTS` operator checks how many records are in the result of a subquery. I `EXISTS` can be used in a [WHERE](../../sql-reference/statements/select/where.md) clause. -:::warning -References to main query tables and columns are not supported in a subquery. -::: +!!! warning "Warning" + References to main query tables and columns are not supported in a subquery. **Syntax** diff --git a/docs/en/sql-reference/operators/in.md b/docs/en/sql-reference/operators/in.md index 5dda097e799..d8468370f3e 100644 --- a/docs/en/sql-reference/operators/in.md +++ b/docs/en/sql-reference/operators/in.md @@ -119,9 +119,8 @@ FROM t_null There are two options for IN-s with subqueries (similar to JOINs): normal `IN` / `JOIN` and `GLOBAL IN` / `GLOBAL JOIN`. They differ in how they are run for distributed query processing. -:::note -Remember that the algorithms described below may work differently depending on the [settings](../../operations/settings/settings.md) `distributed_product_mode` setting. -::: +!!! attention "Attention" + Remember that the algorithms described below may work differently depending on the [settings](../../operations/settings/settings.md) `distributed_product_mode` setting. When using the regular IN, the query is sent to remote servers, and each of them runs the subqueries in the `IN` or `JOIN` clause. diff --git a/docs/en/sql-reference/operators/index.md b/docs/en/sql-reference/operators/index.md index 4761f46ec05..a64dcd70c6c 100644 --- a/docs/en/sql-reference/operators/index.md +++ b/docs/en/sql-reference/operators/index.md @@ -1,6 +1,6 @@ --- -sidebar_position: 38 -sidebar_label: Operators +toc_priority: 38 +toc_title: Operators --- # Operators {#operators} @@ -210,9 +210,8 @@ Types of intervals: You can also use a string literal when setting the `INTERVAL` value. For example, `INTERVAL 1 HOUR` is identical to the `INTERVAL '1 hour'` or `INTERVAL '1' hour`. -:::warning -Intervals with different types can’t be combined. You can’t use expressions like `INTERVAL 4 DAY 1 HOUR`. Specify intervals in units that are smaller or equal to the smallest unit of the interval, for example, `INTERVAL 25 HOUR`. You can use consecutive operations, like in the example below. -::: +!!! warning "Warning" + Intervals with different types can’t be combined. You can’t use expressions like `INTERVAL 4 DAY 1 HOUR`. Specify intervals in units that are smaller or equal to the smallest unit of the interval, for example, `INTERVAL 25 HOUR`. You can use consecutive operations, like in the example below. Examples: @@ -248,9 +247,9 @@ SELECT now() AS current_date_time, current_date_time + INTERVAL '4' day + INTERV You can work with dates without using `INTERVAL`, just by adding or subtracting seconds, minutes, and hours. For example, an interval of one day can be set by adding `60*60*24`. -:::note -The `INTERVAL` syntax or `addDays` function are always preferred. Simple addition or subtraction (syntax like `now() + ...`) doesn't consider time settings. For example, daylight saving time. -::: +!!! note "Note" + The `INTERVAL` syntax or `addDays` function are always preferred. Simple addition or subtraction (syntax like `now() + ...`) doesn't consider time settings. For example, daylight saving time. + Examples: diff --git a/docs/en/sql-reference/statements/alter/column.md b/docs/en/sql-reference/statements/alter/column.md index 3d22146a56b..6bb63ea06a6 100644 --- a/docs/en/sql-reference/statements/alter/column.md +++ b/docs/en/sql-reference/statements/alter/column.md @@ -1,6 +1,6 @@ --- -sidebar_position: 37 -sidebar_label: COLUMN +toc_priority: 37 +toc_title: COLUMN --- # Column Manipulations {#column-manipulations} @@ -75,9 +75,8 @@ Deletes the column with the name `name`. If the `IF EXISTS` clause is specified, Deletes data from the file system. Since this deletes entire files, the query is completed almost instantly. -:::warning -You can’t delete a column if it is referenced by [materialized view](../../../sql-reference/statements/create/view.md#materialized). Otherwise, it returns an error. -::: +!!! warning "Warning" + You can’t delete a column if it is referenced by [materialized view](../../../sql-reference/statements/create/view.md#materialized). Otherwise, it returns an error. Example: diff --git a/docs/en/sql-reference/statements/alter/comment.md b/docs/en/sql-reference/statements/alter/comment.md index af57adcf31c..67a17fc8974 100644 --- a/docs/en/sql-reference/statements/alter/comment.md +++ b/docs/en/sql-reference/statements/alter/comment.md @@ -1,6 +1,6 @@ --- -sidebar_position: 51 -sidebar_label: COMMENT +toc_priority: 51 +toc_title: COMMENT --- # ALTER TABLE … MODIFY COMMENT {#alter-modify-comment} diff --git a/docs/en/sql-reference/statements/alter/constraint.md b/docs/en/sql-reference/statements/alter/constraint.md index c9517981ae7..8f4ce57b905 100644 --- a/docs/en/sql-reference/statements/alter/constraint.md +++ b/docs/en/sql-reference/statements/alter/constraint.md @@ -1,6 +1,6 @@ --- -sidebar_position: 43 -sidebar_label: CONSTRAINT +toc_priority: 43 +toc_title: CONSTRAINT --- # Manipulating Constraints {#manipulations-with-constraints} @@ -16,8 +16,7 @@ See more on [constraints](../../../sql-reference/statements/create/table.md#cons Queries will add or remove metadata about constraints from table so they are processed immediately. -:::warning -Constraint check **will not be executed** on existing data if it was added. -::: +!!! warning "Warning" + Constraint check **will not be executed** on existing data if it was added. All changes on replicated tables are broadcasted to ZooKeeper and will be applied on other replicas as well. diff --git a/docs/en/sql-reference/statements/alter/delete.md b/docs/en/sql-reference/statements/alter/delete.md index 21ae091f9e7..6c638c0a3ac 100644 --- a/docs/en/sql-reference/statements/alter/delete.md +++ b/docs/en/sql-reference/statements/alter/delete.md @@ -1,6 +1,6 @@ --- -sidebar_position: 39 -sidebar_label: DELETE +toc_priority: 39 +toc_title: DELETE --- # ALTER TABLE … DELETE Statement {#alter-mutations} @@ -11,9 +11,8 @@ ALTER TABLE [db.]table [ON CLUSTER cluster] DELETE WHERE filter_expr Deletes data matching the specified filtering expression. Implemented as a [mutation](../../../sql-reference/statements/alter/index.md#mutations). -:::note -The `ALTER TABLE` prefix makes this syntax different from most other systems supporting SQL. It is intended to signify that unlike similar queries in OLTP databases this is a heavy operation not designed for frequent use. -::: +!!! note "Note" + The `ALTER TABLE` prefix makes this syntax different from most other systems supporting SQL. It is intended to signify that unlike similar queries in OLTP databases this is a heavy operation not designed for frequent use. The `filter_expr` must be of type `UInt8`. The query deletes rows in the table for which this expression takes a non-zero value. diff --git a/docs/en/sql-reference/statements/alter/index.md b/docs/en/sql-reference/statements/alter/index.md index 536da948218..0d5909518ed 100644 --- a/docs/en/sql-reference/statements/alter/index.md +++ b/docs/en/sql-reference/statements/alter/index.md @@ -1,9 +1,9 @@ --- -sidebar_position: 35 -sidebar_label: ALTER +toc_priority: 35 +toc_title: ALTER --- -# ALTER +## ALTER {#query_language_queries_alter} Most `ALTER TABLE` queries modify table settings or data: @@ -16,9 +16,8 @@ Most `ALTER TABLE` queries modify table settings or data: - [CONSTRAINT](../../../sql-reference/statements/alter/constraint.md) - [TTL](../../../sql-reference/statements/alter/ttl.md) -:::note -Most `ALTER TABLE` queries are supported only for [\*MergeTree](../../../engines/table-engines/mergetree-family/index.md) tables, as well as [Merge](../../../engines/table-engines/special/merge.md) and [Distributed](../../../engines/table-engines/special/distributed.md). -::: +!!! note "Note" + Most `ALTER TABLE` queries are supported only for [\*MergeTree](../../../engines/table-engines/mergetree-family/index.md) tables, as well as [Merge](../../../engines/table-engines/special/merge.md) and [Distributed](../../../engines/table-engines/special/distributed.md). These `ALTER` statements manipulate views: @@ -55,8 +54,7 @@ For all `ALTER` queries, you can use the [replication_alter_partitions_sync](../ You can specify how long (in seconds) to wait for inactive replicas to execute all `ALTER` queries with the [replication_wait_for_inactive_replica_timeout](../../../operations/settings/settings.md#replication-wait-for-inactive-replica-timeout) setting. -:::note -For all `ALTER` queries, if `replication_alter_partitions_sync = 2` and some replicas are not active for more than the time, specified in the `replication_wait_for_inactive_replica_timeout` setting, then an exception `UNFINISHED` is thrown. -::: +!!! info "Note" + For all `ALTER` queries, if `replication_alter_partitions_sync = 2` and some replicas are not active for more than the time, specified in the `replication_wait_for_inactive_replica_timeout` setting, then an exception `UNFINISHED` is thrown. For `ALTER TABLE ... UPDATE|DELETE` queries the synchronicity is defined by the [mutations_sync](../../../operations/settings/settings.md#mutations_sync) setting. diff --git a/docs/en/sql-reference/statements/alter/index/index.md b/docs/en/sql-reference/statements/alter/index/index.md index 92f55792a70..4e2943d37f3 100644 --- a/docs/en/sql-reference/statements/alter/index/index.md +++ b/docs/en/sql-reference/statements/alter/index/index.md @@ -1,7 +1,7 @@ --- toc_hidden_folder: true -sidebar_position: 42 -sidebar_label: INDEX +toc_priority: 42 +toc_title: INDEX --- # Manipulating Data Skipping Indices {#manipulations-with-data-skipping-indices} @@ -18,6 +18,5 @@ The first two commands are lightweight in a sense that they only change metadata Also, they are replicated, syncing indices metadata via ZooKeeper. -:::note -Index manipulation is supported only for tables with [`*MergeTree`](../../../../engines/table-engines/mergetree-family/mergetree.md) engine (including [replicated](../../../../engines/table-engines/mergetree-family/replication.md) variants). -::: \ No newline at end of file +!!! note "Note" + Index manipulation is supported only for tables with [`*MergeTree`](../../../../engines/table-engines/mergetree-family/mergetree.md) engine (including [replicated](../../../../engines/table-engines/mergetree-family/replication.md) variants). diff --git a/docs/en/sql-reference/statements/alter/order-by.md b/docs/en/sql-reference/statements/alter/order-by.md index 84d29ae8e11..16f9ace206d 100644 --- a/docs/en/sql-reference/statements/alter/order-by.md +++ b/docs/en/sql-reference/statements/alter/order-by.md @@ -1,6 +1,6 @@ --- -sidebar_position: 41 -sidebar_label: ORDER BY +toc_priority: 41 +toc_title: ORDER BY --- # Manipulating Key Expressions {#manipulations-with-key-expressions} @@ -13,6 +13,5 @@ The command changes the [sorting key](../../../engines/table-engines/mergetree-f The command is lightweight in a sense that it only changes metadata. To keep the property that data part rows are ordered by the sorting key expression you cannot add expressions containing existing columns to the sorting key (only columns added by the `ADD COLUMN` command in the same `ALTER` query, without default column value). -:::note -It only works for tables in the [`MergeTree`](../../../engines/table-engines/mergetree-family/mergetree.md) family (including [replicated](../../../engines/table-engines/mergetree-family/replication.md) tables). -::: \ No newline at end of file +!!! note "Note" + It only works for tables in the [`MergeTree`](../../../engines/table-engines/mergetree-family/mergetree.md) family (including [replicated](../../../engines/table-engines/mergetree-family/replication.md) tables). diff --git a/docs/en/sql-reference/statements/alter/partition.md b/docs/en/sql-reference/statements/alter/partition.md index 453d1bd7bf6..12737624ecb 100644 --- a/docs/en/sql-reference/statements/alter/partition.md +++ b/docs/en/sql-reference/statements/alter/partition.md @@ -1,6 +1,6 @@ --- -sidebar_position: 38 -sidebar_label: PARTITION +toc_priority: 38 +toc_title: PARTITION --- # Manipulating Partitions and Parts {#alter_manipulations-with-partitions} @@ -160,9 +160,8 @@ ALTER TABLE table_name FREEZE [PARTITION partition_expr] [WITH NAME 'backup_name This query creates a local backup of a specified partition. If the `PARTITION` clause is omitted, the query creates the backup of all partitions at once. -:::note -The entire backup process is performed without stopping the server. -::: +!!! note "Note" + The entire backup process is performed without stopping the server. Note that for old-styled tables you can specify the prefix of the partition name (for example, `2019`) - then the query creates the backup for all the corresponding partitions. Read about setting the partition expression in a section [How to specify the partition expression](#alter-how-to-specify-part-expr). @@ -172,9 +171,8 @@ At the time of execution, for a data snapshot, the query creates hardlinks to a - `N` is the incremental number of the backup. - if the `WITH NAME` parameter is specified, then the value of the `'backup_name'` parameter is used instead of the incremental number. -:::note -If you use [a set of disks for data storage in a table](../../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes), the `shadow/N` directory appears on every disk, storing data parts that matched by the `PARTITION` expression. -::: +!!! note "Note" + If you use [a set of disks for data storage in a table](../../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes), the `shadow/N` directory appears on every disk, storing data parts that matched by the `PARTITION` expression. The same structure of directories is created inside the backup as inside `/var/lib/clickhouse/`. The query performs `chmod` for all files, forbidding writing into them. diff --git a/docs/en/sql-reference/statements/alter/projection.md b/docs/en/sql-reference/statements/alter/projection.md index 5ccf33d2d2f..c7ebc83c496 100644 --- a/docs/en/sql-reference/statements/alter/projection.md +++ b/docs/en/sql-reference/statements/alter/projection.md @@ -1,6 +1,6 @@ --- -sidebar_position: 49 -sidebar_label: PROJECTION +toc_priority: 49 +toc_title: PROJECTION --- # Manipulating Projections {#manipulations-with-projections} @@ -20,6 +20,5 @@ The commands `ADD`, `DROP` and `CLEAR` are lightweight in a sense that they only Also, they are replicated, syncing projections metadata via ZooKeeper. -:::note -Projection manipulation is supported only for tables with [`*MergeTree`](../../../engines/table-engines/mergetree-family/mergetree.md) engine (including [replicated](../../../engines/table-engines/mergetree-family/replication.md) variants). -::: \ No newline at end of file +!!! note "Note" + Projection manipulation is supported only for tables with [`*MergeTree`](../../../engines/table-engines/mergetree-family/mergetree.md) engine (including [replicated](../../../engines/table-engines/mergetree-family/replication.md) variants). diff --git a/docs/en/sql-reference/statements/alter/quota.md b/docs/en/sql-reference/statements/alter/quota.md index 2398a57502c..05130a569ab 100644 --- a/docs/en/sql-reference/statements/alter/quota.md +++ b/docs/en/sql-reference/statements/alter/quota.md @@ -1,6 +1,6 @@ --- -sidebar_position: 46 -sidebar_label: QUOTA +toc_priority: 46 +toc_title: QUOTA --- # ALTER QUOTA {#alter-quota-statement} diff --git a/docs/en/sql-reference/statements/alter/role.md b/docs/en/sql-reference/statements/alter/role.md index d3cb28a1705..ea6d3c61820 100644 --- a/docs/en/sql-reference/statements/alter/role.md +++ b/docs/en/sql-reference/statements/alter/role.md @@ -1,6 +1,6 @@ --- -sidebar_position: 46 -sidebar_label: ROLE +toc_priority: 46 +toc_title: ROLE --- ## ALTER ROLE {#alter-role-statement} diff --git a/docs/en/sql-reference/statements/alter/row-policy.md b/docs/en/sql-reference/statements/alter/row-policy.md index 47207d29287..bbf9f317737 100644 --- a/docs/en/sql-reference/statements/alter/row-policy.md +++ b/docs/en/sql-reference/statements/alter/row-policy.md @@ -1,6 +1,6 @@ --- -sidebar_position: 47 -sidebar_label: ROW POLICY +toc_priority: 47 +toc_title: ROW POLICY --- # ALTER ROW POLICY {#alter-row-policy-statement} diff --git a/docs/en/sql-reference/statements/alter/sample-by.md b/docs/en/sql-reference/statements/alter/sample-by.md index 08e4fe1066b..21b20be8b78 100644 --- a/docs/en/sql-reference/statements/alter/sample-by.md +++ b/docs/en/sql-reference/statements/alter/sample-by.md @@ -1,6 +1,6 @@ --- -sidebar_position: 41 -sidebar_label: SAMPLE BY +toc_priority: 41 +toc_title: SAMPLE BY --- # Manipulating Sampling-Key Expressions {#manipulations-with-sampling-key-expressions} @@ -15,6 +15,5 @@ The command changes the [sampling key](../../../engines/table-engines/mergetree- The command is lightweight in the sense that it only changes metadata. The primary key must contain the new sample key. -:::note -It only works for tables in the [MergeTree](../../../engines/table-engines/mergetree-family/mergetree.md) family (including [replicated](../../../engines/table-engines/mergetree-family/replication.md) tables). -::: \ No newline at end of file +!!! note "Note" + It only works for tables in the [MergeTree](../../../engines/table-engines/mergetree-family/mergetree.md) family (including [replicated](../../../engines/table-engines/mergetree-family/replication.md) tables). diff --git a/docs/en/sql-reference/statements/alter/setting.md b/docs/en/sql-reference/statements/alter/setting.md index bb361e2ee6f..90747bc1919 100644 --- a/docs/en/sql-reference/statements/alter/setting.md +++ b/docs/en/sql-reference/statements/alter/setting.md @@ -1,6 +1,6 @@ --- -sidebar_position: 38 -sidebar_label: SETTING +toc_priority: 38 +toc_title: SETTING --- # Table Settings Manipulations {#table_settings_manipulations} @@ -14,9 +14,9 @@ If a setting with the specified name does not exist, then the query raises an ex ALTER TABLE [db].name [ON CLUSTER cluster] MODIFY|RESET SETTING ... ``` -:::note -These queries can be applied to [MergeTree](../../../engines/table-engines/mergetree-family/mergetree.md) tables only. -::: +!!! note "Note" + These queries can be applied to [MergeTree](../../../engines/table-engines/mergetree-family/mergetree.md) tables only. + ## MODIFY SETTING {#alter_modify_setting} diff --git a/docs/en/sql-reference/statements/alter/settings-profile.md b/docs/en/sql-reference/statements/alter/settings-profile.md index b1728f21c08..57d12142c48 100644 --- a/docs/en/sql-reference/statements/alter/settings-profile.md +++ b/docs/en/sql-reference/statements/alter/settings-profile.md @@ -1,6 +1,6 @@ --- -sidebar_position: 48 -sidebar_label: SETTINGS PROFILE +toc_priority: 48 +toc_title: SETTINGS PROFILE --- ## ALTER SETTINGS PROFILE {#alter-settings-profile-statement} diff --git a/docs/en/sql-reference/statements/alter/ttl.md b/docs/en/sql-reference/statements/alter/ttl.md index f2cf8724197..9cd63d3b8fe 100644 --- a/docs/en/sql-reference/statements/alter/ttl.md +++ b/docs/en/sql-reference/statements/alter/ttl.md @@ -1,6 +1,6 @@ --- -sidebar_position: 44 -sidebar_label: TTL +toc_priority: 44 +toc_title: TTL --- # Manipulations with Table TTL {#manipulations-with-table-ttl} diff --git a/docs/en/sql-reference/statements/alter/update.md b/docs/en/sql-reference/statements/alter/update.md index aeff7cfa1b2..13ea1b2a8db 100644 --- a/docs/en/sql-reference/statements/alter/update.md +++ b/docs/en/sql-reference/statements/alter/update.md @@ -1,6 +1,6 @@ --- -sidebar_position: 40 -sidebar_label: UPDATE +toc_priority: 40 +toc_title: UPDATE --- # ALTER TABLE … UPDATE Statements {#alter-table-update-statements} @@ -11,9 +11,8 @@ ALTER TABLE [db.]table UPDATE column1 = expr1 [, ...] WHERE filter_expr Manipulates data matching the specified filtering expression. Implemented as a [mutation](../../../sql-reference/statements/alter/index.md#mutations). -:::note -The `ALTER TABLE` prefix makes this syntax different from most other systems supporting SQL. It is intended to signify that unlike similar queries in OLTP databases this is a heavy operation not designed for frequent use. -::: +!!! note "Note" + The `ALTER TABLE` prefix makes this syntax different from most other systems supporting SQL. It is intended to signify that unlike similar queries in OLTP databases this is a heavy operation not designed for frequent use. The `filter_expr` must be of type `UInt8`. This query updates values of specified columns to the values of corresponding expressions in rows for which the `filter_expr` takes a non-zero value. Values are casted to the column type using the `CAST` operator. Updating columns that are used in the calculation of the primary or the partition key is not supported. diff --git a/docs/en/sql-reference/statements/alter/user.md b/docs/en/sql-reference/statements/alter/user.md index f9b90349dab..4873982e2a1 100644 --- a/docs/en/sql-reference/statements/alter/user.md +++ b/docs/en/sql-reference/statements/alter/user.md @@ -1,6 +1,6 @@ --- -sidebar_position: 45 -sidebar_label: USER +toc_priority: 45 +toc_title: USER --- # ALTER USER {#alter-user-statement} diff --git a/docs/en/sql-reference/statements/alter/view.md b/docs/en/sql-reference/statements/alter/view.md index 71e89aaefe8..0fb1c4be0ff 100644 --- a/docs/en/sql-reference/statements/alter/view.md +++ b/docs/en/sql-reference/statements/alter/view.md @@ -1,6 +1,6 @@ --- -sidebar_position: 50 -sidebar_label: VIEW +toc_priority: 50 +toc_title: VIEW --- # ALTER TABLE … MODIFY QUERY Statement {#alter-modify-query} diff --git a/docs/en/sql-reference/statements/attach.md b/docs/en/sql-reference/statements/attach.md index bc7b2be333f..2949ac6db38 100644 --- a/docs/en/sql-reference/statements/attach.md +++ b/docs/en/sql-reference/statements/attach.md @@ -1,6 +1,6 @@ --- -sidebar_position: 40 -sidebar_label: ATTACH +toc_priority: 40 +toc_title: ATTACH --- # ATTACH Statement {#attach} diff --git a/docs/en/sql-reference/statements/check-table.md b/docs/en/sql-reference/statements/check-table.md index 1164a8b8be6..c9ad40860f7 100644 --- a/docs/en/sql-reference/statements/check-table.md +++ b/docs/en/sql-reference/statements/check-table.md @@ -1,6 +1,6 @@ --- -sidebar_position: 41 -sidebar_label: CHECK +toc_priority: 41 +toc_title: CHECK --- # CHECK TABLE Statement {#check-table} diff --git a/docs/en/sql-reference/statements/create/database.md b/docs/en/sql-reference/statements/create/database.md index 18ed94bef79..787bbc02346 100644 --- a/docs/en/sql-reference/statements/create/database.md +++ b/docs/en/sql-reference/statements/create/database.md @@ -1,6 +1,6 @@ --- -sidebar_position: 35 -sidebar_label: DATABASE +toc_priority: 35 +toc_title: DATABASE --- # CREATE DATABASE {#query-language-create-database} diff --git a/docs/en/sql-reference/statements/create/dictionary.md b/docs/en/sql-reference/statements/create/dictionary.md index 246625cc901..86ab8f977b0 100644 --- a/docs/en/sql-reference/statements/create/dictionary.md +++ b/docs/en/sql-reference/statements/create/dictionary.md @@ -1,6 +1,6 @@ --- -sidebar_position: 38 -sidebar_label: DICTIONARY +toc_priority: 38 +toc_title: DICTIONARY --- # CREATE DICTIONARY {#create-dictionary-query} diff --git a/docs/en/sql-reference/statements/create/function.md b/docs/en/sql-reference/statements/create/function.md index a87d3d70e54..ddfcdfef521 100644 --- a/docs/en/sql-reference/statements/create/function.md +++ b/docs/en/sql-reference/statements/create/function.md @@ -1,6 +1,6 @@ --- -sidebar_position: 38 -sidebar_label: FUNCTION +toc_priority: 38 +toc_title: FUNCTION --- # CREATE FUNCTION {#create-function} diff --git a/docs/en/sql-reference/statements/create/index.md b/docs/en/sql-reference/statements/create/index.md index 666a2c66d2f..3df62869e2b 100644 --- a/docs/en/sql-reference/statements/create/index.md +++ b/docs/en/sql-reference/statements/create/index.md @@ -1,9 +1,10 @@ --- -sidebar_position: 34 -sidebar_label: CREATE +toc_folder_title: CREATE +toc_priority: 34 +toc_title: Overview --- -# CREATE Queries +# CREATE Queries {#create-queries} Create queries make a new entity of one of the following kinds: diff --git a/docs/en/sql-reference/statements/create/quota.md b/docs/en/sql-reference/statements/create/quota.md index 931da165a73..767846ead52 100644 --- a/docs/en/sql-reference/statements/create/quota.md +++ b/docs/en/sql-reference/statements/create/quota.md @@ -1,6 +1,6 @@ --- -sidebar_position: 42 -sidebar_label: QUOTA +toc_priority: 42 +toc_title: QUOTA --- # CREATE QUOTA {#create-quota-statement} diff --git a/docs/en/sql-reference/statements/create/role.md b/docs/en/sql-reference/statements/create/role.md index 5f7db960f27..e0e58f7a0f6 100644 --- a/docs/en/sql-reference/statements/create/role.md +++ b/docs/en/sql-reference/statements/create/role.md @@ -1,6 +1,6 @@ --- -sidebar_position: 40 -sidebar_label: ROLE +toc_priority: 40 +toc_title: ROLE --- # CREATE ROLE {#create-role-statement} diff --git a/docs/en/sql-reference/statements/create/row-policy.md b/docs/en/sql-reference/statements/create/row-policy.md index 58b7b1e2cb9..3f88d794619 100644 --- a/docs/en/sql-reference/statements/create/row-policy.md +++ b/docs/en/sql-reference/statements/create/row-policy.md @@ -1,15 +1,14 @@ --- -sidebar_position: 41 -sidebar_label: ROW POLICY +toc_priority: 41 +toc_title: ROW POLICY --- # CREATE ROW POLICY {#create-row-policy-statement} Creates a [row policy](../../../operations/access-rights.md#row-policy-management), i.e. a filter used to determine which rows a user can read from a table. -:::warning -Row policies makes sense only for users with readonly access. If user can modify table or copy partitions between tables, it defeats the restrictions of row policies. -::: +!!! note "Warning" + Row policies makes sense only for users with readonly access. If user can modify table or copy partitions between tables, it defeats the restrictions of row policies. Syntax: @@ -31,17 +30,16 @@ In the section `TO` you can provide a list of users and roles this policy should Keyword `ALL` means all the ClickHouse users including current user. Keyword `ALL EXCEPT` allow to exclude some users from the all users list, for example, `CREATE ROW POLICY ... TO ALL EXCEPT accountant, john@localhost` -:::note -If there are no row policies defined for a table then any user can `SELECT` all the row from the table. Defining one or more row policies for the table makes the access to the table depending on the row policies no matter if those row policies are defined for the current user or not. For example, the following policy +!!! note "Note" + If there are no row policies defined for a table then any user can `SELECT` all the row from the table. Defining one or more row policies for the table makes the access to the table depending on the row policies no matter if those row policies are defined for the current user or not. For example, the following policy -`CREATE ROW POLICY pol1 ON mydb.table1 USING b=1 TO mira, peter` + `CREATE ROW POLICY pol1 ON mydb.table1 USING b=1 TO mira, peter` -forbids the users `mira` and `peter` to see the rows with `b != 1`, and any non-mentioned user (e.g., the user `paul`) will see no rows from `mydb.table1` at all. + forbids the users `mira` and `peter` to see the rows with `b != 1`, and any non-mentioned user (e.g., the user `paul`) will see no rows from `mydb.table1` at all. -If that's not desirable it can't be fixed by adding one more row policy, like the following: + If that's not desirable it can't be fixed by adding one more row policy, like the following: -`CREATE ROW POLICY pol2 ON mydb.table1 USING 1 TO ALL EXCEPT mira, peter` -::: + `CREATE ROW POLICY pol2 ON mydb.table1 USING 1 TO ALL EXCEPT mira, peter` ## AS Clause {#create-row-policy-as} diff --git a/docs/en/sql-reference/statements/create/settings-profile.md b/docs/en/sql-reference/statements/create/settings-profile.md index 0cc633d9770..07bb54c9da3 100644 --- a/docs/en/sql-reference/statements/create/settings-profile.md +++ b/docs/en/sql-reference/statements/create/settings-profile.md @@ -1,6 +1,6 @@ --- -sidebar_position: 43 -sidebar_label: SETTINGS PROFILE +toc_priority: 43 +toc_title: SETTINGS PROFILE --- # CREATE SETTINGS PROFILE {#create-settings-profile-statement} diff --git a/docs/en/sql-reference/statements/create/table.md b/docs/en/sql-reference/statements/create/table.md index c477e41ba02..ee663c92695 100644 --- a/docs/en/sql-reference/statements/create/table.md +++ b/docs/en/sql-reference/statements/create/table.md @@ -1,6 +1,6 @@ --- -sidebar_position: 36 -sidebar_label: TABLE +toc_priority: 36 +toc_title: TABLE --- # CREATE TABLE {#create-table-query} @@ -159,9 +159,8 @@ ENGINE = engine PRIMARY KEY(expr1[, expr2,...]); ``` -:::warning -You can't combine both ways in one query. -::: +!!! warning "Warning" + You can't combine both ways in one query. ## Constraints {#constraints} @@ -215,9 +214,8 @@ ALTER TABLE codec_example MODIFY COLUMN float_value CODEC(Default); Codecs can be combined in a pipeline, for example, `CODEC(Delta, Default)`. -:::warning -You can’t decompress ClickHouse database files with external utilities like `lz4`. Instead, use the special [clickhouse-compressor](https://github.com/ClickHouse/ClickHouse/tree/master/programs/compressor) utility. -::: +!!! warning "Warning" + You can’t decompress ClickHouse database files with external utilities like `lz4`. Instead, use the special [clickhouse-compressor](https://github.com/ClickHouse/ClickHouse/tree/master/programs/compressor) utility. Compression is supported for the following table engines: @@ -239,7 +237,7 @@ Codecs: High compression levels are useful for asymmetric scenarios, like compress once, decompress repeatedly. Higher levels mean better compression and higher CPU usage. -### Specialized Codecs {#specialized-codecs} +### Specialized Codecs {#create-query-specialized-codecs} These codecs are designed to make compression more effective by using specific features of data. Some of these codecs do not compress data themself. Instead, they prepare the data for a common purpose codec, which compresses it better than without this preparation. @@ -273,13 +271,11 @@ Encryption codecs: These codecs use a fixed nonce and encryption is therefore deterministic. This makes it compatible with deduplicating engines such as [ReplicatedMergeTree](../../../engines/table-engines/mergetree-family/replication.md) but has a weakness: when the same data block is encrypted twice, the resulting ciphertext will be exactly the same so an adversary who can read the disk can see this equivalence (although only the equivalence, without getting its content). -:::warning -Most engines including the "*MergeTree" family create index files on disk without applying codecs. This means plaintext will appear on disk if an encrypted column is indexed. -::: +!!! attention "Attention" + Most engines including the "*MergeTree" family create index files on disk without applying codecs. This means plaintext will appear on disk if an encrypted column is indexed. -:::warning -If you perform a SELECT query mentioning a specific value in an encrypted column (such as in its WHERE clause), the value may appear in [system.query_log](../../../operations/system-tables/query_log.md). You may want to disable the logging. -::: +!!! attention "Attention" + If you perform a SELECT query mentioning a specific value in an encrypted column (such as in its WHERE clause), the value may appear in [system.query_log](../../../operations/system-tables/query_log.md). You may want to disable the logging. **Example** @@ -291,9 +287,8 @@ CREATE TABLE mytable ENGINE = MergeTree ORDER BY x; ``` -:::note -If compression needs to be applied, it must be explicitly specified. Otherwise, only encryption will be applied to data. -::: +!!!note "Note" + If compression needs to be applied, it must be explicitly specified. Otherwise, only encryption will be applied to data. **Example** @@ -335,9 +330,8 @@ It’s possible to use tables with [ENGINE = Memory](../../../engines/table-engi 'REPLACE' query allows you to update the table atomically. -:::note -This query is supported only for [Atomic](../../../engines/database-engines/atomic.md) database engine. -::: +!!!note "Note" + This query is supported only for [Atomic](../../../engines/database-engines/atomic.md) database engine. If you need to delete some data from a table, you can create a new table and fill it with a `SELECT` statement that does not retrieve unwanted data, then drop the old table and rename the new one: @@ -411,9 +405,8 @@ SELECT * FROM base.t1; You can add a comment to the table when you creating it. -:::note -The comment is supported for all table engines except [Kafka](../../../engines/table-engines/integrations/kafka.md), [RabbitMQ](../../../engines/table-engines/integrations/rabbitmq.md) and [EmbeddedRocksDB](../../../engines/table-engines/integrations/embedded-rocksdb.md). -::: +!!!note "Note" + The comment is supported for all table engines except [Kafka](../../../engines/table-engines/integrations/kafka.md), [RabbitMQ](../../../engines/table-engines/integrations/rabbitmq.md) and [EmbeddedRocksDB](../../../engines/table-engines/integrations/embedded-rocksdb.md). **Syntax** diff --git a/docs/en/sql-reference/statements/create/user.md b/docs/en/sql-reference/statements/create/user.md index 0aad0961a8b..5dfcf891439 100644 --- a/docs/en/sql-reference/statements/create/user.md +++ b/docs/en/sql-reference/statements/create/user.md @@ -1,6 +1,6 @@ --- -sidebar_position: 39 -sidebar_label: USER +toc_priority: 39 +toc_title: USER --- # CREATE USER {#create-user-statement} @@ -52,9 +52,9 @@ Another way of specifying host is to use `@` syntax following the username. Exam - `CREATE USER mira@'localhost'` — Equivalent to the `HOST LOCAL` syntax. - `CREATE USER mira@'192.168.%.%'` — Equivalent to the `HOST LIKE` syntax. -:::warning -ClickHouse treats `user_name@'address'` as a username as a whole. Thus, technically you can create multiple users with the same `user_name` and different constructions after `@`. However, we do not recommend to do so. -::: +!!! info "Warning" + ClickHouse treats `user_name@'address'` as a username as a whole. Thus, technically you can create multiple users with the same `user_name` and different constructions after `@`. However, we do not recommend to do so. + ## GRANTEES Clause {#grantees} diff --git a/docs/en/sql-reference/statements/create/view.md b/docs/en/sql-reference/statements/create/view.md index e31d1b4473f..f7d3a6d697a 100644 --- a/docs/en/sql-reference/statements/create/view.md +++ b/docs/en/sql-reference/statements/create/view.md @@ -1,6 +1,6 @@ --- -sidebar_position: 37 -sidebar_label: VIEW +toc_priority: 37 +toc_title: VIEW --- # CREATE VIEW {#create-view} @@ -49,11 +49,10 @@ When creating a materialized view with `TO [db].[table]`, you must not use `POPU A materialized view is implemented as follows: when inserting data to the table specified in `SELECT`, part of the inserted data is converted by this `SELECT` query, and the result is inserted in the view. -:::note -Materialized views in ClickHouse use **column names** instead of column order during insertion into destination table. If some column names are not present in the `SELECT` query result, ClickHouse uses a default value, even if the column is not [Nullable](../../data-types/nullable.md). A safe practice would be to add aliases for every column when using Materialized views. +!!! important "Important" + Materialized views in ClickHouse use **column names** instead of column order during insertion into destination table. If some column names are not present in the `SELECT` query result, ClickHouse uses a default value, even if the column is not [Nullable](../../data-types/nullable.md). A safe practice would be to add aliases for every column when using Materialized views. -Materialized views in ClickHouse are implemented more like insert triggers. If there’s some aggregation in the view query, it’s applied only to the batch of freshly inserted data. Any changes to existing data of source table (like update, delete, drop partition, etc.) does not change the materialized view. -::: + Materialized views in ClickHouse are implemented more like insert triggers. If there’s some aggregation in the view query, it’s applied only to the batch of freshly inserted data. Any changes to existing data of source table (like update, delete, drop partition, etc.) does not change the materialized view. If you specify `POPULATE`, the existing table data is inserted into the view when creating it, as if making a `CREATE TABLE ... AS SELECT ...` . Otherwise, the query contains only the data inserted in the table after creating the view. We **do not recommend** using `POPULATE`, since data inserted in the table during the view creation will not be inserted in it. @@ -69,9 +68,10 @@ To delete a view, use [DROP VIEW](../../../sql-reference/statements/drop.md#drop ## Live View [Experimental] {#live-view} -:::note -This is an experimental feature that may change in backwards-incompatible ways in the future releases. Enable usage of live views and `WATCH` query using [allow_experimental_live_view](../../../operations/settings/settings.md#allow-experimental-live-view) setting. Input the command `set allow_experimental_live_view = 1`. -::: +!!! important "Important" + This is an experimental feature that may change in backwards-incompatible ways in the future releases. + Enable usage of live views and `WATCH` query using [allow_experimental_live_view](../../../operations/settings/settings.md#allow-experimental-live-view) setting. Input the command `set allow_experimental_live_view = 1`. + ```sql CREATE LIVE VIEW [IF NOT EXISTS] [db.]table_name [WITH [TIMEOUT [value_in_sec] [AND]] [REFRESH [value_in_sec]]] AS SELECT ... @@ -83,15 +83,14 @@ Live views are triggered by insert into the innermost table specified in the que Live views work similarly to how a query in a distributed table works. But instead of combining partial results from different servers they combine partial result from current data with partial result from the new data. When a live view query includes a subquery then the cached partial result is only stored for the innermost subquery. -:::info -- [Table function](../../../sql-reference/table-functions/index.md) is not supported as the innermost table. -- Tables that do not have inserts such as a [dictionary](../../../sql-reference/dictionaries/index.md), [system table](../../../operations/system-tables/index.md), a [normal view](#normal), or a [materialized view](#materialized) will not trigger a live view. -- Only queries where one can combine partial result from the old data plus partial result from the new data will work. Live view will not work for queries that require the complete data set to compute the final result or aggregations where the state of the aggregation must be preserved. -- Does not work with replicated or distributed tables where inserts are performed on different nodes. -- Can't be triggered by multiple tables. +!!! info "Limitations" + - [Table function](../../../sql-reference/table-functions/index.md) is not supported as the innermost table. + - Tables that do not have inserts such as a [dictionary](../../../sql-reference/dictionaries/index.md), [system table](../../../operations/system-tables/index.md), a [normal view](#normal), or a [materialized view](#materialized) will not trigger a live view. + - Only queries where one can combine partial result from the old data plus partial result from the new data will work. Live view will not work for queries that require the complete data set to compute the final result or aggregations where the state of the aggregation must be preserved. + - Does not work with replicated or distributed tables where inserts are performed on different nodes. + - Can't be triggered by multiple tables. -See [WITH REFRESH](#live-view-with-refresh) to force periodic updates of a live view that in some cases can be used as a workaround. -::: + See [WITH REFRESH](#live-view-with-refresh) to force periodic updates of a live view that in some cases can be used as a workaround. ### Monitoring Live View Changes {#live-view-monitoring} @@ -247,9 +246,9 @@ Most common uses of live view tables include: ## Window View [Experimental] {#window-view} -:::info -This is an experimental feature that may change in backwards-incompatible ways in the future releases. Enable usage of window views and `WATCH` query using [allow_experimental_window_view](../../../operations/settings/settings.md#allow-experimental-window-view) setting. Input the command `set allow_experimental_window_view = 1`. -::: +!!! important "Important" + This is an experimental feature that may change in backwards-incompatible ways in the future releases. + Enable usage of window views and `WATCH` query using [allow_experimental_window_view](../../../operations/settings/settings.md#allow-experimental-window-view) setting. Input the command `set allow_experimental_window_view = 1`. ``` sql CREATE WINDOW VIEW [IF NOT EXISTS] [db.]table_name [TO [db.]table_name] [ENGINE = engine] [WATERMARK = strategy] [ALLOWED_LATENESS = interval_function] AS SELECT ... GROUP BY time_window_function diff --git a/docs/en/sql-reference/statements/describe-table.md b/docs/en/sql-reference/statements/describe-table.md index 7fbe5bd2790..823a31ed313 100644 --- a/docs/en/sql-reference/statements/describe-table.md +++ b/docs/en/sql-reference/statements/describe-table.md @@ -1,6 +1,6 @@ --- -sidebar_position: 42 -sidebar_label: DESCRIBE +toc_priority: 42 +toc_title: DESCRIBE --- # DESCRIBE TABLE {#misc-describe-table} diff --git a/docs/en/sql-reference/statements/detach.md b/docs/en/sql-reference/statements/detach.md index bf20f7b3461..b77bcbc00fb 100644 --- a/docs/en/sql-reference/statements/detach.md +++ b/docs/en/sql-reference/statements/detach.md @@ -1,6 +1,6 @@ --- -sidebar_position: 43 -sidebar_label: DETACH +toc_priority: 43 +toc_title: DETACH --- # DETACH Statement {#detach} diff --git a/docs/en/sql-reference/statements/drop.md b/docs/en/sql-reference/statements/drop.md index 0d3e1f7860d..552a7b5f1a9 100644 --- a/docs/en/sql-reference/statements/drop.md +++ b/docs/en/sql-reference/statements/drop.md @@ -1,6 +1,6 @@ --- -sidebar_position: 44 -sidebar_label: DROP +toc_priority: 44 +toc_title: DROP --- # DROP Statements {#drop} diff --git a/docs/en/sql-reference/statements/exchange.md b/docs/en/sql-reference/statements/exchange.md index abe3d40950e..91b0c48ddcf 100644 --- a/docs/en/sql-reference/statements/exchange.md +++ b/docs/en/sql-reference/statements/exchange.md @@ -1,6 +1,6 @@ --- -sidebar_position: 49 -sidebar_label: EXCHANGE +toc_priority: 49 +toc_title: EXCHANGE --- # EXCHANGE Statement {#exchange} @@ -8,9 +8,8 @@ sidebar_label: EXCHANGE Exchanges the names of two tables or dictionaries atomically. This task can also be accomplished with a [RENAME](./rename.md) query using a temporary name, but the operation is not atomic in that case. -:::note -The `EXCHANGE` query is supported by the [Atomic](../../engines/database-engines/atomic.md) database engine only. -::: +!!! note "Note" + The `EXCHANGE` query is supported by the [Atomic](../../engines/database-engines/atomic.md) database engine only. **Syntax** diff --git a/docs/en/sql-reference/statements/exists.md b/docs/en/sql-reference/statements/exists.md index 7c6cc812665..b7c4a487791 100644 --- a/docs/en/sql-reference/statements/exists.md +++ b/docs/en/sql-reference/statements/exists.md @@ -1,6 +1,6 @@ --- -sidebar_position: 45 -sidebar_label: EXISTS +toc_priority: 45 +toc_title: EXISTS --- # EXISTS Statement {#exists-statement} diff --git a/docs/en/sql-reference/statements/explain.md b/docs/en/sql-reference/statements/explain.md index 80f8961a3e9..9c74c069f02 100644 --- a/docs/en/sql-reference/statements/explain.md +++ b/docs/en/sql-reference/statements/explain.md @@ -1,6 +1,6 @@ --- -sidebar_position: 39 -sidebar_label: EXPLAIN +toc_priority: 39 +toc_title: EXPLAIN --- # EXPLAIN Statement {#explain} @@ -138,9 +138,8 @@ Union ReadFromStorage (SystemNumbers) ``` -:::note -Step and query cost estimation is not supported. -::: +!!! note "Note" + Step and query cost estimation is not supported. When `json = 1`, the query plan is represented in JSON format. Every node is a dictionary that always has the keys `Node Type` and `Plans`. `Node Type` is a string with a step name. `Plans` is an array with child step descriptions. Other optional keys may be added depending on node type and settings. @@ -447,8 +446,8 @@ Result: └─────────────────────────────────────────────────────────┘ ``` -:::note -The validation is not complete, so a successfull query does not guarantee that the override would not cause issues. -::: +!!! note "Note" + The validation is not complete, so a successfull query does not guarantee that the override would + not cause issues. [Оriginal article](https://clickhouse.com/docs/en/sql-reference/statements/explain/) diff --git a/docs/en/sql-reference/statements/grant.md b/docs/en/sql-reference/statements/grant.md index 1ee330061b5..1b2b63ba0e7 100644 --- a/docs/en/sql-reference/statements/grant.md +++ b/docs/en/sql-reference/statements/grant.md @@ -1,6 +1,6 @@ --- -sidebar_position: 38 -sidebar_label: GRANT +toc_priority: 38 +toc_title: GRANT --- # GRANT Statement {#grant} diff --git a/docs/en/sql-reference/statements/index.md b/docs/en/sql-reference/statements/index.md index ab51cbb330c..a317e4a47de 100644 --- a/docs/en/sql-reference/statements/index.md +++ b/docs/en/sql-reference/statements/index.md @@ -1,9 +1,10 @@ --- -sidebar_position: 31 -sidebar_label: Statements +toc_folder_title: Statements +toc_hidden: true +toc_priority: 31 --- -# ClickHouse SQL Statements +# ClickHouse SQL Statements {#clickhouse-sql-statements} Statements represent various kinds of action you can perform using SQL queries. Each kind of statement has it’s own syntax and usage details that are described separately: diff --git a/docs/en/sql-reference/statements/insert-into.md b/docs/en/sql-reference/statements/insert-into.md index 17d6ce1809b..f8eefad7051 100644 --- a/docs/en/sql-reference/statements/insert-into.md +++ b/docs/en/sql-reference/statements/insert-into.md @@ -1,9 +1,9 @@ --- -sidebar_position: 33 -sidebar_label: INSERT INTO +toc_priority: 33 +toc_title: INSERT INTO --- -# INSERT INTO Statement +## INSERT INTO Statement {#insert} Inserts data into a table. diff --git a/docs/en/sql-reference/statements/kill.md b/docs/en/sql-reference/statements/kill.md index 9fe207f24b2..eab6f602c4a 100644 --- a/docs/en/sql-reference/statements/kill.md +++ b/docs/en/sql-reference/statements/kill.md @@ -1,6 +1,6 @@ --- -sidebar_position: 46 -sidebar_label: KILL +toc_priority: 46 +toc_title: KILL --- # KILL Statements {#kill-statements} diff --git a/docs/en/sql-reference/statements/misc.md b/docs/en/sql-reference/statements/misc.md index 2751c5296c2..c553ef37f8d 100644 --- a/docs/en/sql-reference/statements/misc.md +++ b/docs/en/sql-reference/statements/misc.md @@ -1,6 +1,6 @@ --- toc_hidden: true -sidebar_position: 70 +toc_priority: 41 --- # Miscellaneous Statements {#miscellaneous-queries} diff --git a/docs/en/sql-reference/statements/optimize.md b/docs/en/sql-reference/statements/optimize.md index 773284a1b30..30899cc2940 100644 --- a/docs/en/sql-reference/statements/optimize.md +++ b/docs/en/sql-reference/statements/optimize.md @@ -1,15 +1,14 @@ --- -sidebar_position: 47 -sidebar_label: OPTIMIZE +toc_priority: 47 +toc_title: OPTIMIZE --- # OPTIMIZE Statement {#misc_operations-optimize} This query tries to initialize an unscheduled merge of data parts for tables. -:::warning -`OPTIMIZE` can’t fix the `Too many parts` error. -::: +!!! warning "Warning" + `OPTIMIZE` can’t fix the `Too many parts` error. **Syntax** @@ -28,19 +27,16 @@ When `OPTIMIZE` is used with the [ReplicatedMergeTree](../../engines/table-engin You can specify how long (in seconds) to wait for inactive replicas to execute `OPTIMIZE` queries by the [replication_wait_for_inactive_replica_timeout](../../operations/settings/settings.md#replication-wait-for-inactive-replica-timeout) setting. -:::note -If the `replication_alter_partitions_sync` is set to `2` and some replicas are not active for more than the time, specified by the `replication_wait_for_inactive_replica_timeout` setting, then an exception `UNFINISHED` is thrown. -::: +!!! info "Note" + If the `replication_alter_partitions_sync` is set to `2` and some replicas are not active for more than the time, specified by the `replication_wait_for_inactive_replica_timeout` setting, then an exception `UNFINISHED` is thrown. ## BY expression {#by-expression} If you want to perform deduplication on custom set of columns rather than on all, you can specify list of columns explicitly or use any combination of [`*`](../../sql-reference/statements/select/index.md#asterisk), [`COLUMNS`](../../sql-reference/statements/select/index.md#columns-expression) or [`EXCEPT`](../../sql-reference/statements/select/index.md#except-modifier) expressions. The explictly written or implicitly expanded list of columns must include all columns specified in row ordering expression (both primary and sorting keys) and partitioning expression (partitioning key). -:::note -Notice that `*` behaves just like in `SELECT`: [MATERIALIZED](../../sql-reference/statements/create/table.md#materialized) and [ALIAS](../../sql-reference/statements/create/table.md#alias) columns are not used for expansion. - -Also, it is an error to specify empty list of columns, or write an expression that results in an empty list of columns, or deduplicate by an `ALIAS` column. -::: +!!! note "Note" + Notice that `*` behaves just like in `SELECT`: [MATERIALIZED](../../sql-reference/statements/create/table.md#materialized) and [ALIAS](../../sql-reference/statements/create/table.md#alias) columns are not used for expansion. + Also, it is an error to specify empty list of columns, or write an expression that results in an empty list of columns, or deduplicate by an `ALIAS` column. **Syntax** diff --git a/docs/en/sql-reference/statements/rename.md b/docs/en/sql-reference/statements/rename.md index b3bea3e3c37..c2192f1a6e1 100644 --- a/docs/en/sql-reference/statements/rename.md +++ b/docs/en/sql-reference/statements/rename.md @@ -1,6 +1,6 @@ --- -sidebar_position: 48 -sidebar_label: RENAME +toc_priority: 48 +toc_title: RENAME --- # RENAME Statement {#misc_operations-rename} @@ -8,9 +8,8 @@ sidebar_label: RENAME Renames databases, tables, or dictionaries. Several entities can be renamed in a single query. Note that the `RENAME` query with several entities is non-atomic operation. To swap entities names atomically, use the [EXCHANGE](./exchange.md) statement. -:::note -The `RENAME` query is supported by the [Atomic](../../engines/database-engines/atomic.md) database engine only. -::: +!!! note "Note" + The `RENAME` query is supported by the [Atomic](../../engines/database-engines/atomic.md) database engine only. **Syntax** diff --git a/docs/en/sql-reference/statements/revoke.md b/docs/en/sql-reference/statements/revoke.md index 4ffa8a21027..75005260c4a 100644 --- a/docs/en/sql-reference/statements/revoke.md +++ b/docs/en/sql-reference/statements/revoke.md @@ -1,6 +1,6 @@ --- -sidebar_position: 39 -sidebar_label: REVOKE +toc_priority: 39 +toc_title: REVOKE --- # REVOKE Statement {#revoke} diff --git a/docs/en/sql-reference/statements/select/all.md b/docs/en/sql-reference/statements/select/all.md index 6b35678fd92..ba66f63b447 100644 --- a/docs/en/sql-reference/statements/select/all.md +++ b/docs/en/sql-reference/statements/select/all.md @@ -1,5 +1,5 @@ --- -sidebar_label: ALL +toc_title: ALL --- # ALL Clause {#select-all} diff --git a/docs/en/sql-reference/statements/select/array-join.md b/docs/en/sql-reference/statements/select/array-join.md index f7fc08ae9ba..f138bcc45c7 100644 --- a/docs/en/sql-reference/statements/select/array-join.md +++ b/docs/en/sql-reference/statements/select/array-join.md @@ -1,5 +1,5 @@ --- -sidebar_label: ARRAY JOIN +toc_title: ARRAY JOIN --- # ARRAY JOIN Clause {#select-array-join-clause} diff --git a/docs/en/sql-reference/statements/select/distinct.md b/docs/en/sql-reference/statements/select/distinct.md index 898de4730ae..390afa46248 100644 --- a/docs/en/sql-reference/statements/select/distinct.md +++ b/docs/en/sql-reference/statements/select/distinct.md @@ -1,5 +1,5 @@ --- -sidebar_label: DISTINCT +toc_title: DISTINCT --- # DISTINCT Clause {#select-distinct} diff --git a/docs/en/sql-reference/statements/select/except.md b/docs/en/sql-reference/statements/select/except.md index dcaefd67ca9..e6d9b365a91 100644 --- a/docs/en/sql-reference/statements/select/except.md +++ b/docs/en/sql-reference/statements/select/except.md @@ -1,5 +1,5 @@ --- -sidebar_label: EXCEPT +toc_title: EXCEPT --- # EXCEPT Clause {#except-clause} diff --git a/docs/en/sql-reference/statements/select/format.md b/docs/en/sql-reference/statements/select/format.md index a7936509ad5..c3104bd12fe 100644 --- a/docs/en/sql-reference/statements/select/format.md +++ b/docs/en/sql-reference/statements/select/format.md @@ -1,5 +1,5 @@ --- -sidebar_label: FORMAT +toc_title: FORMAT --- # FORMAT Clause {#format-clause} diff --git a/docs/en/sql-reference/statements/select/from.md b/docs/en/sql-reference/statements/select/from.md index 9d5147db13c..df30a0fb0d2 100644 --- a/docs/en/sql-reference/statements/select/from.md +++ b/docs/en/sql-reference/statements/select/from.md @@ -1,5 +1,5 @@ --- -sidebar_label: FROM +toc_title: FROM --- # FROM Clause {#select-from} diff --git a/docs/en/sql-reference/statements/select/group-by.md b/docs/en/sql-reference/statements/select/group-by.md index b08647271f1..969a39ce51f 100644 --- a/docs/en/sql-reference/statements/select/group-by.md +++ b/docs/en/sql-reference/statements/select/group-by.md @@ -1,5 +1,5 @@ --- -sidebar_label: GROUP BY +toc_title: GROUP BY --- # GROUP BY Clause {#select-group-by-clause} @@ -12,9 +12,8 @@ sidebar_label: GROUP BY When you want to group data in the table by column numbers instead of column names, enable the setting [enable_positional_arguments](../../../operations/settings/settings.md#enable-positional-arguments). -:::note -There’s an additional way to run aggregation over a table. If a query contains table columns only inside aggregate functions, the `GROUP BY clause` can be omitted, and aggregation by an empty set of keys is assumed. Such queries always return exactly one row. -::: +!!! note "Note" + There’s an additional way to run aggregation over a table. If a query contains table columns only inside aggregate functions, the `GROUP BY clause` can be omitted, and aggregation by an empty set of keys is assumed. Such queries always return exactly one row. ## NULL Processing {#null-processing} @@ -56,9 +55,8 @@ The subtotals are calculated in the reverse order: at first subtotals are calcul In the subtotals rows the values of already "grouped" key expressions are set to `0` or empty line. -:::note -Mind that [HAVING](../../../sql-reference/statements/select/having.md) clause can affect the subtotals results. -::: +!!! note "Note" + Mind that [HAVING](../../../sql-reference/statements/select/having.md) clause can affect the subtotals results. **Example** @@ -116,9 +114,8 @@ As `GROUP BY` section has three key expressions, the result contains four tables In the subtotals rows the values of all "grouped" key expressions are set to `0` or empty line. -:::note -Mind that [HAVING](../../../sql-reference/statements/select/having.md) clause can affect the subtotals results. -::: +!!! note "Note" + Mind that [HAVING](../../../sql-reference/statements/select/having.md) clause can affect the subtotals results. **Example** @@ -209,9 +206,8 @@ This extra row is only produced in `JSON*`, `TabSeparated*`, and `Pretty*` forma - In `Pretty*` formats, the row is output as a separate table after the main result. - In the other formats it is not available. -:::note -totals is output in the results of `SELECT` queries, and is not output in `INSERT INTO ... SELECT`. -::: +!!! note "Note" + totals is output in the results of `SELECT` queries, and is not output in `INSERT INTO ... SELECT`. `WITH TOTALS` can be run in different ways when [HAVING](../../../sql-reference/statements/select/having.md) is present. The behavior depends on the `totals_mode` setting. diff --git a/docs/en/sql-reference/statements/select/having.md b/docs/en/sql-reference/statements/select/having.md index 9aee0cf4d63..93d56097b11 100644 --- a/docs/en/sql-reference/statements/select/having.md +++ b/docs/en/sql-reference/statements/select/having.md @@ -1,5 +1,5 @@ --- -sidebar_label: HAVING +toc_title: HAVING --- # HAVING Clause {#having-clause} diff --git a/docs/en/sql-reference/statements/select/index.md b/docs/en/sql-reference/statements/select/index.md index 50dd8fecf3a..33644133153 100644 --- a/docs/en/sql-reference/statements/select/index.md +++ b/docs/en/sql-reference/statements/select/index.md @@ -1,9 +1,11 @@ --- -sidebar_position: 32 -sidebar_label: SELECT +title: SELECT Query +toc_folder_title: SELECT +toc_priority: 32 +toc_title: Overview --- -# SELECT Query +# SELECT Query {#select-queries-syntax} `SELECT` queries perform data retrieval. By default, the requested data is returned to the client, while in conjunction with [INSERT INTO](../../../sql-reference/statements/insert-into.md) it can be forwarded to a different table. diff --git a/docs/en/sql-reference/statements/select/intersect.md b/docs/en/sql-reference/statements/select/intersect.md index ef9868daebb..2243a35e4d8 100644 --- a/docs/en/sql-reference/statements/select/intersect.md +++ b/docs/en/sql-reference/statements/select/intersect.md @@ -1,5 +1,5 @@ --- -sidebar_label: INTERSECT +toc_title: INTERSECT --- # INTERSECT Clause {#intersect-clause} diff --git a/docs/en/sql-reference/statements/select/into-outfile.md b/docs/en/sql-reference/statements/select/into-outfile.md index b37285cb0cc..b949b9c83c0 100644 --- a/docs/en/sql-reference/statements/select/into-outfile.md +++ b/docs/en/sql-reference/statements/select/into-outfile.md @@ -1,8 +1,8 @@ --- -sidebar_label: INTO OUTFILE +toc_title: INTO OUTFILE --- -# INTO OUTFILE Clause +# INTO OUTFILE Clause {#into-outfile-clause} `INTO OUTFILE` clause redirects the result of a `SELECT` query to a file on the **client** side. diff --git a/docs/en/sql-reference/statements/select/join.md b/docs/en/sql-reference/statements/select/join.md index 0cf58d0b90f..3d302be561a 100644 --- a/docs/en/sql-reference/statements/select/join.md +++ b/docs/en/sql-reference/statements/select/join.md @@ -1,5 +1,5 @@ --- -sidebar_label: JOIN +toc_title: JOIN --- # JOIN Clause {#select-join} @@ -36,9 +36,8 @@ Additional join types available in ClickHouse: - `LEFT ANY JOIN`, `RIGHT ANY JOIN` and `INNER ANY JOIN`, partially (for opposite side of `LEFT` and `RIGHT`) or completely (for `INNER` and `FULL`) disables the cartesian product for standard `JOIN` types. - `ASOF JOIN` and `LEFT ASOF JOIN`, joining sequences with a non-exact match. `ASOF JOIN` usage is described below. -:::note -When [join_algorithm](../../../operations/settings/settings.md#settings-join_algorithm) is set to `partial_merge`, `RIGHT JOIN` and `FULL JOIN` are supported only with `ALL` strictness (`SEMI`, `ANTI`, `ANY`, and `ASOF` are not supported). -::: +!!! note "Note" + When [join_algorithm](../../../operations/settings/settings.md#settings-join_algorithm) is set to `partial_merge`, `RIGHT JOIN` and `FULL JOIN` are supported only with `ALL` strictness (`SEMI`, `ANTI`, `ANY`, and `ASOF` are not supported). ## Settings {#join-settings} @@ -64,9 +63,8 @@ Rows are joined if the whole complex condition is met. If the conditions are not The `OR` operator inside the `ON` clause works using the hash join algorithm — for each `OR` argument with join keys for `JOIN`, a separate hash table is created, so memory consumption and query execution time grow linearly with an increase in the number of expressions `OR` of the `ON` clause. -:::note -If a condition refers columns from different tables, then only the equality operator (`=`) is supported so far. -::: +!!! note "Note" + If a condition refers columns from different tables, then only the equality operator (`=`) is supported so far. **Example** @@ -199,9 +197,8 @@ For example, consider the following tables: `ASOF JOIN` can take the timestamp of a user event from `table_1` and find an event in `table_2` where the timestamp is closest to the timestamp of the event from `table_1` corresponding to the closest match condition. Equal timestamp values are the closest if available. Here, the `user_id` column can be used for joining on equality and the `ev_time` column can be used for joining on the closest match. In our example, `event_1_1` can be joined with `event_2_1` and `event_1_2` can be joined with `event_2_3`, but `event_2_2` can’t be joined. -:::note -`ASOF` join is **not** supported in the [Join](../../../engines/table-engines/special/join.md) table engine. -::: +!!! note "Note" + `ASOF` join is **not** supported in the [Join](../../../engines/table-engines/special/join.md) table engine. ## Distributed JOIN {#global-join} diff --git a/docs/en/sql-reference/statements/select/limit-by.md b/docs/en/sql-reference/statements/select/limit-by.md index 913b7b40338..68b459a46e8 100644 --- a/docs/en/sql-reference/statements/select/limit-by.md +++ b/docs/en/sql-reference/statements/select/limit-by.md @@ -1,5 +1,5 @@ --- -sidebar_label: LIMIT BY +toc_title: LIMIT BY --- # LIMIT BY Clause {#limit-by-clause} @@ -13,9 +13,8 @@ ClickHouse supports the following syntax variants: During query processing, ClickHouse selects data ordered by sorting key. The sorting key is set explicitly using an [ORDER BY](order-by.md#select-order-by) clause or implicitly as a property of the table engine (row order is only guaranteed when using [ORDER BY](order-by.md#select-order-by), otherwise the row blocks will not be ordered due to multi-threading). Then ClickHouse applies `LIMIT n BY expressions` and returns the first `n` rows for each distinct combination of `expressions`. If `OFFSET` is specified, then for each data block that belongs to a distinct combination of `expressions`, ClickHouse skips `offset_value` number of rows from the beginning of the block and returns a maximum of `n` rows as a result. If `offset_value` is bigger than the number of rows in the data block, ClickHouse returns zero rows from the block. -:::note -`LIMIT BY` is not related to [LIMIT](../../../sql-reference/statements/select/limit.md). They can both be used in the same query. -::: +!!! note "Note" + `LIMIT BY` is not related to [LIMIT](../../../sql-reference/statements/select/limit.md). They can both be used in the same query. If you want to use column numbers instead of column names in the `LIMIT BY` clause, enable the setting [enable_positional_arguments](../../../operations/settings/settings.md#enable-positional-arguments). diff --git a/docs/en/sql-reference/statements/select/limit.md b/docs/en/sql-reference/statements/select/limit.md index 6b1c90041fe..6ed38b2dd64 100644 --- a/docs/en/sql-reference/statements/select/limit.md +++ b/docs/en/sql-reference/statements/select/limit.md @@ -1,5 +1,5 @@ --- -sidebar_label: LIMIT +toc_title: LIMIT --- # LIMIT Clause {#limit-clause} @@ -12,9 +12,8 @@ sidebar_label: LIMIT If there is no [ORDER BY](../../../sql-reference/statements/select/order-by.md) clause that explicitly sorts results, the choice of rows for the result may be arbitrary and non-deterministic. -:::note -The number of rows in the result set can also depend on the [limit](../../../operations/settings/settings.md#limit) setting. -::: +!!! note "Note" + The number of rows in the result set can also depend on the [limit](../../../operations/settings/settings.md#limit) setting. ## LIMIT … WITH TIES Modifier {#limit-with-ties} diff --git a/docs/en/sql-reference/statements/select/offset.md b/docs/en/sql-reference/statements/select/offset.md index e120845dbc6..20ebd972a24 100644 --- a/docs/en/sql-reference/statements/select/offset.md +++ b/docs/en/sql-reference/statements/select/offset.md @@ -1,5 +1,5 @@ --- -sidebar_label: OFFSET +toc_title: OFFSET --- # OFFSET FETCH Clause {#offset-fetch} @@ -30,13 +30,11 @@ SELECT * FROM test_fetch ORDER BY a LIMIT 3 OFFSET 1; The `WITH TIES` option is used to return any additional rows that tie for the last place in the result set according to the `ORDER BY` clause. For example, if `fetch_row_count` is set to 5 but two additional rows match the values of the `ORDER BY` columns in the fifth row, the result set will contain seven rows. -:::note -According to the standard, the `OFFSET` clause must come before the `FETCH` clause if both are present. -::: +!!! note "Note" + According to the standard, the `OFFSET` clause must come before the `FETCH` clause if both are present. -:::note -The real offset can also depend on the [offset](../../../operations/settings/settings.md#offset) setting. -::: +!!! note "Note" + The real offset can also depend on the [offset](../../../operations/settings/settings.md#offset) setting. ## Examples {#examples} diff --git a/docs/en/sql-reference/statements/select/order-by.md b/docs/en/sql-reference/statements/select/order-by.md index 46e483dddf4..b24f0213e4e 100644 --- a/docs/en/sql-reference/statements/select/order-by.md +++ b/docs/en/sql-reference/statements/select/order-by.md @@ -1,5 +1,5 @@ --- -sidebar_label: ORDER BY +toc_title: ORDER BY --- # ORDER BY Clause {#select-order-by} diff --git a/docs/en/sql-reference/statements/select/prewhere.md b/docs/en/sql-reference/statements/select/prewhere.md index c3aa2e14384..646bb83e692 100644 --- a/docs/en/sql-reference/statements/select/prewhere.md +++ b/docs/en/sql-reference/statements/select/prewhere.md @@ -1,5 +1,5 @@ --- -sidebar_label: PREWHERE +toc_title: PREWHERE --- # PREWHERE Clause {#prewhere-clause} @@ -18,9 +18,8 @@ If the [optimize_move_to_prewhere](../../../operations/settings/settings.md#opti If query has [FINAL](from.md#select-from-final) modifier, the `PREWHERE` optimization is not always correct. It is enabled only if both settings [optimize_move_to_prewhere](../../../operations/settings/settings.md#optimize_move_to_prewhere) and [optimize_move_to_prewhere_if_final](../../../operations/settings/settings.md#optimize_move_to_prewhere_if_final) are turned on. -:::note -The `PREWHERE` section is executed before `FINAL`, so the results of `FROM ... FINAL` queries may be skewed when using `PREWHERE` with fields not in the `ORDER BY` section of a table. -::: +!!! note "Attention" + The `PREWHERE` section is executed before `FINAL`, so the results of `FROM ... FINAL` queries may be skewed when using `PREWHERE` with fields not in the `ORDER BY` section of a table. ## Limitations {#limitations} diff --git a/docs/en/sql-reference/statements/select/sample.md b/docs/en/sql-reference/statements/select/sample.md index 3673a49a9e9..2405cb0a03c 100644 --- a/docs/en/sql-reference/statements/select/sample.md +++ b/docs/en/sql-reference/statements/select/sample.md @@ -1,5 +1,5 @@ --- -sidebar_label: SAMPLE +toc_title: SAMPLE --- # SAMPLE Clause {#select-sample-clause} @@ -14,9 +14,8 @@ Approximated query processing can be useful in the following cases: - When your raw data is not accurate, so approximation does not noticeably degrade the quality. - Business requirements target approximate results (for cost-effectiveness, or to market exact results to premium users). -:::note -You can only use sampling with the tables in the [MergeTree](../../../engines/table-engines/mergetree-family/mergetree.md) family, and only if the sampling expression was specified during table creation (see [MergeTree engine](../../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table)). -::: +!!! note "Note" + You can only use sampling with the tables in the [MergeTree](../../../engines/table-engines/mergetree-family/mergetree.md) family, and only if the sampling expression was specified during table creation (see [MergeTree engine](../../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table)). The features of data sampling are listed below: @@ -26,12 +25,11 @@ The features of data sampling are listed below: For the `SAMPLE` clause the following syntax is supported: -| SAMPLE Clause Syntax | Description | -|----------------------|------------------------------| -| `SAMPLE k` | Here `k` is the number from 0 to 1. The query is executed on `k` fraction of data. For example, `SAMPLE 0.1` runs the query on 10% of data. [Read more](#select-sample-k) | -| `SAMPLE n` | Here `n` is a sufficiently large integer. The query is executed on a sample of at least `n` rows (but not significantly more than this). For example, `SAMPLE 10000000` runs the query on a minimum of 10,000,000 rows. [Read more](#select-sample-n) | -| `SAMPLE k OFFSET m` | Here `k` and `m` are the numbers from 0 to 1. The query is executed on a sample of `k` fraction of the data. The data used for the sample is offset by `m` fraction. [Read more](#select-sample-offset) | - +| SAMPLE Clause Syntax | Description | +|----------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `SAMPLE k` | Here `k` is the number from 0 to 1.
The query is executed on `k` fraction of data. For example, `SAMPLE 0.1` runs the query on 10% of data. [Read more](#select-sample-k) | +| `SAMPLE n` | Here `n` is a sufficiently large integer.
The query is executed on a sample of at least `n` rows (but not significantly more than this). For example, `SAMPLE 10000000` runs the query on a minimum of 10,000,000 rows. [Read more](#select-sample-n) | +| `SAMPLE k OFFSET m` | Here `k` and `m` are the numbers from 0 to 1.
The query is executed on a sample of `k` fraction of the data. The data used for the sample is offset by `m` fraction. [Read more](#select-sample-offset) | ## SAMPLE K {#select-sample-k} diff --git a/docs/en/sql-reference/statements/select/union.md b/docs/en/sql-reference/statements/select/union.md index 8a1c7a770c9..6dfe554edf0 100644 --- a/docs/en/sql-reference/statements/select/union.md +++ b/docs/en/sql-reference/statements/select/union.md @@ -1,5 +1,5 @@ --- -sidebar_label: UNION +toc_title: UNION --- # UNION Clause {#union-clause} diff --git a/docs/en/sql-reference/statements/select/where.md b/docs/en/sql-reference/statements/select/where.md index c68f9d39d09..348b869e2db 100644 --- a/docs/en/sql-reference/statements/select/where.md +++ b/docs/en/sql-reference/statements/select/where.md @@ -1,5 +1,5 @@ --- -sidebar_label: WHERE +toc_title: WHERE --- # WHERE Clause {#select-where} @@ -10,9 +10,8 @@ If there is a `WHERE` clause, it must contain an expression with the `UInt8` typ `WHERE` expression is evaluated on the ability to use indexes and partition pruning, if the underlying table engine supports that. -:::note -There is a filtering optimization called [PREWHERE](../../../sql-reference/statements/select/prewhere.md). -::: +!!! note "Note" + There is a filtering optimization called [PREWHERE](../../../sql-reference/statements/select/prewhere.md). If you need to test a value for [NULL](../../../sql-reference/syntax.md#null-literal), use [IS NULL](../../operators/index.md#operator-is-null) and [IS NOT NULL](../../operators/index.md#is-not-null) operators or [isNull](../../../sql-reference/functions/functions-for-nulls.md#isnull) and [isNotNull](../../../sql-reference/functions/functions-for-nulls.md#isnotnull) functions. Otherwise an expression with `NULL` never passes. diff --git a/docs/en/sql-reference/statements/select/with.md b/docs/en/sql-reference/statements/select/with.md index 39fcb752980..d6c8da261cb 100644 --- a/docs/en/sql-reference/statements/select/with.md +++ b/docs/en/sql-reference/statements/select/with.md @@ -1,5 +1,5 @@ --- -sidebar_label: WITH +toc_title: WITH --- # WITH Clause {#with-clause} diff --git a/docs/en/sql-reference/statements/set-role.md b/docs/en/sql-reference/statements/set-role.md index cac7ca28b92..cf14a9c6d75 100644 --- a/docs/en/sql-reference/statements/set-role.md +++ b/docs/en/sql-reference/statements/set-role.md @@ -1,6 +1,6 @@ --- -sidebar_position: 51 -sidebar_label: SET ROLE +toc_priority: 51 +toc_title: SET ROLE --- # SET ROLE Statement {#set-role-statement} diff --git a/docs/en/sql-reference/statements/set.md b/docs/en/sql-reference/statements/set.md index d2a1d30c797..e5de5c41284 100644 --- a/docs/en/sql-reference/statements/set.md +++ b/docs/en/sql-reference/statements/set.md @@ -1,6 +1,6 @@ --- -sidebar_position: 50 -sidebar_label: SET +toc_priority: 50 +toc_title: SET --- # SET Statement {#query-set} diff --git a/docs/en/sql-reference/statements/show.md b/docs/en/sql-reference/statements/show.md index 75c5c121946..96cbee0b04d 100644 --- a/docs/en/sql-reference/statements/show.md +++ b/docs/en/sql-reference/statements/show.md @@ -1,6 +1,6 @@ --- -sidebar_position: 37 -sidebar_label: SHOW +toc_priority: 37 +toc_title: SHOW --- # SHOW Statements {#show-queries} @@ -361,9 +361,8 @@ SHOW ACCESS Returns a list of clusters. All available clusters are listed in the [system.clusters](../../operations/system-tables/clusters.md) table. -:::note -`SHOW CLUSTER name` query displays the contents of system.clusters table for this cluster. -::: +!!! info "Note" + `SHOW CLUSTER name` query displays the contents of system.clusters table for this cluster. ### Syntax {#show-cluster-syntax} diff --git a/docs/en/sql-reference/statements/system.md b/docs/en/sql-reference/statements/system.md index 1d638ab3965..b71853f29dd 100644 --- a/docs/en/sql-reference/statements/system.md +++ b/docs/en/sql-reference/statements/system.md @@ -1,6 +1,6 @@ --- -sidebar_position: 36 -sidebar_label: SYSTEM +toc_priority: 36 +toc_title: SYSTEM --- # SYSTEM Statements {#query-language-system} @@ -67,7 +67,7 @@ SELECT name, status FROM system.dictionaries; ## RELOAD MODELS {#query_language-system-reload-models} -Reloads all [CatBoost](../../../guides/developer/apply-catboost-model.md) models if the configuration was updated without restarting the server. +Reloads all [CatBoost](../../guides/apply-catboost-model.md#applying-catboost-model-in-clickhouse) models if the configuration was updated without restarting the server. **Syntax** @@ -191,9 +191,8 @@ Provides possibility to stop background merges for tables in the MergeTree famil SYSTEM STOP MERGES [ON VOLUME | [db.]merge_tree_family_table_name] ``` -:::note -`DETACH / ATTACH` table will start background merges for the table even in case when merges have been stopped for all MergeTree tables before. -::: +!!! note "Note" + `DETACH / ATTACH` table will start background merges for the table even in case when merges have been stopped for all MergeTree tables before. ### START MERGES {#query_language-system-start-merges} @@ -327,9 +326,8 @@ One may execute query after: Replica attaches locally found parts and sends info about them to Zookeeper. Parts present on a replica before metadata loss are not re-fetched from other ones if not being outdated (so replica restoration does not mean re-downloading all data over the network). -:::warning -Parts in all states are moved to `detached/` folder. Parts active before data loss (committed) are attached. -::: +!!! warning "Warning" + Parts in all states are moved to `detached/` folder. Parts active before data loss (committed) are attached. **Syntax** diff --git a/docs/en/sql-reference/statements/truncate.md b/docs/en/sql-reference/statements/truncate.md index 393ba82b3cd..b5354196fa4 100644 --- a/docs/en/sql-reference/statements/truncate.md +++ b/docs/en/sql-reference/statements/truncate.md @@ -1,6 +1,6 @@ --- -sidebar_position: 52 -sidebar_label: TRUNCATE +toc_priority: 52 +toc_title: TRUNCATE --- # TRUNCATE Statement {#truncate-statement} @@ -17,6 +17,5 @@ You can use the [replication_alter_partitions_sync](../../operations/settings/se You can specify how long (in seconds) to wait for inactive replicas to execute `TRUNCATE` queries with the [replication_wait_for_inactive_replica_timeout](../../operations/settings/settings.md#replication-wait-for-inactive-replica-timeout) setting. -:::note -If the `replication_alter_partitions_sync` is set to `2` and some replicas are not active for more than the time, specified by the `replication_wait_for_inactive_replica_timeout` setting, then an exception `UNFINISHED` is thrown. -::: \ No newline at end of file +!!! info "Note" + If the `replication_alter_partitions_sync` is set to `2` and some replicas are not active for more than the time, specified by the `replication_wait_for_inactive_replica_timeout` setting, then an exception `UNFINISHED` is thrown. diff --git a/docs/en/sql-reference/statements/use.md b/docs/en/sql-reference/statements/use.md index 869bf44fdeb..841c23d333d 100644 --- a/docs/en/sql-reference/statements/use.md +++ b/docs/en/sql-reference/statements/use.md @@ -1,6 +1,6 @@ --- -sidebar_position: 53 -sidebar_label: USE +toc_priority: 53 +toc_title: USE --- # USE Statement {#use} diff --git a/docs/en/sql-reference/statements/watch.md b/docs/en/sql-reference/statements/watch.md index 688cf21e23c..be793d30f3d 100644 --- a/docs/en/sql-reference/statements/watch.md +++ b/docs/en/sql-reference/statements/watch.md @@ -1,13 +1,14 @@ --- -sidebar_position: 53 -sidebar_label: WATCH +toc_priority: 53 +toc_title: WATCH --- # WATCH Statement (Experimental) {#watch} -:::warning -This is an experimental feature that may change in backwards-incompatible ways in the future releases. Enable live views and `WATCH` query using `set allow_experimental_live_view = 1`. -::: +!!! important "Important" + This is an experimental feature that may change in backwards-incompatible ways in the future releases. + Enable live views and `WATCH` query using `set allow_experimental_live_view = 1`. + ``` sql WATCH [db.]live_view @@ -104,6 +105,5 @@ WATCH lv EVENTS LIMIT 1; The `FORMAT` clause works the same way as for the [SELECT](../../sql-reference/statements/select/format.md#format-clause). -:::note -The [JSONEachRowWithProgress](../../interfaces/formats.md#jsoneachrowwithprogress) format should be used when watching [LIVE VIEW](./create/view.md#live-view) tables over the HTTP interface. The progress messages will be added to the output to keep the long-lived HTTP connection alive until the query result changes. The interval between progress messages is controlled using the [live_view_heartbeat_interval](./create/view.md#live-view-settings) setting. -::: \ No newline at end of file +!!! info "Note" + The [JSONEachRowWithProgress](../../interfaces/formats.md#jsoneachrowwithprogress) format should be used when watching [LIVE VIEW](./create/view.md#live-view) tables over the HTTP interface. The progress messages will be added to the output to keep the long-lived HTTP connection alive until the query result changes. The interval between progress messages is controlled using the [live_view_heartbeat_interval](./create/view.md#live-view-settings) setting. diff --git a/docs/en/sql-reference/syntax.md b/docs/en/sql-reference/syntax.md index 10664549329..19efef3dc6a 100644 --- a/docs/en/sql-reference/syntax.md +++ b/docs/en/sql-reference/syntax.md @@ -1,6 +1,6 @@ --- -sidebar_position: 31 -sidebar_label: Syntax +toc_priority: 31 +toc_title: Syntax --- # Syntax {#syntax} diff --git a/docs/en/sql-reference/table-functions/cluster.md b/docs/en/sql-reference/table-functions/cluster.md index 5954ed1b439..a02c2a10fb7 100644 --- a/docs/en/sql-reference/table-functions/cluster.md +++ b/docs/en/sql-reference/table-functions/cluster.md @@ -1,6 +1,6 @@ --- -sidebar_position: 50 -sidebar_label: cluster +toc_priority: 50 +toc_title: cluster --- # cluster, clusterAllReplicas {#cluster-clusterallreplicas} @@ -9,9 +9,8 @@ Allows to access all shards in an existing cluster which configured in `remote_s `clusterAllReplicas` function — same as `cluster`, but all replicas are queried. Each replica in a cluster is used as a separate shard/connection. -:::note -All available clusters are listed in the [system.clusters](../../operations/system-tables/clusters.md) table. -::: +!!! note "Note" + All available clusters are listed in the [system.clusters](../../operations/system-tables/clusters.md) table. **Syntax** diff --git a/docs/en/sql-reference/table-functions/dictionary.md b/docs/en/sql-reference/table-functions/dictionary.md index f04a4b6eb24..ad30cb30adf 100644 --- a/docs/en/sql-reference/table-functions/dictionary.md +++ b/docs/en/sql-reference/table-functions/dictionary.md @@ -1,6 +1,6 @@ --- -sidebar_position: 54 -sidebar_label: dictionary function +toc_priority: 54 +toc_title: dictionary function --- # dictionary {#dictionary-function} diff --git a/docs/en/sql-reference/table-functions/file.md b/docs/en/sql-reference/table-functions/file.md index 4b72b0d84f5..f7c2a9e6d5b 100644 --- a/docs/en/sql-reference/table-functions/file.md +++ b/docs/en/sql-reference/table-functions/file.md @@ -1,6 +1,6 @@ --- -sidebar_position: 37 -sidebar_label: file +toc_priority: 37 +toc_title: file --- # file {#file} @@ -106,9 +106,8 @@ Query the number of rows in all files of these two directories: SELECT count(*) FROM file('{some,another}_dir/*', 'TSV', 'name String, value UInt32'); ``` -:::warning -If your listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`. -::: +!!! warning "Warning" + If your listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`. **Example** diff --git a/docs/en/sql-reference/table-functions/generate.md b/docs/en/sql-reference/table-functions/generate.md index bb9ad3f7551..ae22e1a1b88 100644 --- a/docs/en/sql-reference/table-functions/generate.md +++ b/docs/en/sql-reference/table-functions/generate.md @@ -1,6 +1,6 @@ --- -sidebar_position: 47 -sidebar_label: generateRandom +toc_priority: 47 +toc_title: generateRandom --- # generateRandom {#generaterandom} diff --git a/docs/en/sql-reference/table-functions/hdfs.md b/docs/en/sql-reference/table-functions/hdfs.md index 7f7dc53d27e..a7c3baca299 100644 --- a/docs/en/sql-reference/table-functions/hdfs.md +++ b/docs/en/sql-reference/table-functions/hdfs.md @@ -1,6 +1,6 @@ --- -sidebar_position: 45 -sidebar_label: hdfs +toc_priority: 45 +toc_title: hdfs --- # hdfs {#hdfs} @@ -78,9 +78,8 @@ SELECT count(*) FROM hdfs('hdfs://hdfs1:9000/{some,another}_dir/*', 'TSV', 'name String, value UInt32') ``` -:::warning -If your listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`. -::: +!!! warning "Warning" + If your listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`. **Example** diff --git a/docs/en/sql-reference/table-functions/hdfsCluster.md b/docs/en/sql-reference/table-functions/hdfsCluster.md index b46b8e64a1a..6183fe83c38 100644 --- a/docs/en/sql-reference/table-functions/hdfsCluster.md +++ b/docs/en/sql-reference/table-functions/hdfsCluster.md @@ -1,6 +1,6 @@ --- -sidebar_position: 55 -sidebar_label: hdfsCluster +toc_priority: 55 +toc_title: hdfsCluster --- # hdfsCluster Table Function {#hdfsCluster-table-function} @@ -49,9 +49,8 @@ SELECT count(*) FROM hdfsCluster('cluster_simple', 'hdfs://hdfs1:9000/{some,another}_dir/*', 'TSV', 'name String, value UInt32') ``` -:::warning -If your listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`. -::: +!!! warning "Warning" + If your listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`. **See Also** diff --git a/docs/en/sql-reference/table-functions/index.md b/docs/en/sql-reference/table-functions/index.md index a51312324f0..24d67e31fa8 100644 --- a/docs/en/sql-reference/table-functions/index.md +++ b/docs/en/sql-reference/table-functions/index.md @@ -1,9 +1,10 @@ --- -sidebar_label: Table Functions -sidebar_position: 34 +toc_folder_title: Table Functions +toc_priority: 34 +toc_title: Introduction --- -# Table Functions +# Table Functions {#table-functions} Table functions are methods for constructing tables. @@ -19,9 +20,8 @@ You can use table functions in: - [INSERT INTO TABLE FUNCTION](../../sql-reference/statements/insert-into.md#inserting-into-table-function) query. -:::warning -You can’t use table functions if the [allow_ddl](../../operations/settings/permissions-for-queries.md#settings_allow_ddl) setting is disabled. -::: +!!! warning "Warning" + You can’t use table functions if the [allow_ddl](../../operations/settings/permissions-for-queries.md#settings_allow_ddl) setting is disabled. | Function | Description | |------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------| diff --git a/docs/en/sql-reference/table-functions/input.md b/docs/en/sql-reference/table-functions/input.md index 916abb890ff..17707b798d6 100644 --- a/docs/en/sql-reference/table-functions/input.md +++ b/docs/en/sql-reference/table-functions/input.md @@ -1,6 +1,6 @@ --- -sidebar_position: 46 -sidebar_label: input +toc_priority: 46 +toc_title: input --- # input {#input} diff --git a/docs/en/sql-reference/table-functions/jdbc.md b/docs/en/sql-reference/table-functions/jdbc.md index 57128f7d146..9fe1333fc94 100644 --- a/docs/en/sql-reference/table-functions/jdbc.md +++ b/docs/en/sql-reference/table-functions/jdbc.md @@ -1,6 +1,6 @@ --- -sidebar_position: 43 -sidebar_label: jdbc +toc_priority: 43 +toc_title: jdbc --- # jdbc {#table-function-jdbc} diff --git a/docs/en/sql-reference/table-functions/merge.md b/docs/en/sql-reference/table-functions/merge.md index 301f0a69caf..c89f0f4cc5a 100644 --- a/docs/en/sql-reference/table-functions/merge.md +++ b/docs/en/sql-reference/table-functions/merge.md @@ -1,6 +1,6 @@ --- -sidebar_position: 38 -sidebar_label: merge +toc_priority: 38 +toc_title: merge --- # merge {#merge} diff --git a/docs/en/sql-reference/table-functions/mysql.md b/docs/en/sql-reference/table-functions/mysql.md index c6983d8fba1..b45ab86f60f 100644 --- a/docs/en/sql-reference/table-functions/mysql.md +++ b/docs/en/sql-reference/table-functions/mysql.md @@ -1,6 +1,6 @@ --- -sidebar_position: 42 -sidebar_label: mysql +toc_priority: 42 +toc_title: mysql --- # mysql {#mysql} @@ -55,9 +55,8 @@ SELECT name FROM mysql(`mysql1:3306|mysql2:3306|mysql3:3306`, 'mysql_database', A table object with the same columns as the original MySQL table. -:::note -In the `INSERT` query to distinguish table function `mysql(...)` from table name with column names list, you must use keywords `FUNCTION` or `TABLE FUNCTION`. See examples below. -::: +!!! info "Note" + In the `INSERT` query to distinguish table function `mysql(...)` from table name with column names list, you must use keywords `FUNCTION` or `TABLE FUNCTION`. See examples below. **Examples** diff --git a/docs/en/sql-reference/table-functions/null.md b/docs/en/sql-reference/table-functions/null.md index 48df12bfece..4a8d221d620 100644 --- a/docs/en/sql-reference/table-functions/null.md +++ b/docs/en/sql-reference/table-functions/null.md @@ -1,6 +1,6 @@ --- -sidebar_position: 53 -sidebar_label: null function +toc_priority: 53 +toc_title: null function --- # null {#null-function} diff --git a/docs/en/sql-reference/table-functions/numbers.md b/docs/en/sql-reference/table-functions/numbers.md index c15c47cf725..f9735056b05 100644 --- a/docs/en/sql-reference/table-functions/numbers.md +++ b/docs/en/sql-reference/table-functions/numbers.md @@ -1,6 +1,6 @@ --- -sidebar_position: 39 -sidebar_label: numbers +toc_priority: 39 +toc_title: numbers --- # numbers {#numbers} diff --git a/docs/en/sql-reference/table-functions/odbc.md b/docs/en/sql-reference/table-functions/odbc.md index d2614337cdd..a8481fbfd68 100644 --- a/docs/en/sql-reference/table-functions/odbc.md +++ b/docs/en/sql-reference/table-functions/odbc.md @@ -1,6 +1,6 @@ --- -sidebar_position: 44 -sidebar_label: odbc +toc_priority: 44 +toc_title: odbc --- # odbc {#table-functions-odbc} diff --git a/docs/en/sql-reference/table-functions/postgresql.md b/docs/en/sql-reference/table-functions/postgresql.md index 6a30b1f3f0c..b2bdc2495e5 100644 --- a/docs/en/sql-reference/table-functions/postgresql.md +++ b/docs/en/sql-reference/table-functions/postgresql.md @@ -1,6 +1,6 @@ --- -sidebar_position: 42 -sidebar_label: postgresql +toc_priority: 42 +toc_title: postgresql --- # postgresql {#postgresql} @@ -26,9 +26,8 @@ postgresql('host:port', 'database', 'table', 'user', 'password'[, `schema`]) A table object with the same columns as the original PostgreSQL table. -:::note -In the `INSERT` query to distinguish table function `postgresql(...)` from table name with column names list you must use keywords `FUNCTION` or `TABLE FUNCTION`. See examples below. -::: +!!! info "Note" + In the `INSERT` query to distinguish table function `postgresql(...)` from table name with column names list you must use keywords `FUNCTION` or `TABLE FUNCTION`. See examples below. ## Implementation Details {#implementation-details} @@ -42,9 +41,8 @@ All joins, aggregations, sorting, `IN [ array ]` conditions and the `LIMIT` samp PostgreSQL Array types converts into ClickHouse arrays. -:::note -Be careful, in PostgreSQL an array data type column like Integer[] may contain arrays of different dimensions in different rows, but in ClickHouse it is only allowed to have multidimensional arrays of the same dimension in all rows. -::: +!!! info "Note" + Be careful, in PostgreSQL an array data type column like Integer[] may contain arrays of different dimensions in different rows, but in ClickHouse it is only allowed to have multidimensional arrays of the same dimension in all rows. Supports multiple replicas that must be listed by `|`. For example: diff --git a/docs/en/sql-reference/table-functions/remote.md b/docs/en/sql-reference/table-functions/remote.md index 0eae00564ba..9effbb03553 100644 --- a/docs/en/sql-reference/table-functions/remote.md +++ b/docs/en/sql-reference/table-functions/remote.md @@ -1,6 +1,6 @@ --- -sidebar_position: 40 -sidebar_label: remote +toc_priority: 40 +toc_title: remote --- # remote, remoteSecure {#remote-remotesecure} diff --git a/docs/en/sql-reference/table-functions/s3.md b/docs/en/sql-reference/table-functions/s3.md index 61dda209ee6..7dffd252dc9 100644 --- a/docs/en/sql-reference/table-functions/s3.md +++ b/docs/en/sql-reference/table-functions/s3.md @@ -1,6 +1,6 @@ --- -sidebar_position: 45 -sidebar_label: s3 +toc_priority: 45 +toc_title: s3 --- # s3 Table Function {#s3-table-function} @@ -95,9 +95,8 @@ FROM s3('https://clickhouse-public-datasets.s3.amazonaws.com/my-test-bucket-768/ └─────────┘ ``` -:::warning -If your listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`. -::: +!!! warning "Warning" + If your listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`. Count the total amount of rows in files named `file-000.csv`, `file-001.csv`, … , `file-999.csv`: diff --git a/docs/en/sql-reference/table-functions/s3Cluster.md b/docs/en/sql-reference/table-functions/s3Cluster.md index dbd3538c692..65565aa92cb 100644 --- a/docs/en/sql-reference/table-functions/s3Cluster.md +++ b/docs/en/sql-reference/table-functions/s3Cluster.md @@ -1,6 +1,6 @@ --- -sidebar_position: 55 -sidebar_label: s3Cluster +toc_priority: 55 +toc_title: s3Cluster --- # s3Cluster Table Function {#s3Cluster-table-function} @@ -39,9 +39,8 @@ Count the total amount of rows in all files in the cluster `cluster_simple`: SELECT count(*) FROM s3Cluster('cluster_simple', 'http://minio1:9001/root/data/{clickhouse,database}/*', 'minio', 'minio123', 'CSV', 'name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64)))'); ``` -:::warning -If your listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`. -::: +!!! warning "Warning" + If your listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`. **See Also** diff --git a/docs/en/sql-reference/table-functions/sqlite.md b/docs/en/sql-reference/table-functions/sqlite.md index 6058843ae61..be7bd92d7e7 100644 --- a/docs/en/sql-reference/table-functions/sqlite.md +++ b/docs/en/sql-reference/table-functions/sqlite.md @@ -1,6 +1,6 @@ --- -sidebar_position: 55 -sidebar_label: sqlite +toc_priority: 55 +toc_title: sqlite --- ## sqlite {#sqlite} diff --git a/docs/en/sql-reference/table-functions/url.md b/docs/en/sql-reference/table-functions/url.md index 3f2f9c6a710..bfad7a67e0d 100644 --- a/docs/en/sql-reference/table-functions/url.md +++ b/docs/en/sql-reference/table-functions/url.md @@ -1,6 +1,6 @@ --- -sidebar_position: 41 -sidebar_label: url +toc_priority: 41 +toc_title: url --- # url {#url} diff --git a/docs/en/sql-reference/table-functions/view.md b/docs/en/sql-reference/table-functions/view.md index 727cc04e5a2..f78120c370e 100644 --- a/docs/en/sql-reference/table-functions/view.md +++ b/docs/en/sql-reference/table-functions/view.md @@ -1,6 +1,6 @@ --- -sidebar_position: 51 -sidebar_label: view +toc_priority: 51 +toc_title: view --- ## view {#view} diff --git a/docs/en/sql-reference/window-functions/index.md b/docs/en/sql-reference/window-functions/index.md index e9a15995a16..0a55eafc7ab 100644 --- a/docs/en/sql-reference/window-functions/index.md +++ b/docs/en/sql-reference/window-functions/index.md @@ -1,6 +1,6 @@ --- -sidebar_position: 62 -sidebar_label: Window Functions +toc_priority: 62 +toc_title: Window Functions --- # Window Functions diff --git a/docs/en/whats-new/changelog/2017.md b/docs/en/whats-new/changelog/2017.md index 6a9f599daa0..af82c69386a 100644 --- a/docs/en/whats-new/changelog/2017.md +++ b/docs/en/whats-new/changelog/2017.md @@ -1,6 +1,6 @@ --- -sidebar_label: 2017 -sidebar_position: 26 +toc_priority: 79 +toc_title: '2017' --- ### ClickHouse Release 1.1.54327, 2017-12-21 {#clickhouse-release-1-1-54327-2017-12-21} diff --git a/docs/en/whats-new/changelog/2018.md b/docs/en/whats-new/changelog/2018.md index d4edca54e52..db09bcd8a03 100644 --- a/docs/en/whats-new/changelog/2018.md +++ b/docs/en/whats-new/changelog/2018.md @@ -1,6 +1,6 @@ --- -sidebar_label: 2018 -sidebar_position: 25 +toc_priority: 78 +toc_title: '2018' --- ## ClickHouse Release 18.16 {#clickhouse-release-18-16} diff --git a/docs/en/whats-new/changelog/2019.md b/docs/en/whats-new/changelog/2019.md index c41041705d9..aa06f5cb1e3 100644 --- a/docs/en/whats-new/changelog/2019.md +++ b/docs/en/whats-new/changelog/2019.md @@ -1,6 +1,6 @@ --- -sidebar_label: 2019 -sidebar_position: 22 +toc_priority: 77 +toc_title: '2019' --- ## ClickHouse Release 19.17 {#clickhouse-release-v19-17} diff --git a/docs/en/whats-new/changelog/2020.md b/docs/en/whats-new/changelog/2020.md index 7ec37c51eb1..e0afe256777 100644 --- a/docs/en/whats-new/changelog/2020.md +++ b/docs/en/whats-new/changelog/2020.md @@ -1,6 +1,6 @@ --- -sidebar_label: 2020 -sidebar_position: 21 +toc_priority: 76 +toc_title: '2020' --- ### ClickHouse release 20.12 diff --git a/docs/en/whats-new/changelog/2021.md b/docs/en/whats-new/changelog/2021.md index e4c430342ce..2e81d981990 100644 --- a/docs/en/whats-new/changelog/2021.md +++ b/docs/en/whats-new/changelog/2021.md @@ -1,8 +1,6 @@ --- -sidebar_label: 2021 -sidebar_position: 20 -keywords: [clickhouse, changelog] -description: Changelog +toc_priority: 75 +toc_title: '2021' --- ### ClickHouse release v21.12, 2021-12-15 diff --git a/docs/en/whats-new/changelog/index.md b/docs/en/whats-new/changelog/index.md index 22f6a30452d..517ea16f3e7 100644 --- a/docs/en/whats-new/changelog/index.md +++ b/docs/en/whats-new/changelog/index.md @@ -1,498 +1,7 @@ --- -sidebar_label: Changelog -sidebar_position: 1 -keywords: [clickhouse, changelog] -description: Changelog +toc_folder_title: Changelog +toc_priority: 74 +toc_title: '2022' --- -# ClickHouse Changelog - -### Table of Contents -**[ClickHouse release v22.3-lts, 2022-03-17](#223)**
-**[ClickHouse release v22.2, 2022-02-17](#222)**
-**[ClickHouse release v22.1, 2022-01-18](#221)**
-**[Changelog for 2021](https://github.com/ClickHouse/ClickHouse/blob/master/docs/en/whats-new/changelog/2021.md)**
- - -## ClickHouse release v22.3-lts, 2022-03-17 - -#### Backward Incompatible Change - -* Make `arrayCompact` function behave as other higher-order functions: perform compaction not of lambda function results but on the original array. If you're using nontrivial lambda functions in arrayCompact you may restore old behaviour by wrapping `arrayCompact` arguments into `arrayMap`. Closes [#34010](https://github.com/ClickHouse/ClickHouse/issues/34010) [#18535](https://github.com/ClickHouse/ClickHouse/issues/18535) [#14778](https://github.com/ClickHouse/ClickHouse/issues/14778). [#34795](https://github.com/ClickHouse/ClickHouse/pull/34795) ([Alexandre Snarskii](https://github.com/snar)). -* Change implementation specific behavior on overflow of function `toDatetime`. It will be saturated to the nearest min/max supported instant of datetime instead of wraparound. This change is highlighted as "backward incompatible" because someone may unintentionally rely on the old behavior. [#32898](https://github.com/ClickHouse/ClickHouse/pull/32898) ([HaiBo Li](https://github.com/marising)). -* Make function `cast(value, 'IPv4')`, `cast(value, 'IPv6')` behave same as `toIPv4`, `toIPv6` functions. Changed behavior of incorrect IP address passed into functions `toIPv4`,` toIPv6`, now if invalid IP address passes into this functions exception will be raised, before this function return default value. Added functions `IPv4StringToNumOrDefault`, `IPv4StringToNumOrNull`, `IPv6StringToNumOrDefault`, `IPv6StringOrNull` `toIPv4OrDefault`, `toIPv4OrNull`, `toIPv6OrDefault`, `toIPv6OrNull`. Functions `IPv4StringToNumOrDefault `, `toIPv4OrDefault `, `toIPv6OrDefault ` should be used if previous logic relied on `IPv4StringToNum`, `toIPv4`, `toIPv6` returning default value for invalid address. Added setting `cast_ipv4_ipv6_default_on_conversion_error`, if this setting enabled, then IP address conversion functions will behave as before. Closes [#22825](https://github.com/ClickHouse/ClickHouse/issues/22825). Closes [#5799](https://github.com/ClickHouse/ClickHouse/issues/5799). Closes [#35156](https://github.com/ClickHouse/ClickHouse/issues/35156). [#35240](https://github.com/ClickHouse/ClickHouse/pull/35240) ([Maksim Kita](https://github.com/kitaisreal)). - -#### New Feature - -* Support for caching data locally for remote filesystems. It can be enabled for `s3` disks. Closes [#28961](https://github.com/ClickHouse/ClickHouse/issues/28961). [#33717](https://github.com/ClickHouse/ClickHouse/pull/33717) ([Kseniia Sumarokova](https://github.com/kssenii)). In the meantime, we enabled the test suite on s3 filesystem and no more known issues exist, so it is started to be production ready. -* Add new table function `hive`. It can be used as follows `hive('', '', '', '', '')` for example `SELECT * FROM hive('thrift://hivetest:9083', 'test', 'demo', 'id Nullable(String), score Nullable(Int32), day Nullable(String)', 'day')`. [#34946](https://github.com/ClickHouse/ClickHouse/pull/34946) ([lgbo](https://github.com/lgbo-ustc)). -* Support authentication of users connected via SSL by their X.509 certificate. [#31484](https://github.com/ClickHouse/ClickHouse/pull/31484) ([eungenue](https://github.com/eungenue)). -* Support schema inference for inserting into table functions `file`/`hdfs`/`s3`/`url`. [#34732](https://github.com/ClickHouse/ClickHouse/pull/34732) ([Kruglov Pavel](https://github.com/Avogar)). -* Now you can read `system.zookeeper` table without restrictions on path or using `like` expression. This reads can generate quite heavy load for zookeeper so to enable this ability you have to enable setting `allow_unrestricted_reads_from_keeper`. [#34609](https://github.com/ClickHouse/ClickHouse/pull/34609) ([Sergei Trifonov](https://github.com/serxa)). -* Display CPU and memory metrics in clickhouse-local. Close [#34545](https://github.com/ClickHouse/ClickHouse/issues/34545). [#34605](https://github.com/ClickHouse/ClickHouse/pull/34605) ([李扬](https://github.com/taiyang-li)). -* Implement `startsWith` and `endsWith` function for arrays, closes [#33982](https://github.com/ClickHouse/ClickHouse/issues/33982). [#34368](https://github.com/ClickHouse/ClickHouse/pull/34368) ([usurai](https://github.com/usurai)). -* Add three functions for Map data type: 1. `mapReplace(map1, map2)` - replaces values for keys in map1 with the values of the corresponding keys in map2; adds keys from map2 that don't exist in map1. 2. `mapFilter` 3. `mapMap`. mapFilter and mapMap are higher order functions, accepting two arguments, the first argument is a lambda function with k, v pair as arguments, the second argument is a column of type Map. [#33698](https://github.com/ClickHouse/ClickHouse/pull/33698) ([hexiaoting](https://github.com/hexiaoting)). -* Allow getting default user and password for clickhouse-client from the `CLICKHOUSE_USER` and `CLICKHOUSE_PASSWORD` environment variables. Close [#34538](https://github.com/ClickHouse/ClickHouse/issues/34538). [#34947](https://github.com/ClickHouse/ClickHouse/pull/34947) ([DR](https://github.com/freedomDR)). - -#### Experimental Feature - -* New data type `Object()`, which supports storing of semi-structured data (for now JSON only). Data is written to such types as string. Then all paths are extracted according to format of semi-structured data and written as separate columns in most optimal types, that can store all their values. Those columns can be queried by names that match paths in source data. E.g `data.key1.key2` or with cast operator `data.key1.key2::Int64`. -* Add `database_replicated_allow_only_replicated_engine` setting. When enabled, it only allowed to only create `Replicated` tables or tables with stateless engines in `Replicated` databases. [#35214](https://github.com/ClickHouse/ClickHouse/pull/35214) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). Note that `Replicated` database is still an experimental feature. - -#### Performance Improvement - -* Improve performance of insertion into `MergeTree` tables by optimizing sorting. Up to 2x improvement is observed on realistic benchmarks. [#34750](https://github.com/ClickHouse/ClickHouse/pull/34750) ([Maksim Kita](https://github.com/kitaisreal)). -* Columns pruning when reading Parquet, ORC and Arrow files from URL and S3. Closes [#34163](https://github.com/ClickHouse/ClickHouse/issues/34163). [#34849](https://github.com/ClickHouse/ClickHouse/pull/34849) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Columns pruning when reading Parquet, ORC and Arrow files from Hive. [#34954](https://github.com/ClickHouse/ClickHouse/pull/34954) ([lgbo](https://github.com/lgbo-ustc)). -* A bunch of performance optimizations from a performance superhero. Improve performance of processing queries with large `IN` section. Improve performance of `direct` dictionary if its source is `ClickHouse`. Improve performance of `detectCharset `, `detectLanguageUnknown ` functions. [#34888](https://github.com/ClickHouse/ClickHouse/pull/34888) ([Maksim Kita](https://github.com/kitaisreal)). -* Improve performance of `any` aggregate function by using more batching. [#34760](https://github.com/ClickHouse/ClickHouse/pull/34760) ([Raúl Marín](https://github.com/Algunenano)). -* Multiple improvements for performance of `clickhouse-keeper`: less locking [#35010](https://github.com/ClickHouse/ClickHouse/pull/35010) ([zhanglistar](https://github.com/zhanglistar)), lower memory usage by streaming reading and writing of snapshot instead of full copy. [#34584](https://github.com/ClickHouse/ClickHouse/pull/34584) ([zhanglistar](https://github.com/zhanglistar)), optimizing compaction of log store in the RAFT implementation. [#34534](https://github.com/ClickHouse/ClickHouse/pull/34534) ([zhanglistar](https://github.com/zhanglistar)), versioning of the internal data structure [#34486](https://github.com/ClickHouse/ClickHouse/pull/34486) ([zhanglistar](https://github.com/zhanglistar)). - -#### Improvement - -* Allow asynchronous inserts to table functions. Fixes [#34864](https://github.com/ClickHouse/ClickHouse/issues/34864). [#34866](https://github.com/ClickHouse/ClickHouse/pull/34866) ([Anton Popov](https://github.com/CurtizJ)). -* Implicit type casting of the key argument for functions `dictGetHierarchy`, `dictIsIn`, `dictGetChildren`, `dictGetDescendants`. Closes [#34970](https://github.com/ClickHouse/ClickHouse/issues/34970). [#35027](https://github.com/ClickHouse/ClickHouse/pull/35027) ([Maksim Kita](https://github.com/kitaisreal)). -* `EXPLAIN AST` query can output AST in form of a graph in Graphviz format: `EXPLAIN AST graph = 1 SELECT * FROM system.parts`. [#35173](https://github.com/ClickHouse/ClickHouse/pull/35173) ([李扬](https://github.com/taiyang-li)). -* When large files were written with `s3` table function or table engine, the content type on the files was mistakenly set to `application/xml` due to a bug in the AWS SDK. This closes [#33964](https://github.com/ClickHouse/ClickHouse/issues/33964). [#34433](https://github.com/ClickHouse/ClickHouse/pull/34433) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Change restrictive row policies a bit to make them an easier alternative to permissive policies in easy cases. If for a particular table only restrictive policies exist (without permissive policies) users will be able to see some rows. Also `SHOW CREATE ROW POLICY` will always show `AS permissive` or `AS restrictive` in row policy's definition. [#34596](https://github.com/ClickHouse/ClickHouse/pull/34596) ([Vitaly Baranov](https://github.com/vitlibar)). -* Improve schema inference with globs in File/S3/HDFS/URL engines. Try to use the next path for schema inference in case of error. [#34465](https://github.com/ClickHouse/ClickHouse/pull/34465) ([Kruglov Pavel](https://github.com/Avogar)). -* Play UI now correctly detects the preferred light/dark theme from the OS. [#35068](https://github.com/ClickHouse/ClickHouse/pull/35068) ([peledni](https://github.com/peledni)). -* Added `date_time_input_format = 'best_effort_us'`. Closes [#34799](https://github.com/ClickHouse/ClickHouse/issues/34799). [#34982](https://github.com/ClickHouse/ClickHouse/pull/34982) ([WenYao](https://github.com/Cai-Yao)). -* A new settings called `allow_plaintext_password` and `allow_no_password` are added in server configuration which turn on/off authentication types that can be potentially insecure in some environments. They are allowed by default. [#34738](https://github.com/ClickHouse/ClickHouse/pull/34738) ([Heena Bansal](https://github.com/HeenaBansal2009)). -* Support for `DateTime64` data type in `Arrow` format, closes [#8280](https://github.com/ClickHouse/ClickHouse/issues/8280) and closes [#28574](https://github.com/ClickHouse/ClickHouse/issues/28574). [#34561](https://github.com/ClickHouse/ClickHouse/pull/34561) ([李扬](https://github.com/taiyang-li)). -* Reload `remote_url_allow_hosts` (filtering of outgoing connections) on config update. [#35294](https://github.com/ClickHouse/ClickHouse/pull/35294) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* Support `--testmode` parameter for `clickhouse-local`. This parameter enables interpretation of test hints that we use in functional tests. [#35264](https://github.com/ClickHouse/ClickHouse/pull/35264) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Add `distributed_depth` to query log. It is like a more detailed variant of `is_initial_query` [#35207](https://github.com/ClickHouse/ClickHouse/pull/35207) ([李扬](https://github.com/taiyang-li)). -* Respect `remote_url_allow_hosts` for `MySQL` and `PostgreSQL` table functions. [#35191](https://github.com/ClickHouse/ClickHouse/pull/35191) ([Heena Bansal](https://github.com/HeenaBansal2009)). -* Added `disk_name` field to `system.part_log`. [#35178](https://github.com/ClickHouse/ClickHouse/pull/35178) ([Artyom Yurkov](https://github.com/Varinara)). -* Do not retry non-rertiable errors when querying remote URLs. Closes [#35161](https://github.com/ClickHouse/ClickHouse/issues/35161). [#35172](https://github.com/ClickHouse/ClickHouse/pull/35172) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Support distributed INSERT SELECT queries (the setting `parallel_distributed_insert_select`) table function `view()`. [#35132](https://github.com/ClickHouse/ClickHouse/pull/35132) ([Azat Khuzhin](https://github.com/azat)). -* More precise memory tracking during `INSERT` into `Buffer` with `AggregateFunction`. [#35072](https://github.com/ClickHouse/ClickHouse/pull/35072) ([Azat Khuzhin](https://github.com/azat)). -* Avoid division by zero in Query Profiler if Linux kernel has a bug. Closes [#34787](https://github.com/ClickHouse/ClickHouse/issues/34787). [#35032](https://github.com/ClickHouse/ClickHouse/pull/35032) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Add more sanity checks for keeper configuration: now mixing of localhost and non-local servers is not allowed, also add checks for same value of internal raft port and keeper client port. [#35004](https://github.com/ClickHouse/ClickHouse/pull/35004) ([alesapin](https://github.com/alesapin)). -* Currently, if the user changes the settings of the system tables there will be tons of logs and ClickHouse will rename the tables every minute. This fixes [#34929](https://github.com/ClickHouse/ClickHouse/issues/34929). [#34949](https://github.com/ClickHouse/ClickHouse/pull/34949) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). -* Use connection pool for Hive metastore client. [#34940](https://github.com/ClickHouse/ClickHouse/pull/34940) ([lgbo](https://github.com/lgbo-ustc)). -* Ignore per-column `TTL` in `CREATE TABLE AS` if new table engine does not support it (i.e. if the engine is not of `MergeTree` family). [#34938](https://github.com/ClickHouse/ClickHouse/pull/34938) ([Azat Khuzhin](https://github.com/azat)). -* Allow `LowCardinality` strings for `ngrambf_v1`/`tokenbf_v1` indexes. Closes [#21865](https://github.com/ClickHouse/ClickHouse/issues/21865). [#34911](https://github.com/ClickHouse/ClickHouse/pull/34911) ([Lars Hiller Eidnes](https://github.com/larspars)). -* Allow opening empty sqlite db if the file doesn't exist. Closes [#33367](https://github.com/ClickHouse/ClickHouse/issues/33367). [#34907](https://github.com/ClickHouse/ClickHouse/pull/34907) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Implement memory statistics for FreeBSD - this is required for `max_server_memory_usage` to work correctly. [#34902](https://github.com/ClickHouse/ClickHouse/pull/34902) ([Alexandre Snarskii](https://github.com/snar)). -* In previous versions the progress bar in clickhouse-client can jump forward near 50% for no reason. This closes [#34324](https://github.com/ClickHouse/ClickHouse/issues/34324). [#34801](https://github.com/ClickHouse/ClickHouse/pull/34801) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Now `ALTER TABLE DROP COLUMN columnX` queries for `MergeTree` table engines will work instantly when `columnX` is an `ALIAS` column. Fixes [#34660](https://github.com/ClickHouse/ClickHouse/issues/34660). [#34786](https://github.com/ClickHouse/ClickHouse/pull/34786) ([alesapin](https://github.com/alesapin)). -* Show hints when user mistyped the name of a data skipping index. Closes [#29698](https://github.com/ClickHouse/ClickHouse/issues/29698). [#34764](https://github.com/ClickHouse/ClickHouse/pull/34764) ([flynn](https://github.com/ucasfl)). -* Support `remote()`/`cluster()` table functions for `parallel_distributed_insert_select`. [#34728](https://github.com/ClickHouse/ClickHouse/pull/34728) ([Azat Khuzhin](https://github.com/azat)). -* Do not reset logging that configured via `--log-file`/`--errorlog-file` command line options in case of empty configuration in the config file. [#34718](https://github.com/ClickHouse/ClickHouse/pull/34718) ([Amos Bird](https://github.com/amosbird)). -* Extract schema only once on table creation and prevent reading from local files/external sources to extract schema on each server startup. [#34684](https://github.com/ClickHouse/ClickHouse/pull/34684) ([Kruglov Pavel](https://github.com/Avogar)). -* Allow specifying argument names for executable UDFs. This is necessary for formats where argument name is part of serialization, like `Native`, `JSONEachRow`. Closes [#34604](https://github.com/ClickHouse/ClickHouse/issues/34604). [#34653](https://github.com/ClickHouse/ClickHouse/pull/34653) ([Maksim Kita](https://github.com/kitaisreal)). -* `MaterializedMySQL` (experimental feature) now supports `materialized_mysql_tables_list` (a comma-separated list of MySQL database tables, which will be replicated by the MaterializedMySQL database engine. Default value: empty list — means all the tables will be replicated), mentioned at [#32977](https://github.com/ClickHouse/ClickHouse/issues/32977). [#34487](https://github.com/ClickHouse/ClickHouse/pull/34487) ([zzsmdfj](https://github.com/zzsmdfj)). -* Improve OpenTelemetry span logs for INSERT operation on distributed table. [#34480](https://github.com/ClickHouse/ClickHouse/pull/34480) ([Frank Chen](https://github.com/FrankChen021)). -* Make the znode `ctime` and `mtime` consistent between servers in ClickHouse Keeper. [#33441](https://github.com/ClickHouse/ClickHouse/pull/33441) ([小路](https://github.com/nicelulu)). - -#### Build/Testing/Packaging Improvement - -* Package repository is migrated to JFrog Artifactory (**Mikhail f. Shiryaev**). -* Randomize some settings in functional tests, so more possible combinations of settings will be tested. This is yet another fuzzing method to ensure better test coverage. This closes [#32268](https://github.com/ClickHouse/ClickHouse/issues/32268). [#34092](https://github.com/ClickHouse/ClickHouse/pull/34092) ([Kruglov Pavel](https://github.com/Avogar)). -* Drop PVS-Studio from our CI. [#34680](https://github.com/ClickHouse/ClickHouse/pull/34680) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). -* Add an ability to build stripped binaries with CMake. In previous versions it was performed by dh-tools. [#35196](https://github.com/ClickHouse/ClickHouse/pull/35196) ([alesapin](https://github.com/alesapin)). -* Smaller "fat-free" `clickhouse-keeper` build. [#35031](https://github.com/ClickHouse/ClickHouse/pull/35031) ([alesapin](https://github.com/alesapin)). -* Use @robot-clickhouse as an author and committer for PRs like https://github.com/ClickHouse/ClickHouse/pull/34685. [#34793](https://github.com/ClickHouse/ClickHouse/pull/34793) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). -* Limit DWARF version for debug info by 4 max, because our internal stack symbolizer cannot parse DWARF version 5. This makes sense if you compile ClickHouse with clang-15. [#34777](https://github.com/ClickHouse/ClickHouse/pull/34777) ([Alexey Milovidov](https://github.com/alexey-milovidov)). -* Remove `clickhouse-test` debian package as unneeded complication. CI use tests from repository and standalone testing via deb package is no longer supported. [#34606](https://github.com/ClickHouse/ClickHouse/pull/34606) ([Ilya Yatsishin](https://github.com/qoega)). - -#### Bug Fix (user-visible misbehaviour in official stable or prestable release) - -* A fix for HDFS integration: When the inner buffer size is too small, NEED_MORE_INPUT in `HadoopSnappyDecoder` will run multi times (>=3) for one compressed block. This makes the input data be copied into the wrong place in `HadoopSnappyDecoder::buffer`. [#35116](https://github.com/ClickHouse/ClickHouse/pull/35116) ([lgbo](https://github.com/lgbo-ustc)). -* Ignore obsolete grants in ATTACH GRANT statements. This PR fixes [#34815](https://github.com/ClickHouse/ClickHouse/issues/34815). [#34855](https://github.com/ClickHouse/ClickHouse/pull/34855) ([Vitaly Baranov](https://github.com/vitlibar)). -* Fix segfault in Postgres database when getting create table query if database was created using named collections. Closes [#35312](https://github.com/ClickHouse/ClickHouse/issues/35312). [#35313](https://github.com/ClickHouse/ClickHouse/pull/35313) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Fix partial merge join duplicate rows bug, close [#31009](https://github.com/ClickHouse/ClickHouse/issues/31009). [#35311](https://github.com/ClickHouse/ClickHouse/pull/35311) ([Vladimir C](https://github.com/vdimir)). -* Fix possible `Assertion 'position() != working_buffer.end()' failed` while using bzip2 compression with small `max_read_buffer_size` setting value. The bug was found in https://github.com/ClickHouse/ClickHouse/pull/35047. [#35300](https://github.com/ClickHouse/ClickHouse/pull/35300) ([Kruglov Pavel](https://github.com/Avogar)). While using lz4 compression with a small max_read_buffer_size setting value. [#35296](https://github.com/ClickHouse/ClickHouse/pull/35296) ([Kruglov Pavel](https://github.com/Avogar)). While using lzma compression with small `max_read_buffer_size` setting value. [#35295](https://github.com/ClickHouse/ClickHouse/pull/35295) ([Kruglov Pavel](https://github.com/Avogar)). While using `brotli` compression with a small `max_read_buffer_size` setting value. The bug was found in https://github.com/ClickHouse/ClickHouse/pull/35047. [#35281](https://github.com/ClickHouse/ClickHouse/pull/35281) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix possible segfault in `JSONEachRow` schema inference. [#35291](https://github.com/ClickHouse/ClickHouse/pull/35291) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix `CHECK TABLE` query in case when sparse columns are enabled in table. [#35274](https://github.com/ClickHouse/ClickHouse/pull/35274) ([Anton Popov](https://github.com/CurtizJ)). -* Avoid std::terminate in case of exception in reading from remote VFS. [#35257](https://github.com/ClickHouse/ClickHouse/pull/35257) ([Azat Khuzhin](https://github.com/azat)). -* Fix reading port from config, close [#34776](https://github.com/ClickHouse/ClickHouse/issues/34776). [#35193](https://github.com/ClickHouse/ClickHouse/pull/35193) ([Vladimir C](https://github.com/vdimir)). -* Fix error in query with `WITH TOTALS` in case if `HAVING` returned empty result. This fixes [#33711](https://github.com/ClickHouse/ClickHouse/issues/33711). [#35186](https://github.com/ClickHouse/ClickHouse/pull/35186) ([Amos Bird](https://github.com/amosbird)). -* Fix a corner case of `replaceRegexpAll`, close [#35117](https://github.com/ClickHouse/ClickHouse/issues/35117). [#35182](https://github.com/ClickHouse/ClickHouse/pull/35182) ([Vladimir C](https://github.com/vdimir)). -* Schema inference didn't work properly on case of `INSERT INTO FUNCTION s3(...) FROM ...`, it tried to read schema from s3 file instead of from select query. [#35176](https://github.com/ClickHouse/ClickHouse/pull/35176) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix MaterializedPostgreSQL (experimental feature) `table overrides` for partition by, etc. Closes [#35048](https://github.com/ClickHouse/ClickHouse/issues/35048). [#35162](https://github.com/ClickHouse/ClickHouse/pull/35162) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Fix MaterializedPostgreSQL (experimental feature) adding new table to replication (ATTACH TABLE) after manually removing (DETACH TABLE). Closes [#33800](https://github.com/ClickHouse/ClickHouse/issues/33800). Closes [#34922](https://github.com/ClickHouse/ClickHouse/issues/34922). Closes [#34315](https://github.com/ClickHouse/ClickHouse/issues/34315). [#35158](https://github.com/ClickHouse/ClickHouse/pull/35158) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Fix partition pruning error when non-monotonic function is used with IN operator. This fixes [#35136](https://github.com/ClickHouse/ClickHouse/issues/35136). [#35146](https://github.com/ClickHouse/ClickHouse/pull/35146) ([Amos Bird](https://github.com/amosbird)). -* Fixed slightly incorrect translation of YAML configs to XML. [#35135](https://github.com/ClickHouse/ClickHouse/pull/35135) ([Miel Donkers](https://github.com/mdonkers)). -* Fix `optimize_skip_unused_shards_rewrite_in` for signed columns and negative values. [#35134](https://github.com/ClickHouse/ClickHouse/pull/35134) ([Azat Khuzhin](https://github.com/azat)). -* The `update_lag` external dictionary configuration option was unusable showing the error message ``Unexpected key `update_lag` in dictionary source configuration``. [#35089](https://github.com/ClickHouse/ClickHouse/pull/35089) ([Jason Chu](https://github.com/1lann)). -* Avoid possible deadlock on server shutdown. [#35081](https://github.com/ClickHouse/ClickHouse/pull/35081) ([Azat Khuzhin](https://github.com/azat)). -* Fix missing alias after function is optimized to a subcolumn when setting `optimize_functions_to_subcolumns` is enabled. Closes [#33798](https://github.com/ClickHouse/ClickHouse/issues/33798). [#35079](https://github.com/ClickHouse/ClickHouse/pull/35079) ([qieqieplus](https://github.com/qieqieplus)). -* Fix reading from `system.asynchronous_inserts` table if there exists asynchronous insert into table function. [#35050](https://github.com/ClickHouse/ClickHouse/pull/35050) ([Anton Popov](https://github.com/CurtizJ)). -* Fix possible exception `Reading for MergeTree family tables must be done with last position boundary` (relevant to operation on remote VFS). Closes [#34979](https://github.com/ClickHouse/ClickHouse/issues/34979). [#35001](https://github.com/ClickHouse/ClickHouse/pull/35001) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Fix unexpected result when use -State type aggregate function in window frame. [#34999](https://github.com/ClickHouse/ClickHouse/pull/34999) ([metahys](https://github.com/metahys)). -* Fix possible segfault in FileLog (experimental feature). Closes [#30749](https://github.com/ClickHouse/ClickHouse/issues/30749). [#34996](https://github.com/ClickHouse/ClickHouse/pull/34996) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Fix possible rare error `Cannot push block to port which already has data`. [#34993](https://github.com/ClickHouse/ClickHouse/pull/34993) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* Fix wrong schema inference for unquoted dates in CSV. Closes [#34768](https://github.com/ClickHouse/ClickHouse/issues/34768). [#34961](https://github.com/ClickHouse/ClickHouse/pull/34961) ([Kruglov Pavel](https://github.com/Avogar)). -* Integration with Hive: Fix unexpected result when use `in` in `where` in hive query. [#34945](https://github.com/ClickHouse/ClickHouse/pull/34945) ([lgbo](https://github.com/lgbo-ustc)). -* Avoid busy polling in ClickHouse Keeper while searching for changelog files to delete. [#34931](https://github.com/ClickHouse/ClickHouse/pull/34931) ([Azat Khuzhin](https://github.com/azat)). -* Fix DateTime64 conversion from PostgreSQL. Closes [#33364](https://github.com/ClickHouse/ClickHouse/issues/33364). [#34910](https://github.com/ClickHouse/ClickHouse/pull/34910) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Fix possible "Part directory doesn't exist" during `INSERT` into MergeTree table backed by VFS over s3. [#34876](https://github.com/ClickHouse/ClickHouse/pull/34876) ([Azat Khuzhin](https://github.com/azat)). -* Support DDLs like CREATE USER to be executed on cross replicated cluster. [#34860](https://github.com/ClickHouse/ClickHouse/pull/34860) ([Jianmei Zhang](https://github.com/zhangjmruc)). -* Fix bugs for multiple columns group by in `WindowView` (experimental feature). [#34859](https://github.com/ClickHouse/ClickHouse/pull/34859) ([vxider](https://github.com/Vxider)). -* Fix possible failures in S2 functions when queries contain const columns. [#34745](https://github.com/ClickHouse/ClickHouse/pull/34745) ([Bharat Nallan](https://github.com/bharatnc)). -* Fix bug for H3 funcs containing const columns which cause queries to fail. [#34743](https://github.com/ClickHouse/ClickHouse/pull/34743) ([Bharat Nallan](https://github.com/bharatnc)). -* Fix `No such file or directory` with enabled `fsync_part_directory` and vertical merge. [#34739](https://github.com/ClickHouse/ClickHouse/pull/34739) ([Azat Khuzhin](https://github.com/azat)). -* Fix serialization/printing for system queries `RELOAD MODEL`, `RELOAD FUNCTION`, `RESTART DISK` when used `ON CLUSTER`. Closes [#34514](https://github.com/ClickHouse/ClickHouse/issues/34514). [#34696](https://github.com/ClickHouse/ClickHouse/pull/34696) ([Maksim Kita](https://github.com/kitaisreal)). -* Fix `allow_experimental_projection_optimization` with `enable_global_with_statement` (before it may lead to `Stack size too large` error in case of multiple expressions in `WITH` clause, and also it executes scalar subqueries again and again, so not it will be more optimal). [#34650](https://github.com/ClickHouse/ClickHouse/pull/34650) ([Azat Khuzhin](https://github.com/azat)). -* Stop to select part for mutate when the other replica has already updated the transaction log for `ReplatedMergeTree` engine. [#34633](https://github.com/ClickHouse/ClickHouse/pull/34633) ([Jianmei Zhang](https://github.com/zhangjmruc)). -* Fix incorrect result of trivial count query when part movement feature is used [#34089](https://github.com/ClickHouse/ClickHouse/issues/34089). [#34385](https://github.com/ClickHouse/ClickHouse/pull/34385) ([nvartolomei](https://github.com/nvartolomei)). -* Fix inconsistency of `max_query_size` limitation in distributed subqueries. [#34078](https://github.com/ClickHouse/ClickHouse/pull/34078) ([Chao Ma](https://github.com/godliness)). - - -### ClickHouse release v22.2, 2022-02-17 - -#### Upgrade Notes - -* Applying data skipping indexes for queries with FINAL may produce incorrect result. In this release we disabled data skipping indexes by default for queries with FINAL (a new setting `use_skip_indexes_if_final` is introduced and disabled by default). [#34243](https://github.com/ClickHouse/ClickHouse/pull/34243) ([Azat Khuzhin](https://github.com/azat)). - -#### New Feature - -* Projections are production ready. Set `allow_experimental_projection_optimization` by default and deprecate this setting. [#34456](https://github.com/ClickHouse/ClickHouse/pull/34456) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* An option to create a new files on insert for `File`/`S3`/`HDFS` engines. Allow to overwrite a file in `HDFS`. Throw an exception in attempt to overwrite a file in `S3` by default. Throw an exception in attempt to append data to file in formats that have a suffix (and thus don't support appends, like `Parquet`, `ORC`). Closes [#31640](https://github.com/ClickHouse/ClickHouse/issues/31640) Closes [#31622](https://github.com/ClickHouse/ClickHouse/issues/31622) Closes [#23862](https://github.com/ClickHouse/ClickHouse/issues/23862) Closes [#15022](https://github.com/ClickHouse/ClickHouse/issues/15022) Closes [#16674](https://github.com/ClickHouse/ClickHouse/issues/16674). [#33302](https://github.com/ClickHouse/ClickHouse/pull/33302) ([Kruglov Pavel](https://github.com/Avogar)). -* Add a setting that allows a user to provide own deduplication semantic in `MergeTree`/`ReplicatedMergeTree` If provided, it's used instead of data digest to generate block ID. So, for example, by providing a unique value for the setting in each INSERT statement, the user can avoid the same inserted data being deduplicated. This closes: [#7461](https://github.com/ClickHouse/ClickHouse/issues/7461). [#32304](https://github.com/ClickHouse/ClickHouse/pull/32304) ([Igor Nikonov](https://github.com/devcrafter)). -* Add support of `DEFAULT` keyword for INSERT statements. Closes [#6331](https://github.com/ClickHouse/ClickHouse/issues/6331). [#33141](https://github.com/ClickHouse/ClickHouse/pull/33141) ([Andrii Buriachevskyi](https://github.com/1over)). -* `EPHEMERAL` column specifier is added to `CREATE TABLE` query. Closes [#9436](https://github.com/ClickHouse/ClickHouse/issues/9436). [#34424](https://github.com/ClickHouse/ClickHouse/pull/34424) ([yakov-olkhovskiy](https://github.com/yakov-olkhovskiy)). -* Support `IF EXISTS` clause for `TTL expr TO [DISK|VOLUME] [IF EXISTS] 'xxx'` feature. Parts will be moved to disk or volume only if it exists on replica, so `MOVE TTL` rules will be able to behave differently on replicas according to the existing storage policies. Resolves [#34455](https://github.com/ClickHouse/ClickHouse/issues/34455). [#34504](https://github.com/ClickHouse/ClickHouse/pull/34504) ([Anton Popov](https://github.com/CurtizJ)). -* Allow set default table engine and to create tables without specifying ENGINE. [#34187](https://github.com/ClickHouse/ClickHouse/pull/34187) ([Ilya Yatsishin](https://github.com/qoega)). -* Add table function `format(format_name, data)`. [#34125](https://github.com/ClickHouse/ClickHouse/pull/34125) ([Kruglov Pavel](https://github.com/Avogar)). -* Detect format in `clickhouse-local` by file name even in the case when it is passed to stdin. [#33829](https://github.com/ClickHouse/ClickHouse/pull/33829) ([Kruglov Pavel](https://github.com/Avogar)). -* Add schema inference for `values` table function. Closes [#33811](https://github.com/ClickHouse/ClickHouse/issues/33811). [#34017](https://github.com/ClickHouse/ClickHouse/pull/34017) ([Kruglov Pavel](https://github.com/Avogar)). -* Dynamic reload of server TLS certificates on config reload. Closes [#15764](https://github.com/ClickHouse/ClickHouse/issues/15764). [#15765](https://github.com/ClickHouse/ClickHouse/pull/15765) ([johnskopis](https://github.com/johnskopis)). [#31257](https://github.com/ClickHouse/ClickHouse/pull/31257) ([Filatenkov Artur](https://github.com/FArthur-cmd)). -* Now ReplicatedMergeTree can recover data when some of its disks are broken. [#13544](https://github.com/ClickHouse/ClickHouse/pull/13544) ([Amos Bird](https://github.com/amosbird)). -* Fault-tolerant connections in clickhouse-client: `clickhouse-client ... --host host1 --host host2 --port port2 --host host3 --port port --host host4`. [#34490](https://github.com/ClickHouse/ClickHouse/pull/34490) ([Kruglov Pavel](https://github.com/Avogar)). [#33824](https://github.com/ClickHouse/ClickHouse/pull/33824) ([Filippov Denis](https://github.com/DF5HSE)). -* Add `DEGREES` and `RADIANS` functions for MySQL compatibility. [#33769](https://github.com/ClickHouse/ClickHouse/pull/33769) ([Bharat Nallan](https://github.com/bharatnc)). -* Add `h3ToCenterChild` function. [#33313](https://github.com/ClickHouse/ClickHouse/pull/33313) ([Bharat Nallan](https://github.com/bharatnc)). Add new h3 miscellaneous functions: `edgeLengthKm`,`exactEdgeLengthKm`,`exactEdgeLengthM`,`exactEdgeLengthRads`,`numHexagons`. [#33621](https://github.com/ClickHouse/ClickHouse/pull/33621) ([Bharat Nallan](https://github.com/bharatnc)). -* Add function `bitSlice` to extract bit subsequences from String/FixedString. [#33360](https://github.com/ClickHouse/ClickHouse/pull/33360) ([RogerYK](https://github.com/RogerYK)). -* Implemented `meanZTest` aggregate function. [#33354](https://github.com/ClickHouse/ClickHouse/pull/33354) ([achimbab](https://github.com/achimbab)). -* Add confidence intervals to T-tests aggregate functions. [#33260](https://github.com/ClickHouse/ClickHouse/pull/33260) ([achimbab](https://github.com/achimbab)). -* Add function `addressToLineWithInlines`. Close [#26211](https://github.com/ClickHouse/ClickHouse/issues/26211). [#33467](https://github.com/ClickHouse/ClickHouse/pull/33467) ([SuperDJY](https://github.com/cmsxbc)). -* Added `#!` and `# ` as a recognised start of a single line comment. Closes [#34138](https://github.com/ClickHouse/ClickHouse/issues/34138). [#34230](https://github.com/ClickHouse/ClickHouse/pull/34230) ([Aaron Katz](https://github.com/aaronstephenkatz)). - -#### Experimental Feature - -* Functions for text classification: language and charset detection. See [#23271](https://github.com/ClickHouse/ClickHouse/issues/23271). [#33314](https://github.com/ClickHouse/ClickHouse/pull/33314) ([Nikolay Degterinsky](https://github.com/evillique)). -* Add memory overcommit to `MemoryTracker`. Added `guaranteed` settings for memory limits which represent soft memory limits. In case when hard memory limit is reached, `MemoryTracker` tries to cancel the most overcommited query. New setting `memory_usage_overcommit_max_wait_microseconds` specifies how long queries may wait another query to stop. Closes [#28375](https://github.com/ClickHouse/ClickHouse/issues/28375). [#31182](https://github.com/ClickHouse/ClickHouse/pull/31182) ([Dmitry Novik](https://github.com/novikd)). -* Enable stream to table join in WindowView. [#33729](https://github.com/ClickHouse/ClickHouse/pull/33729) ([vxider](https://github.com/Vxider)). -* Support `SET`, `YEAR`, `TIME` and `GEOMETRY` data types in `MaterializedMySQL` (experimental feature). Fixes [#18091](https://github.com/ClickHouse/ClickHouse/issues/18091), [#21536](https://github.com/ClickHouse/ClickHouse/issues/21536), [#26361](https://github.com/ClickHouse/ClickHouse/issues/26361). [#33429](https://github.com/ClickHouse/ClickHouse/pull/33429) ([zzsmdfj](https://github.com/zzsmdfj)). -* Fix various issues when projection is enabled by default. Each issue is described in separate commit. This is for [#33678](https://github.com/ClickHouse/ClickHouse/issues/33678) . This fixes [#34273](https://github.com/ClickHouse/ClickHouse/issues/34273). [#34305](https://github.com/ClickHouse/ClickHouse/pull/34305) ([Amos Bird](https://github.com/amosbird)). - -#### Performance Improvement - -* Support `optimize_read_in_order` if prefix of sorting key is already sorted. E.g. if we have sorting key `ORDER BY (a, b)` in table and query with `WHERE a = const ORDER BY b` clauses, now it will be applied reading in order of sorting key instead of full sort. [#32748](https://github.com/ClickHouse/ClickHouse/pull/32748) ([Anton Popov](https://github.com/CurtizJ)). -* Improve performance of partitioned insert into table functions `URL`, `S3`, `File`, `HDFS`. Closes [#34348](https://github.com/ClickHouse/ClickHouse/issues/34348). [#34510](https://github.com/ClickHouse/ClickHouse/pull/34510) ([Maksim Kita](https://github.com/kitaisreal)). -* Multiple performance improvements of clickhouse-keeper. [#34484](https://github.com/ClickHouse/ClickHouse/pull/34484) [#34587](https://github.com/ClickHouse/ClickHouse/pull/34587) ([zhanglistar](https://github.com/zhanglistar)). -* `FlatDictionary` improve performance of dictionary data load. [#33871](https://github.com/ClickHouse/ClickHouse/pull/33871) ([Maksim Kita](https://github.com/kitaisreal)). -* Improve performance of `mapPopulateSeries` function. Closes [#33944](https://github.com/ClickHouse/ClickHouse/issues/33944). [#34318](https://github.com/ClickHouse/ClickHouse/pull/34318) ([Maksim Kita](https://github.com/kitaisreal)). -* `_file` and `_path` virtual columns (in file-like table engines) are made `LowCardinality` - it will make queries for multiple files faster. Closes [#34300](https://github.com/ClickHouse/ClickHouse/issues/34300). [#34317](https://github.com/ClickHouse/ClickHouse/pull/34317) ([flynn](https://github.com/ucasfl)). -* Speed up loading of data parts. It was not parallelized before: the setting `part_loading_threads` did not have effect. See [#4699](https://github.com/ClickHouse/ClickHouse/issues/4699). [#34310](https://github.com/ClickHouse/ClickHouse/pull/34310) ([alexey-milovidov](https://github.com/alexey-milovidov)). -* Improve performance of `LineAsString` format. This closes [#34303](https://github.com/ClickHouse/ClickHouse/issues/34303). [#34306](https://github.com/ClickHouse/ClickHouse/pull/34306) ([alexey-milovidov](https://github.com/alexey-milovidov)). -* Optimize `quantilesExact{Low,High}` to use `nth_element` instead of `sort`. [#34287](https://github.com/ClickHouse/ClickHouse/pull/34287) ([Danila Kutenin](https://github.com/danlark1)). -* Slightly improve performance of `Regexp` format. [#34202](https://github.com/ClickHouse/ClickHouse/pull/34202) ([alexey-milovidov](https://github.com/alexey-milovidov)). -* Minor improvement for analysis of scalar subqueries. [#34128](https://github.com/ClickHouse/ClickHouse/pull/34128) ([Federico Rodriguez](https://github.com/fedrod)). -* Make ORDER BY tuple almost as fast as ORDER BY columns. We have special optimizations for multiple column ORDER BY: https://github.com/ClickHouse/ClickHouse/pull/10831 . It's beneficial to also apply to tuple columns. [#34060](https://github.com/ClickHouse/ClickHouse/pull/34060) ([Amos Bird](https://github.com/amosbird)). -* Rework and reintroduce the scalar subqueries cache to Materialized Views execution. [#33958](https://github.com/ClickHouse/ClickHouse/pull/33958) ([Raúl Marín](https://github.com/Algunenano)). -* Slightly improve performance of `ORDER BY` by adding x86-64 AVX-512 support for `memcmpSmall` functions to accelerate memory comparison. It works only if you compile ClickHouse by yourself. [#33706](https://github.com/ClickHouse/ClickHouse/pull/33706) ([hanqf-git](https://github.com/hanqf-git)). -* Improve `range_hashed` dictionary performance if for key there are a lot of intervals. Fixes [#23821](https://github.com/ClickHouse/ClickHouse/issues/23821). [#33516](https://github.com/ClickHouse/ClickHouse/pull/33516) ([Maksim Kita](https://github.com/kitaisreal)). -* For inserts and merges into S3, write files in parallel whenever possible (TODO: check if it's merged). [#33291](https://github.com/ClickHouse/ClickHouse/pull/33291) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* Improve `clickhouse-keeper` performance and fix several memory leaks in NuRaft library. [#33329](https://github.com/ClickHouse/ClickHouse/pull/33329) ([alesapin](https://github.com/alesapin)). - -#### Improvement - -* Support asynchronous inserts in `clickhouse-client` for queries with inlined data. [#34267](https://github.com/ClickHouse/ClickHouse/pull/34267) ([Anton Popov](https://github.com/CurtizJ)). -* Functions `dictGet`, `dictHas` implicitly cast key argument to dictionary key structure, if they are different. [#33672](https://github.com/ClickHouse/ClickHouse/pull/33672) ([Maksim Kita](https://github.com/kitaisreal)). -* Improvements for `range_hashed` dictionaries. Improve performance of load time if there are multiple attributes. Allow to create a dictionary without attributes. Added option to specify strategy when intervals `start` and `end` have `Nullable` type `convert_null_range_bound_to_open` by default is `true`. Closes [#29791](https://github.com/ClickHouse/ClickHouse/issues/29791). Allow to specify `Float`, `Decimal`, `DateTime64`, `Int128`, `Int256`, `UInt128`, `UInt256` as range types. `RangeHashedDictionary` added support for range values that extend `Int64` type. Closes [#28322](https://github.com/ClickHouse/ClickHouse/issues/28322). Added option `range_lookup_strategy` to specify range lookup type `min`, `max` by default is `min` . Closes [#21647](https://github.com/ClickHouse/ClickHouse/issues/21647). Fixed allocated bytes calculations. Fixed type name in `system.dictionaries` in case of `ComplexKeyHashedDictionary`. [#33927](https://github.com/ClickHouse/ClickHouse/pull/33927) ([Maksim Kita](https://github.com/kitaisreal)). -* `flat`, `hashed`, `hashed_array` dictionaries now support creating with empty attributes, with support of reading the keys and using `dictHas`. Fixes [#33820](https://github.com/ClickHouse/ClickHouse/issues/33820). [#33918](https://github.com/ClickHouse/ClickHouse/pull/33918) ([Maksim Kita](https://github.com/kitaisreal)). -* Added support for `DateTime64` data type in dictionaries. [#33914](https://github.com/ClickHouse/ClickHouse/pull/33914) ([Maksim Kita](https://github.com/kitaisreal)). -* Allow to write `s3(url, access_key_id, secret_access_key)` (autodetect of data format and table structure, but with explicit credentials). [#34503](https://github.com/ClickHouse/ClickHouse/pull/34503) ([Kruglov Pavel](https://github.com/Avogar)). -* Added sending of the output format back to client like it's done in HTTP protocol as suggested in [#34362](https://github.com/ClickHouse/ClickHouse/issues/34362). Closes [#34362](https://github.com/ClickHouse/ClickHouse/issues/34362). [#34499](https://github.com/ClickHouse/ClickHouse/pull/34499) ([Vitaly Baranov](https://github.com/vitlibar)). -* Send ProfileEvents statistics in case of INSERT SELECT query (to display query metrics in `clickhouse-client` for this type of queries). [#34498](https://github.com/ClickHouse/ClickHouse/pull/34498) ([Dmitry Novik](https://github.com/novikd)). -* Recognize `.jsonl` extension for JSONEachRow format. [#34496](https://github.com/ClickHouse/ClickHouse/pull/34496) ([Kruglov Pavel](https://github.com/Avogar)). -* Improve schema inference in clickhouse-local. Allow to write just `clickhouse-local -q "select * from table" < data.format`. [#34495](https://github.com/ClickHouse/ClickHouse/pull/34495) ([Kruglov Pavel](https://github.com/Avogar)). -* Privileges CREATE/ALTER/DROP ROW POLICY now can be granted on a table or on `database.*` as well as globally `*.*`. [#34489](https://github.com/ClickHouse/ClickHouse/pull/34489) ([Vitaly Baranov](https://github.com/vitlibar)). -* Allow to export arbitrary large files to `s3`. Add two new settings: `s3_upload_part_size_multiply_factor` and `s3_upload_part_size_multiply_parts_count_threshold`. Now each time `s3_upload_part_size_multiply_parts_count_threshold` uploaded to S3 from a single query `s3_min_upload_part_size` multiplied by `s3_upload_part_size_multiply_factor`. Fixes [#34244](https://github.com/ClickHouse/ClickHouse/issues/34244). [#34422](https://github.com/ClickHouse/ClickHouse/pull/34422) ([alesapin](https://github.com/alesapin)). -* Allow to skip not found (404) URLs for globs when using URL storage / table function. Also closes [#34359](https://github.com/ClickHouse/ClickHouse/issues/34359). [#34392](https://github.com/ClickHouse/ClickHouse/pull/34392) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Default input and output formats for `clickhouse-local` that can be overriden by --input-format and --output-format. Close [#30631](https://github.com/ClickHouse/ClickHouse/issues/30631). [#34352](https://github.com/ClickHouse/ClickHouse/pull/34352) ([李扬](https://github.com/taiyang-li)). -* Add options for `clickhouse-format`. Which close [#30528](https://github.com/ClickHouse/ClickHouse/issues/30528) - `max_query_size` - `max_parser_depth`. [#34349](https://github.com/ClickHouse/ClickHouse/pull/34349) ([李扬](https://github.com/taiyang-li)). -* Better handling of pre-inputs before client start. This is for [#34308](https://github.com/ClickHouse/ClickHouse/issues/34308). [#34336](https://github.com/ClickHouse/ClickHouse/pull/34336) ([Amos Bird](https://github.com/amosbird)). -* `REGEXP_MATCHES` and `REGEXP_REPLACE` function aliases for compatibility with PostgreSQL. Close [#30885](https://github.com/ClickHouse/ClickHouse/issues/30885). [#34334](https://github.com/ClickHouse/ClickHouse/pull/34334) ([李扬](https://github.com/taiyang-li)). -* Some servers expect a User-Agent header in their HTTP requests. A `User-Agent` header entry has been added to HTTP requests of the form: User-Agent: ClickHouse/VERSION_STRING. [#34330](https://github.com/ClickHouse/ClickHouse/pull/34330) ([Saad Ur Rahman](https://github.com/surahman)). -* Cancel merges before acquiring table lock for `TRUNCATE` query to avoid `DEADLOCK_AVOIDED` error in some cases. Fixes [#34302](https://github.com/ClickHouse/ClickHouse/issues/34302). [#34304](https://github.com/ClickHouse/ClickHouse/pull/34304) ([tavplubix](https://github.com/tavplubix)). -* Change severity of the "Cancelled merging parts" message in logs, because it's not an error. This closes [#34148](https://github.com/ClickHouse/ClickHouse/issues/34148). [#34232](https://github.com/ClickHouse/ClickHouse/pull/34232) ([alexey-milovidov](https://github.com/alexey-milovidov)). -* Add ability to compose PostgreSQL-style cast operator `::` with expressions using `[]` and `.` operators (array and tuple indexing). [#34229](https://github.com/ClickHouse/ClickHouse/pull/34229) ([Nikolay Degterinsky](https://github.com/evillique)). -* Recognize `YYYYMMDD-hhmmss` format in `parseDateTimeBestEffort` function. This closes [#34206](https://github.com/ClickHouse/ClickHouse/issues/34206). [#34208](https://github.com/ClickHouse/ClickHouse/pull/34208) ([alexey-milovidov](https://github.com/alexey-milovidov)). -* Allow carriage return in the middle of the line while parsing by `Regexp` format. This closes [#34200](https://github.com/ClickHouse/ClickHouse/issues/34200). [#34205](https://github.com/ClickHouse/ClickHouse/pull/34205) ([alexey-milovidov](https://github.com/alexey-milovidov)). -* Allow to parse dictionary's `PRIMARY KEY` as `PRIMARY KEY (id, value)`; previously supported only `PRIMARY KEY id, value`. Closes [#34135](https://github.com/ClickHouse/ClickHouse/issues/34135). [#34141](https://github.com/ClickHouse/ClickHouse/pull/34141) ([Maksim Kita](https://github.com/kitaisreal)). -* An optional argument for `splitByChar` to limit the number of resulting elements. close [#34081](https://github.com/ClickHouse/ClickHouse/issues/34081). [#34140](https://github.com/ClickHouse/ClickHouse/pull/34140) ([李扬](https://github.com/taiyang-li)). -* Improving the experience of multiple line editing for clickhouse-client. This is a follow-up of [#31123](https://github.com/ClickHouse/ClickHouse/pull/31123). [#34114](https://github.com/ClickHouse/ClickHouse/pull/34114) ([Amos Bird](https://github.com/amosbird)). -* Add `UUID` suport in `MsgPack` input/output format. [#34065](https://github.com/ClickHouse/ClickHouse/pull/34065) ([Kruglov Pavel](https://github.com/Avogar)). -* Tracing context (for OpenTelemetry) is now propagated from GRPC client metadata (this change is relevant for GRPC client-server protocol). [#34064](https://github.com/ClickHouse/ClickHouse/pull/34064) ([andremarianiello](https://github.com/andremarianiello)). -* Supports all types of `SYSTEM` queries with `ON CLUSTER` clause. [#34005](https://github.com/ClickHouse/ClickHouse/pull/34005) ([小路](https://github.com/nicelulu)). -* Improve memory accounting for queries that are using less than `max_untracker_memory`. [#34001](https://github.com/ClickHouse/ClickHouse/pull/34001) ([Azat Khuzhin](https://github.com/azat)). -* Fixed UTF-8 string case-insensitive search when lowercase and uppercase characters are represented by different number of bytes. Example is `ẞ` and `ß`. This closes [#7334](https://github.com/ClickHouse/ClickHouse/issues/7334). [#33992](https://github.com/ClickHouse/ClickHouse/pull/33992) ([Harry Lee](https://github.com/HarryLeeIBM)). -* Detect format and schema from stdin in `clickhouse-local`. [#33960](https://github.com/ClickHouse/ClickHouse/pull/33960) ([Kruglov Pavel](https://github.com/Avogar)). -* Correctly handle the case of misconfiguration when multiple disks are using the same path on the filesystem. [#29072](https://github.com/ClickHouse/ClickHouse/issues/29072). [#33905](https://github.com/ClickHouse/ClickHouse/pull/33905) ([zhongyuankai](https://github.com/zhongyuankai)). -* Try every resolved IP address while getting S3 proxy. S3 proxies are rarely used, mostly in Yandex Cloud. [#33862](https://github.com/ClickHouse/ClickHouse/pull/33862) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* Support EXPLAIN AST CREATE FUNCTION query `EXPLAIN AST CREATE FUNCTION mycast AS (n) -> cast(n as String)` will return `EXPLAIN AST CREATE FUNCTION mycast AS n -> CAST(n, 'String')`. [#33819](https://github.com/ClickHouse/ClickHouse/pull/33819) ([李扬](https://github.com/taiyang-li)). -* Added support for cast from `Map(Key, Value)` to `Array(Tuple(Key, Value))`. [#33794](https://github.com/ClickHouse/ClickHouse/pull/33794) ([Maksim Kita](https://github.com/kitaisreal)). -* Add some improvements and fixes for `Bool` data type. Fixes [#33244](https://github.com/ClickHouse/ClickHouse/issues/33244). [#33737](https://github.com/ClickHouse/ClickHouse/pull/33737) ([Kruglov Pavel](https://github.com/Avogar)). -* Parse and store OpenTelemetry trace-id in big-endian order. [#33723](https://github.com/ClickHouse/ClickHouse/pull/33723) ([Frank Chen](https://github.com/FrankChen021)). -* Improvement for `fromUnixTimestamp64` family functions.. They now accept any integer value that can be converted to `Int64`. This closes: [#14648](https://github.com/ClickHouse/ClickHouse/issues/14648). [#33505](https://github.com/ClickHouse/ClickHouse/pull/33505) ([Andrey Zvonov](https://github.com/zvonand)). -* Reimplement `_shard_num` from constants (see [#7624](https://github.com/ClickHouse/ClickHouse/issues/7624)) with `shardNum()` function (seee [#27020](https://github.com/ClickHouse/ClickHouse/issues/27020)), to avoid possible issues (like those that had been found in [#16947](https://github.com/ClickHouse/ClickHouse/issues/16947)). [#33392](https://github.com/ClickHouse/ClickHouse/pull/33392) ([Azat Khuzhin](https://github.com/azat)). -* Enable binary arithmetic (plus, minus, multiply, division, least, greatest) between Decimal and Float. [#33355](https://github.com/ClickHouse/ClickHouse/pull/33355) ([flynn](https://github.com/ucasfl)). -* Respect cgroups limits in max_threads autodetection. [#33342](https://github.com/ClickHouse/ClickHouse/pull/33342) ([JaySon](https://github.com/JaySon-Huang)). -* Add new clickhouse-keeper setting `min_session_timeout_ms`. Now clickhouse-keeper will determine client session timeout according to `min_session_timeout_ms` and `session_timeout_ms` settings. [#33288](https://github.com/ClickHouse/ClickHouse/pull/33288) ([JackyWoo](https://github.com/JackyWoo)). -* Added `UUID` data type support for functions `hex` and `bin`. [#32170](https://github.com/ClickHouse/ClickHouse/pull/32170) ([Frank Chen](https://github.com/FrankChen021)). -* Fix reading of subcolumns with dots in their names. In particular fixed reading of `Nested` columns, if their element names contain dots (e.g ```Nested(`keys.name` String, `keys.id` UInt64, values UInt64)```). [#34228](https://github.com/ClickHouse/ClickHouse/pull/34228) ([Anton Popov](https://github.com/CurtizJ)). -* Fixes `parallel_view_processing = 0` not working when inserting into a table using `VALUES`. - Fixes `view_duration_ms` in the `query_views_log` not being set correctly for materialized views. [#34067](https://github.com/ClickHouse/ClickHouse/pull/34067) ([Raúl Marín](https://github.com/Algunenano)). -* Fix parsing tables structure from ZooKeeper: now metadata from ZooKeeper compared with local metadata in canonical form. It helps when canonical function names can change between ClickHouse versions. [#33933](https://github.com/ClickHouse/ClickHouse/pull/33933) ([sunny](https://github.com/sunny19930321)). -* Properly escape some characters for interaction with LDAP. [#33401](https://github.com/ClickHouse/ClickHouse/pull/33401) ([IlyaTsoi](https://github.com/IlyaTsoi)). - -#### Build/Testing/Packaging Improvement - -* Remove unbundled build support. [#33690](https://github.com/ClickHouse/ClickHouse/pull/33690) ([Azat Khuzhin](https://github.com/azat)). -* Ensure that tests don't depend on the result of non-stable sorting of equal elements. Added equal items ranges randomization in debug after sort to prevent issues when we rely on equal items sort order. [#34393](https://github.com/ClickHouse/ClickHouse/pull/34393) ([Maksim Kita](https://github.com/kitaisreal)). -* Add verbosity to a style check. [#34289](https://github.com/ClickHouse/ClickHouse/pull/34289) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). -* Remove `clickhouse-test` debian package because it's obsolete. [#33948](https://github.com/ClickHouse/ClickHouse/pull/33948) ([Ilya Yatsishin](https://github.com/qoega)). -* Multiple improvements for build system to remove the possibility of occasionally using packages from the OS and to enforce hermetic builds. [#33695](https://github.com/ClickHouse/ClickHouse/pull/33695) ([Amos Bird](https://github.com/amosbird)). - -#### Bug Fix (user-visible misbehaviour in official stable or prestable release) - -* Fixed the assertion in case of using `allow_experimental_parallel_reading_from_replicas` with `max_parallel_replicas` equals to 1. This fixes [#34525](https://github.com/ClickHouse/ClickHouse/issues/34525). [#34613](https://github.com/ClickHouse/ClickHouse/pull/34613) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). -* Fix rare bug while reading of empty arrays, which could lead to `Data compressed with different methods` error. It can reproduce if you have mostly empty arrays, but not always. And reading is performed in backward direction with ORDER BY ... DESC. This error is extremely unlikely to happen. [#34327](https://github.com/ClickHouse/ClickHouse/pull/34327) ([Anton Popov](https://github.com/CurtizJ)). -* Fix wrong result of `round`/`roundBankers` if integer values of small types are rounded. Closes [#33267](https://github.com/ClickHouse/ClickHouse/issues/33267). [#34562](https://github.com/ClickHouse/ClickHouse/pull/34562) ([李扬](https://github.com/taiyang-li)). -* Sometimes query cancellation did not work immediately when we were reading multiple files from s3 or HDFS. Fixes [#34301](https://github.com/ClickHouse/ClickHouse/issues/34301) Relates to [#34397](https://github.com/ClickHouse/ClickHouse/issues/34397). [#34539](https://github.com/ClickHouse/ClickHouse/pull/34539) ([Dmitry Novik](https://github.com/novikd)). -* Fix exception `Chunk should have AggregatedChunkInfo in MergingAggregatedTransform` (in case of `optimize_aggregation_in_order = 1` and `distributed_aggregation_memory_efficient = 0`). Fixes [#34526](https://github.com/ClickHouse/ClickHouse/issues/34526). [#34532](https://github.com/ClickHouse/ClickHouse/pull/34532) ([Anton Popov](https://github.com/CurtizJ)). -* Fix comparison between integers and floats in index analysis. Previously it could lead to skipping some granules for reading by mistake. Fixes [#34493](https://github.com/ClickHouse/ClickHouse/issues/34493). [#34528](https://github.com/ClickHouse/ClickHouse/pull/34528) ([Anton Popov](https://github.com/CurtizJ)). -* Fix compression support in URL engine. [#34524](https://github.com/ClickHouse/ClickHouse/pull/34524) ([Frank Chen](https://github.com/FrankChen021)). -* Fix possible error 'file_size: Operation not supported' in files' schema autodetection. [#34479](https://github.com/ClickHouse/ClickHouse/pull/34479) ([Kruglov Pavel](https://github.com/Avogar)). -* Fixes possible race with table deletion. [#34416](https://github.com/ClickHouse/ClickHouse/pull/34416) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Fix possible error `Cannot convert column Function to mask` in short circuit function evaluation. Closes [#34171](https://github.com/ClickHouse/ClickHouse/issues/34171). [#34415](https://github.com/ClickHouse/ClickHouse/pull/34415) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix potential crash when doing schema inference from url source. Closes [#34147](https://github.com/ClickHouse/ClickHouse/issues/34147). [#34405](https://github.com/ClickHouse/ClickHouse/pull/34405) ([Kruglov Pavel](https://github.com/Avogar)). -* For UDFs access permissions were checked for database level instead of global level as it should be. Closes [#34281](https://github.com/ClickHouse/ClickHouse/issues/34281). [#34404](https://github.com/ClickHouse/ClickHouse/pull/34404) ([Maksim Kita](https://github.com/kitaisreal)). -* Fix wrong engine syntax in result of `SHOW CREATE DATABASE` query for databases with engine `Memory`. This closes [#34335](https://github.com/ClickHouse/ClickHouse/issues/34335). [#34345](https://github.com/ClickHouse/ClickHouse/pull/34345) ([alexey-milovidov](https://github.com/alexey-milovidov)). -* Fixed a couple of extremely rare race conditions that might lead to broken state of replication queue and "intersecting parts" error. [#34297](https://github.com/ClickHouse/ClickHouse/pull/34297) ([tavplubix](https://github.com/tavplubix)). -* Fix progress bar width. It was incorrectly rounded to integer number of characters. [#34275](https://github.com/ClickHouse/ClickHouse/pull/34275) ([alexey-milovidov](https://github.com/alexey-milovidov)). -* Fix current_user/current_address client information fields for inter-server communication (before this patch current_user/current_address will be preserved from the previous query). [#34263](https://github.com/ClickHouse/ClickHouse/pull/34263) ([Azat Khuzhin](https://github.com/azat)). -* Fix memory leak in case of some Exception during query processing with `optimize_aggregation_in_order=1`. [#34234](https://github.com/ClickHouse/ClickHouse/pull/34234) ([Azat Khuzhin](https://github.com/azat)). -* Fix metric `Query`, which shows the number of executing queries. In last several releases it was always 0. [#34224](https://github.com/ClickHouse/ClickHouse/pull/34224) ([Anton Popov](https://github.com/CurtizJ)). -* Fix schema inference for table runction `s3`. [#34186](https://github.com/ClickHouse/ClickHouse/pull/34186) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix rare and benign race condition in `HDFS`, `S3` and `URL` storage engines which can lead to additional connections. [#34172](https://github.com/ClickHouse/ClickHouse/pull/34172) ([alesapin](https://github.com/alesapin)). -* Fix bug which can rarely lead to error "Cannot read all data" while reading LowCardinality columns of MergeTree table engines family which stores data on remote file system like S3 (virtual filesystem over s3 is an experimental feature that is not ready for production). [#34139](https://github.com/ClickHouse/ClickHouse/pull/34139) ([alesapin](https://github.com/alesapin)). -* Fix inserts to distributed tables in case of a change of native protocol. The last change was in the version 22.1, so there may be some failures of inserts to distributed tables after upgrade to that version. [#34132](https://github.com/ClickHouse/ClickHouse/pull/34132) ([Anton Popov](https://github.com/CurtizJ)). -* Fix possible data race in `File` table engine that was introduced in [#33960](https://github.com/ClickHouse/ClickHouse/pull/33960). Closes [#34111](https://github.com/ClickHouse/ClickHouse/issues/34111). [#34113](https://github.com/ClickHouse/ClickHouse/pull/34113) ([Kruglov Pavel](https://github.com/Avogar)). -* Fixed minor race condition that might cause "intersecting parts" error in extremely rare cases after ZooKeeper connection loss. [#34096](https://github.com/ClickHouse/ClickHouse/pull/34096) ([tavplubix](https://github.com/tavplubix)). -* Fix asynchronous inserts with `Native` format. [#34068](https://github.com/ClickHouse/ClickHouse/pull/34068) ([Anton Popov](https://github.com/CurtizJ)). -* Fix bug which lead to inability for server to start when both replicated access storage and keeper (embedded in clickhouse-server) are used. Introduced two settings for keeper socket timeout instead of settings from default user: `keeper_server.socket_receive_timeout_sec` and `keeper_server.socket_send_timeout_sec`. Fixes [#33973](https://github.com/ClickHouse/ClickHouse/issues/33973). [#33988](https://github.com/ClickHouse/ClickHouse/pull/33988) ([alesapin](https://github.com/alesapin)). -* Fix segfault while parsing ORC file with corrupted footer. Closes [#33797](https://github.com/ClickHouse/ClickHouse/issues/33797). [#33984](https://github.com/ClickHouse/ClickHouse/pull/33984) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix parsing IPv6 from query parameter (prepared statements) and fix IPv6 to string conversion. Closes [#33928](https://github.com/ClickHouse/ClickHouse/issues/33928). [#33971](https://github.com/ClickHouse/ClickHouse/pull/33971) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix crash while reading of nested tuples. Fixes [#33838](https://github.com/ClickHouse/ClickHouse/issues/33838). [#33956](https://github.com/ClickHouse/ClickHouse/pull/33956) ([Anton Popov](https://github.com/CurtizJ)). -* Fix usage of functions `array` and `tuple` with literal arguments in distributed queries. Previously it could lead to `Not found columns` exception. [#33938](https://github.com/ClickHouse/ClickHouse/pull/33938) ([Anton Popov](https://github.com/CurtizJ)). -* Aggregate function combinator `-If` did not correctly process `Nullable` filter argument. This closes [#27073](https://github.com/ClickHouse/ClickHouse/issues/27073). [#33920](https://github.com/ClickHouse/ClickHouse/pull/33920) ([alexey-milovidov](https://github.com/alexey-milovidov)). -* Fix potential race condition when doing remote disk read (virtual filesystem over s3 is an experimental feature that is not ready for production). [#33912](https://github.com/ClickHouse/ClickHouse/pull/33912) ([Amos Bird](https://github.com/amosbird)). -* Fix crash if SQL UDF is created with lambda with non identifier arguments. Closes [#33866](https://github.com/ClickHouse/ClickHouse/issues/33866). [#33868](https://github.com/ClickHouse/ClickHouse/pull/33868) ([Maksim Kita](https://github.com/kitaisreal)). -* Fix usage of sparse columns (which can be enabled by experimental setting `ratio_of_defaults_for_sparse_serialization`). [#33849](https://github.com/ClickHouse/ClickHouse/pull/33849) ([Anton Popov](https://github.com/CurtizJ)). -* Fixed `replica is not readonly` logical error on `SYSTEM RESTORE REPLICA` query when replica is actually readonly. Fixes [#33806](https://github.com/ClickHouse/ClickHouse/issues/33806). [#33847](https://github.com/ClickHouse/ClickHouse/pull/33847) ([tavplubix](https://github.com/tavplubix)). -* Fix memory leak in `clickhouse-keeper` in case of compression is used (default). [#33840](https://github.com/ClickHouse/ClickHouse/pull/33840) ([Azat Khuzhin](https://github.com/azat)). -* Fix index analysis with no common types available. [#33833](https://github.com/ClickHouse/ClickHouse/pull/33833) ([Amos Bird](https://github.com/amosbird)). -* Fix schema inference for `JSONEachRow` and `JSONCompactEachRow`. [#33830](https://github.com/ClickHouse/ClickHouse/pull/33830) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix usage of external dictionaries with `redis` source and large number of keys. [#33804](https://github.com/ClickHouse/ClickHouse/pull/33804) ([Anton Popov](https://github.com/CurtizJ)). -* Fix bug in client that led to 'Connection reset by peer' in server. Closes [#33309](https://github.com/ClickHouse/ClickHouse/issues/33309). [#33790](https://github.com/ClickHouse/ClickHouse/pull/33790) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix parsing query INSERT INTO ... VALUES SETTINGS ... (...), ... [#33776](https://github.com/ClickHouse/ClickHouse/pull/33776) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix bug of check table when creating data part with wide format and projection. [#33774](https://github.com/ClickHouse/ClickHouse/pull/33774) ([李扬](https://github.com/taiyang-li)). -* Fix tiny race between count() and INSERT/merges/... in MergeTree (it is possible to return incorrect number of rows for SELECT with optimize_trivial_count_query). [#33753](https://github.com/ClickHouse/ClickHouse/pull/33753) ([Azat Khuzhin](https://github.com/azat)). -* Throw exception when directory listing request has failed in storage HDFS. [#33724](https://github.com/ClickHouse/ClickHouse/pull/33724) ([LiuNeng](https://github.com/liuneng1994)). -* Fix mutation when table contains projections. This fixes [#33010](https://github.com/ClickHouse/ClickHouse/issues/33010). This fixes [#33275](https://github.com/ClickHouse/ClickHouse/issues/33275). [#33679](https://github.com/ClickHouse/ClickHouse/pull/33679) ([Amos Bird](https://github.com/amosbird)). -* Correctly determine current database if `CREATE TEMPORARY TABLE AS SELECT` is queried inside a named HTTP session. This is a very rare use case. This closes [#8340](https://github.com/ClickHouse/ClickHouse/issues/8340). [#33676](https://github.com/ClickHouse/ClickHouse/pull/33676) ([alexey-milovidov](https://github.com/alexey-milovidov)). -* Allow some queries with sorting, LIMIT BY, ARRAY JOIN and lambda functions. This closes [#7462](https://github.com/ClickHouse/ClickHouse/issues/7462). [#33675](https://github.com/ClickHouse/ClickHouse/pull/33675) ([alexey-milovidov](https://github.com/alexey-milovidov)). -* Fix bug in "zero copy replication" (a feature that is under development and should not be used in production) which lead to data duplication in case of TTL move. Fixes [#33643](https://github.com/ClickHouse/ClickHouse/issues/33643). [#33642](https://github.com/ClickHouse/ClickHouse/pull/33642) ([alesapin](https://github.com/alesapin)). -* Fix `Chunk should have AggregatedChunkInfo in GroupingAggregatedTransform` (in case of `optimize_aggregation_in_order = 1`). [#33637](https://github.com/ClickHouse/ClickHouse/pull/33637) ([Azat Khuzhin](https://github.com/azat)). -* Fix error `Bad cast from type ... to DB::DataTypeArray` which may happen when table has `Nested` column with dots in name, and default value is generated for it (e.g. during insert, when column is not listed). Continuation of [#28762](https://github.com/ClickHouse/ClickHouse/issues/28762). [#33588](https://github.com/ClickHouse/ClickHouse/pull/33588) ([Alexey Pavlenko](https://github.com/alexeypavlenko)). -* Export into `lz4` files has been fixed. Closes [#31421](https://github.com/ClickHouse/ClickHouse/issues/31421). [#31862](https://github.com/ClickHouse/ClickHouse/pull/31862) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix potential crash if `group_by_overflow_mode` was set to `any` (approximate GROUP BY) and aggregation was performed by single column of type `LowCardinality`. [#34506](https://github.com/ClickHouse/ClickHouse/pull/34506) ([DR](https://github.com/freedomDR)). -* Fix inserting to temporary tables via gRPC client-server protocol. Fixes [#34347](https://github.com/ClickHouse/ClickHouse/issues/34347), issue `#2`. [#34364](https://github.com/ClickHouse/ClickHouse/pull/34364) ([Vitaly Baranov](https://github.com/vitlibar)). -* Fix issue [#19429](https://github.com/ClickHouse/ClickHouse/issues/19429). [#34225](https://github.com/ClickHouse/ClickHouse/pull/34225) ([Vitaly Baranov](https://github.com/vitlibar)). -* Fix issue [#18206](https://github.com/ClickHouse/ClickHouse/issues/18206). [#33977](https://github.com/ClickHouse/ClickHouse/pull/33977) ([Vitaly Baranov](https://github.com/vitlibar)). -* This PR allows using multiple LDAP storages in the same list of user directories. It worked earlier but was broken because LDAP tests are disabled (they are part of the testflows tests). [#33574](https://github.com/ClickHouse/ClickHouse/pull/33574) ([Vitaly Baranov](https://github.com/vitlibar)). - - -### ClickHouse release v22.1, 2022-01-18 - -#### Upgrade Notes - -* The functions `left` and `right` were previously implemented in parser and now full-featured. Distributed queries with `left` or `right` functions without aliases may throw exception if cluster contains different versions of clickhouse-server. If you are upgrading your cluster and encounter this error, you should finish upgrading your cluster to ensure all nodes have the same version. Also you can add aliases (`AS something`) to the columns in your queries to avoid this issue. [#33407](https://github.com/ClickHouse/ClickHouse/pull/33407) ([alexey-milovidov](https://github.com/alexey-milovidov)). -* Resource usage by scalar subqueries is fully accounted since this version. With this change, rows read in scalar subqueries are now reported in the query_log. If the scalar subquery is cached (repeated or called for several rows) the rows read are only counted once. This change allows KILLing queries and reporting progress while they are executing scalar subqueries. [#32271](https://github.com/ClickHouse/ClickHouse/pull/32271) ([Raúl Marín](https://github.com/Algunenano)). - -#### New Feature - -* Implement data schema inference for input formats. Allow to skip structure (or write just `auto`) in table functions `file`, `url`, `s3`, `hdfs` and in parameters of `clickhouse-local` . Allow to skip structure in create query for table engines `File`, `HDFS`, `S3`, `URL`, `Merge`, `Buffer`, `Distributed` and `ReplicatedMergeTree` (if we add new replicas). [#32455](https://github.com/ClickHouse/ClickHouse/pull/32455) ([Kruglov Pavel](https://github.com/Avogar)). -* Detect format by file extension in `file`/`hdfs`/`s3`/`url` table functions and `HDFS`/`S3`/`URL` table engines and also for `SELECT INTO OUTFILE` and `INSERT FROM INFILE` [#33565](https://github.com/ClickHouse/ClickHouse/pull/33565) ([Kruglov Pavel](https://github.com/Avogar)). Close [#30918](https://github.com/ClickHouse/ClickHouse/issues/30918). [#33443](https://github.com/ClickHouse/ClickHouse/pull/33443) ([OnePiece](https://github.com/zhongyuankai)). -* A tool for collecting diagnostics data if you need support. [#33175](https://github.com/ClickHouse/ClickHouse/pull/33175) ([Alexander Burmak](https://github.com/Alex-Burmak)). -* Automatic cluster discovery via Zoo/Keeper. It allows to add replicas to the cluster without changing configuration on every server. [#31442](https://github.com/ClickHouse/ClickHouse/pull/31442) ([vdimir](https://github.com/vdimir)). -* Implement hive table engine to access apache hive from clickhouse. This implements: [#29245](https://github.com/ClickHouse/ClickHouse/issues/29245). [#31104](https://github.com/ClickHouse/ClickHouse/pull/31104) ([taiyang-li](https://github.com/taiyang-li)). -* Add aggregate functions `cramersV`, `cramersVBiasCorrected`, `theilsU` and `contingency`. These functions calculate dependency (measure of association) between categorical values. All these functions are using cross-tab (histogram on pairs) for implementation. You can imagine it like a correlation coefficient but for any discrete values (not necessary numbers). [#33366](https://github.com/ClickHouse/ClickHouse/pull/33366) ([alexey-milovidov](https://github.com/alexey-milovidov)). Initial implementation by [Vanyok-All-is-OK](https://github.com/Vanyok-All-is-OK) and [antikvist](https://github.com/antikvist). -* Added table function `hdfsCluster` which allows processing files from HDFS in parallel from many nodes in a specified cluster, similarly to `s3Cluster`. [#32400](https://github.com/ClickHouse/ClickHouse/pull/32400) ([Zhichang Yu](https://github.com/yuzhichang)). -* Adding support for disks backed by Azure Blob Storage, in a similar way it has been done for disks backed by AWS S3. [#31505](https://github.com/ClickHouse/ClickHouse/pull/31505) ([Jakub Kuklis](https://github.com/jkuklis)). -* Allow `COMMENT` in `CREATE VIEW` (for all VIEW kinds). [#31062](https://github.com/ClickHouse/ClickHouse/pull/31062) ([Vasily Nemkov](https://github.com/Enmk)). -* Dynamically reinitialize listening ports and protocols when configuration changes. [#30549](https://github.com/ClickHouse/ClickHouse/pull/30549) ([Kevin Michel](https://github.com/kmichel-aiven)). -* Added `left`, `right`, `leftUTF8`, `rightUTF8` functions. Fix error in implementation of `substringUTF8` function with negative offset (offset from the end of string). [#33407](https://github.com/ClickHouse/ClickHouse/pull/33407) ([alexey-milovidov](https://github.com/alexey-milovidov)). -* Add new functions for `H3` coordinate system: `h3HexAreaKm2`, `h3CellAreaM2`, `h3CellAreaRads2`. [#33479](https://github.com/ClickHouse/ClickHouse/pull/33479) ([Bharat Nallan](https://github.com/bharatnc)). -* Add `MONTHNAME` function. [#33436](https://github.com/ClickHouse/ClickHouse/pull/33436) ([usurai](https://github.com/usurai)). -* Added function `arrayLast`. Closes [#33390](https://github.com/ClickHouse/ClickHouse/issues/33390). [#33415](https://github.com/ClickHouse/ClickHouse/pull/33415) Added function `arrayLastIndex`. [#33465](https://github.com/ClickHouse/ClickHouse/pull/33465) ([Maksim Kita](https://github.com/kitaisreal)). -* Add function `decodeURLFormComponent` slightly different to `decodeURLComponent`. Close [#10298](https://github.com/ClickHouse/ClickHouse/issues/10298). [#33451](https://github.com/ClickHouse/ClickHouse/pull/33451) ([SuperDJY](https://github.com/cmsxbc)). -* Allow to split `GraphiteMergeTree` rollup rules for plain/tagged metrics (optional rule_type field). [#33494](https://github.com/ClickHouse/ClickHouse/pull/33494) ([Michail Safronov](https://github.com/msaf1980)). - - -#### Performance Improvement - -* Support moving conditions to `PREWHERE` (setting `optimize_move_to_prewhere`) for tables of `Merge` engine if its all underlying tables supports `PREWHERE`. [#33300](https://github.com/ClickHouse/ClickHouse/pull/33300) ([Anton Popov](https://github.com/CurtizJ)). -* More efficient handling of globs for URL storage. Now you can easily query million URLs in parallel with retries. Closes [#32866](https://github.com/ClickHouse/ClickHouse/issues/32866). [#32907](https://github.com/ClickHouse/ClickHouse/pull/32907) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Avoid exponential backtracking in parser. This closes [#20158](https://github.com/ClickHouse/ClickHouse/issues/20158). [#33481](https://github.com/ClickHouse/ClickHouse/pull/33481) ([alexey-milovidov](https://github.com/alexey-milovidov)). -* Abuse of `untuple` function was leading to exponential complexity of query analysis (found by fuzzer). This closes [#33297](https://github.com/ClickHouse/ClickHouse/issues/33297). [#33445](https://github.com/ClickHouse/ClickHouse/pull/33445) ([alexey-milovidov](https://github.com/alexey-milovidov)). -* Reduce allocated memory for dictionaries with string attributes. [#33466](https://github.com/ClickHouse/ClickHouse/pull/33466) ([Maksim Kita](https://github.com/kitaisreal)). -* Slight performance improvement of `reinterpret` function. [#32587](https://github.com/ClickHouse/ClickHouse/pull/32587) ([alexey-milovidov](https://github.com/alexey-milovidov)). -* Non significant change. In extremely rare cases when data part is lost on every replica, after merging of some data parts, the subsequent queries may skip less amount of partitions during partition pruning. This hardly affects anything. [#32220](https://github.com/ClickHouse/ClickHouse/pull/32220) ([Azat Khuzhin](https://github.com/azat)). -* Improve `clickhouse-keeper` writing performance by optimization the size calculation logic. [#32366](https://github.com/ClickHouse/ClickHouse/pull/32366) ([zhanglistar](https://github.com/zhanglistar)). -* Optimize single part projection materialization. This closes [#31669](https://github.com/ClickHouse/ClickHouse/issues/31669). [#31885](https://github.com/ClickHouse/ClickHouse/pull/31885) ([Amos Bird](https://github.com/amosbird)). -* Improve query performance of system tables. [#33312](https://github.com/ClickHouse/ClickHouse/pull/33312) ([OnePiece](https://github.com/zhongyuankai)). -* Optimize selecting of MergeTree parts that can be moved between volumes. [#33225](https://github.com/ClickHouse/ClickHouse/pull/33225) ([OnePiece](https://github.com/zhongyuankai)). -* Fix `sparse_hashed` dict performance with sequential keys (wrong hash function). [#32536](https://github.com/ClickHouse/ClickHouse/pull/32536) ([Azat Khuzhin](https://github.com/azat)). - - -#### Experimental Feature - -* Parallel reading from multiple replicas within a shard during distributed query without using sample key. To enable this, set `allow_experimental_parallel_reading_from_replicas = 1` and `max_parallel_replicas` to any number. This closes [#26748](https://github.com/ClickHouse/ClickHouse/issues/26748). [#29279](https://github.com/ClickHouse/ClickHouse/pull/29279) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). -* Implemented sparse serialization. It can reduce usage of disk space and improve performance of some queries for columns, which contain a lot of default (zero) values. It can be enabled by setting `ratio_for_sparse_serialization`. Sparse serialization will be chosen dynamically for column, if it has ratio of number of default values to number of all values above that threshold. Serialization (default or sparse) will be fixed for every column in part, but may varies between parts. [#22535](https://github.com/ClickHouse/ClickHouse/pull/22535) ([Anton Popov](https://github.com/CurtizJ)). -* Add "TABLE OVERRIDE" feature for customizing MaterializedMySQL table schemas. [#32325](https://github.com/ClickHouse/ClickHouse/pull/32325) ([Stig Bakken](https://github.com/stigsb)). -* Add `EXPLAIN TABLE OVERRIDE` query. [#32836](https://github.com/ClickHouse/ClickHouse/pull/32836) ([Stig Bakken](https://github.com/stigsb)). -* Support TABLE OVERRIDE clause for MaterializedPostgreSQL. RFC: [#31480](https://github.com/ClickHouse/ClickHouse/issues/31480). [#32749](https://github.com/ClickHouse/ClickHouse/pull/32749) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Change ZooKeeper path for zero-copy marks for shared data. Note that "zero-copy replication" is non-production feature (in early stages of development) that you shouldn't use anyway. But in case if you have used it, let you keep in mind this change. [#32061](https://github.com/ClickHouse/ClickHouse/pull/32061) ([ianton-ru](https://github.com/ianton-ru)). -* Events clause support for WINDOW VIEW watch query. [#32607](https://github.com/ClickHouse/ClickHouse/pull/32607) ([vxider](https://github.com/Vxider)). -* Fix ACL with explicit digit hash in `clickhouse-keeper`: now the behavior consistent with ZooKeeper and generated digest is always accepted. [#33249](https://github.com/ClickHouse/ClickHouse/pull/33249) ([小路](https://github.com/nicelulu)). [#33246](https://github.com/ClickHouse/ClickHouse/pull/33246). -* Fix unexpected projection removal when detaching parts. [#32067](https://github.com/ClickHouse/ClickHouse/pull/32067) ([Amos Bird](https://github.com/amosbird)). - - -#### Improvement - -* Now date time conversion functions that generates time before `1970-01-01 00:00:00` will be saturated to zero instead of overflow. [#29953](https://github.com/ClickHouse/ClickHouse/pull/29953) ([Amos Bird](https://github.com/amosbird)). It also fixes a bug in index analysis if date truncation function would yield result before the Unix epoch. -* Always display resource usage (total CPU usage, total RAM usage and max RAM usage per host) in client. [#33271](https://github.com/ClickHouse/ClickHouse/pull/33271) ([alexey-milovidov](https://github.com/alexey-milovidov)). -* Improve `Bool` type serialization and deserialization, check the range of values. [#32984](https://github.com/ClickHouse/ClickHouse/pull/32984) ([Kruglov Pavel](https://github.com/Avogar)). -* If an invalid setting is defined using the `SET` query or using the query parameters in the HTTP request, error message will contain suggestions that are similar to the invalid setting string (if any exists). [#32946](https://github.com/ClickHouse/ClickHouse/pull/32946) ([Antonio Andelic](https://github.com/antonio2368)). -* Support hints for mistyped setting names for clickhouse-client and clickhouse-local. Closes [#32237](https://github.com/ClickHouse/ClickHouse/issues/32237). [#32841](https://github.com/ClickHouse/ClickHouse/pull/32841) ([凌涛](https://github.com/lingtaolf)). -* Allow to use virtual columns in Materialized Views. Close [#11210](https://github.com/ClickHouse/ClickHouse/issues/11210). [#33482](https://github.com/ClickHouse/ClickHouse/pull/33482) ([OnePiece](https://github.com/zhongyuankai)). -* Add config to disable IPv6 in clickhouse-keeper if needed. This close [#33381](https://github.com/ClickHouse/ClickHouse/issues/33381). [#33450](https://github.com/ClickHouse/ClickHouse/pull/33450) ([Wu Xueyang](https://github.com/wuxueyang96)). -* Add more info to `system.build_options` about current git revision. [#33431](https://github.com/ClickHouse/ClickHouse/pull/33431) ([taiyang-li](https://github.com/taiyang-li)). -* `clickhouse-local`: track memory under `--max_memory_usage_in_client` option. [#33341](https://github.com/ClickHouse/ClickHouse/pull/33341) ([Azat Khuzhin](https://github.com/azat)). -* Allow negative intervals in function `intervalLengthSum`. Their length will be added as well. This closes [#33323](https://github.com/ClickHouse/ClickHouse/issues/33323). [#33335](https://github.com/ClickHouse/ClickHouse/pull/33335) ([alexey-milovidov](https://github.com/alexey-milovidov)). -* `LineAsString` can be used as output format. This closes [#30919](https://github.com/ClickHouse/ClickHouse/issues/30919). [#33331](https://github.com/ClickHouse/ClickHouse/pull/33331) ([Sergei Trifonov](https://github.com/serxa)). -* Support `` in cluster configuration, as an alternative form of `1`. Close [#33270](https://github.com/ClickHouse/ClickHouse/issues/33270). [#33330](https://github.com/ClickHouse/ClickHouse/pull/33330) ([SuperDJY](https://github.com/cmsxbc)). -* Pressing Ctrl+C twice will terminate `clickhouse-benchmark` immediately without waiting for in-flight queries. This closes [#32586](https://github.com/ClickHouse/ClickHouse/issues/32586). [#33303](https://github.com/ClickHouse/ClickHouse/pull/33303) ([alexey-milovidov](https://github.com/alexey-milovidov)). -* Support Unix timestamp with milliseconds in `parseDateTimeBestEffort` function. [#33276](https://github.com/ClickHouse/ClickHouse/pull/33276) ([Ben](https://github.com/benbiti)). -* Allow to cancel query while reading data from external table in the formats: `Arrow` / `Parquet` / `ORC` - it failed to be cancelled it case of big files and setting input_format_allow_seeks as false. Closes [#29678](https://github.com/ClickHouse/ClickHouse/issues/29678). [#33238](https://github.com/ClickHouse/ClickHouse/pull/33238) ([Kseniia Sumarokova](https://github.com/kssenii)). -* If table engine supports `SETTINGS` clause, allow to pass the settings as key-value or via config. Add this support for MySQL. [#33231](https://github.com/ClickHouse/ClickHouse/pull/33231) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Correctly prevent Nullable primary keys if necessary. This is for [#32780](https://github.com/ClickHouse/ClickHouse/issues/32780). [#33218](https://github.com/ClickHouse/ClickHouse/pull/33218) ([Amos Bird](https://github.com/amosbird)). -* Add retry for `PostgreSQL` connections in case nothing has been fetched yet. Closes [#33199](https://github.com/ClickHouse/ClickHouse/issues/33199). [#33209](https://github.com/ClickHouse/ClickHouse/pull/33209) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Validate config keys for external dictionaries. [#33095](https://github.com/ClickHouse/ClickHouse/issues/33095#issuecomment-1000577517). [#33130](https://github.com/ClickHouse/ClickHouse/pull/33130) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Send profile info inside `clickhouse-local`. Closes [#33093](https://github.com/ClickHouse/ClickHouse/issues/33093). [#33097](https://github.com/ClickHouse/ClickHouse/pull/33097) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Short circuit evaluation: support for function `throwIf`. Closes [#32969](https://github.com/ClickHouse/ClickHouse/issues/32969). [#32973](https://github.com/ClickHouse/ClickHouse/pull/32973) ([Maksim Kita](https://github.com/kitaisreal)). -* (This only happens in unofficial builds). Fixed segfault when inserting data into compressed Decimal, String, FixedString and Array columns. This closes [#32939](https://github.com/ClickHouse/ClickHouse/issues/32939). [#32940](https://github.com/ClickHouse/ClickHouse/pull/32940) ([N. Kolotov](https://github.com/nkolotov)). -* Added support for specifying subquery as SQL user defined function. Example: `CREATE FUNCTION test AS () -> (SELECT 1)`. Closes [#30755](https://github.com/ClickHouse/ClickHouse/issues/30755). [#32758](https://github.com/ClickHouse/ClickHouse/pull/32758) ([Maksim Kita](https://github.com/kitaisreal)). -* Improve gRPC compression support for [#28671](https://github.com/ClickHouse/ClickHouse/issues/28671). [#32747](https://github.com/ClickHouse/ClickHouse/pull/32747) ([Vitaly Baranov](https://github.com/vitlibar)). -* Flush all In-Memory data parts when WAL is not enabled while shutdown server or detaching table. [#32742](https://github.com/ClickHouse/ClickHouse/pull/32742) ([nauta](https://github.com/nautaa)). -* Allow to control connection timeouts for MySQL (previously was supported only for dictionary source). Closes [#16669](https://github.com/ClickHouse/ClickHouse/issues/16669). Previously default connect_timeout was rather small, now it is configurable. [#32734](https://github.com/ClickHouse/ClickHouse/pull/32734) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Support `authSource` option for storage `MongoDB`. Closes [#32594](https://github.com/ClickHouse/ClickHouse/issues/32594). [#32702](https://github.com/ClickHouse/ClickHouse/pull/32702) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Support `Date32` type in `genarateRandom` table function. [#32643](https://github.com/ClickHouse/ClickHouse/pull/32643) ([nauta](https://github.com/nautaa)). -* Add settings `max_concurrent_select_queries` and `max_concurrent_insert_queries` for control concurrent queries by query kind. Close [#3575](https://github.com/ClickHouse/ClickHouse/issues/3575). [#32609](https://github.com/ClickHouse/ClickHouse/pull/32609) ([SuperDJY](https://github.com/cmsxbc)). -* Improve handling nested structures with missing columns while reading data in `Protobuf` format. Follow-up to https://github.com/ClickHouse/ClickHouse/pull/31988. [#32531](https://github.com/ClickHouse/ClickHouse/pull/32531) ([Vitaly Baranov](https://github.com/vitlibar)). -* Allow empty credentials for `MongoDB` engine. Closes [#26267](https://github.com/ClickHouse/ClickHouse/issues/26267). [#32460](https://github.com/ClickHouse/ClickHouse/pull/32460) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Disable some optimizations for window functions that may lead to exceptions. Closes [#31535](https://github.com/ClickHouse/ClickHouse/issues/31535). Closes [#31620](https://github.com/ClickHouse/ClickHouse/issues/31620). [#32453](https://github.com/ClickHouse/ClickHouse/pull/32453) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Allows to connect to MongoDB 5.0. Closes [#31483](https://github.com/ClickHouse/ClickHouse/issues/31483),. [#32416](https://github.com/ClickHouse/ClickHouse/pull/32416) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Enable comparison between `Decimal` and `Float`. Closes [#22626](https://github.com/ClickHouse/ClickHouse/issues/22626). [#31966](https://github.com/ClickHouse/ClickHouse/pull/31966) ([flynn](https://github.com/ucasFL)). -* Added settings `command_read_timeout`, `command_write_timeout` for `StorageExecutable`, `StorageExecutablePool`, `ExecutableDictionary`, `ExecutablePoolDictionary`, `ExecutableUserDefinedFunctions`. Setting `command_read_timeout` controls timeout for reading data from command stdout in milliseconds. Setting `command_write_timeout` timeout for writing data to command stdin in milliseconds. Added settings `command_termination_timeout` for `ExecutableUserDefinedFunction`, `ExecutableDictionary`, `StorageExecutable`. Added setting `execute_direct` for `ExecutableUserDefinedFunction`, by default true. Added setting `execute_direct` for `ExecutableDictionary`, `ExecutablePoolDictionary`, by default false. [#30957](https://github.com/ClickHouse/ClickHouse/pull/30957) ([Maksim Kita](https://github.com/kitaisreal)). -* Bitmap aggregate functions will give correct result for out of range argument instead of wraparound. [#33127](https://github.com/ClickHouse/ClickHouse/pull/33127) ([DR](https://github.com/freedomDR)). -* Fix parsing incorrect queries with `FROM INFILE` statement. [#33521](https://github.com/ClickHouse/ClickHouse/pull/33521) ([Kruglov Pavel](https://github.com/Avogar)). -* Don't allow to write into `S3` if path contains globs. [#33142](https://github.com/ClickHouse/ClickHouse/pull/33142) ([Kruglov Pavel](https://github.com/Avogar)). -* `--echo` option was not used by `clickhouse-client` in batch mode with single query. [#32843](https://github.com/ClickHouse/ClickHouse/pull/32843) ([N. Kolotov](https://github.com/nkolotov)). -* Use `--database` option for clickhouse-local. [#32797](https://github.com/ClickHouse/ClickHouse/pull/32797) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Fix surprisingly bad code in SQL ordinary function `file`. Now it supports symlinks. [#32640](https://github.com/ClickHouse/ClickHouse/pull/32640) ([alexey-milovidov](https://github.com/alexey-milovidov)). -* Updating `modification_time` for data part in `system.parts` after part movement [#32964](https://github.com/ClickHouse/ClickHouse/issues/32964). [#32965](https://github.com/ClickHouse/ClickHouse/pull/32965) ([save-my-heart](https://github.com/save-my-heart)). -* Potential issue, cannot be exploited: integer overflow may happen in array resize. [#33024](https://github.com/ClickHouse/ClickHouse/pull/33024) ([varadarajkumar](https://github.com/varadarajkumar)). - - -#### Build/Testing/Packaging Improvement - -* Add packages, functional tests and Docker builds for AArch64 (ARM) version of ClickHouse. [#32911](https://github.com/ClickHouse/ClickHouse/pull/32911) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). [#32415](https://github.com/ClickHouse/ClickHouse/pull/32415) -* Prepare ClickHouse to be built with musl-libc. It is not enabled by default. [#33134](https://github.com/ClickHouse/ClickHouse/pull/33134) ([alexey-milovidov](https://github.com/alexey-milovidov)). -* Make installation script working on FreeBSD. This closes [#33384](https://github.com/ClickHouse/ClickHouse/issues/33384). [#33418](https://github.com/ClickHouse/ClickHouse/pull/33418) ([alexey-milovidov](https://github.com/alexey-milovidov)). -* Add `actionlint` for GitHub Actions workflows and verify workflow files via `act --list` to check the correct workflow syntax. [#33612](https://github.com/ClickHouse/ClickHouse/pull/33612) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). -* Add more tests for the nullable primary key feature. Add more tests with different types and merge tree kinds, plus randomly generated data. [#33228](https://github.com/ClickHouse/ClickHouse/pull/33228) ([Amos Bird](https://github.com/amosbird)). -* Add a simple tool to visualize flaky tests in web browser. [#33185](https://github.com/ClickHouse/ClickHouse/pull/33185) ([alexey-milovidov](https://github.com/alexey-milovidov)). -* Enable hermetic build for shared builds. This is mainly for developers. [#32968](https://github.com/ClickHouse/ClickHouse/pull/32968) ([Amos Bird](https://github.com/amosbird)). -* Update `libc++` and `libc++abi` to the latest. [#32484](https://github.com/ClickHouse/ClickHouse/pull/32484) ([Raúl Marín](https://github.com/Algunenano)). -* Added integration test for external .NET client ([ClickHouse.Client](https://github.com/DarkWanderer/ClickHouse.Client)). [#23230](https://github.com/ClickHouse/ClickHouse/pull/23230) ([Oleg V. Kozlyuk](https://github.com/DarkWanderer)). -* Inject git information into clickhouse binary file. So we can get source code revision easily from clickhouse binary file. [#33124](https://github.com/ClickHouse/ClickHouse/pull/33124) ([taiyang-li](https://github.com/taiyang-li)). -* Remove obsolete code from ConfigProcessor. Yandex specific code is not used anymore. The code contained one minor defect. This defect was reported by [Mallik Hassan](https://github.com/SadiHassan) in [#33032](https://github.com/ClickHouse/ClickHouse/issues/33032). This closes [#33032](https://github.com/ClickHouse/ClickHouse/issues/33032). [#33026](https://github.com/ClickHouse/ClickHouse/pull/33026) ([alexey-milovidov](https://github.com/alexey-milovidov)). - - -#### Bug Fix (user-visible misbehavior in official stable or prestable release) - -* Several fixes for format parsing. This is relevant if `clickhouse-server` is open for write access to adversary. Specifically crafted input data for `Native` format may lead to reading uninitialized memory or crash. This is relevant if `clickhouse-server` is open for write access to adversary. [#33050](https://github.com/ClickHouse/ClickHouse/pull/33050) ([Heena Bansal](https://github.com/HeenaBansal2009)). Fixed Apache Avro Union type index out of boundary issue in Apache Avro binary format. [#33022](https://github.com/ClickHouse/ClickHouse/pull/33022) ([Harry Lee](https://github.com/HarryLeeIBM)). Fix null pointer dereference in `LowCardinality` data when deserializing `LowCardinality` data in the Native format. [#33021](https://github.com/ClickHouse/ClickHouse/pull/33021) ([Harry Lee](https://github.com/HarryLeeIBM)). -* ClickHouse Keeper handler will correctly remove operation when response sent. [#32988](https://github.com/ClickHouse/ClickHouse/pull/32988) ([JackyWoo](https://github.com/JackyWoo)). -* Potential off-by-one miscalculation of quotas: quota limit was not reached, but the limit was exceeded. This fixes [#31174](https://github.com/ClickHouse/ClickHouse/issues/31174). [#31656](https://github.com/ClickHouse/ClickHouse/pull/31656) ([sunny](https://github.com/sunny19930321)). -* Fixed CASTing from String to IPv4 or IPv6 and back. Fixed error message in case of failed conversion. [#29224](https://github.com/ClickHouse/ClickHouse/pull/29224) ([Dmitry Novik](https://github.com/novikd)) [#27914](https://github.com/ClickHouse/ClickHouse/pull/27914) ([Vasily Nemkov](https://github.com/Enmk)). -* Fixed an exception like `Unknown aggregate function nothing` during an execution on a remote server. This fixes [#16689](https://github.com/ClickHouse/ClickHouse/issues/16689). [#26074](https://github.com/ClickHouse/ClickHouse/pull/26074) ([hexiaoting](https://github.com/hexiaoting)). -* Fix wrong database for JOIN without explicit database in distributed queries (Fixes: [#10471](https://github.com/ClickHouse/ClickHouse/issues/10471)). [#33611](https://github.com/ClickHouse/ClickHouse/pull/33611) ([Azat Khuzhin](https://github.com/azat)). -* Fix segfault in Apache `Avro` format that appears after the second insert into file. [#33566](https://github.com/ClickHouse/ClickHouse/pull/33566) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix segfault in Apache `Arrow` format if schema contains `Dictionary` type. Closes [#33507](https://github.com/ClickHouse/ClickHouse/issues/33507). [#33529](https://github.com/ClickHouse/ClickHouse/pull/33529) ([Kruglov Pavel](https://github.com/Avogar)). -* Out of band `offset` and `limit` settings may be applied incorrectly for views. Close [#33289](https://github.com/ClickHouse/ClickHouse/issues/33289) [#33518](https://github.com/ClickHouse/ClickHouse/pull/33518) ([hexiaoting](https://github.com/hexiaoting)). -* Fix an exception `Block structure mismatch` which may happen during insertion into table with default nested `LowCardinality` column. Fixes [#33028](https://github.com/ClickHouse/ClickHouse/issues/33028). [#33504](https://github.com/ClickHouse/ClickHouse/pull/33504) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* Fix dictionary expressions for `range_hashed` range min and range max attributes when created using DDL. Closes [#30809](https://github.com/ClickHouse/ClickHouse/issues/30809). [#33478](https://github.com/ClickHouse/ClickHouse/pull/33478) ([Maksim Kita](https://github.com/kitaisreal)). -* Fix possible use-after-free for INSERT into Materialized View with concurrent DROP ([Azat Khuzhin](https://github.com/azat)). -* Do not try to read pass EOF (to workaround for a bug in the Linux kernel), this bug can be reproduced on kernels (3.14..5.9), and requires `index_granularity_bytes=0` (i.e. turn off adaptive index granularity). [#33372](https://github.com/ClickHouse/ClickHouse/pull/33372) ([Azat Khuzhin](https://github.com/azat)). -* The commands `SYSTEM SUSPEND` and `SYSTEM ... THREAD FUZZER` missed access control. It is fixed. Author: Kevin Michel. [#33333](https://github.com/ClickHouse/ClickHouse/pull/33333) ([alexey-milovidov](https://github.com/alexey-milovidov)). -* Fix when `COMMENT` for dictionaries does not appear in `system.tables`, `system.dictionaries`. Allow to modify the comment for `Dictionary` engine. Closes [#33251](https://github.com/ClickHouse/ClickHouse/issues/33251). [#33261](https://github.com/ClickHouse/ClickHouse/pull/33261) ([Maksim Kita](https://github.com/kitaisreal)). -* Add asynchronous inserts (with enabled setting `async_insert`) to query log. Previously such queries didn't appear in the query log. [#33239](https://github.com/ClickHouse/ClickHouse/pull/33239) ([Anton Popov](https://github.com/CurtizJ)). -* Fix sending `WHERE 1 = 0` expressions for external databases query. Closes [#33152](https://github.com/ClickHouse/ClickHouse/issues/33152). [#33214](https://github.com/ClickHouse/ClickHouse/pull/33214) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Fix DDL validation for MaterializedPostgreSQL. Fix setting `materialized_postgresql_allow_automatic_update`. Closes [#29535](https://github.com/ClickHouse/ClickHouse/issues/29535). [#33200](https://github.com/ClickHouse/ClickHouse/pull/33200) ([Kseniia Sumarokova](https://github.com/kssenii)). Make sure unused replication slots are always removed. Found in [#26952](https://github.com/ClickHouse/ClickHouse/issues/26952). [#33187](https://github.com/ClickHouse/ClickHouse/pull/33187) ([Kseniia Sumarokova](https://github.com/kssenii)). Fix MaterializedPostreSQL detach/attach (removing / adding to replication) tables with non-default schema. Found in [#29535](https://github.com/ClickHouse/ClickHouse/issues/29535). [#33179](https://github.com/ClickHouse/ClickHouse/pull/33179) ([Kseniia Sumarokova](https://github.com/kssenii)). Fix DROP MaterializedPostgreSQL database. [#33468](https://github.com/ClickHouse/ClickHouse/pull/33468) ([Kseniia Sumarokova](https://github.com/kssenii)). -* The metric `StorageBufferBytes` sometimes was miscalculated. [#33159](https://github.com/ClickHouse/ClickHouse/pull/33159) ([xuyatian](https://github.com/xuyatian)). -* Fix error `Invalid version for SerializationLowCardinality key column` in case of reading from `LowCardinality` column with `local_filesystem_read_prefetch` or `remote_filesystem_read_prefetch` enabled. [#33046](https://github.com/ClickHouse/ClickHouse/pull/33046) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* Fix `s3` table function reading empty file. Closes [#33008](https://github.com/ClickHouse/ClickHouse/issues/33008). [#33037](https://github.com/ClickHouse/ClickHouse/pull/33037) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Fix Context leak in case of cancel_http_readonly_queries_on_client_close (i.e. leaking of external tables that had been uploaded the the server and other resources). [#32982](https://github.com/ClickHouse/ClickHouse/pull/32982) ([Azat Khuzhin](https://github.com/azat)). -* Fix wrong tuple output in `CSV` format in case of custom csv delimiter. [#32981](https://github.com/ClickHouse/ClickHouse/pull/32981) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix HDFS URL check that didn't allow using HA namenode address. Bug was introduced in https://github.com/ClickHouse/ClickHouse/pull/31042. [#32976](https://github.com/ClickHouse/ClickHouse/pull/32976) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix throwing exception like positional argument out of bounds for non-positional arguments. Closes [#31173](https://github.com/ClickHouse/ClickHouse/issues/31173)#event-5789668239. [#32961](https://github.com/ClickHouse/ClickHouse/pull/32961) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Fix UB in case of unexpected EOF during filling a set from HTTP query (i.e. if the client interrupted in the middle, i.e. `timeout 0.15s curl -Ss -F 's=@t.csv;' 'http://127.0.0.1:8123/?s_structure=key+Int&query=SELECT+dummy+IN+s'` and with large enough `t.csv`). [#32955](https://github.com/ClickHouse/ClickHouse/pull/32955) ([Azat Khuzhin](https://github.com/azat)). -* Fix a regression in `replaceRegexpAll` function. The function worked incorrectly when matched substring was empty. This closes [#32777](https://github.com/ClickHouse/ClickHouse/issues/32777). This closes [#30245](https://github.com/ClickHouse/ClickHouse/issues/30245). [#32945](https://github.com/ClickHouse/ClickHouse/pull/32945) ([alexey-milovidov](https://github.com/alexey-milovidov)). -* Fix `ORC` format stripe reading. [#32929](https://github.com/ClickHouse/ClickHouse/pull/32929) ([kreuzerkrieg](https://github.com/kreuzerkrieg)). -* `topKWeightedState` failed for some input types. [#32487](https://github.com/ClickHouse/ClickHouse/issues/32487). [#32914](https://github.com/ClickHouse/ClickHouse/pull/32914) ([vdimir](https://github.com/vdimir)). -* Fix exception `Single chunk is expected from view inner query (LOGICAL_ERROR)` in materialized view. Fixes [#31419](https://github.com/ClickHouse/ClickHouse/issues/31419). [#32862](https://github.com/ClickHouse/ClickHouse/pull/32862) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* Fix optimization with lazy seek for async reads from remote filesystems. Closes [#32803](https://github.com/ClickHouse/ClickHouse/issues/32803). [#32835](https://github.com/ClickHouse/ClickHouse/pull/32835) ([Kseniia Sumarokova](https://github.com/kssenii)). -* `MergeTree` table engine might silently skip some mutations if there are too many running mutations or in case of high memory consumption, it's fixed. Fixes [#17882](https://github.com/ClickHouse/ClickHouse/issues/17882). [#32814](https://github.com/ClickHouse/ClickHouse/pull/32814) ([tavplubix](https://github.com/tavplubix)). -* Avoid reusing the scalar subquery cache when processing MV blocks. This fixes a bug when the scalar query reference the source table but it means that all subscalar queries in the MV definition will be calculated for each block. [#32811](https://github.com/ClickHouse/ClickHouse/pull/32811) ([Raúl Marín](https://github.com/Algunenano)). -* Server might fail to start if database with `MySQL` engine cannot connect to MySQL server, it's fixed. Fixes [#14441](https://github.com/ClickHouse/ClickHouse/issues/14441). [#32802](https://github.com/ClickHouse/ClickHouse/pull/32802) ([tavplubix](https://github.com/tavplubix)). -* Fix crash when used `fuzzBits` function, close [#32737](https://github.com/ClickHouse/ClickHouse/issues/32737). [#32755](https://github.com/ClickHouse/ClickHouse/pull/32755) ([SuperDJY](https://github.com/cmsxbc)). -* Fix error `Column is not under aggregate function` in case of MV with `GROUP BY (list of columns)` (which is pared as `GROUP BY tuple(...)`) over `Kafka`/`RabbitMQ`. Fixes [#32668](https://github.com/ClickHouse/ClickHouse/issues/32668) and [#32744](https://github.com/ClickHouse/ClickHouse/issues/32744). [#32751](https://github.com/ClickHouse/ClickHouse/pull/32751) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* Fix `ALTER TABLE ... MATERIALIZE TTL` query with `TTL ... DELETE WHERE ...` and `TTL ... GROUP BY ...` modes. [#32695](https://github.com/ClickHouse/ClickHouse/pull/32695) ([Anton Popov](https://github.com/CurtizJ)). -* Fix `optimize_read_in_order` optimization in case when table engine is `Distributed` or `Merge` and its underlying `MergeTree` tables have monotonous function in prefix of sorting key. [#32670](https://github.com/ClickHouse/ClickHouse/pull/32670) ([Anton Popov](https://github.com/CurtizJ)). -* Fix LOGICAL_ERROR exception when the target of a materialized view is a JOIN or a SET table. [#32669](https://github.com/ClickHouse/ClickHouse/pull/32669) ([Raúl Marín](https://github.com/Algunenano)). -* Inserting into S3 with multipart upload to Google Cloud Storage may trigger abort. [#32504](https://github.com/ClickHouse/ClickHouse/issues/32504). [#32649](https://github.com/ClickHouse/ClickHouse/pull/32649) ([vdimir](https://github.com/vdimir)). -* Fix possible exception at `RabbitMQ` storage startup by delaying channel creation. [#32584](https://github.com/ClickHouse/ClickHouse/pull/32584) ([Kseniia Sumarokova](https://github.com/kssenii)). -* Fix table lifetime (i.e. possible use-after-free) in case of parallel DROP TABLE and INSERT. [#32572](https://github.com/ClickHouse/ClickHouse/pull/32572) ([Azat Khuzhin](https://github.com/azat)). -* Fix async inserts with formats `CustomSeparated`, `Template`, `Regexp`, `MsgPack` and `JSONAsString`. Previousely the async inserts with these formats didn't read any data. [#32530](https://github.com/ClickHouse/ClickHouse/pull/32530) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix `groupBitmapAnd` function on distributed table. [#32529](https://github.com/ClickHouse/ClickHouse/pull/32529) ([minhthucdao](https://github.com/dmthuc)). -* Fix crash in JOIN found by fuzzer, close [#32458](https://github.com/ClickHouse/ClickHouse/issues/32458). [#32508](https://github.com/ClickHouse/ClickHouse/pull/32508) ([vdimir](https://github.com/vdimir)). -* Proper handling of the case with Apache Arrow column duplication. [#32507](https://github.com/ClickHouse/ClickHouse/pull/32507) ([Dmitriy Mokhnatkin](https://github.com/DMokhnatkin)). -* Fix issue with ambiguous query formatting in distributed queries that led to errors when some table columns were named `ALL` or `DISTINCT`. This closes [#32391](https://github.com/ClickHouse/ClickHouse/issues/32391). [#32490](https://github.com/ClickHouse/ClickHouse/pull/32490) ([alexey-milovidov](https://github.com/alexey-milovidov)). -* Fix failures in queries that are trying to use skipping indices, which are not materialized yet. Fixes [#32292](https://github.com/ClickHouse/ClickHouse/issues/32292) and [#30343](https://github.com/ClickHouse/ClickHouse/issues/30343). [#32359](https://github.com/ClickHouse/ClickHouse/pull/32359) ([Anton Popov](https://github.com/CurtizJ)). -* Fix broken select query when there are more than 2 row policies on same column, begin at second queries on the same session. [#31606](https://github.com/ClickHouse/ClickHouse/issues/31606). [#32291](https://github.com/ClickHouse/ClickHouse/pull/32291) ([SuperDJY](https://github.com/cmsxbc)). -* Fix fractional unix timestamp conversion to `DateTime64`, fractional part was reversed for negative unix timestamps (before 1970-01-01). [#32240](https://github.com/ClickHouse/ClickHouse/pull/32240) ([Ben](https://github.com/benbiti)). -* Some entries of replication queue might hang for `temporary_directories_lifetime` (1 day by default) with `Directory tmp_merge_` or `Part ... (state Deleting) already exists, but it will be deleted soon` or similar error. It's fixed. Fixes [#29616](https://github.com/ClickHouse/ClickHouse/issues/29616). [#32201](https://github.com/ClickHouse/ClickHouse/pull/32201) ([tavplubix](https://github.com/tavplubix)). -* Fix parsing of `APPLY lambda` column transformer which could lead to client/server crash. [#32138](https://github.com/ClickHouse/ClickHouse/pull/32138) ([Kruglov Pavel](https://github.com/Avogar)). -* Fix `base64Encode` adding trailing bytes on small strings. [#31797](https://github.com/ClickHouse/ClickHouse/pull/31797) ([Kevin Michel](https://github.com/kmichel-aiven)). -* Fix possible crash (or incorrect result) in case of `LowCardinality` arguments of window function. Fixes [#31114](https://github.com/ClickHouse/ClickHouse/issues/31114). [#31888](https://github.com/ClickHouse/ClickHouse/pull/31888) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). -* Fix hang up with command `DROP TABLE system.query_log sync`. [#33293](https://github.com/ClickHouse/ClickHouse/pull/33293) ([zhanghuajie](https://github.com/zhanghuajieHIT)). - - -## [Changelog for 2021](https://github.com/ClickHouse/ClickHouse/blob/master/docs/en/whats-new/changelog/2021.md) \ No newline at end of file +{% include "content/changelog.md" %} diff --git a/docs/en/whats-new/index.md b/docs/en/whats-new/index.md index ac2b41a6637..8033fdf71d9 100644 --- a/docs/en/whats-new/index.md +++ b/docs/en/whats-new/index.md @@ -1,10 +1,8 @@ --- -sidebar_label: What's New -sidebar_position: 500 -keywords: [clickhouse, what's, new, roadmap, changelog] -description: What's New in ClickHouse +toc_folder_title: What's New +toc_priority: 82 --- -# What’s New in ClickHouse {#whats-new-in-clickhouse} +# What’s New in ClickHouse? {#whats-new-in-clickhouse} There’s a short high-level [roadmap](../whats-new/roadmap.md) and a detailed [changelog](../whats-new/changelog/index.md) for releases that have already been published. diff --git a/docs/en/whats-new/roadmap.md b/docs/en/whats-new/roadmap.md index be7298ccd79..54f8f9d68a3 100644 --- a/docs/en/whats-new/roadmap.md +++ b/docs/en/whats-new/roadmap.md @@ -7,3 +7,4 @@ toc_title: Roadmap The roadmap for the year 2022 is published for open discussion [here](https://github.com/ClickHouse/ClickHouse/issues/32513). +{## [Original article](https://clickhouse.com/docs/en/roadmap/) ##} diff --git a/docs/en/whats-new/security-changelog.md b/docs/en/whats-new/security-changelog.md index 0a5c926f227..685f1c6d21d 100644 --- a/docs/en/whats-new/security-changelog.md +++ b/docs/en/whats-new/security-changelog.md @@ -1,11 +1,50 @@ --- -sidebar_label: Security Changelog -sidebar_position: 100 -keywords: [clickhouse, security, changelog] -description: Security Changelog +toc_priority: 76 +toc_title: Security Changelog --- +## Fixed in ClickHouse 21.10.2.15, 2021-10-18 {#fixed-in-clickhouse-release-21-10-2-215-2021-10-18} -# Security Changelog +### CVE-2021-43304 {#cve-2021-43304} + +Heap buffer overflow in Clickhouse's LZ4 compression codec when parsing a malicious query. There is no verification that the copy operations in the LZ4::decompressImpl loop and especially the arbitrary copy operation wildCopy(op, ip, copy_end), don’t exceed the destination buffer’s limits. + +Credits: JFrog Security Research Team + +### CVE-2021-43305 {#cve-2021-43305} + +Heap buffer overflow in Clickhouse's LZ4 compression codec when parsing a malicious query. There is no verification that the copy operations in the LZ4::decompressImpl loop and especially the arbitrary copy operation wildCopy(op, ip, copy_end), don’t exceed the destination buffer’s limits. This issue is very similar to CVE-2021-43304, but the vulnerable copy operation is in a different wildCopy call. + +Credits: JFrog Security Research Team + +### CVE-2021-42387 {#cve-2021-42387} + +Heap out-of-bounds read in Clickhouse's LZ4 compression codec when parsing a malicious query. As part of the LZ4::decompressImpl() loop, a 16-bit unsigned user-supplied value ('offset') is read from the compressed data. The offset is later used in the length of a copy operation, without checking the upper bounds of the source of the copy operation. + +Credits: JFrog Security Research Team + +### CVE-2021-42388 {#cve-2021-42388} + +Heap out-of-bounds read in Clickhouse's LZ4 compression codec when parsing a malicious query. As part of the LZ4::decompressImpl() loop, a 16-bit unsigned user-supplied value ('offset') is read from the compressed data. The offset is later used in the length of a copy operation, without checking the lower bounds of the source of the copy operation. + +Credits: JFrog Security Research Team + +### CVE-2021-42389 {#cve-2021-42389} + +Divide-by-zero in Clickhouse's Delta compression codec when parsing a malicious query. The first byte of the compressed buffer is used in a modulo operation without being checked for 0. + +Credits: JFrog Security Research Team + +### CVE-2021-42390 {#cve-2021-42390} + +Divide-by-zero in Clickhouse's DeltaDouble compression codec when parsing a malicious query. The first byte of the compressed buffer is used in a modulo operation without being checked for 0. + +Credits: JFrog Security Research Team + +### CVE-2021-42391 {#cve-2021-42391} + +Divide-by-zero in Clickhouse's Gorilla compression codec when parsing a malicious query. The first byte of the compressed buffer is used in a modulo operation without being checked for 0. + +Credits: JFrog Security Research Team ## Fixed in ClickHouse 21.4.3.21, 2021-04-12 {#fixed-in-clickhouse-release-21-4-3-21-2021-04-12} @@ -84,3 +123,5 @@ Credits: Andrey Krasichkov and Evgeny Sidorov of Yandex Information Security Tea Incorrect configuration in deb package could lead to the unauthorized use of the database. Credits: the UK’s National Cyber Security Centre (NCSC) + +{## [Original article](https://clickhouse.com/docs/en/security_changelog/) ##} From 4cd159746e8ae498978cbfb1fb34194965a7a864 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Wed, 30 Mar 2022 18:13:01 +0300 Subject: [PATCH 107/239] Fix polling of socket with negative timeout (when poll() interrupted by EINTR) In case of EINTR the timeout will be adjusted, but this should not be done in case of negative timeout since it means infinite timeout, and in that adjustment block negative timeout will be reset to 0, which will make poll() return (since zero timeout means return immediatelly even if no fd is ready). This should also fix 02127_connection_drain flap on CI [1]. [1]: https://s3.amazonaws.com/clickhouse-test-reports/32928/ddd5bebe555ce8feebcdd339e47fc45184c20dd1/stateless_tests__release__wide_parts_enabled__actions_.html Refs: https://github.com/ClickHouse/poco/pull/55 Signed-off-by: Azat Khuzhin --- contrib/poco | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/poco b/contrib/poco index 520a90e02e3..008b1646947 160000 --- a/contrib/poco +++ b/contrib/poco @@ -1 +1 @@ -Subproject commit 520a90e02e3e5cb90afeae1846d161dbc508a6f1 +Subproject commit 008b16469471d55b176db181756c94e3f14dd2dc From 4547ed370a4bbe20260ccdd6cd020b4c5d8ba55a Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Wed, 30 Mar 2022 20:54:33 +0800 Subject: [PATCH 108/239] add hints for column description --- src/Common/NamePrompter.h | 7 +++++++ src/Storages/ColumnsDescription.cpp | 21 ++++++++++++++++++--- src/Storages/ColumnsDescription.h | 7 +++++-- src/Storages/IndicesDescription.h | 1 - 4 files changed, 30 insertions(+), 6 deletions(-) diff --git a/src/Common/NamePrompter.h b/src/Common/NamePrompter.h index a88d4bdea8e..8e301dec8b7 100644 --- a/src/Common/NamePrompter.h +++ b/src/Common/NamePrompter.h @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -102,6 +103,12 @@ public: return prompter.getHints(name, getAllRegisteredNames()); } + String getHintsString(const String & name) const + { + const auto hints = getHints(name); + return !hints.empty() ? ", may be you meant: " + toString(hints) : ""; + } + IHints() = default; IHints(const IHints &) = default; diff --git a/src/Storages/ColumnsDescription.cpp b/src/Storages/ColumnsDescription.cpp index 69ca6002c22..a694405665b 100644 --- a/src/Storages/ColumnsDescription.cpp +++ b/src/Storages/ColumnsDescription.cpp @@ -230,8 +230,8 @@ void ColumnsDescription::remove(const String & column_name) { auto range = getNameRange(columns, column_name); if (range.first == range.second) - throw Exception("There is no column " + column_name + " in table.", - ErrorCodes::NO_SUCH_COLUMN_IN_TABLE); + throw Exception( + "There is no column " + column_name + " in table" + getHintsString(column_name), ErrorCodes::NO_SUCH_COLUMN_IN_TABLE); for (auto list_it = range.first; list_it != range.second;) { @@ -244,7 +244,10 @@ void ColumnsDescription::rename(const String & column_from, const String & colum { auto it = columns.get<1>().find(column_from); if (it == columns.get<1>().end()) - throw Exception("Cannot find column " + column_from + " in ColumnsDescription", ErrorCodes::LOGICAL_ERROR); + { + throw Exception( + "Cannot find column " + column_from + " in ColumnsDescription" + getHintsString(column_from), ErrorCodes::LOGICAL_ERROR); + } columns.get<1>().modify_key(it, [&column_to] (String & old_name) { @@ -745,6 +748,18 @@ void ColumnsDescription::removeSubcolumns(const String & name_in_storage) subcolumns.get<1>().erase(range.first, range.second); } +std::vector ColumnsDescription::getAllRegisteredNames() const +{ + std::vector names; + names.reserve(columns.size()); + for (const auto & column : columns) + { + if (column.name.find('.') == std::string::npos) + names.push_back(column.name); + } + return names; +} + Block validateColumnsDefaultsAndGetSampleBlock(ASTPtr default_expr_list, const NamesAndTypesList & all_columns, ContextPtr context) { for (const auto & child : default_expr_list->children) diff --git a/src/Storages/ColumnsDescription.h b/src/Storages/ColumnsDescription.h index 4ae1dcfc2cd..affe2ef5a56 100644 --- a/src/Storages/ColumnsDescription.h +++ b/src/Storages/ColumnsDescription.h @@ -91,7 +91,7 @@ struct ColumnDescription /// Description of multiple table columns (in CREATE TABLE for example). -class ColumnsDescription +class ColumnsDescription : public IHints<2, ColumnsDescription> { public: ColumnsDescription() = default; @@ -149,7 +149,8 @@ public: { auto it = columns.get<1>().find(column_name); if (it == columns.get<1>().end()) - throw Exception("Cannot find column " + column_name + " in ColumnsDescription", ErrorCodes::LOGICAL_ERROR); + throw Exception( + "Cannot find column " + column_name + " in ColumnsDescription" + getHintsString(column_name), ErrorCodes::LOGICAL_ERROR); removeSubcolumns(it->name); if (!columns.get<1>().modify(it, std::forward(f))) @@ -196,6 +197,8 @@ public: return columns.empty(); } + std::vector getAllRegisteredNames() const override; + /// Keep the sequence of columns and allow to lookup by name. using ColumnsContainer = boost::multi_index_container< ColumnDescription, diff --git a/src/Storages/IndicesDescription.h b/src/Storages/IndicesDescription.h index 72e0748778f..862df6fe23c 100644 --- a/src/Storages/IndicesDescription.h +++ b/src/Storages/IndicesDescription.h @@ -74,7 +74,6 @@ struct IndicesDescription : public std::vector, IHints<1, Indi /// Return common expression for all stored indices ExpressionActionsPtr getSingleExpressionForIndices(const ColumnsDescription & columns, ContextPtr context) const; -public: Names getAllRegisteredNames() const override; }; From fd9a10ef5300ac4ad20eca03b7f213ba5b571e98 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Wed, 30 Mar 2022 21:33:23 +0800 Subject: [PATCH 109/239] add hints for projections --- src/Storages/ColumnsDescription.h | 2 +- src/Storages/ProjectionsDescription.cpp | 17 +++++++++++++++-- src/Storages/ProjectionsDescription.h | 4 +++- .../02250_hints_for_columns.reference | 3 +++ .../0_stateless/02250_hints_for_columns.sh | 17 +++++++++++++++++ .../02250_hints_for_projections.reference | 1 + .../0_stateless/02250_hints_for_projections.sh | 13 +++++++++++++ 7 files changed, 53 insertions(+), 4 deletions(-) create mode 100644 tests/queries/0_stateless/02250_hints_for_columns.reference create mode 100644 tests/queries/0_stateless/02250_hints_for_columns.sh create mode 100644 tests/queries/0_stateless/02250_hints_for_projections.reference create mode 100644 tests/queries/0_stateless/02250_hints_for_projections.sh diff --git a/src/Storages/ColumnsDescription.h b/src/Storages/ColumnsDescription.h index affe2ef5a56..81cb475a1f6 100644 --- a/src/Storages/ColumnsDescription.h +++ b/src/Storages/ColumnsDescription.h @@ -91,7 +91,7 @@ struct ColumnDescription /// Description of multiple table columns (in CREATE TABLE for example). -class ColumnsDescription : public IHints<2, ColumnsDescription> +class ColumnsDescription : public IHints<1, ColumnsDescription> { public: ColumnsDescription() = default; diff --git a/src/Storages/ProjectionsDescription.cpp b/src/Storages/ProjectionsDescription.cpp index 7c340cda739..70e312931cc 100644 --- a/src/Storages/ProjectionsDescription.cpp +++ b/src/Storages/ProjectionsDescription.cpp @@ -335,7 +335,9 @@ const ProjectionDescription & ProjectionsDescription::get(const String & project { auto it = map.find(projection_name); if (it == map.end()) - throw Exception("There is no projection " + projection_name + " in table", ErrorCodes::NO_SUCH_PROJECTION_IN_TABLE); + throw Exception( + "There is no projection " + projection_name + " in table" + getHintsString(projection_name), + ErrorCodes::NO_SUCH_PROJECTION_IN_TABLE); return *(it->second); } @@ -376,13 +378,24 @@ void ProjectionsDescription::remove(const String & projection_name, bool if_exis { if (if_exists) return; - throw Exception("There is no projection " + projection_name + " in table.", ErrorCodes::NO_SUCH_PROJECTION_IN_TABLE); + throw Exception( + "There is no projection " + projection_name + " in table" + getHintsString(projection_name), + ErrorCodes::NO_SUCH_PROJECTION_IN_TABLE); } projections.erase(it->second); map.erase(it); } +std::vector ProjectionsDescription::getAllRegisteredNames() const +{ + std::vector names; + names.reserve(map.size()); + for (const auto & pair : map) + names.push_back(pair.first); + return names; +} + ExpressionActionsPtr ProjectionsDescription::getSingleExpressionForProjections(const ColumnsDescription & columns, ContextPtr query_context) const { diff --git a/src/Storages/ProjectionsDescription.h b/src/Storages/ProjectionsDescription.h index 3e8d5e1a4f1..c48942eb0ec 100644 --- a/src/Storages/ProjectionsDescription.h +++ b/src/Storages/ProjectionsDescription.h @@ -106,7 +106,7 @@ struct ProjectionDescription using ProjectionDescriptionRawPtr = const ProjectionDescription *; /// All projections in storage -struct ProjectionsDescription +struct ProjectionsDescription : public IHints<1, ProjectionsDescription> { ProjectionsDescription() = default; ProjectionsDescription(ProjectionsDescription && other) = default; @@ -138,6 +138,8 @@ struct ProjectionsDescription add(ProjectionDescription && projection, const String & after_projection = String(), bool first = false, bool if_not_exists = false); void remove(const String & projection_name, bool if_exists); + std::vector getAllRegisteredNames() const override; + private: /// Keep the sequence of columns and allow to lookup by name. using Container = std::list; diff --git a/tests/queries/0_stateless/02250_hints_for_columns.reference b/tests/queries/0_stateless/02250_hints_for_columns.reference new file mode 100644 index 00000000000..0eabe367130 --- /dev/null +++ b/tests/queries/0_stateless/02250_hints_for_columns.reference @@ -0,0 +1,3 @@ +OK +OK +OK diff --git a/tests/queries/0_stateless/02250_hints_for_columns.sh b/tests/queries/0_stateless/02250_hints_for_columns.sh new file mode 100644 index 00000000000..e8fe1a7a160 --- /dev/null +++ b/tests/queries/0_stateless/02250_hints_for_columns.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS t" + +$CLICKHOUSE_CLIENT --query="CREATE TABLE t (CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192)" + +$CLICKHOUSE_CLIENT --query="ALTER TABLE t DROP COLUMN ToDro" 2>&1 | grep -q "may be you meant: \['ToDrop'\]" && echo 'OK' || echo 'FAIL' + +$CLICKHOUSE_CLIENT --query="ALTER TABLE t MODIFY COLUMN ToDro UInt64" 2>&1 | grep -q "may be you meant: \['ToDrop'\]" && echo 'OK' || echo 'FAIL' + +$CLICKHOUSE_CLIENT --query="ALTER TABLE t RENAME COLUMN ToDro to ToDropp" 2>&1 | grep -q "may be you meant: \['ToDrop'\]" && echo 'OK' || echo 'FAIL' + +$CLICKHOUSE_CLIENT --query="DROP TABLE t" \ No newline at end of file diff --git a/tests/queries/0_stateless/02250_hints_for_projections.reference b/tests/queries/0_stateless/02250_hints_for_projections.reference new file mode 100644 index 00000000000..d86bac9de59 --- /dev/null +++ b/tests/queries/0_stateless/02250_hints_for_projections.reference @@ -0,0 +1 @@ +OK diff --git a/tests/queries/0_stateless/02250_hints_for_projections.sh b/tests/queries/0_stateless/02250_hints_for_projections.sh new file mode 100644 index 00000000000..57123b88bde --- /dev/null +++ b/tests/queries/0_stateless/02250_hints_for_projections.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS t" + +$CLICKHOUSE_CLIENT --query="create table t (x Int32, y Int32, projection pToDrop (select x, y order by x)) engine = MergeTree order by y;" + +$CLICKHOUSE_CLIENT --query="ALTER TABLE t DROP PROJECTION pToDro" 2>&1 | grep -q "may be you meant: \['pToDrop'\]" && echo 'OK' || echo 'FAIL' + +$CLICKHOUSE_CLIENT --query="DROP TABLE t" \ No newline at end of file From eda299b48b744d321d013885ecddcff0eb08fc0d Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Thu, 31 Mar 2022 12:14:28 +0800 Subject: [PATCH 110/239] fix building --- src/Common/NamePrompter.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/Common/NamePrompter.h b/src/Common/NamePrompter.h index 8e301dec8b7..9663427ef12 100644 --- a/src/Common/NamePrompter.h +++ b/src/Common/NamePrompter.h @@ -1,8 +1,8 @@ #pragma once #include +#include #include -#include #include #include @@ -105,8 +105,12 @@ public: String getHintsString(const String & name) const { - const auto hints = getHints(name); - return !hints.empty() ? ", may be you meant: " + toString(hints) : ""; + auto hints = getHints(name); + + /// Note: we don't use toString because it will cause writeCString naming conflict in src/Dictionaries/MongoDBDictionarySource.cpp + for (auto & hint : hints) + hint = "'" + hint + "'"; + return !hints.empty() ? ", may be you meant: " + boost::algorithm::join(hints, ",") : ""; } IHints() = default; From d6247338de5ba64f4180ef99dd7e003416ee04d7 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Thu, 31 Mar 2022 13:33:20 +0800 Subject: [PATCH 111/239] fix failed stateless tests --- src/Common/NamePrompter.h | 2 +- src/Storages/AlterCommands.cpp | 21 ++++++++++++++------- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/src/Common/NamePrompter.h b/src/Common/NamePrompter.h index 9663427ef12..25206cbd25f 100644 --- a/src/Common/NamePrompter.h +++ b/src/Common/NamePrompter.h @@ -110,7 +110,7 @@ public: /// Note: we don't use toString because it will cause writeCString naming conflict in src/Dictionaries/MongoDBDictionarySource.cpp for (auto & hint : hints) hint = "'" + hint + "'"; - return !hints.empty() ? ", may be you meant: " + boost::algorithm::join(hints, ",") : ""; + return !hints.empty() ? ", may be you meant: [" + boost::algorithm::join(hints, ",") + "]" : ""; } IHints() = default; diff --git a/src/Storages/AlterCommands.cpp b/src/Storages/AlterCommands.cpp index 44f208adacc..3ddeec4fa47 100644 --- a/src/Storages/AlterCommands.cpp +++ b/src/Storages/AlterCommands.cpp @@ -1046,8 +1046,10 @@ void AlterCommands::validate(const StorageInMemoryMetadata & metadata, ContextPt if (!all_columns.has(column_name)) { if (!command.if_exists) - throw Exception{"Wrong column name. Cannot find column " + backQuote(column_name) + " to modify", - ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK}; + throw Exception{ + "Wrong column name. Cannot find column " + backQuote(column_name) + " to modify" + + all_columns.getHintsString(column_name), + ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK}; else continue; } @@ -1153,7 +1155,8 @@ void AlterCommands::validate(const StorageInMemoryMetadata & metadata, ContextPt } else if (!command.if_exists) throw Exception( - "Wrong column name. Cannot find column " + backQuote(command.column_name) + " to drop", + "Wrong column name. Cannot find column " + backQuote(command.column_name) + " to drop" + + all_columns.getHintsString(command.column_name), ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK); } else if (command.type == AlterCommand::COMMENT_COLUMN) @@ -1161,8 +1164,10 @@ void AlterCommands::validate(const StorageInMemoryMetadata & metadata, ContextPt if (!all_columns.has(command.column_name)) { if (!command.if_exists) - throw Exception{"Wrong column name. Cannot find column " + backQuote(command.column_name) + " to comment", - ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK}; + throw Exception{ + "Wrong column name. Cannot find column " + backQuote(command.column_name) + " to comment" + + all_columns.getHintsString(command.column_name), + ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK}; } } else if (command.type == AlterCommand::MODIFY_SETTING || command.type == AlterCommand::RESET_SETTING) @@ -1196,8 +1201,10 @@ void AlterCommands::validate(const StorageInMemoryMetadata & metadata, ContextPt if (!all_columns.has(command.column_name)) { if (!command.if_exists) - throw Exception{"Wrong column name. Cannot find column " + backQuote(command.column_name) + " to rename", - ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK}; + throw Exception{ + "Wrong column name. Cannot find column " + backQuote(command.column_name) + " to rename" + + all_columns.getHintsString(command.column_name), + ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK}; else continue; } From 6bc1786047e0adee4c3d5c121fa9d9b3b0626c1a Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Thu, 31 Mar 2022 16:43:23 +0800 Subject: [PATCH 112/239] fix style --- src/Storages/AlterCommands.cpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/Storages/AlterCommands.cpp b/src/Storages/AlterCommands.cpp index 3ddeec4fa47..5b44a4676c6 100644 --- a/src/Storages/AlterCommands.cpp +++ b/src/Storages/AlterCommands.cpp @@ -1046,8 +1046,7 @@ void AlterCommands::validate(const StorageInMemoryMetadata & metadata, ContextPt if (!all_columns.has(column_name)) { if (!command.if_exists) - throw Exception{ - "Wrong column name. Cannot find column " + backQuote(column_name) + " to modify" + throw Exception{"Wrong column name. Cannot find column " + backQuote(column_name) + " to modify" + all_columns.getHintsString(column_name), ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK}; else @@ -1164,8 +1163,7 @@ void AlterCommands::validate(const StorageInMemoryMetadata & metadata, ContextPt if (!all_columns.has(command.column_name)) { if (!command.if_exists) - throw Exception{ - "Wrong column name. Cannot find column " + backQuote(command.column_name) + " to comment" + throw Exception{"Wrong column name. Cannot find column " + backQuote(command.column_name) + " to comment" + all_columns.getHintsString(command.column_name), ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK}; } @@ -1201,8 +1199,7 @@ void AlterCommands::validate(const StorageInMemoryMetadata & metadata, ContextPt if (!all_columns.has(command.column_name)) { if (!command.if_exists) - throw Exception{ - "Wrong column name. Cannot find column " + backQuote(command.column_name) + " to rename" + throw Exception{"Wrong column name. Cannot find column " + backQuote(command.column_name) + " to rename" + all_columns.getHintsString(command.column_name), ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK}; else From 9dd1a76fd85e3ed677d084a01c1a550255e19c9e Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Thu, 31 Mar 2022 16:45:36 +0800 Subject: [PATCH 113/239] fix stateless tests --- tests/queries/0_stateless/02250_hints_for_columns.reference | 2 -- tests/queries/0_stateless/02250_hints_for_projections.reference | 1 - 2 files changed, 3 deletions(-) diff --git a/tests/queries/0_stateless/02250_hints_for_columns.reference b/tests/queries/0_stateless/02250_hints_for_columns.reference index 0eabe367130..d86bac9de59 100644 --- a/tests/queries/0_stateless/02250_hints_for_columns.reference +++ b/tests/queries/0_stateless/02250_hints_for_columns.reference @@ -1,3 +1 @@ OK -OK -OK diff --git a/tests/queries/0_stateless/02250_hints_for_projections.reference b/tests/queries/0_stateless/02250_hints_for_projections.reference index d86bac9de59..e69de29bb2d 100644 --- a/tests/queries/0_stateless/02250_hints_for_projections.reference +++ b/tests/queries/0_stateless/02250_hints_for_projections.reference @@ -1 +0,0 @@ -OK From 10bbb965127f6d3f1ae15ff2a6b0cfbbdee68a18 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Fri, 1 Apr 2022 11:56:43 +0800 Subject: [PATCH 114/239] fix stateless test --- tests/queries/0_stateless/02250_hints_for_columns.reference | 2 ++ tests/queries/0_stateless/02250_hints_for_projections.reference | 1 + 2 files changed, 3 insertions(+) diff --git a/tests/queries/0_stateless/02250_hints_for_columns.reference b/tests/queries/0_stateless/02250_hints_for_columns.reference index d86bac9de59..0eabe367130 100644 --- a/tests/queries/0_stateless/02250_hints_for_columns.reference +++ b/tests/queries/0_stateless/02250_hints_for_columns.reference @@ -1 +1,3 @@ OK +OK +OK diff --git a/tests/queries/0_stateless/02250_hints_for_projections.reference b/tests/queries/0_stateless/02250_hints_for_projections.reference index e69de29bb2d..d86bac9de59 100644 --- a/tests/queries/0_stateless/02250_hints_for_projections.reference +++ b/tests/queries/0_stateless/02250_hints_for_projections.reference @@ -0,0 +1 @@ +OK From f4772d3b8fe416355324cac849ab925ae6bdfbe3 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Fri, 1 Apr 2022 14:45:20 +0800 Subject: [PATCH 115/239] chmod a+x 02250_hints_for_columns/02250_hints_for_projections --- tests/queries/0_stateless/02250_hints_for_columns.sh | 0 tests/queries/0_stateless/02250_hints_for_projections.sh | 0 2 files changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 tests/queries/0_stateless/02250_hints_for_columns.sh mode change 100644 => 100755 tests/queries/0_stateless/02250_hints_for_projections.sh diff --git a/tests/queries/0_stateless/02250_hints_for_columns.sh b/tests/queries/0_stateless/02250_hints_for_columns.sh old mode 100644 new mode 100755 diff --git a/tests/queries/0_stateless/02250_hints_for_projections.sh b/tests/queries/0_stateless/02250_hints_for_projections.sh old mode 100644 new mode 100755 From d96b682a5562132186da9f3aaea9af2647877b5b Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Fri, 1 Apr 2022 13:12:54 +0000 Subject: [PATCH 116/239] Refactor --- src/Common/NamePrompter.cpp | 15 +++++++++ src/Common/NamePrompter.h | 15 +++++---- src/Storages/AlterCommands.cpp | 32 ++++++++++++------- src/Storages/ColumnsDescription.cpp | 12 ++++--- src/Storages/ColumnsDescription.h | 7 ++-- src/Storages/ProjectionsDescription.cpp | 15 +++++---- .../0_stateless/02250_hints_for_columns.sh | 8 ++--- .../02250_hints_for_projections.sh | 4 +-- 8 files changed, 71 insertions(+), 37 deletions(-) create mode 100644 src/Common/NamePrompter.cpp diff --git a/src/Common/NamePrompter.cpp b/src/Common/NamePrompter.cpp new file mode 100644 index 00000000000..c5a2224dcb4 --- /dev/null +++ b/src/Common/NamePrompter.cpp @@ -0,0 +1,15 @@ +#include +#include + +namespace DB::detail +{ +void appendHintsMessageImpl(String & message, const std::vector & hints) +{ + if (hints.empty()) + { + return; + } + + message += ". Maybe you meant: " + toString(hints); +} +} diff --git a/src/Common/NamePrompter.h b/src/Common/NamePrompter.h index 25206cbd25f..b3eb271c0f0 100644 --- a/src/Common/NamePrompter.h +++ b/src/Common/NamePrompter.h @@ -1,7 +1,6 @@ #pragma once #include -#include #include #include @@ -91,6 +90,10 @@ private: } }; +namespace detail +{ +void appendHintsMessageImpl(String & message, const std::vector & hints); +} template class IHints @@ -103,14 +106,10 @@ public: return prompter.getHints(name, getAllRegisteredNames()); } - String getHintsString(const String & name) const + void appendHintsMessage(String & message, const String & name) const { auto hints = getHints(name); - - /// Note: we don't use toString because it will cause writeCString naming conflict in src/Dictionaries/MongoDBDictionarySource.cpp - for (auto & hint : hints) - hint = "'" + hint + "'"; - return !hints.empty() ? ", may be you meant: [" + boost::algorithm::join(hints, ",") + "]" : ""; + detail::appendHintsMessageImpl(message, hints); } IHints() = default; @@ -126,4 +125,6 @@ private: NamePrompter prompter; }; +void appendHintsString(String & message, const std::vector & hints, const String & name); + } diff --git a/src/Storages/AlterCommands.cpp b/src/Storages/AlterCommands.cpp index 5b44a4676c6..2870dc42af7 100644 --- a/src/Storages/AlterCommands.cpp +++ b/src/Storages/AlterCommands.cpp @@ -1046,9 +1046,12 @@ void AlterCommands::validate(const StorageInMemoryMetadata & metadata, ContextPt if (!all_columns.has(column_name)) { if (!command.if_exists) - throw Exception{"Wrong column name. Cannot find column " + backQuote(column_name) + " to modify" - + all_columns.getHintsString(column_name), + { + String exception_message = fmt::format("Wrong column. Cannot find colum {} to modify", backQuote(column_name)); + all_columns.appendHintsMessage(exception_message, column_name); + throw Exception{exception_message, ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK}; + } else continue; } @@ -1153,19 +1156,22 @@ void AlterCommands::validate(const StorageInMemoryMetadata & metadata, ContextPt all_columns.remove(command.column_name); } else if (!command.if_exists) - throw Exception( - "Wrong column name. Cannot find column " + backQuote(command.column_name) + " to drop" - + all_columns.getHintsString(command.column_name), - ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK); + { + String exception_message = fmt::format("Wrong column name. Cannot find column {} to drop", backQuote(command.column_name)); + all_columns.appendHintsMessage(exception_message, command.column_name); + throw Exception(exception_message, ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK); + } } else if (command.type == AlterCommand::COMMENT_COLUMN) { if (!all_columns.has(command.column_name)) { if (!command.if_exists) - throw Exception{"Wrong column name. Cannot find column " + backQuote(command.column_name) + " to comment" - + all_columns.getHintsString(command.column_name), - ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK}; + { + String exception_message = fmt::format("Wrong column name. Cannot find column {} to comment", backQuote(command.column_name)); + all_columns.appendHintsMessage(exception_message, command.column_name); + throw Exception(exception_message, ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK); + } } } else if (command.type == AlterCommand::MODIFY_SETTING || command.type == AlterCommand::RESET_SETTING) @@ -1199,9 +1205,11 @@ void AlterCommands::validate(const StorageInMemoryMetadata & metadata, ContextPt if (!all_columns.has(command.column_name)) { if (!command.if_exists) - throw Exception{"Wrong column name. Cannot find column " + backQuote(command.column_name) + " to rename" - + all_columns.getHintsString(command.column_name), - ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK}; + { + String exception_message = fmt::format("Wrong column name. Cannot find column {} to rename", backQuote(command.column_name)); + all_columns.appendHintsMessage(exception_message, command.column_name); + throw Exception(exception_message, ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK); + } else continue; } diff --git a/src/Storages/ColumnsDescription.cpp b/src/Storages/ColumnsDescription.cpp index a694405665b..f3a939614c1 100644 --- a/src/Storages/ColumnsDescription.cpp +++ b/src/Storages/ColumnsDescription.cpp @@ -230,8 +230,11 @@ void ColumnsDescription::remove(const String & column_name) { auto range = getNameRange(columns, column_name); if (range.first == range.second) - throw Exception( - "There is no column " + column_name + " in table" + getHintsString(column_name), ErrorCodes::NO_SUCH_COLUMN_IN_TABLE); + { + String exception_message = fmt::format("There is no column {} in table", column_name); + appendHintsMessage(exception_message, column_name); + throw Exception(exception_message, ErrorCodes::NO_SUCH_COLUMN_IN_TABLE); + } for (auto list_it = range.first; list_it != range.second;) { @@ -245,8 +248,9 @@ void ColumnsDescription::rename(const String & column_from, const String & colum auto it = columns.get<1>().find(column_from); if (it == columns.get<1>().end()) { - throw Exception( - "Cannot find column " + column_from + " in ColumnsDescription" + getHintsString(column_from), ErrorCodes::LOGICAL_ERROR); + String exception_message = fmt::format("Cannot find column {} in ColumnsDescription", column_from); + appendHintsMessage(exception_message, column_from); + throw Exception(exception_message, ErrorCodes::LOGICAL_ERROR); } columns.get<1>().modify_key(it, [&column_to] (String & old_name) diff --git a/src/Storages/ColumnsDescription.h b/src/Storages/ColumnsDescription.h index 81cb475a1f6..d3d6f7f2ff5 100644 --- a/src/Storages/ColumnsDescription.h +++ b/src/Storages/ColumnsDescription.h @@ -149,8 +149,11 @@ public: { auto it = columns.get<1>().find(column_name); if (it == columns.get<1>().end()) - throw Exception( - "Cannot find column " + column_name + " in ColumnsDescription" + getHintsString(column_name), ErrorCodes::LOGICAL_ERROR); + { + String exception_message = fmt::format("Cannot find column {} in ColumnsDescription", column_name); + appendHintsMessage(exception_message, column_name); + throw Exception(exception_message, ErrorCodes::LOGICAL_ERROR); + } removeSubcolumns(it->name); if (!columns.get<1>().modify(it, std::forward(f))) diff --git a/src/Storages/ProjectionsDescription.cpp b/src/Storages/ProjectionsDescription.cpp index 70e312931cc..69d7c5f8ed6 100644 --- a/src/Storages/ProjectionsDescription.cpp +++ b/src/Storages/ProjectionsDescription.cpp @@ -335,9 +335,11 @@ const ProjectionDescription & ProjectionsDescription::get(const String & project { auto it = map.find(projection_name); if (it == map.end()) - throw Exception( - "There is no projection " + projection_name + " in table" + getHintsString(projection_name), - ErrorCodes::NO_SUCH_PROJECTION_IN_TABLE); + { + String exception_message = fmt::format("There is no projection {} in table", projection_name); + appendHintsMessage(exception_message, projection_name); + throw Exception(exception_message, ErrorCodes::NO_SUCH_PROJECTION_IN_TABLE); + } return *(it->second); } @@ -378,9 +380,10 @@ void ProjectionsDescription::remove(const String & projection_name, bool if_exis { if (if_exists) return; - throw Exception( - "There is no projection " + projection_name + " in table" + getHintsString(projection_name), - ErrorCodes::NO_SUCH_PROJECTION_IN_TABLE); + + String exception_message = fmt::format("There is no projection {} in table", projection_name); + appendHintsMessage(exception_message, projection_name); + throw Exception(exception_message, ErrorCodes::NO_SUCH_PROJECTION_IN_TABLE); } projections.erase(it->second); diff --git a/tests/queries/0_stateless/02250_hints_for_columns.sh b/tests/queries/0_stateless/02250_hints_for_columns.sh index e8fe1a7a160..45fd2f238b1 100755 --- a/tests/queries/0_stateless/02250_hints_for_columns.sh +++ b/tests/queries/0_stateless/02250_hints_for_columns.sh @@ -8,10 +8,10 @@ $CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS t" $CLICKHOUSE_CLIENT --query="CREATE TABLE t (CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192)" -$CLICKHOUSE_CLIENT --query="ALTER TABLE t DROP COLUMN ToDro" 2>&1 | grep -q "may be you meant: \['ToDrop'\]" && echo 'OK' || echo 'FAIL' +$CLICKHOUSE_CLIENT --query="ALTER TABLE t DROP COLUMN ToDro" 2>&1 | grep -q "Maybe you meant: \['ToDrop'\]" && echo 'OK' || echo 'FAIL' -$CLICKHOUSE_CLIENT --query="ALTER TABLE t MODIFY COLUMN ToDro UInt64" 2>&1 | grep -q "may be you meant: \['ToDrop'\]" && echo 'OK' || echo 'FAIL' +$CLICKHOUSE_CLIENT --query="ALTER TABLE t MODIFY COLUMN ToDro UInt64" 2>&1 | grep -q "Maybe you meant: \['ToDrop'\]" && echo 'OK' || echo 'FAIL' -$CLICKHOUSE_CLIENT --query="ALTER TABLE t RENAME COLUMN ToDro to ToDropp" 2>&1 | grep -q "may be you meant: \['ToDrop'\]" && echo 'OK' || echo 'FAIL' +$CLICKHOUSE_CLIENT --query="ALTER TABLE t RENAME COLUMN ToDro to ToDropp" 2>&1 | grep -q "Maybe you meant: \['ToDrop'\]" && echo 'OK' || echo 'FAIL' -$CLICKHOUSE_CLIENT --query="DROP TABLE t" \ No newline at end of file +$CLICKHOUSE_CLIENT --query="DROP TABLE t" diff --git a/tests/queries/0_stateless/02250_hints_for_projections.sh b/tests/queries/0_stateless/02250_hints_for_projections.sh index 57123b88bde..7db8b243ae4 100755 --- a/tests/queries/0_stateless/02250_hints_for_projections.sh +++ b/tests/queries/0_stateless/02250_hints_for_projections.sh @@ -8,6 +8,6 @@ $CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS t" $CLICKHOUSE_CLIENT --query="create table t (x Int32, y Int32, projection pToDrop (select x, y order by x)) engine = MergeTree order by y;" -$CLICKHOUSE_CLIENT --query="ALTER TABLE t DROP PROJECTION pToDro" 2>&1 | grep -q "may be you meant: \['pToDrop'\]" && echo 'OK' || echo 'FAIL' +$CLICKHOUSE_CLIENT --query="ALTER TABLE t DROP PROJECTION pToDro" 2>&1 | grep -q "Maybe you meant: \['pToDrop'\]" && echo 'OK' || echo 'FAIL' -$CLICKHOUSE_CLIENT --query="DROP TABLE t" \ No newline at end of file +$CLICKHOUSE_CLIENT --query="DROP TABLE t" From a926bc19eabc3d739b5ed9bef0c324ac6d49ca62 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Mon, 4 Apr 2022 07:24:10 +0000 Subject: [PATCH 117/239] Address PR comments --- src/Common/NamePrompter.h | 3 --- src/Storages/AlterCommands.cpp | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/src/Common/NamePrompter.h b/src/Common/NamePrompter.h index b3eb271c0f0..962a89a8e76 100644 --- a/src/Common/NamePrompter.h +++ b/src/Common/NamePrompter.h @@ -124,7 +124,4 @@ public: private: NamePrompter prompter; }; - -void appendHintsString(String & message, const std::vector & hints, const String & name); - } diff --git a/src/Storages/AlterCommands.cpp b/src/Storages/AlterCommands.cpp index 2870dc42af7..76df6316fed 100644 --- a/src/Storages/AlterCommands.cpp +++ b/src/Storages/AlterCommands.cpp @@ -1047,7 +1047,7 @@ void AlterCommands::validate(const StorageInMemoryMetadata & metadata, ContextPt { if (!command.if_exists) { - String exception_message = fmt::format("Wrong column. Cannot find colum {} to modify", backQuote(column_name)); + String exception_message = fmt::format("Wrong column. Cannot find column {} to modify", backQuote(column_name)); all_columns.appendHintsMessage(exception_message, column_name); throw Exception{exception_message, ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK}; From 9a76efb8500e779f4925c4830e4f15d527084b7c Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Sun, 3 Apr 2022 23:39:59 +0300 Subject: [PATCH 118/239] Fix formatting of INSERT INFILE queries (missing quotes) Signed-off-by: Azat Khuzhin --- src/Parsers/ASTInsertQuery.cpp | 12 ++++++++++-- .../0_stateless/02165_insert_from_infile.reference | 4 ++-- .../0_stateless/02264_format_insert_infile.reference | 3 +++ .../0_stateless/02264_format_insert_infile.sql | 2 ++ 4 files changed, 17 insertions(+), 4 deletions(-) create mode 100644 tests/queries/0_stateless/02264_format_insert_infile.reference create mode 100644 tests/queries/0_stateless/02264_format_insert_infile.sql diff --git a/src/Parsers/ASTInsertQuery.cpp b/src/Parsers/ASTInsertQuery.cpp index 7e1d48d7f55..1d30c8f1bbd 100644 --- a/src/Parsers/ASTInsertQuery.cpp +++ b/src/Parsers/ASTInsertQuery.cpp @@ -81,9 +81,17 @@ void ASTInsertQuery::formatImpl(const FormatSettings & settings, FormatState & s if (infile) { - settings.ostr << (settings.hilite ? hilite_keyword : "") << " FROM INFILE " << (settings.hilite ? hilite_none : "") << infile->as().value.safeGet(); + settings.ostr + << (settings.hilite ? hilite_keyword : "") + << " FROM INFILE " + << (settings.hilite ? hilite_none : "") + << quoteString(infile->as().value.safeGet()); if (compression) - settings.ostr << (settings.hilite ? hilite_keyword : "") << " COMPRESSION " << (settings.hilite ? hilite_none : "") << compression->as().value.safeGet(); + settings.ostr + << (settings.hilite ? hilite_keyword : "") + << " COMPRESSION " + << (settings.hilite ? hilite_none : "") + << compression->as().value.safeGet(); } if (select) diff --git a/tests/queries/0_stateless/02165_insert_from_infile.reference b/tests/queries/0_stateless/02165_insert_from_infile.reference index 2a00a8faa31..f8c205ecc0f 100644 --- a/tests/queries/0_stateless/02165_insert_from_infile.reference +++ b/tests/queries/0_stateless/02165_insert_from_infile.reference @@ -1,5 +1,5 @@ -INSERT INTO test FROM INFILE data.file SELECT x +INSERT INTO test FROM INFILE \'data.file\' SELECT x FROM input(\'x UInt32\') -INSERT INTO test FROM INFILE data.file WITH number AS x +INSERT INTO test FROM INFILE \'data.file\' WITH number AS x SELECT number FROM input(\'number UInt32\') diff --git a/tests/queries/0_stateless/02264_format_insert_infile.reference b/tests/queries/0_stateless/02264_format_insert_infile.reference new file mode 100644 index 00000000000..338ea6fbfc6 --- /dev/null +++ b/tests/queries/0_stateless/02264_format_insert_infile.reference @@ -0,0 +1,3 @@ +-- { echo } +EXPLAIN SYNTAX INSERT INTO foo FROM INFILE '/dev/null'; +INSERT INTO foo FROM INFILE \'/dev/null\' diff --git a/tests/queries/0_stateless/02264_format_insert_infile.sql b/tests/queries/0_stateless/02264_format_insert_infile.sql new file mode 100644 index 00000000000..38ee39d932d --- /dev/null +++ b/tests/queries/0_stateless/02264_format_insert_infile.sql @@ -0,0 +1,2 @@ +-- { echo } +EXPLAIN SYNTAX INSERT INTO foo FROM INFILE '/dev/null'; From 93bbe9641aa162c65ea182279cfa10ab71e6d8cf Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Mon, 4 Apr 2022 10:30:31 +0300 Subject: [PATCH 119/239] Fix formatting of INSERT ... COMPRESSION Signed-off-by: Azat Khuzhin --- src/Parsers/ASTInsertQuery.cpp | 2 +- .../0_stateless/02264_format_insert_compression.reference | 3 +++ tests/queries/0_stateless/02264_format_insert_compression.sql | 2 ++ 3 files changed, 6 insertions(+), 1 deletion(-) create mode 100644 tests/queries/0_stateless/02264_format_insert_compression.reference create mode 100644 tests/queries/0_stateless/02264_format_insert_compression.sql diff --git a/src/Parsers/ASTInsertQuery.cpp b/src/Parsers/ASTInsertQuery.cpp index 1d30c8f1bbd..40e14c918ff 100644 --- a/src/Parsers/ASTInsertQuery.cpp +++ b/src/Parsers/ASTInsertQuery.cpp @@ -91,7 +91,7 @@ void ASTInsertQuery::formatImpl(const FormatSettings & settings, FormatState & s << (settings.hilite ? hilite_keyword : "") << " COMPRESSION " << (settings.hilite ? hilite_none : "") - << compression->as().value.safeGet(); + << quoteString(compression->as().value.safeGet()); } if (select) diff --git a/tests/queries/0_stateless/02264_format_insert_compression.reference b/tests/queries/0_stateless/02264_format_insert_compression.reference new file mode 100644 index 00000000000..107b7fcb3e9 --- /dev/null +++ b/tests/queries/0_stateless/02264_format_insert_compression.reference @@ -0,0 +1,3 @@ +-- { echo } +EXPLAIN SYNTAX INSERT INTO foo FROM INFILE '/dev/null' COMPRESSION 'gz'; +INSERT INTO foo FROM INFILE \'/dev/null\' COMPRESSION \'gz\' diff --git a/tests/queries/0_stateless/02264_format_insert_compression.sql b/tests/queries/0_stateless/02264_format_insert_compression.sql new file mode 100644 index 00000000000..c095a8fbbb7 --- /dev/null +++ b/tests/queries/0_stateless/02264_format_insert_compression.sql @@ -0,0 +1,2 @@ +-- { echo } +EXPLAIN SYNTAX INSERT INTO foo FROM INFILE '/dev/null' COMPRESSION 'gz'; From ae53aae1063b09beaa23b74abec5fe8b79565597 Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Mon, 4 Apr 2022 08:48:31 +0000 Subject: [PATCH 120/239] fix clang-tidy --- src/Storages/MergeTree/MergeTreeDataWriter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/MergeTreeDataWriter.h b/src/Storages/MergeTree/MergeTreeDataWriter.h index 33742d7e52a..7b6bf8fb1db 100644 --- a/src/Storages/MergeTree/MergeTreeDataWriter.h +++ b/src/Storages/MergeTree/MergeTreeDataWriter.h @@ -42,7 +42,7 @@ public: */ static BlocksWithPartition splitBlockIntoParts(const Block & block, size_t max_parts, const StorageMetadataPtr & metadata_snapshot, ContextPtr context); - void deduceTypesOfObjectColumns(const StorageSnapshotPtr & storage_snapshot, Block & block); + static void deduceTypesOfObjectColumns(const StorageSnapshotPtr & storage_snapshot, Block & block); /// This structure contains not completely written temporary part. /// Some writes may happen asynchronously, e.g. for blob storages. From 803a1a2a9c299fb2a87608f02b98e47515f66ac6 Mon Sep 17 00:00:00 2001 From: alesapin Date: Mon, 4 Apr 2022 11:19:02 +0200 Subject: [PATCH 121/239] Fix tests and check --- src/Storages/MergeTree/MergeTreeData.cpp | 3 ++- tests/queries/0_stateless/00980_merge_alter_settings.sql | 6 +++--- .../00980_zookeeper_merge_tree_alter_settings.sql | 4 ++-- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index f1af92e7763..4e4e555fb54 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -1909,6 +1909,7 @@ void MergeTreeData::checkAlterIsPossible(const AlterCommands & commands, Context StorageInMemoryMetadata old_metadata = getInMemoryMetadata(); const auto & settings = local_context->getSettingsRef(); + const auto & settings_from_storage = getSettings(); if (!settings.allow_non_metadata_alters) { @@ -2103,7 +2104,7 @@ void MergeTreeData::checkAlterIsPossible(const AlterCommands & commands, Context { for (const auto & reset_setting : command.settings_resets) { - if (!settings.has(reset_setting)) + if (!settings_from_storage->has(reset_setting)) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot reset setting '{}' because it doesn't exist for MergeTree engines family", reset_setting); } } diff --git a/tests/queries/0_stateless/00980_merge_alter_settings.sql b/tests/queries/0_stateless/00980_merge_alter_settings.sql index c0d18f6d453..f595a09970d 100644 --- a/tests/queries/0_stateless/00980_merge_alter_settings.sql +++ b/tests/queries/0_stateless/00980_merge_alter_settings.sql @@ -91,8 +91,8 @@ SHOW CREATE TABLE table_for_reset_setting; ALTER TABLE table_for_reset_setting RESET SETTING index_granularity; -- { serverError 472 } --- ignore undefined setting -ALTER TABLE table_for_reset_setting RESET SETTING merge_with_ttl_timeout, unknown_setting; +-- don't execute alter with incorrect setting +ALTER TABLE table_for_reset_setting RESET SETTING merge_with_ttl_timeout, unknown_setting; -- { serverError 36 } ALTER TABLE table_for_reset_setting MODIFY SETTING merge_with_ttl_timeout = 300, max_concurrent_queries = 1; @@ -102,4 +102,4 @@ ALTER TABLE table_for_reset_setting RESET SETTING max_concurrent_queries, merge_ SHOW CREATE TABLE table_for_reset_setting; -DROP TABLE IF EXISTS table_for_reset_setting; \ No newline at end of file +DROP TABLE IF EXISTS table_for_reset_setting; diff --git a/tests/queries/0_stateless/00980_zookeeper_merge_tree_alter_settings.sql b/tests/queries/0_stateless/00980_zookeeper_merge_tree_alter_settings.sql index dfb91eb3b0a..1b291bf84d2 100644 --- a/tests/queries/0_stateless/00980_zookeeper_merge_tree_alter_settings.sql +++ b/tests/queries/0_stateless/00980_zookeeper_merge_tree_alter_settings.sql @@ -108,8 +108,8 @@ ATTACH TABLE replicated_table_for_reset_setting1; SHOW CREATE TABLE replicated_table_for_reset_setting1; SHOW CREATE TABLE replicated_table_for_reset_setting2; --- ignore undefined setting -ALTER TABLE replicated_table_for_reset_setting1 RESET SETTING check_delay_period, unknown_setting; +-- don't execute alter with incorrect setting +ALTER TABLE replicated_table_for_reset_setting1 RESET SETTING check_delay_period, unknown_setting; -- { serverError 36 } ALTER TABLE replicated_table_for_reset_setting1 RESET SETTING merge_with_ttl_timeout; ALTER TABLE replicated_table_for_reset_setting2 RESET SETTING merge_with_ttl_timeout; From 14538f645606468e4d6026c3efb8b0a3dfb0125a Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Sat, 5 Feb 2022 19:33:42 +0300 Subject: [PATCH 122/239] Add system.processors_profile_log This is the system table that will contain Processors level profiling. v2: one entry per Processor, not 3 (PortFull/NeedData/work()) v3: us over ms v4: Enable processors_profile_log table by default Signed-off-by: Azat Khuzhin --- .../system-tables/processors_profile_log.md | 73 +++++++++++++++++++ programs/server/config.xml | 9 +++ src/Common/SystemLogBase.cpp | 1 + src/Common/SystemLogBase.h | 1 + src/Interpreters/Context.cpp | 11 +++ src/Interpreters/Context.h | 2 + src/Interpreters/InterpreterSystemQuery.cpp | 4 +- src/Interpreters/ProcessorsProfileLog.cpp | 57 +++++++++++++++ src/Interpreters/ProcessorsProfileLog.h | 42 +++++++++++ src/Interpreters/SystemLog.cpp | 4 + src/Interpreters/SystemLog.h | 3 + src/Interpreters/executeQuery.cpp | 5 ++ 12 files changed, 211 insertions(+), 1 deletion(-) create mode 100644 docs/en/operations/system-tables/processors_profile_log.md create mode 100644 src/Interpreters/ProcessorsProfileLog.cpp create mode 100644 src/Interpreters/ProcessorsProfileLog.h diff --git a/docs/en/operations/system-tables/processors_profile_log.md b/docs/en/operations/system-tables/processors_profile_log.md new file mode 100644 index 00000000000..a2f851d6791 --- /dev/null +++ b/docs/en/operations/system-tables/processors_profile_log.md @@ -0,0 +1,73 @@ +# system.processors_profile_log {#system-processors_profile_log} + +This table contains profiling on processors level (that you can find in [`EXPLAIN PIPELINE`](../../sql-reference/statements/explain.md#explain-pipeline)). + +Columns: + +- `event_date` ([Date](../../sql-reference/data-types/date.md)) — The date when the event happened. +- `event_time` ([DateTime64](../../sql-reference/data-types/datetime64.md)) — The date and time when the event happened. +- `query_id` ([String](../../sql-reference/data-types/string.md)) — ID of the query +- `name` ([LowCardinality(String)](../../sql-reference/data-types/lowcardinality.md)) — Name of the processor. +- `elapsed_us` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Number of microseconds this processor was executed. +- `need_data_elapsed_us` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Number of microseconds this processor was waiting for data (from other processor). +- `port_full_elapsed_us` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Number of microseconds this processor was waiting because output port was full. + +**Example** + +Query: + +``` sql +EXPLAIN PIPELINE +SELECT sleep(1) + +┌─explain─────────────────────────┐ +│ (Expression) │ +│ ExpressionTransform │ +│ (SettingQuotaAndLimits) │ +│ (ReadFromStorage) │ +│ SourceFromSingleChunk 0 → 1 │ +└─────────────────────────────────┘ + +SELECT sleep(1) +SETTINGS log_processors_profiles = 1 + +Query id: feb5ed16-1c24-4227-aa54-78c02b3b27d4 + +┌─sleep(1)─┐ +│ 0 │ +└──────────┘ + +1 rows in set. Elapsed: 1.018 sec. + +SELECT + name, + elapsed_us, + need_data_elapsed_us, + port_full_elapsed_us +FROM system.processors_profile_log +WHERE query_id = 'feb5ed16-1c24-4227-aa54-78c02b3b27d4' +ORDER BY name ASC +``` + +Result: + +``` text +┌─name────────────────────┬─elapsed_us─┬─need_data_elapsed_us─┬─port_full_elapsed_us─┐ +│ ExpressionTransform │ 1000497 │ 2823 │ 197 │ +│ LazyOutputFormat │ 36 │ 1002188 │ 0 │ +│ LimitsCheckingTransform │ 10 │ 1002994 │ 106 │ +│ NullSource │ 5 │ 1002074 │ 0 │ +│ NullSource │ 1 │ 1002084 │ 0 │ +│ SourceFromSingleChunk │ 45 │ 4736 │ 1000819 │ +└─────────────────────────┴────────────┴──────────────────────┴──────────────────────┘ +``` + +Here you can see: + +- `ExpressionTransform` was executing `sleep(1)` function, so it `work` will takes 1e6, and so `elapsed_us` > 1e6. +- `SourceFromSingleChunk` need to wait, because `ExpressionTransform` does not accept any data during execution of `sleep(1)`, so it will be in `PortFull` state for 1e6 us, and so `port_full_elapsed_us` > 1e6. +- `LimitsCheckingTransform`/`NullSource`/`LazyOutputFormat` need to wait until `ExpressionTransform` will execute `sleep(1)` to process the result, so `need_data_elapsed_us` > 1e6. + +**See Also** + +- [`EXPLAIN PIPELINE`](../../sql-reference/statements/explain.md#explain-pipeline) diff --git a/programs/server/config.xml b/programs/server/config.xml index 3b035fb39ac..3bb26a3a368 100644 --- a/programs/server/config.xml +++ b/programs/server/config.xml @@ -1042,6 +1042,15 @@ 7500 --> + + + system + processors_profile_log
+ + toYYYYMM(event_date) + 7500 +
+ - INSERT INTO test_table_small SELECT groupArraySample(500)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(500)) - INSERT INTO test_table_small2 SELECT groupArraySample(500)(number) AS set, groupArraySample(400)(number) AS subset FROM (SELECT * FROM numbers(500)) - INSERT INTO test_table_smallf SELECT groupArraySample(500)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(5000000)) - - - INSERT INTO test_table_medium SELECT groupArraySample(1000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(1000000)) - INSERT INTO test_table_medium2 SELECT groupArraySample(1000000)(number) AS set, groupArraySample(4000)(number) AS subset FROM (SELECT * FROM numbers(1000000)) - INSERT INTO test_table_mediumf SELECT groupArraySample(1000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(100000000)) - - - INSERT INTO test_table_large SELECT groupArraySample(50000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(50000000)) - INSERT INTO test_table_large2 SELECT groupArraySample(50000000)(number) AS set, groupArraySample(8000)(number) AS subset FROM (SELECT * FROM numbers(50000000)) - INSERT INTO test_table_largef SELECT groupArraySample(50000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(1000000000)) - - select hasAll(set, subset) from test_table_small - select hasAll(set, subset) from test_table_small2 - select hasAll(set, subset) from test_table_smallf - - select hasAll(set, subset) from test_table_medium - select hasAll(set, subset) from test_table_medium2 - select hasAll(set, subset) from test_table_mediumf - - select hasAll(set, subset) from test_table_large - select hasAll(set, subset) from test_table_large2 Settings max_execution_time=300 - select hasAll(set, subset) from test_table_largef - - DROP TABLE IF EXISTS test_table_small - DROP TABLE IF EXISTS test_table_small2 - DROP TABLE IF EXISTS test_table_smallf - - DROP TABLE IF EXISTS test_table_medium - DROP TABLE IF EXISTS test_table_medium2 - DROP TABLE IF EXISTS test_table_mediumf - - DROP TABLE IF EXISTS test_table_large - DROP TABLE IF EXISTS test_table_large2 - DROP TABLE IF EXISTS test_table_largef - diff --git a/tests/performance/hasAll_simd_int32.xml b/tests/performance/hasAll_simd_int32.xml deleted file mode 100644 index 4543dea161b..00000000000 --- a/tests/performance/hasAll_simd_int32.xml +++ /dev/null @@ -1,52 +0,0 @@ - - CREATE TABLE test_table_small (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_small2 (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_smallf (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set - - CREATE TABLE test_table_medium (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_medium2 (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_mediumf (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set - - CREATE TABLE test_table_large (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_large2 (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_largef (`set` Array(Int32), `subset` Array (Int32)) ENGINE = MergeTree ORDER BY set - - - INSERT INTO test_table_small SELECT groupArraySample(500)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(500)) - INSERT INTO test_table_small2 SELECT groupArraySample(500)(number) AS set, groupArraySample(400)(number) AS subset FROM (SELECT * FROM numbers(500)) - INSERT INTO test_table_smallf SELECT groupArraySample(500)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(5000000)) - - - INSERT INTO test_table_medium SELECT groupArraySample(1000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(1000000)) - INSERT INTO test_table_medium2 SELECT groupArraySample(1000000)(number) AS set, groupArraySample(4000)(number) AS subset FROM (SELECT * FROM numbers(1000000)) - INSERT INTO test_table_mediumf SELECT groupArraySample(1000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(100000000)) - - - INSERT INTO test_table_large SELECT groupArraySample(50000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(50000000)) - INSERT INTO test_table_large2 SELECT groupArraySample(50000000)(number) AS set, groupArraySample(4000)(number) AS subset FROM (SELECT * FROM numbers(50000000)) Settings max_execution_time=30 - INSERT INTO test_table_largef SELECT groupArraySample(50000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(1000000000)) - - select hasAll(set, subset) from test_table_small - select hasAll(set, subset) from test_table_small2 - select hasAll(set, subset) from test_table_smallf - - select hasAll(set, subset) from test_table_medium - select hasAll(set, subset) from test_table_medium2 - select hasAll(set, subset) from test_table_mediumf - - select hasAll(set, subset) from test_table_large - select hasAll(set, subset) from test_table_large2 Settings max_execution_time=300 - select hasAll(set, subset) from test_table_largef - - DROP TABLE IF EXISTS test_table_small - DROP TABLE IF EXISTS test_table_small2 - DROP TABLE IF EXISTS test_table_smallf - - DROP TABLE IF EXISTS test_table_medium - DROP TABLE IF EXISTS test_table_medium2 - DROP TABLE IF EXISTS test_table_mediumf - - DROP TABLE IF EXISTS test_table_large - DROP TABLE IF EXISTS test_table_large2 - DROP TABLE IF EXISTS test_table_largef - diff --git a/tests/performance/hasAll_simd_int64.xml b/tests/performance/hasAll_simd_int64.xml deleted file mode 100644 index 07e52483bb1..00000000000 --- a/tests/performance/hasAll_simd_int64.xml +++ /dev/null @@ -1,52 +0,0 @@ - - CREATE TABLE test_table_small (`set` Array(Int64), `subset` Array (Int64)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_small2 (`set` Array(Int64), `subset` Array (Int64)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_smallf (`set` Array(Int64), `subset` Array (Int64)) ENGINE = MergeTree ORDER BY set - - CREATE TABLE test_table_medium (`set` Array(Int64), `subset` Array (Int64)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_medium2 (`set` Array(Int64), `subset` Array (Int64)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_mediumf (`set` Array(Int64), `subset` Array (Int64)) ENGINE = MergeTree ORDER BY set - - CREATE TABLE test_table_large (`set` Array(Int64), `subset` Array (Int64)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_large2 (`set` Array(Int64), `subset` Array (Int64)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_largef (`set` Array(Int64), `subset` Array (Int64)) ENGINE = MergeTree ORDER BY set - - - INSERT INTO test_table_small SELECT groupArraySample(500)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(500)) - INSERT INTO test_table_small2 SELECT groupArraySample(500)(number) AS set, groupArraySample(400)(number) AS subset FROM (SELECT * FROM numbers(500)) - INSERT INTO test_table_smallf SELECT groupArraySample(500)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(5000000)) - - - INSERT INTO test_table_medium SELECT groupArraySample(1000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(1000000)) - INSERT INTO test_table_medium2 SELECT groupArraySample(1000000)(number) AS set, groupArraySample(4000)(number) AS subset FROM (SELECT * FROM numbers(1000000)) - INSERT INTO test_table_mediumf SELECT groupArraySample(1000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(100000000)) - - - INSERT INTO test_table_large SELECT groupArraySample(50000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(50000000)) - INSERT INTO test_table_large2 SELECT groupArraySample(50000000)(number) AS set, groupArraySample(2000)(number) AS subset FROM (SELECT * FROM numbers(50000000)) Settings max_execution_time=30 - INSERT INTO test_table_largef SELECT groupArraySample(50000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(1000000000)) - - select hasAll(set, subset) from test_table_small - select hasAll(set, subset) from test_table_small2 - select hasAll(set, subset) from test_table_smallf - - select hasAll(set, subset) from test_table_medium - select hasAll(set, subset) from test_table_medium2 - select hasAll(set, subset) from test_table_mediumf - - select hasAll(set, subset) from test_table_large - select hasAll(set, subset) from test_table_large2 Settings max_execution_time=300 - select hasAll(set, subset) from test_table_largef - - DROP TABLE IF EXISTS test_table_small - DROP TABLE IF EXISTS test_table_small2 - DROP TABLE IF EXISTS test_table_smallf - - DROP TABLE IF EXISTS test_table_medium - DROP TABLE IF EXISTS test_table_medium2 - DROP TABLE IF EXISTS test_table_mediumf - - DROP TABLE IF EXISTS test_table_large - DROP TABLE IF EXISTS test_table_large2 - DROP TABLE IF EXISTS test_table_largef - diff --git a/tests/performance/hasAll_simd_int8.xml b/tests/performance/hasAll_simd_int8.xml deleted file mode 100644 index 5ddc84aa5bd..00000000000 --- a/tests/performance/hasAll_simd_int8.xml +++ /dev/null @@ -1,52 +0,0 @@ - - CREATE TABLE test_table_small (`set` Array(Int8), `subset` Array (Int8)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_small2 (`set` Array(Int8), `subset` Array (Int8)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_smallf (`set` Array(Int8), `subset` Array (Int8)) ENGINE = MergeTree ORDER BY set - - CREATE TABLE test_table_medium (`set` Array(Int8), `subset` Array (Int8)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_medium2 (`set` Array(Int8), `subset` Array (Int8)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_mediumf (`set` Array(Int8), `subset` Array (Int8)) ENGINE = MergeTree ORDER BY set - - CREATE TABLE test_table_large (`set` Array(Int8), `subset` Array (Int8)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_large2 (`set` Array(Int8), `subset` Array (Int8)) ENGINE = MergeTree ORDER BY set - CREATE TABLE test_table_largef (`set` Array(Int8), `subset` Array (Int8)) ENGINE = MergeTree ORDER BY set - - - INSERT INTO test_table_small SELECT groupArraySample(500)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(500)) - INSERT INTO test_table_small2 SELECT groupArraySample(500)(number) AS set, groupArraySample(400)(number) AS subset FROM (SELECT * FROM numbers(500)) - INSERT INTO test_table_smallf SELECT groupArraySample(500)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(5000000)) - - - INSERT INTO test_table_medium SELECT groupArraySample(1000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(1000000)) - INSERT INTO test_table_medium2 SELECT groupArraySample(1000000)(number) AS set, groupArraySample(4000)(number) AS subset FROM (SELECT * FROM numbers(1000000)) - INSERT INTO test_table_mediumf SELECT groupArraySample(1000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(100000000)) - - - INSERT INTO test_table_large SELECT groupArraySample(50000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(50000000)) - INSERT INTO test_table_large2 SELECT groupArraySample(50000000)(number) AS set, groupArraySample(4000)(number) AS subset FROM (SELECT * FROM numbers(50000000)) - INSERT INTO test_table_largef SELECT groupArraySample(50000000)(number) AS set, groupArraySample(10)(number) AS subset FROM (SELECT * FROM numbers(1000000000)) - - select hasAll(set, subset) from test_table_small - select hasAll(set, subset) from test_table_small2 - select hasAll(set, subset) from test_table_smallf - - select hasAll(set, subset) from test_table_medium - select hasAll(set, subset) from test_table_medium2 - select hasAll(set, subset) from test_table_mediumf - - select hasAll(set, subset) from test_table_large - select hasAll(set, subset) from test_table_large2 Settings max_execution_time=300 - select hasAll(set, subset) from test_table_largef - - DROP TABLE IF EXISTS test_table_small - DROP TABLE IF EXISTS test_table_small2 - DROP TABLE IF EXISTS test_table_smallf - - DROP TABLE IF EXISTS test_table_medium - DROP TABLE IF EXISTS test_table_medium2 - DROP TABLE IF EXISTS test_table_mediumf - - DROP TABLE IF EXISTS test_table_large - DROP TABLE IF EXISTS test_table_large2 - DROP TABLE IF EXISTS test_table_largef - diff --git a/tests/performance/has_all.xml b/tests/performance/has_all.xml new file mode 100644 index 00000000000..331442cbfee --- /dev/null +++ b/tests/performance/has_all.xml @@ -0,0 +1,53 @@ + + + + array_type + + Int8 + Int16 + Int32 + Int64 + + + + + + CREATE TABLE test_table_small_{array_type} + ( + `set` Array({array_type}), + `subset` Array ({array_type}) + ) + ENGINE = MergeTree ORDER BY set; + + + + CREATE TABLE test_table_medium_{array_type} + ( + `set` Array({array_type}), + `subset` Array ({array_type}) + ) + ENGINE = MergeTree ORDER BY set; + + + + CREATE TABLE test_table_large_{array_type} + ( + `set` Array({array_type}), + `subset` Array ({array_type}) + ) + ENGINE = MergeTree ORDER BY set; + + + + INSERT INTO test_table_small_{array_type} SELECT groupArraySample(5000)(rand64()) AS set, groupArraySample(500)(rand64()) AS subset FROM numbers(10000000) GROUP BY number % 5000; + INSERT INTO test_table_medium_{array_type} SELECT groupArraySample(50000)(rand64()) AS set, groupArraySample(5000)(rand64()) AS subset FROM numbers(25000000) GROUP BY number % 50000; + INSERT INTO test_table_large_{array_type} SELECT groupArraySample(500000)(rand64()) AS set, groupArraySample(500000)(rand64()) AS subset FROM numbers(50000000) GROUP BY number % 500000; + + SELECT hasAll(set, subset) FROM test_table_small_{array_type} FORMAT Null + SELECT hasAll(set, subset) FROM test_table_medium_{array_type} FORMAT Null + SELECT hasAll(set, subset) FROM test_table_large_{array_type} FORMAT Null + + DROP TABLE IF EXISTS test_table_small_{array_type} + DROP TABLE IF EXISTS test_table_medium_{array_type} + DROP TABLE IF EXISTS test_table_large_{array_type} + From 09c04e4993ff357fa10d352f394dbe6204f4ee96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BD=95=E6=9D=8E=E5=A4=AB?= Date: Mon, 4 Apr 2022 19:56:41 +0800 Subject: [PATCH 132/239] Improve the pipeline description for JOIN (#35612) Improve the pipeline description for JOIN --- .../QueryPlan/ITransformingStep.cpp | 5 +++++ src/Processors/QueryPlan/ITransformingStep.h | 3 +++ src/QueryPipeline/QueryPipelineBuilder.cpp | 15 ++++++++++++++- .../02236_explain_pipeline_join.reference | 19 +++++++++++++++++++ .../02236_explain_pipeline_join.sql | 10 ++++++++++ 5 files changed, 51 insertions(+), 1 deletion(-) create mode 100644 tests/queries/0_stateless/02236_explain_pipeline_join.reference create mode 100644 tests/queries/0_stateless/02236_explain_pipeline_join.sql diff --git a/src/Processors/QueryPlan/ITransformingStep.cpp b/src/Processors/QueryPlan/ITransformingStep.cpp index 629fb89be1e..9b9797b6540 100644 --- a/src/Processors/QueryPlan/ITransformingStep.cpp +++ b/src/Processors/QueryPlan/ITransformingStep.cpp @@ -70,4 +70,9 @@ void ITransformingStep::describePipeline(FormatSettings & settings) const IQueryPlanStep::describePipeline(processors, settings); } +void ITransformingStep::appendExtraProcessors(const Processors & extra_processors) +{ + processors.insert(processors.end(), extra_processors.begin(), extra_processors.end()); +} + } diff --git a/src/Processors/QueryPlan/ITransformingStep.h b/src/Processors/QueryPlan/ITransformingStep.h index d87ca05d4bc..8f3641dd5bd 100644 --- a/src/Processors/QueryPlan/ITransformingStep.h +++ b/src/Processors/QueryPlan/ITransformingStep.h @@ -57,6 +57,9 @@ public: void describePipeline(FormatSettings & settings) const override; + /// Append extra processors for this step. + void appendExtraProcessors(const Processors & extra_processors); + protected: /// Clear distinct_columns if res_header doesn't contain all of them. static void updateDistinctColumns(const Block & res_header, NameSet & distinct_columns); diff --git a/src/QueryPipeline/QueryPipelineBuilder.cpp b/src/QueryPipeline/QueryPipelineBuilder.cpp index fcd3105a422..9f392b51cf0 100644 --- a/src/QueryPipeline/QueryPipelineBuilder.cpp +++ b/src/QueryPipeline/QueryPipelineBuilder.cpp @@ -1,5 +1,6 @@ #include +#include #include #include #include @@ -307,7 +308,15 @@ std::unique_ptr QueryPipelineBuilder::joinPipelines( right->pipe.dropExtremes(); left->pipe.collected_processors = collected_processors; - right->pipe.collected_processors = collected_processors; + + /// Collect the NEW processors for the right pipeline. + QueryPipelineProcessorsCollector collector(*right); + /// Remember the last step of the right pipeline. + ExpressionStep* step = typeid_cast(right->pipe.processors.back()->getQueryPlanStep()); + if (!step) + { + throw Exception(ErrorCodes::LOGICAL_ERROR, "The top step of the right pipeline should be ExpressionStep"); + } /// In case joined subquery has totals, and we don't, add default chunk to totals. bool default_totals = false; @@ -377,6 +386,10 @@ std::unique_ptr QueryPipelineBuilder::joinPipelines( left->pipe.processors.emplace_back(std::move(joining)); } + /// Move the collected processors to the last step in the right pipeline. + Processors processors = collector.detachProcessors(); + step->appendExtraProcessors(processors); + left->pipe.processors.insert(left->pipe.processors.end(), right->pipe.processors.begin(), right->pipe.processors.end()); left->pipe.holder = std::move(right->pipe.holder); left->pipe.header = left->pipe.output_ports.front()->getHeader(); diff --git a/tests/queries/0_stateless/02236_explain_pipeline_join.reference b/tests/queries/0_stateless/02236_explain_pipeline_join.reference new file mode 100644 index 00000000000..ed993e2a1e7 --- /dev/null +++ b/tests/queries/0_stateless/02236_explain_pipeline_join.reference @@ -0,0 +1,19 @@ +(Expression) +ExpressionTransform + (Join) + JoiningTransform 2 → 1 + (Expression) + ExpressionTransform + (SettingQuotaAndLimits) + (Limit) + Limit + (ReadFromStorage) + Numbers 0 → 1 + (Expression) + FillingRightJoinSide + ExpressionTransform + (SettingQuotaAndLimits) + (Limit) + Limit + (ReadFromStorage) + Numbers 0 → 1 diff --git a/tests/queries/0_stateless/02236_explain_pipeline_join.sql b/tests/queries/0_stateless/02236_explain_pipeline_join.sql new file mode 100644 index 00000000000..de885ed74ee --- /dev/null +++ b/tests/queries/0_stateless/02236_explain_pipeline_join.sql @@ -0,0 +1,10 @@ +EXPLAIN PIPELINE +SELECT * FROM +( + SELECT * FROM system.numbers LIMIT 10 +) t1 +ALL LEFT JOIN +( + SELECT * FROM system.numbers LIMIT 10 +) t2 +USING number; From 9b75ef6ce957b550aaf3fdb0ae2403227a4cfceb Mon Sep 17 00:00:00 2001 From: alesapin Date: Mon, 4 Apr 2022 14:03:16 +0200 Subject: [PATCH 133/239] Fix build --- src/Storages/MergeTree/MergeTreeData.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index f77480dbaaf..8419f07ae73 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -2966,7 +2966,7 @@ void MergeTreeData::tryRemovePartImmediately(DataPartPtr && part) if (!((*it)->getState() == DataPartState::Outdated && it->unique())) { - if (!(*it)->getState() == DataPartState::Outdated) + if ((*it)->getState() != DataPartState::Outdated) LOG_WARNING("Cannot immediately remove part {} because it's not in Outdated state " "usage counter {}", part_name_with_state, it->use_count()); From 47528de78ba2d317f6532dab3bb07461f469049c Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Mon, 4 Apr 2022 14:07:05 +0200 Subject: [PATCH 134/239] Fix build --- src/CMakeLists.txt | 12 ++++++++++++ src/{Functions => Common}/TargetSpecific.cpp | 2 +- src/{Functions => Common}/TargetSpecific.h | 0 src/Functions/CMakeLists.txt | 11 ----------- src/Functions/FunctionStartsEndsWith.h | 2 +- src/Functions/FunctionsHashing.h | 2 +- src/Functions/FunctionsRandom.h | 2 +- src/Functions/GatherUtils/sliceHasImplAnyAll.h | 3 ++- src/Functions/PerformanceAdaptors.h | 2 +- src/Functions/greatCircleDistance.cpp | 2 +- 10 files changed, 20 insertions(+), 18 deletions(-) rename src/{Functions => Common}/TargetSpecific.cpp (96%) rename src/{Functions => Common}/TargetSpecific.h (100%) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 145015ad0f2..851c276cd10 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -49,6 +49,18 @@ if (COMPILER_GCC) add_definitions ("-fno-tree-loop-distribute-patterns") endif () +# ClickHouse developers may use platform-dependent code under some macro (e.g. `#ifdef ENABLE_MULTITARGET`). +# If turned ON, this option defines such macro. +# See `src/Common/TargetSpecific.h` +option(ENABLE_MULTITARGET_CODE "Enable platform-dependent code" ON) + +if (ENABLE_MULTITARGET_CODE) + add_definitions(-DENABLE_MULTITARGET_CODE=1) +else() + add_definitions(-DENABLE_MULTITARGET_CODE=0) +endif() + + add_subdirectory (Access) add_subdirectory (Backups) add_subdirectory (Columns) diff --git a/src/Functions/TargetSpecific.cpp b/src/Common/TargetSpecific.cpp similarity index 96% rename from src/Functions/TargetSpecific.cpp rename to src/Common/TargetSpecific.cpp index 830611fea7a..43319eff44b 100644 --- a/src/Functions/TargetSpecific.cpp +++ b/src/Common/TargetSpecific.cpp @@ -1,4 +1,4 @@ -#include +#include #include diff --git a/src/Functions/TargetSpecific.h b/src/Common/TargetSpecific.h similarity index 100% rename from src/Functions/TargetSpecific.h rename to src/Common/TargetSpecific.h diff --git a/src/Functions/CMakeLists.txt b/src/Functions/CMakeLists.txt index 2596b10503f..debe7fac8a5 100644 --- a/src/Functions/CMakeLists.txt +++ b/src/Functions/CMakeLists.txt @@ -96,17 +96,6 @@ if (TARGET ch_contrib::rapidjson) target_link_libraries(clickhouse_functions PRIVATE ch_contrib::rapidjson) endif() -# ClickHouse developers may use platform-dependent code under some macro (e.g. `#ifdef ENABLE_MULTITARGET`). -# If turned ON, this option defines such macro. -# See `src/Functions/TargetSpecific.h` -option(ENABLE_MULTITARGET_CODE "Enable platform-dependent code" ON) - -if (ENABLE_MULTITARGET_CODE) - add_definitions(-DENABLE_MULTITARGET_CODE=1) -else() - add_definitions(-DENABLE_MULTITARGET_CODE=0) -endif() - add_subdirectory(GatherUtils) target_link_libraries(clickhouse_functions PRIVATE clickhouse_functions_gatherutils) diff --git a/src/Functions/FunctionStartsEndsWith.h b/src/Functions/FunctionStartsEndsWith.h index bbe1631fdf9..f6e0d6375c6 100644 --- a/src/Functions/FunctionStartsEndsWith.h +++ b/src/Functions/FunctionStartsEndsWith.h @@ -1,12 +1,12 @@ #pragma once #include +#include #include #include #include #include #include -#include #include #include #include diff --git a/src/Functions/FunctionsHashing.h b/src/Functions/FunctionsHashing.h index 88a0e9524b3..b78ecb5c72a 100644 --- a/src/Functions/FunctionsHashing.h +++ b/src/Functions/FunctionsHashing.h @@ -38,8 +38,8 @@ #include #include #include -#include #include +#include #include #include diff --git a/src/Functions/FunctionsRandom.h b/src/Functions/FunctionsRandom.h index 2dacd6d6db9..937bc9d36dd 100644 --- a/src/Functions/FunctionsRandom.h +++ b/src/Functions/FunctionsRandom.h @@ -1,9 +1,9 @@ #pragma once +#include #include #include #include -#include #include #include diff --git a/src/Functions/GatherUtils/sliceHasImplAnyAll.h b/src/Functions/GatherUtils/sliceHasImplAnyAll.h index ec8daceb990..68f31006b4f 100644 --- a/src/Functions/GatherUtils/sliceHasImplAnyAll.h +++ b/src/Functions/GatherUtils/sliceHasImplAnyAll.h @@ -14,7 +14,8 @@ #include #endif -#include +#include + namespace DB::GatherUtils { diff --git a/src/Functions/PerformanceAdaptors.h b/src/Functions/PerformanceAdaptors.h index bcc195e988e..5b690d83805 100644 --- a/src/Functions/PerformanceAdaptors.h +++ b/src/Functions/PerformanceAdaptors.h @@ -1,8 +1,8 @@ #pragma once -#include #include +#include #include #include diff --git a/src/Functions/greatCircleDistance.cpp b/src/Functions/greatCircleDistance.cpp index f0743486584..9b0d2625914 100644 --- a/src/Functions/greatCircleDistance.cpp +++ b/src/Functions/greatCircleDistance.cpp @@ -6,8 +6,8 @@ #include #include #include -#include #include +#include #include #include From bd89fcafdbc44b4b41f1c7458af5eeedec062774 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Mon, 4 Apr 2022 14:17:15 +0200 Subject: [PATCH 135/239] Make `SortDescription::column_name` always non-empty (#35805) --- src/Core/Block.cpp | 3 +- src/Core/SortCursor.h | 32 ++++------- src/Core/SortDescription.cpp | 35 +++--------- src/Core/SortDescription.h | 54 +++++++++++-------- src/Interpreters/InterpreterSelectQuery.cpp | 4 -- src/Interpreters/MutationsInterpreter.cpp | 2 +- src/Interpreters/Set.cpp | 4 +- src/Interpreters/sortBlock.cpp | 4 +- src/Processors/LimitTransform.cpp | 7 +-- .../Algorithms/AggregatingSortedAlgorithm.cpp | 11 ++-- .../Algorithms/CollapsingSortedAlgorithm.cpp | 8 +-- .../FinishAggregatingInOrderAlgorithm.cpp | 27 +++------- .../FinishAggregatingInOrderAlgorithm.h | 6 +-- .../GraphiteRollupSortedAlgorithm.cpp | 18 ++++--- .../IMergingAlgorithmWithDelayedChunk.cpp | 13 ++--- .../IMergingAlgorithmWithDelayedChunk.h | 6 +-- .../IMergingAlgorithmWithSharedChunks.cpp | 12 ++--- .../IMergingAlgorithmWithSharedChunks.h | 6 +-- .../Algorithms/MergingSortedAlgorithm.cpp | 20 +++---- .../Algorithms/MergingSortedAlgorithm.h | 4 +- .../Algorithms/ReplacingSortedAlgorithm.cpp | 18 ++++--- .../Algorithms/SummingSortedAlgorithm.cpp | 17 +++--- .../VersionedCollapsingAlgorithm.cpp | 13 ++--- src/Processors/QueryPlan/FillingStep.cpp | 4 +- .../QueryPlan/ReadFromMergeTree.cpp | 11 +--- src/Processors/QueryPlan/SortingStep.cpp | 12 ++--- src/Processors/QueryPlan/WindowStep.cpp | 2 +- .../AggregatingInOrderTransform.cpp | 12 +---- .../Transforms/AggregatingInOrderTransform.h | 2 +- .../Transforms/CheckSortedTransform.cpp | 30 ++--------- .../Transforms/CheckSortedTransform.h | 11 +--- .../Transforms/DistinctSortedTransform.cpp | 11 ++-- .../Transforms/DistinctSortedTransform.h | 6 ++- .../Transforms/FinishSortingTransform.cpp | 14 +++-- .../Transforms/FinishSortingTransform.h | 9 ++-- .../Transforms/MergeSortingTransform.cpp | 21 +++++--- .../Transforms/MergeSortingTransform.h | 17 +++--- .../Transforms/PartialSortingTransform.cpp | 4 +- .../Transforms/SortingTransform.cpp | 19 ++----- src/Processors/Transforms/SortingTransform.h | 8 +-- src/Storages/MergeTree/MergeTask.cpp | 2 +- .../MergeTree/MergeTreeDataWriter.cpp | 4 +- 42 files changed, 218 insertions(+), 305 deletions(-) diff --git a/src/Core/Block.cpp b/src/Core/Block.cpp index a7142ef7f2e..60d2eba4f08 100644 --- a/src/Core/Block.cpp +++ b/src/Core/Block.cpp @@ -46,7 +46,8 @@ static ReturnType checkColumnStructure(const ColumnWithTypeAndName & actual, con return onError("Block structure mismatch in " + std::string(context_description) + " stream: different names of columns:\n" + actual.dumpStructure() + "\n" + expected.dumpStructure(), code); - if (!actual.type->equals(*expected.type)) + if ((actual.type && !expected.type) || (!actual.type && expected.type) + || (actual.type && expected.type && !actual.type->equals(*expected.type))) return onError("Block structure mismatch in " + std::string(context_description) + " stream: different types:\n" + actual.dumpStructure() + "\n" + expected.dumpStructure(), code); diff --git a/src/Core/SortCursor.h b/src/Core/SortCursor.h index a5daba9fbee..a0f60fbccf8 100644 --- a/src/Core/SortCursor.h +++ b/src/Core/SortCursor.h @@ -15,10 +15,6 @@ namespace DB { -namespace ErrorCodes -{ - extern const int LOGICAL_ERROR; -} /** Cursor allows to compare rows in different blocks (and parts). * Cursor moves inside single block. @@ -61,25 +57,21 @@ struct SortCursorImpl reset(block, perm); } - SortCursorImpl(const Columns & columns, const SortDescription & desc_, size_t order_ = 0, IColumn::Permutation * perm = nullptr) + SortCursorImpl( + const Block & header, + const Columns & columns, + const SortDescription & desc_, + size_t order_ = 0, + IColumn::Permutation * perm = nullptr) : desc(desc_), sort_columns_size(desc.size()), order(order_), need_collation(desc.size()) { - for (auto & column_desc : desc) - { - if (!column_desc.column_name.empty()) - throw Exception("SortDescription should contain column position if SortCursor was used without header.", - ErrorCodes::LOGICAL_ERROR); - } - reset(columns, {}, perm); + reset(columns, header, perm); } bool empty() const { return rows == 0; } /// Set the cursor to the beginning of the new block. - void reset(const Block & block, IColumn::Permutation * perm = nullptr) - { - reset(block.getColumns(), block, perm); - } + void reset(const Block & block, IColumn::Permutation * perm = nullptr) { reset(block.getColumns(), block, perm); } /// Set the cursor to the beginning of the new block. void reset(const Columns & columns, const Block & block, IColumn::Permutation * perm = nullptr) @@ -95,9 +87,7 @@ struct SortCursorImpl for (size_t j = 0, size = desc.size(); j < size; ++j) { auto & column_desc = desc[j]; - size_t column_number = !column_desc.column_name.empty() - ? block.getPositionByName(column_desc.column_name) - : column_desc.column_number; + size_t column_number = block.getPositionByName(column_desc.column_name); sort_columns.push_back(columns[column_number].get()); need_collation[j] = desc[j].collator != nullptr && sort_columns.back()->isCollationSupported(); @@ -367,12 +357,12 @@ private: }; template -bool less(const TLeftColumns & lhs, const TRightColumns & rhs, size_t i, size_t j, const SortDescription & descr) +bool less(const TLeftColumns & lhs, const TRightColumns & rhs, size_t i, size_t j, const SortDescriptionWithPositions & descr) { for (const auto & elem : descr) { size_t ind = elem.column_number; - int res = elem.direction * lhs[ind]->compareAt(i, j, *rhs[ind], elem.nulls_direction); + int res = elem.base.direction * lhs[ind]->compareAt(i, j, *rhs[ind], elem.base.nulls_direction); if (res < 0) return true; else if (res > 0) diff --git a/src/Core/SortDescription.cpp b/src/Core/SortDescription.cpp index 314b6624623..7994ada7b85 100644 --- a/src/Core/SortDescription.cpp +++ b/src/Core/SortDescription.cpp @@ -1,12 +1,12 @@ -#include #include +#include #include #include namespace DB { -void dumpSortDescription(const SortDescription & description, const Block & header, WriteBuffer & out) +void dumpSortDescription(const SortDescription & description, WriteBuffer & out) { bool first = true; @@ -16,17 +16,7 @@ void dumpSortDescription(const SortDescription & description, const Block & head out << ", "; first = false; - if (!desc.column_name.empty()) - out << desc.column_name; - else - { - if (desc.column_number < header.columns()) - out << header.getByPosition(desc.column_number).name; - else - out << "?"; - - out << " (pos " << desc.column_number << ")"; - } + out << desc.column_name; if (desc.direction > 0) out << " ASC"; @@ -38,18 +28,9 @@ void dumpSortDescription(const SortDescription & description, const Block & head } } -void SortColumnDescription::explain(JSONBuilder::JSONMap & map, const Block & header) const +void SortColumnDescription::explain(JSONBuilder::JSONMap & map) const { - if (!column_name.empty()) - map.add("Column", column_name); - else - { - if (column_number < header.columns()) - map.add("Column", header.getByPosition(column_number).name); - - map.add("Position", column_number); - } - + map.add("Column", column_name); map.add("Ascending", direction > 0); map.add("With Fill", with_fill); } @@ -57,17 +38,17 @@ void SortColumnDescription::explain(JSONBuilder::JSONMap & map, const Block & he std::string dumpSortDescription(const SortDescription & description) { WriteBufferFromOwnString wb; - dumpSortDescription(description, Block{}, wb); + dumpSortDescription(description, wb); return wb.str(); } -JSONBuilder::ItemPtr explainSortDescription(const SortDescription & description, const Block & header) +JSONBuilder::ItemPtr explainSortDescription(const SortDescription & description) { auto json_array = std::make_unique(); for (const auto & descr : description) { auto json_map = std::make_unique(); - descr.explain(*json_map, header); + descr.explain(*json_map); json_array->add(std::move(json_map)); } diff --git a/src/Core/SortDescription.h b/src/Core/SortDescription.h index db15f3a54db..66f2ca24c69 100644 --- a/src/Core/SortDescription.h +++ b/src/Core/SortDescription.h @@ -39,7 +39,6 @@ struct FillColumnDescription struct SortColumnDescription { std::string column_name; /// The name of the column. - size_t column_number; /// Column number (used if no name is given). int direction; /// 1 - ascending, -1 - descending. int nulls_direction; /// 1 - NULLs and NaNs are greater, -1 - less. /// To achieve NULLS LAST, set it equal to direction, to achieve NULLS FIRST, set it opposite. @@ -48,23 +47,24 @@ struct SortColumnDescription FillColumnDescription fill_description; explicit SortColumnDescription( - size_t column_number_, int direction_ = 1, int nulls_direction_ = 1, - const std::shared_ptr & collator_ = nullptr, - bool with_fill_ = false, const FillColumnDescription & fill_description_ = {}) - : column_number(column_number_), direction(direction_), nulls_direction(nulls_direction_), collator(collator_) - , with_fill(with_fill_), fill_description(fill_description_) {} - - explicit SortColumnDescription( - const std::string & column_name_, int direction_ = 1, int nulls_direction_ = 1, - const std::shared_ptr & collator_ = nullptr, - bool with_fill_ = false, const FillColumnDescription & fill_description_ = {}) - : column_name(column_name_), column_number(0), direction(direction_), nulls_direction(nulls_direction_) - , collator(collator_), with_fill(with_fill_), fill_description(fill_description_) {} + const std::string & column_name_, + int direction_ = 1, + int nulls_direction_ = 1, + const std::shared_ptr & collator_ = nullptr, + bool with_fill_ = false, + const FillColumnDescription & fill_description_ = {}) + : column_name(column_name_) + , direction(direction_) + , nulls_direction(nulls_direction_) + , collator(collator_) + , with_fill(with_fill_) + , fill_description(fill_description_) + { + } bool operator == (const SortColumnDescription & other) const { - return column_name == other.column_name && column_number == other.column_number - && direction == other.direction && nulls_direction == other.nulls_direction; + return column_name == other.column_name && direction == other.direction && nulls_direction == other.nulls_direction; } bool operator != (const SortColumnDescription & other) const @@ -72,22 +72,30 @@ struct SortColumnDescription return !(*this == other); } - std::string dump() const - { - return fmt::format("{}:{}:dir {}nulls ", column_name, column_number, direction, nulls_direction); - } + std::string dump() const { return fmt::format("{}:dir {}nulls {}", column_name, direction, nulls_direction); } - void explain(JSONBuilder::JSONMap & map, const Block & header) const; + void explain(JSONBuilder::JSONMap & map) const; +}; + +struct SortColumnDescriptionWithColumnIndex +{ + SortColumnDescription base; + size_t column_number; + + SortColumnDescriptionWithColumnIndex(SortColumnDescription description_, size_t column_number_) + : base(std::move(description_)), column_number(column_number_) + { + } }; /// Description of the sorting rule for several columns. using SortDescription = std::vector; +using SortDescriptionWithPositions = std::vector; /// Outputs user-readable description into `out`. -void dumpSortDescription(const SortDescription & description, const Block & header, WriteBuffer & out); +void dumpSortDescription(const SortDescription & description, WriteBuffer & out); std::string dumpSortDescription(const SortDescription & description); -JSONBuilder::ItemPtr explainSortDescription(const SortDescription & description, const Block & header); - +JSONBuilder::ItemPtr explainSortDescription(const SortDescription & description); } diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index dda2e3f2142..5091debbe72 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -2249,10 +2249,6 @@ static bool windowDescriptionComparator(const WindowDescription * _left, const W return true; else if (left[i].column_name > right[i].column_name) return false; - else if (left[i].column_number < right[i].column_number) - return true; - else if (left[i].column_number > right[i].column_number) - return false; else if (left[i].direction < right[i].direction) return true; else if (left[i].direction > right[i].direction) diff --git a/src/Interpreters/MutationsInterpreter.cpp b/src/Interpreters/MutationsInterpreter.cpp index 5e795c5760a..f46333dc00a 100644 --- a/src/Interpreters/MutationsInterpreter.cpp +++ b/src/Interpreters/MutationsInterpreter.cpp @@ -1025,7 +1025,7 @@ std::optional MutationsInterpreter::getStorageSortDescriptionIf for (size_t i = 0; i < sort_columns_size; ++i) { if (header.has(sort_columns[i])) - sort_description.emplace_back(header.getPositionByName(sort_columns[i]), 1, 1); + sort_description.emplace_back(sort_columns[i], 1, 1); else return {}; } diff --git a/src/Interpreters/Set.cpp b/src/Interpreters/Set.cpp index 224b13d2c45..28bbea54110 100644 --- a/src/Interpreters/Set.cpp +++ b/src/Interpreters/Set.cpp @@ -430,8 +430,8 @@ MergeTreeSetIndex::MergeTreeSetIndex(const Columns & set_elements, std::vectorgetName()}); + sort_description.emplace_back(ordered_set[i]->getName(), 1, 1); } sortBlock(block_to_sort, sort_description); diff --git a/src/Interpreters/sortBlock.cpp b/src/Interpreters/sortBlock.cpp index 3281445022e..4343e8c7fc6 100644 --- a/src/Interpreters/sortBlock.cpp +++ b/src/Interpreters/sortBlock.cpp @@ -98,9 +98,7 @@ ColumnsWithSortDescriptions getColumnsWithSortDescription(const Block & block, c { const auto & sort_column_description = description[i]; - const IColumn * column = !sort_column_description.column_name.empty() - ? block.getByName(sort_column_description.column_name).column.get() - : block.safeGetByPosition(sort_column_description.column_number).column.get(); + const IColumn * column = block.getByName(sort_column_description.column_name).column.get(); if (isCollationRequired(sort_column_description)) { diff --git a/src/Processors/LimitTransform.cpp b/src/Processors/LimitTransform.cpp index 36c58e1454e..48f29680da2 100644 --- a/src/Processors/LimitTransform.cpp +++ b/src/Processors/LimitTransform.cpp @@ -38,12 +38,7 @@ LimitTransform::LimitTransform( } for (const auto & desc : description) - { - if (!desc.column_name.empty()) - sort_column_positions.push_back(header_.getPositionByName(desc.column_name)); - else - sort_column_positions.push_back(desc.column_number); - } + sort_column_positions.push_back(header_.getPositionByName(desc.column_name)); } Chunk LimitTransform::makeChunkWithPreviousRow(const Chunk & chunk, UInt64 row) const diff --git a/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.cpp b/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.cpp index af31ef01fcd..ebc1b37074b 100644 --- a/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.cpp +++ b/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.cpp @@ -104,7 +104,7 @@ static AggregatingSortedAlgorithm::ColumnsDefinition defineColumns( /// Included into PK? auto it = description.begin(); for (; it != description.end(); ++it) - if (it->column_name == column.name || (it->column_name.empty() && it->column_number == i)) + if (it->column_name == column.name) break; if (it != description.end()) @@ -290,11 +290,10 @@ void AggregatingSortedAlgorithm::AggregatingMergedData::initAggregateDescription AggregatingSortedAlgorithm::AggregatingSortedAlgorithm( - const Block & header, size_t num_inputs, - SortDescription description_, size_t max_block_size) - : IMergingAlgorithmWithDelayedChunk(num_inputs, description_) - , columns_definition(defineColumns(header, description_)) - , merged_data(getMergedColumns(header, columns_definition), max_block_size, columns_definition) + const Block & header_, size_t num_inputs, SortDescription description_, size_t max_block_size) + : IMergingAlgorithmWithDelayedChunk(header_, num_inputs, description_) + , columns_definition(defineColumns(header_, description_)) + , merged_data(getMergedColumns(header_, columns_definition), max_block_size, columns_definition) { } diff --git a/src/Processors/Merges/Algorithms/CollapsingSortedAlgorithm.cpp b/src/Processors/Merges/Algorithms/CollapsingSortedAlgorithm.cpp index 592562c47b9..5dfec31c009 100644 --- a/src/Processors/Merges/Algorithms/CollapsingSortedAlgorithm.cpp +++ b/src/Processors/Merges/Algorithms/CollapsingSortedAlgorithm.cpp @@ -21,7 +21,7 @@ namespace ErrorCodes } CollapsingSortedAlgorithm::CollapsingSortedAlgorithm( - const Block & header, + const Block & header_, size_t num_inputs, SortDescription description_, const String & sign_column, @@ -30,9 +30,9 @@ CollapsingSortedAlgorithm::CollapsingSortedAlgorithm( Poco::Logger * log_, WriteBuffer * out_row_sources_buf_, bool use_average_block_sizes) - : IMergingAlgorithmWithSharedChunks(num_inputs, std::move(description_), out_row_sources_buf_, max_row_refs) - , merged_data(header.cloneEmptyColumns(), use_average_block_sizes, max_block_size) - , sign_column_number(header.getPositionByName(sign_column)) + : IMergingAlgorithmWithSharedChunks(header_, num_inputs, std::move(description_), out_row_sources_buf_, max_row_refs) + , merged_data(header_.cloneEmptyColumns(), use_average_block_sizes, max_block_size) + , sign_column_number(header_.getPositionByName(sign_column)) , only_positive_sign(only_positive_sign_) , log(log_) { diff --git a/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.cpp b/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.cpp index fdea3c23dc2..5d8a593c682 100644 --- a/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.cpp +++ b/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.cpp @@ -14,11 +14,8 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -FinishAggregatingInOrderAlgorithm::State::State( - const Chunk & chunk, const SortDescription & desc, Int64 total_bytes_) - : all_columns(chunk.getColumns()) - , num_rows(chunk.getNumRows()) - , total_bytes(total_bytes_) +FinishAggregatingInOrderAlgorithm::State::State(const Chunk & chunk, const SortDescriptionWithPositions & desc, Int64 total_bytes_) + : all_columns(chunk.getColumns()), num_rows(chunk.getNumRows()), total_bytes(total_bytes_) { if (!chunk) return; @@ -32,25 +29,13 @@ FinishAggregatingInOrderAlgorithm::FinishAggregatingInOrderAlgorithm( const Block & header_, size_t num_inputs_, AggregatingTransformParamsPtr params_, - SortDescription description_, + const SortDescription & description_, size_t max_block_size_, size_t max_block_bytes_) - : header(header_) - , num_inputs(num_inputs_) - , params(params_) - , description(std::move(description_)) - , max_block_size(max_block_size_) - , max_block_bytes(max_block_bytes_) + : header(header_), num_inputs(num_inputs_), params(params_), max_block_size(max_block_size_), max_block_bytes(max_block_bytes_) { - /// Replace column names in description to positions. - for (auto & column_description : description) - { - if (!column_description.column_name.empty()) - { - column_description.column_number = header_.getPositionByName(column_description.column_name); - column_description.column_name.clear(); - } - } + for (const auto & column_description : description_) + description.emplace_back(column_description, header_.getPositionByName(column_description.column_name)); } void FinishAggregatingInOrderAlgorithm::initialize(Inputs inputs) diff --git a/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.h b/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.h index f3a1bd40635..ff31886f438 100644 --- a/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.h +++ b/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.h @@ -41,7 +41,7 @@ public: const Block & header_, size_t num_inputs_, AggregatingTransformParamsPtr params_, - SortDescription description_, + const SortDescription & description_, size_t max_block_size_, size_t max_block_bytes_); @@ -69,7 +69,7 @@ private: /// Number of bytes in all columns + number of bytes in arena, related to current chunk. size_t total_bytes = 0; - State(const Chunk & chunk, const SortDescription & description, Int64 total_bytes_); + State(const Chunk & chunk, const SortDescriptionWithPositions & description, Int64 total_bytes_); State() = default; bool isValid() const { return current_row < num_rows; } @@ -78,7 +78,7 @@ private: Block header; size_t num_inputs; AggregatingTransformParamsPtr params; - SortDescription description; + SortDescriptionWithPositions description; size_t max_block_size; size_t max_block_bytes; diff --git a/src/Processors/Merges/Algorithms/GraphiteRollupSortedAlgorithm.cpp b/src/Processors/Merges/Algorithms/GraphiteRollupSortedAlgorithm.cpp index 6464f10ca58..eff62d73f50 100644 --- a/src/Processors/Merges/Algorithms/GraphiteRollupSortedAlgorithm.cpp +++ b/src/Processors/Merges/Algorithms/GraphiteRollupSortedAlgorithm.cpp @@ -30,12 +30,16 @@ static GraphiteRollupSortedAlgorithm::ColumnsDefinition defineColumns( } GraphiteRollupSortedAlgorithm::GraphiteRollupSortedAlgorithm( - const Block & header, size_t num_inputs, - SortDescription description_, size_t max_block_size, - Graphite::Params params_, time_t time_of_merge_) - : IMergingAlgorithmWithSharedChunks(num_inputs, std::move(description_), nullptr, max_row_refs) - , merged_data(header.cloneEmptyColumns(), false, max_block_size) - , params(std::move(params_)), time_of_merge(time_of_merge_) + const Block & header_, + size_t num_inputs, + SortDescription description_, + size_t max_block_size, + Graphite::Params params_, + time_t time_of_merge_) + : IMergingAlgorithmWithSharedChunks(header_, num_inputs, std::move(description_), nullptr, max_row_refs) + , merged_data(header_.cloneEmptyColumns(), false, max_block_size) + , params(std::move(params_)) + , time_of_merge(time_of_merge_) { size_t max_size_of_aggregate_state = 0; size_t max_alignment_of_aggregate_state = 1; @@ -50,7 +54,7 @@ GraphiteRollupSortedAlgorithm::GraphiteRollupSortedAlgorithm( } merged_data.allocMemForAggregates(max_size_of_aggregate_state, max_alignment_of_aggregate_state); - columns_definition = defineColumns(header, params); + columns_definition = defineColumns(header_, params); } UInt32 GraphiteRollupSortedAlgorithm::selectPrecision(const Graphite::Retentions & retentions, time_t time) const diff --git a/src/Processors/Merges/Algorithms/IMergingAlgorithmWithDelayedChunk.cpp b/src/Processors/Merges/Algorithms/IMergingAlgorithmWithDelayedChunk.cpp index e4c60d7609c..1d0be726c16 100644 --- a/src/Processors/Merges/Algorithms/IMergingAlgorithmWithDelayedChunk.cpp +++ b/src/Processors/Merges/Algorithms/IMergingAlgorithmWithDelayedChunk.cpp @@ -4,12 +4,8 @@ namespace DB { -IMergingAlgorithmWithDelayedChunk::IMergingAlgorithmWithDelayedChunk( - size_t num_inputs, - SortDescription description_) - : description(std::move(description_)) - , current_inputs(num_inputs) - , cursors(num_inputs) +IMergingAlgorithmWithDelayedChunk::IMergingAlgorithmWithDelayedChunk(Block header_, size_t num_inputs, SortDescription description_) + : description(std::move(description_)), header(std::move(header_)), current_inputs(num_inputs), cursors(num_inputs) { } @@ -22,7 +18,8 @@ void IMergingAlgorithmWithDelayedChunk::initializeQueue(Inputs inputs) if (!current_inputs[source_num].chunk) continue; - cursors[source_num] = SortCursorImpl(current_inputs[source_num].chunk.getColumns(), description, source_num, current_inputs[source_num].permutation); + cursors[source_num] = SortCursorImpl( + header, current_inputs[source_num].chunk.getColumns(), description, source_num, current_inputs[source_num].permutation); } queue = SortingHeap(cursors); @@ -37,7 +34,7 @@ void IMergingAlgorithmWithDelayedChunk::updateCursor(Input & input, size_t sourc last_chunk_sort_columns = std::move(cursors[source_num].sort_columns); current_input.swap(input); - cursors[source_num].reset(current_input.chunk.getColumns(), {}, current_input.permutation); + cursors[source_num].reset(current_input.chunk.getColumns(), header, current_input.permutation); queue.push(cursors[source_num]); } diff --git a/src/Processors/Merges/Algorithms/IMergingAlgorithmWithDelayedChunk.h b/src/Processors/Merges/Algorithms/IMergingAlgorithmWithDelayedChunk.h index 69530a707c2..e9f735f4a71 100644 --- a/src/Processors/Merges/Algorithms/IMergingAlgorithmWithDelayedChunk.h +++ b/src/Processors/Merges/Algorithms/IMergingAlgorithmWithDelayedChunk.h @@ -10,9 +10,7 @@ namespace DB class IMergingAlgorithmWithDelayedChunk : public IMergingAlgorithm { public: - IMergingAlgorithmWithDelayedChunk( - size_t num_inputs, - SortDescription description_); + IMergingAlgorithmWithDelayedChunk(Block header_, size_t num_inputs, SortDescription description_); protected: SortingHeap queue; @@ -28,6 +26,8 @@ protected: bool skipLastRowFor(size_t input_number) const { return current_inputs[input_number].skip_last_row; } private: + Block header; + /// Inputs currently being merged. Inputs current_inputs; SortCursorImpls cursors; diff --git a/src/Processors/Merges/Algorithms/IMergingAlgorithmWithSharedChunks.cpp b/src/Processors/Merges/Algorithms/IMergingAlgorithmWithSharedChunks.cpp index 97abffdc167..2e87de1ae29 100644 --- a/src/Processors/Merges/Algorithms/IMergingAlgorithmWithSharedChunks.cpp +++ b/src/Processors/Merges/Algorithms/IMergingAlgorithmWithSharedChunks.cpp @@ -4,11 +4,9 @@ namespace DB { IMergingAlgorithmWithSharedChunks::IMergingAlgorithmWithSharedChunks( - size_t num_inputs, - SortDescription description_, - WriteBuffer * out_row_sources_buf_, - size_t max_row_refs) - : description(std::move(description_)) + Block header_, size_t num_inputs, SortDescription description_, WriteBuffer * out_row_sources_buf_, size_t max_row_refs) + : header(std::move(header_)) + , description(std::move(description_)) , chunk_allocator(num_inputs + max_row_refs) , cursors(num_inputs) , sources(num_inputs) @@ -39,7 +37,7 @@ void IMergingAlgorithmWithSharedChunks::initialize(Inputs inputs) source.skip_last_row = inputs[source_num].skip_last_row; source.chunk = chunk_allocator.alloc(inputs[source_num].chunk); - cursors[source_num] = SortCursorImpl(source.chunk->getColumns(), description, source_num, inputs[source_num].permutation); + cursors[source_num] = SortCursorImpl(header, source.chunk->getColumns(), description, source_num, inputs[source_num].permutation); source.chunk->all_columns = cursors[source_num].all_columns; source.chunk->sort_columns = cursors[source_num].sort_columns; @@ -55,7 +53,7 @@ void IMergingAlgorithmWithSharedChunks::consume(Input & input, size_t source_num auto & source = sources[source_num]; source.skip_last_row = input.skip_last_row; source.chunk = chunk_allocator.alloc(input.chunk); - cursors[source_num].reset(source.chunk->getColumns(), {}, input.permutation); + cursors[source_num].reset(source.chunk->getColumns(), header, input.permutation); source.chunk->all_columns = cursors[source_num].all_columns; source.chunk->sort_columns = cursors[source_num].sort_columns; diff --git a/src/Processors/Merges/Algorithms/IMergingAlgorithmWithSharedChunks.h b/src/Processors/Merges/Algorithms/IMergingAlgorithmWithSharedChunks.h index 65c456ea44c..32ef23ab6e5 100644 --- a/src/Processors/Merges/Algorithms/IMergingAlgorithmWithSharedChunks.h +++ b/src/Processors/Merges/Algorithms/IMergingAlgorithmWithSharedChunks.h @@ -10,15 +10,13 @@ class IMergingAlgorithmWithSharedChunks : public IMergingAlgorithm { public: IMergingAlgorithmWithSharedChunks( - size_t num_inputs, - SortDescription description_, - WriteBuffer * out_row_sources_buf_, - size_t max_row_refs); + Block header_, size_t num_inputs, SortDescription description_, WriteBuffer * out_row_sources_buf_, size_t max_row_refs); void initialize(Inputs inputs) override; void consume(Input & input, size_t source_num) override; private: + Block header; SortDescription description; /// Allocator must be destroyed after source_chunks. diff --git a/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.cpp b/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.cpp index 6b2f0f571a1..1765615f9d1 100644 --- a/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.cpp +++ b/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.cpp @@ -11,30 +11,22 @@ namespace ErrorCodes } MergingSortedAlgorithm::MergingSortedAlgorithm( - const Block & header, + Block header_, size_t num_inputs, SortDescription description_, size_t max_block_size, UInt64 limit_, WriteBuffer * out_row_sources_buf_, bool use_average_block_sizes) - : merged_data(header.cloneEmptyColumns(), use_average_block_sizes, max_block_size) + : header(std::move(header_)) + , merged_data(header.cloneEmptyColumns(), use_average_block_sizes, max_block_size) , description(std::move(description_)) , limit(limit_) + , has_collation(std::any_of(description.begin(), description.end(), [](const auto & descr) { return descr.collator != nullptr; })) , out_row_sources_buf(out_row_sources_buf_) , current_inputs(num_inputs) , cursors(num_inputs) { - /// Replace column names in description to positions. - for (auto & column_description : description) - { - has_collation |= column_description.collator != nullptr; - if (!column_description.column_name.empty()) - { - column_description.column_number = header.getPositionByName(column_description.column_name); - column_description.column_name.clear(); - } - } } void MergingSortedAlgorithm::addInput() @@ -65,7 +57,7 @@ void MergingSortedAlgorithm::initialize(Inputs inputs) continue; prepareChunk(chunk); - cursors[source_num] = SortCursorImpl(chunk.getColumns(), description, source_num); + cursors[source_num] = SortCursorImpl(header, chunk.getColumns(), description, source_num); } if (has_collation) @@ -78,7 +70,7 @@ void MergingSortedAlgorithm::consume(Input & input, size_t source_num) { prepareChunk(input.chunk); current_inputs[source_num].swap(input); - cursors[source_num].reset(current_inputs[source_num].chunk.getColumns(), {}); + cursors[source_num].reset(current_inputs[source_num].chunk.getColumns(), header); if (has_collation) queue_with_collation.push(cursors[source_num]); diff --git a/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.h b/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.h index 63dced26dd4..cf3ec44f5fc 100644 --- a/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.h +++ b/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.h @@ -14,7 +14,7 @@ class MergingSortedAlgorithm final : public IMergingAlgorithm { public: MergingSortedAlgorithm( - const Block & header, + Block header_, size_t num_inputs, SortDescription description_, size_t max_block_size, @@ -31,6 +31,8 @@ public: const MergedData & getMergedData() const { return merged_data; } private: + Block header; + MergedData merged_data; /// Settings diff --git a/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.cpp b/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.cpp index b8c788ed1fc..4afd01c988f 100644 --- a/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.cpp +++ b/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.cpp @@ -5,16 +5,18 @@ namespace DB { ReplacingSortedAlgorithm::ReplacingSortedAlgorithm( - const Block & header, size_t num_inputs, - SortDescription description_, const String & version_column, - size_t max_block_size, - WriteBuffer * out_row_sources_buf_, - bool use_average_block_sizes) - : IMergingAlgorithmWithSharedChunks(num_inputs, std::move(description_), out_row_sources_buf_, max_row_refs) - , merged_data(header.cloneEmptyColumns(), use_average_block_sizes, max_block_size) + const Block & header_, + size_t num_inputs, + SortDescription description_, + const String & version_column, + size_t max_block_size, + WriteBuffer * out_row_sources_buf_, + bool use_average_block_sizes) + : IMergingAlgorithmWithSharedChunks(header_, num_inputs, std::move(description_), out_row_sources_buf_, max_row_refs) + , merged_data(header_.cloneEmptyColumns(), use_average_block_sizes, max_block_size) { if (!version_column.empty()) - version_column_number = header.getPositionByName(version_column); + version_column_number = header_.getPositionByName(version_column); } void ReplacingSortedAlgorithm::insertRow() diff --git a/src/Processors/Merges/Algorithms/SummingSortedAlgorithm.cpp b/src/Processors/Merges/Algorithms/SummingSortedAlgorithm.cpp index 0247b8677af..dc4270d4041 100644 --- a/src/Processors/Merges/Algorithms/SummingSortedAlgorithm.cpp +++ b/src/Processors/Merges/Algorithms/SummingSortedAlgorithm.cpp @@ -101,10 +101,10 @@ struct SummingSortedAlgorithm::AggregateDescription }; -static bool isInPrimaryKey(const SortDescription & description, const std::string & name, const size_t number) +static bool isInPrimaryKey(const SortDescription & description, const std::string & name) { for (const auto & desc : description) - if (desc.column_name == name || (desc.column_name.empty() && desc.column_number == number)) + if (desc.column_name == name) return true; return false; @@ -251,7 +251,7 @@ static SummingSortedAlgorithm::ColumnsDefinition defineColumns( } /// Are they inside the primary key or partition key? - if (isInPrimaryKey(description, column.name, i) || isInPartitionKey(column.name, partition_key_columns)) + if (isInPrimaryKey(description, column.name) || isInPartitionKey(column.name, partition_key_columns)) { def.column_numbers_not_to_aggregate.push_back(i); continue; @@ -307,7 +307,7 @@ static SummingSortedAlgorithm::ColumnsDefinition defineColumns( /// no elements of map could be in primary key auto column_num_it = map.second.begin(); for (; column_num_it != map.second.end(); ++column_num_it) - if (isInPrimaryKey(description, header.safeGetByPosition(*column_num_it).name, *column_num_it)) + if (isInPrimaryKey(description, header.safeGetByPosition(*column_num_it).name)) break; if (column_num_it != map.second.end()) { @@ -687,14 +687,15 @@ Chunk SummingSortedAlgorithm::SummingMergedData::pull() SummingSortedAlgorithm::SummingSortedAlgorithm( - const Block & header, size_t num_inputs, + const Block & header_, + size_t num_inputs, SortDescription description_, const Names & column_names_to_sum, const Names & partition_key_columns, size_t max_block_size) - : IMergingAlgorithmWithDelayedChunk(num_inputs, std::move(description_)) - , columns_definition(defineColumns(header, description, column_names_to_sum, partition_key_columns)) - , merged_data(getMergedDataColumns(header, columns_definition), max_block_size, columns_definition) + : IMergingAlgorithmWithDelayedChunk(header_, num_inputs, std::move(description_)) + , columns_definition(defineColumns(header_, description, column_names_to_sum, partition_key_columns)) + , merged_data(getMergedDataColumns(header_, columns_definition), max_block_size, columns_definition) { } diff --git a/src/Processors/Merges/Algorithms/VersionedCollapsingAlgorithm.cpp b/src/Processors/Merges/Algorithms/VersionedCollapsingAlgorithm.cpp index 672242b253b..cbafa53d0a3 100644 --- a/src/Processors/Merges/Algorithms/VersionedCollapsingAlgorithm.cpp +++ b/src/Processors/Merges/Algorithms/VersionedCollapsingAlgorithm.cpp @@ -8,19 +8,20 @@ namespace DB static const size_t MAX_ROWS_IN_MULTIVERSION_QUEUE = 8192; VersionedCollapsingAlgorithm::VersionedCollapsingAlgorithm( - const Block & header, size_t num_inputs, - SortDescription description_, const String & sign_column_, + const Block & header_, + size_t num_inputs, + SortDescription description_, + const String & sign_column_, size_t max_block_size, WriteBuffer * out_row_sources_buf_, bool use_average_block_sizes) - : IMergingAlgorithmWithSharedChunks( - num_inputs, std::move(description_), out_row_sources_buf_, MAX_ROWS_IN_MULTIVERSION_QUEUE) - , merged_data(header.cloneEmptyColumns(), use_average_block_sizes, max_block_size) + : IMergingAlgorithmWithSharedChunks(header_, num_inputs, std::move(description_), out_row_sources_buf_, MAX_ROWS_IN_MULTIVERSION_QUEUE) + , merged_data(header_.cloneEmptyColumns(), use_average_block_sizes, max_block_size) /// -1 for +1 in FixedSizeDequeWithGaps's internal buffer. 3 is a reasonable minimum size to collapse anything. , max_rows_in_queue(std::min(std::max(3, max_block_size), MAX_ROWS_IN_MULTIVERSION_QUEUE) - 1) , current_keys(max_rows_in_queue) { - sign_column_number = header.getPositionByName(sign_column_); + sign_column_number = header_.getPositionByName(sign_column_); } inline ALWAYS_INLINE static void writeRowSourcePart(WriteBuffer & buffer, RowSourcePart row_source) diff --git a/src/Processors/QueryPlan/FillingStep.cpp b/src/Processors/QueryPlan/FillingStep.cpp index 223892aa528..a94bbdb0877 100644 --- a/src/Processors/QueryPlan/FillingStep.cpp +++ b/src/Processors/QueryPlan/FillingStep.cpp @@ -48,13 +48,13 @@ void FillingStep::transformPipeline(QueryPipelineBuilder & pipeline, const Build void FillingStep::describeActions(FormatSettings & settings) const { settings.out << String(settings.offset, ' '); - dumpSortDescription(sort_description, input_streams.front().header, settings.out); + dumpSortDescription(sort_description, settings.out); settings.out << '\n'; } void FillingStep::describeActions(JSONBuilder::JSONMap & map) const { - map.add("Sort Description", explainSortDescription(sort_description, input_streams.front().header)); + map.add("Sort Description", explainSortDescription(sort_description)); } } diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp index 42fbc49b3e7..a14513aceb0 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp +++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp @@ -612,14 +612,8 @@ static void addMergingFinal( ColumnNumbers key_columns; key_columns.reserve(sort_description.size()); - for (const auto & desc : sort_description) - { - if (!desc.column_name.empty()) - key_columns.push_back(header.getPositionByName(desc.column_name)); - else - key_columns.emplace_back(desc.column_number); - } + key_columns.push_back(header.getPositionByName(desc.column_name)); pipe.addSimpleTransform([&](const Block & stream_header) { @@ -774,9 +768,8 @@ Pipe ReadFromMergeTree::spreadMarkRangesAmongStreamsFinal( Names partition_key_columns = metadata_for_reading->getPartitionKey().column_names; - const auto & header = pipe.getHeader(); for (size_t i = 0; i < sort_columns_size; ++i) - sort_description.emplace_back(header.getPositionByName(sort_columns[i]), 1, 1); + sort_description.emplace_back(sort_columns[i], 1, 1); addMergingFinal( pipe, diff --git a/src/Processors/QueryPlan/SortingStep.cpp b/src/Processors/QueryPlan/SortingStep.cpp index 32b314b1c50..1e56c02504b 100644 --- a/src/Processors/QueryPlan/SortingStep.cpp +++ b/src/Processors/QueryPlan/SortingStep.cpp @@ -206,17 +206,17 @@ void SortingStep::describeActions(FormatSettings & settings) const if (!prefix_description.empty()) { settings.out << prefix << "Prefix sort description: "; - dumpSortDescription(prefix_description, input_streams.front().header, settings.out); + dumpSortDescription(prefix_description, settings.out); settings.out << '\n'; settings.out << prefix << "Result sort description: "; - dumpSortDescription(result_description, input_streams.front().header, settings.out); + dumpSortDescription(result_description, settings.out); settings.out << '\n'; } else { settings.out << prefix << "Sort description: "; - dumpSortDescription(result_description, input_streams.front().header, settings.out); + dumpSortDescription(result_description, settings.out); settings.out << '\n'; } @@ -228,11 +228,11 @@ void SortingStep::describeActions(JSONBuilder::JSONMap & map) const { if (!prefix_description.empty()) { - map.add("Prefix Sort Description", explainSortDescription(prefix_description, input_streams.front().header)); - map.add("Result Sort Description", explainSortDescription(result_description, input_streams.front().header)); + map.add("Prefix Sort Description", explainSortDescription(prefix_description)); + map.add("Result Sort Description", explainSortDescription(result_description)); } else - map.add("Sort Description", explainSortDescription(result_description, input_streams.front().header)); + map.add("Sort Description", explainSortDescription(result_description)); if (limit) map.add("Limit", limit); diff --git a/src/Processors/QueryPlan/WindowStep.cpp b/src/Processors/QueryPlan/WindowStep.cpp index cd4bb5f6730..df42ca9e60f 100644 --- a/src/Processors/QueryPlan/WindowStep.cpp +++ b/src/Processors/QueryPlan/WindowStep.cpp @@ -129,7 +129,7 @@ void WindowStep::describeActions(JSONBuilder::JSONMap & map) const } if (!window_description.order_by.empty()) - map.add("Sort Description", explainSortDescription(window_description.order_by, {})); + map.add("Sort Description", explainSortDescription(window_description.order_by)); auto functions_array = std::make_unique(); for (const auto & func : window_functions) diff --git a/src/Processors/Transforms/AggregatingInOrderTransform.cpp b/src/Processors/Transforms/AggregatingInOrderTransform.cpp index 63497ea1af4..c998818a3ec 100644 --- a/src/Processors/Transforms/AggregatingInOrderTransform.cpp +++ b/src/Processors/Transforms/AggregatingInOrderTransform.cpp @@ -26,7 +26,6 @@ AggregatingInOrderTransform::AggregatingInOrderTransform( , max_block_size(max_block_size_) , max_block_bytes(max_block_bytes_) , params(std::move(params_)) - , group_by_description(group_by_description_) , aggregate_columns(params->params.aggregates_size) , many_data(std::move(many_data_)) , variants(*many_data->variants[current_variant]) @@ -34,15 +33,8 @@ AggregatingInOrderTransform::AggregatingInOrderTransform( /// We won't finalize states in order to merge same states (generated due to multi-thread execution) in AggregatingSortedTransform res_header = params->getCustomHeader(false); - /// Replace column names to column position in description_sorted. - for (auto & column_description : group_by_description) - { - if (!column_description.column_name.empty()) - { - column_description.column_number = res_header.getPositionByName(column_description.column_name); - column_description.column_name.clear(); - } - } + for (const auto & column_description : group_by_description_) + group_by_description.emplace_back(column_description, res_header.getPositionByName(column_description.column_name)); } AggregatingInOrderTransform::~AggregatingInOrderTransform() = default; diff --git a/src/Processors/Transforms/AggregatingInOrderTransform.h b/src/Processors/Transforms/AggregatingInOrderTransform.h index e4c217a8f81..f900040d549 100644 --- a/src/Processors/Transforms/AggregatingInOrderTransform.h +++ b/src/Processors/Transforms/AggregatingInOrderTransform.h @@ -51,7 +51,7 @@ private: MutableColumns res_aggregate_columns; AggregatingTransformParamsPtr params; - SortDescription group_by_description; + SortDescriptionWithPositions group_by_description; Aggregator::AggregateColumns aggregate_columns; diff --git a/src/Processors/Transforms/CheckSortedTransform.cpp b/src/Processors/Transforms/CheckSortedTransform.cpp index 3d4518a935d..4491301e274 100644 --- a/src/Processors/Transforms/CheckSortedTransform.cpp +++ b/src/Processors/Transforms/CheckSortedTransform.cpp @@ -12,33 +12,13 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -CheckSortedTransform::CheckSortedTransform( - const Block & header_, - const SortDescription & sort_description_) - : ISimpleTransform(header_, header_, false) - , sort_description_map(addPositionsToSortDescriptions(sort_description_)) +CheckSortedTransform::CheckSortedTransform(const Block & header, const SortDescription & sort_description) + : ISimpleTransform(header, header, false) { + for (const auto & column_description : sort_description) + sort_description_map.emplace_back(column_description, header.getPositionByName(column_description.column_name)); } -SortDescriptionsWithPositions -CheckSortedTransform::addPositionsToSortDescriptions(const SortDescription & sort_description) -{ - SortDescriptionsWithPositions result; - result.reserve(sort_description.size()); - const auto & header = getInputPort().getHeader(); - - for (SortColumnDescription description_copy : sort_description) - { - if (!description_copy.column_name.empty()) - description_copy.column_number = header.getPositionByName(description_copy.column_name); - - result.push_back(description_copy); - } - - return result; -} - - void CheckSortedTransform::transform(Chunk & chunk) { size_t num_rows = chunk.getNumRows(); @@ -54,7 +34,7 @@ void CheckSortedTransform::transform(Chunk & chunk) const IColumn * left_col = left[column_number].get(); const IColumn * right_col = right[column_number].get(); - int res = elem.direction * left_col->compareAt(left_index, right_index, *right_col, elem.nulls_direction); + int res = elem.base.direction * left_col->compareAt(left_index, right_index, *right_col, elem.base.nulls_direction); if (res < 0) { return; diff --git a/src/Processors/Transforms/CheckSortedTransform.h b/src/Processors/Transforms/CheckSortedTransform.h index d1b13d22578..4daaaf79fdf 100644 --- a/src/Processors/Transforms/CheckSortedTransform.h +++ b/src/Processors/Transforms/CheckSortedTransform.h @@ -5,16 +5,12 @@ namespace DB { -using SortDescriptionsWithPositions = std::vector; - /// Streams checks that flow of blocks is sorted in the sort_description order /// Othrewise throws exception in readImpl function. class CheckSortedTransform : public ISimpleTransform { public: - CheckSortedTransform( - const Block & header_, - const SortDescription & sort_description_); + CheckSortedTransform(const Block & header, const SortDescription & sort_description); String getName() const override { return "CheckSortedTransform"; } @@ -23,10 +19,7 @@ protected: void transform(Chunk & chunk) override; private: - SortDescriptionsWithPositions sort_description_map; + SortDescriptionWithPositions sort_description_map; Columns last_row; - - /// Just checks, that all sort_descriptions has column_number - SortDescriptionsWithPositions addPositionsToSortDescriptions(const SortDescription & sort_description); }; } diff --git a/src/Processors/Transforms/DistinctSortedTransform.cpp b/src/Processors/Transforms/DistinctSortedTransform.cpp index 5600476fd77..13d039ebcae 100644 --- a/src/Processors/Transforms/DistinctSortedTransform.cpp +++ b/src/Processors/Transforms/DistinctSortedTransform.cpp @@ -9,8 +9,9 @@ namespace ErrorCodes } DistinctSortedTransform::DistinctSortedTransform( - const Block & header, SortDescription sort_description, const SizeLimits & set_size_limits_, UInt64 limit_hint_, const Names & columns) - : ISimpleTransform(header, header, true) + Block header_, SortDescription sort_description, const SizeLimits & set_size_limits_, UInt64 limit_hint_, const Names & columns) + : ISimpleTransform(header_, header_, true) + , header(std::move(header_)) , description(std::move(sort_description)) , columns_names(columns) , limit_hint(limit_hint_) @@ -24,7 +25,7 @@ void DistinctSortedTransform::transform(Chunk & chunk) if (column_ptrs.empty()) return; - ColumnRawPtrs clearing_hint_columns(getClearingColumns(chunk, column_ptrs)); + ColumnRawPtrs clearing_hint_columns(getClearingColumns(column_ptrs)); if (data.type == ClearableSetVariants::Type::EMPTY) data.init(ClearableSetVariants::chooseMethod(column_ptrs, key_sizes)); @@ -139,13 +140,13 @@ ColumnRawPtrs DistinctSortedTransform::getKeyColumns(const Chunk & chunk) const return column_ptrs; } -ColumnRawPtrs DistinctSortedTransform::getClearingColumns(const Chunk & chunk, const ColumnRawPtrs & key_columns) const +ColumnRawPtrs DistinctSortedTransform::getClearingColumns(const ColumnRawPtrs & key_columns) const { ColumnRawPtrs clearing_hint_columns; clearing_hint_columns.reserve(description.size()); for (const auto & sort_column_description : description) { - const auto * sort_column_ptr = chunk.getColumns().at(sort_column_description.column_number).get(); + const auto * sort_column_ptr = header.getByName(sort_column_description.column_name).column.get(); const auto it = std::find(key_columns.cbegin(), key_columns.cend(), sort_column_ptr); if (it != key_columns.cend()) /// if found in key_columns clearing_hint_columns.emplace_back(sort_column_ptr); diff --git a/src/Processors/Transforms/DistinctSortedTransform.h b/src/Processors/Transforms/DistinctSortedTransform.h index ddac6c18a64..0530a6689e9 100644 --- a/src/Processors/Transforms/DistinctSortedTransform.h +++ b/src/Processors/Transforms/DistinctSortedTransform.h @@ -22,7 +22,8 @@ class DistinctSortedTransform : public ISimpleTransform { public: /// Empty columns_ means all columns. - DistinctSortedTransform(const Block & header, SortDescription sort_description, const SizeLimits & set_size_limits_, UInt64 limit_hint_, const Names & columns); + DistinctSortedTransform( + Block header_, SortDescription sort_description, const SizeLimits & set_size_limits_, UInt64 limit_hint_, const Names & columns); String getName() const override { return "DistinctSortedTransform"; } @@ -33,7 +34,7 @@ private: ColumnRawPtrs getKeyColumns(const Chunk & chunk) const; /// When clearing_columns changed, we can clean HashSet to memory optimization /// clearing_columns is a left-prefix of SortDescription exists in key_columns - ColumnRawPtrs getClearingColumns(const Chunk & chunk, const ColumnRawPtrs & key_columns) const; + ColumnRawPtrs getClearingColumns(const ColumnRawPtrs & key_columns) const; static bool rowsEqual(const ColumnRawPtrs & lhs, size_t n, const ColumnRawPtrs & rhs, size_t m); /// return true if has new data @@ -46,6 +47,7 @@ private: size_t rows, ClearableSetVariants & variants) const; + Block header; SortDescription description; struct PreviousChunk diff --git a/src/Processors/Transforms/FinishSortingTransform.cpp b/src/Processors/Transforms/FinishSortingTransform.cpp index 763ed9ecc49..abded9bd2f0 100644 --- a/src/Processors/Transforms/FinishSortingTransform.cpp +++ b/src/Processors/Transforms/FinishSortingTransform.cpp @@ -21,9 +21,11 @@ static bool isPrefix(const SortDescription & pref_descr, const SortDescription & } FinishSortingTransform::FinishSortingTransform( - const Block & header, const SortDescription & description_sorted_, + const Block & header, + const SortDescription & description_sorted_, const SortDescription & description_to_sort_, - size_t max_merged_block_size_, UInt64 limit_) + size_t max_merged_block_size_, + UInt64 limit_) : SortingTransform(header, description_to_sort_, max_merged_block_size_, limit_) { /// Check for sanity non-modified descriptions @@ -34,7 +36,8 @@ FinishSortingTransform::FinishSortingTransform( /// The target description is modified in SortingTransform constructor. /// To avoid doing the same actions with description_sorted just copy it from prefix of target description. size_t prefix_size = description_sorted_.size(); - description_sorted.assign(description.begin(), description.begin() + prefix_size); + for (size_t i = 0; i < prefix_size; ++i) + description_with_positions.emplace_back(description[i], header_without_constants.getPositionByName(description[i].column_name)); } void FinishSortingTransform::consume(Chunk chunk) @@ -62,7 +65,7 @@ void FinishSortingTransform::consume(Chunk chunk) while (high - low > 1) { ssize_t mid = (low + high) / 2; - if (!less(last_chunk.getColumns(), chunk.getColumns(), last_chunk.getNumRows() - 1, mid, description_sorted)) + if (!less(last_chunk.getColumns(), chunk.getColumns(), last_chunk.getNumRows() - 1, mid, description_with_positions)) low = mid; else high = mid; @@ -100,7 +103,8 @@ void FinishSortingTransform::generate() { if (!merge_sorter) { - merge_sorter = std::make_unique(std::move(chunks), description, max_merged_block_size, limit); + merge_sorter + = std::make_unique(header_without_constants, std::move(chunks), description, max_merged_block_size, limit); generated_prefix = true; } diff --git a/src/Processors/Transforms/FinishSortingTransform.h b/src/Processors/Transforms/FinishSortingTransform.h index 63fbb2e0e63..3bebcc0a68f 100644 --- a/src/Processors/Transforms/FinishSortingTransform.h +++ b/src/Processors/Transforms/FinishSortingTransform.h @@ -11,9 +11,12 @@ class FinishSortingTransform : public SortingTransform { public: /// limit - if not 0, allowed to return just first 'limit' rows in sorted order. - FinishSortingTransform(const Block & header, const SortDescription & description_sorted_, + FinishSortingTransform( + const Block & header, + const SortDescription & description_sorted_, const SortDescription & description_to_sort_, - size_t max_merged_block_size_, UInt64 limit_); + size_t max_merged_block_size_, + UInt64 limit_); String getName() const override { return "FinishSortingTransform"; } @@ -22,7 +25,7 @@ protected: void generate() override; private: - SortDescription description_sorted; + SortDescriptionWithPositions description_with_positions; Chunk tail_chunk; }; diff --git a/src/Processors/Transforms/MergeSortingTransform.cpp b/src/Processors/Transforms/MergeSortingTransform.cpp index 73817d7de4a..1fe945cbbc9 100644 --- a/src/Processors/Transforms/MergeSortingTransform.cpp +++ b/src/Processors/Transforms/MergeSortingTransform.cpp @@ -90,16 +90,21 @@ private: MergeSortingTransform::MergeSortingTransform( const Block & header, const SortDescription & description_, - size_t max_merged_block_size_, UInt64 limit_, + size_t max_merged_block_size_, + UInt64 limit_, size_t max_bytes_before_remerge_, double remerge_lowered_memory_bytes_ratio_, - size_t max_bytes_before_external_sort_, VolumePtr tmp_volume_, + size_t max_bytes_before_external_sort_, + VolumePtr tmp_volume_, size_t min_free_disk_space_) : SortingTransform(header, description_, max_merged_block_size_, limit_) , max_bytes_before_remerge(max_bytes_before_remerge_) , remerge_lowered_memory_bytes_ratio(remerge_lowered_memory_bytes_ratio_) - , max_bytes_before_external_sort(max_bytes_before_external_sort_), tmp_volume(tmp_volume_) - , min_free_disk_space(min_free_disk_space_) {} + , max_bytes_before_external_sort(max_bytes_before_external_sort_) + , tmp_volume(tmp_volume_) + , min_free_disk_space(min_free_disk_space_) +{ +} Processors MergeSortingTransform::expandPipeline() { @@ -180,7 +185,8 @@ void MergeSortingTransform::consume(Chunk chunk) temporary_files.emplace_back(createTemporaryFile(tmp_path)); const std::string & path = temporary_files.back()->path(); - merge_sorter = std::make_unique(std::move(chunks), description, max_merged_block_size, limit); + merge_sorter + = std::make_unique(header_without_constants, std::move(chunks), description, max_merged_block_size, limit); auto current_processor = std::make_shared(header_without_constants, log, path); processors.emplace_back(current_processor); @@ -223,7 +229,8 @@ void MergeSortingTransform::generate() if (!generated_prefix) { if (temporary_files.empty()) - merge_sorter = std::make_unique(std::move(chunks), description, max_merged_block_size, limit); + merge_sorter + = std::make_unique(header_without_constants, std::move(chunks), description, max_merged_block_size, limit); else { ProfileEvents::increment(ProfileEvents::ExternalSortMerge); @@ -251,7 +258,7 @@ void MergeSortingTransform::remerge() LOG_DEBUG(log, "Re-merging intermediate ORDER BY data ({} blocks with {} rows) to save memory consumption", chunks.size(), sum_rows_in_blocks); /// NOTE Maybe concat all blocks and partial sort will be faster than merge? - MergeSorter remerge_sorter(std::move(chunks), description, max_merged_block_size, limit); + MergeSorter remerge_sorter(header_without_constants, std::move(chunks), description, max_merged_block_size, limit); Chunks new_chunks; size_t new_sum_rows_in_blocks = 0; diff --git a/src/Processors/Transforms/MergeSortingTransform.h b/src/Processors/Transforms/MergeSortingTransform.h index f16bebc2f46..b82ecc9d487 100644 --- a/src/Processors/Transforms/MergeSortingTransform.h +++ b/src/Processors/Transforms/MergeSortingTransform.h @@ -18,13 +18,16 @@ class MergeSortingTransform : public SortingTransform { public: /// limit - if not 0, allowed to return just first 'limit' rows in sorted order. - MergeSortingTransform(const Block & header, - const SortDescription & description_, - size_t max_merged_block_size_, UInt64 limit_, - size_t max_bytes_before_remerge_, - double remerge_lowered_memory_bytes_ratio_, - size_t max_bytes_before_external_sort_, VolumePtr tmp_volume_, - size_t min_free_disk_space_); + MergeSortingTransform( + const Block & header, + const SortDescription & description_, + size_t max_merged_block_size_, + UInt64 limit_, + size_t max_bytes_before_remerge_, + double remerge_lowered_memory_bytes_ratio_, + size_t max_bytes_before_external_sort_, + VolumePtr tmp_volume_, + size_t min_free_disk_space_); String getName() const override { return "MergeSortingTransform"; } diff --git a/src/Processors/Transforms/PartialSortingTransform.cpp b/src/Processors/Transforms/PartialSortingTransform.cpp index 3a75571872f..6a787a6cd15 100644 --- a/src/Processors/Transforms/PartialSortingTransform.cpp +++ b/src/Processors/Transforms/PartialSortingTransform.cpp @@ -22,9 +22,7 @@ static ColumnRawPtrs extractColumns(const Block & block, const SortDescription & for (size_t i = 0; i < size; ++i) { - const IColumn * column = !description[i].column_name.empty() - ? block.getByName(description[i].column_name).column.get() - : block.safeGetByPosition(description[i].column_number).column.get(); + const IColumn * column = block.getByName(description[i].column_name).column.get(); res.emplace_back(column); } diff --git a/src/Processors/Transforms/SortingTransform.cpp b/src/Processors/Transforms/SortingTransform.cpp index 8fa9d7adb84..c0f700070fa 100644 --- a/src/Processors/Transforms/SortingTransform.cpp +++ b/src/Processors/Transforms/SortingTransform.cpp @@ -22,7 +22,7 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -MergeSorter::MergeSorter(Chunks chunks_, SortDescription & description_, size_t max_merged_block_size_, UInt64 limit_) +MergeSorter::MergeSorter(const Block & header, Chunks chunks_, SortDescription & description_, size_t max_merged_block_size_, UInt64 limit_) : chunks(std::move(chunks_)), description(description_), max_merged_block_size(max_merged_block_size_), limit(limit_) { Chunks nonempty_chunks; @@ -36,7 +36,7 @@ MergeSorter::MergeSorter(Chunks chunks_, SortDescription & description_, size_t /// which can be inefficient. convertToFullIfSparse(chunk); - cursors.emplace_back(chunk.getColumns(), description); + cursors.emplace_back(header, chunk.getColumns(), description); has_collation |= cursors.back().has_collation; nonempty_chunks.emplace_back(std::move(chunk)); @@ -139,16 +139,6 @@ SortingTransform::SortingTransform( { const auto & sample = inputs.front().getHeader(); - /// Replace column names to column position in sort_description. - for (auto & column_description : description) - { - if (!column_description.column_name.empty()) - { - column_description.column_number = sample.getPositionByName(column_description.column_name); - column_description.column_name.clear(); - } - } - /// Remove constants from header and map old indexes to new. size_t num_columns = sample.columns(); ColumnNumbers map(num_columns, num_columns); @@ -169,13 +159,10 @@ SortingTransform::SortingTransform( description_without_constants.reserve(description.size()); for (const auto & column_description : description) { - auto old_pos = column_description.column_number; + auto old_pos = header.getPositionByName(column_description.column_name); auto new_pos = map[old_pos]; if (new_pos < num_columns) - { description_without_constants.push_back(column_description); - description_without_constants.back().column_number = new_pos; - } } description.swap(description_without_constants); diff --git a/src/Processors/Transforms/SortingTransform.h b/src/Processors/Transforms/SortingTransform.h index 0f7cb4347a4..380ef4dff88 100644 --- a/src/Processors/Transforms/SortingTransform.h +++ b/src/Processors/Transforms/SortingTransform.h @@ -15,7 +15,7 @@ namespace DB class MergeSorter { public: - MergeSorter(Chunks chunks_, SortDescription & description_, size_t max_merged_block_size_, UInt64 limit_); + MergeSorter(const Block & header, Chunks chunks_, SortDescription & description_, size_t max_merged_block_size_, UInt64 limit_); Chunk read(); @@ -45,8 +45,10 @@ private: class MergeSorterSource : public ISource { public: - MergeSorterSource(Block header, Chunks chunks, SortDescription & description, size_t max_merged_block_size, UInt64 limit) - : ISource(std::move(header)), merge_sorter(std::move(chunks), description, max_merged_block_size, limit) {} + MergeSorterSource(const Block & header, Chunks chunks, SortDescription & description, size_t max_merged_block_size, UInt64 limit) + : ISource(header), merge_sorter(header, std::move(chunks), description, max_merged_block_size, limit) + { + } String getName() const override { return "MergeSorterSource"; } diff --git a/src/Storages/MergeTree/MergeTask.cpp b/src/Storages/MergeTree/MergeTask.cpp index 935a11ec5fa..e8241ffe080 100644 --- a/src/Storages/MergeTree/MergeTask.cpp +++ b/src/Storages/MergeTree/MergeTask.cpp @@ -782,7 +782,7 @@ void MergeTask::ExecuteAndFinalizeHorizontalPart::createMergedStream() Block header = pipes.at(0).getHeader(); for (size_t i = 0; i < sort_columns_size; ++i) - sort_description.emplace_back(header.getPositionByName(sort_columns[i]), 1, 1); + sort_description.emplace_back(sort_columns[i], 1, 1); /// The order of the streams is important: when the key is matched, the elements go in the order of the source stream number. /// In the merged part, the lines with the same key must be in the ascending order of the identifier of original part, diff --git a/src/Storages/MergeTree/MergeTreeDataWriter.cpp b/src/Storages/MergeTree/MergeTreeDataWriter.cpp index 4805a273c70..47e95121cb0 100644 --- a/src/Storages/MergeTree/MergeTreeDataWriter.cpp +++ b/src/Storages/MergeTree/MergeTreeDataWriter.cpp @@ -333,7 +333,7 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeTempPart( sort_description.reserve(sort_columns_size); for (size_t i = 0; i < sort_columns_size; ++i) - sort_description.emplace_back(block.getPositionByName(sort_columns[i]), 1, 1); + sort_description.emplace_back(sort_columns[i], 1, 1); ProfileEvents::increment(ProfileEvents::MergeTreeDataWriterBlocks); @@ -521,7 +521,7 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeProjectionPartImpl( sort_description.reserve(sort_columns_size); for (size_t i = 0; i < sort_columns_size; ++i) - sort_description.emplace_back(block.getPositionByName(sort_columns[i]), 1, 1); + sort_description.emplace_back(sort_columns[i], 1, 1); ProfileEvents::increment(ProfileEvents::MergeTreeDataProjectionWriterBlocks); From 4c51329ad64f28ce449ea745a1392b631b623fd2 Mon Sep 17 00:00:00 2001 From: Nickita Taranov Date: Fri, 25 Mar 2022 16:18:24 +0100 Subject: [PATCH 136/239] stash --- src/Interpreters/ActionsDAG.cpp | 14 ++++ src/Interpreters/ActionsDAG.h | 5 ++ .../QueryPlan/Optimizations/Optimizations.h | 9 ++- .../Optimizations/liftUpFunctions.cpp | 80 +++++++++++++++++++ src/Processors/QueryPlan/SortingStep.cpp | 24 ++++-- src/Processors/QueryPlan/SortingStep.h | 5 ++ ...on_calculation_after_sorting_and_limit.xml | 4 + .../01655_plan_optimizations.reference | 12 +++ .../0_stateless/01655_plan_optimizations.sh | 10 +++ 9 files changed, 155 insertions(+), 8 deletions(-) create mode 100644 src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp create mode 100644 tests/performance/function_calculation_after_sorting_and_limit.xml diff --git a/src/Interpreters/ActionsDAG.cpp b/src/Interpreters/ActionsDAG.cpp index 25116f5145a..151ca631d2d 100644 --- a/src/Interpreters/ActionsDAG.cpp +++ b/src/Interpreters/ActionsDAG.cpp @@ -1527,6 +1527,20 @@ ActionsDAG::SplitResult ActionsDAG::splitActionsBeforeArrayJoin(const NameSet & return res; } +ActionsDAG::SplitResult ActionsDAG::splitActionsBySortingDescription(const SortDescription & sort_description) const +{ + std::unordered_set split_nodes; + for (const auto & sort_column : sort_description) + { + const auto * node = tryFindInIndex(sort_column.column_name); + if (node) + split_nodes.insert(node); + } + auto res = split(split_nodes); + res.second->project_input = project_input; + return res; +} + ActionsDAG::SplitResult ActionsDAG::splitActionsForFilter(const std::string & column_name) const { const auto * node = tryFindInIndex(column_name); diff --git a/src/Interpreters/ActionsDAG.h b/src/Interpreters/ActionsDAG.h index b07ab08c997..a7424ac4967 100644 --- a/src/Interpreters/ActionsDAG.h +++ b/src/Interpreters/ActionsDAG.h @@ -7,6 +7,8 @@ #include "config_core.h" +#include + namespace DB { @@ -274,6 +276,9 @@ public: /// Index of initial actions must contain column_name. SplitResult splitActionsForFilter(const std::string & column_name) const; + /// + SplitResult splitActionsBySortingDescription(const SortDescription & sort_description) const; + /// Create actions which may calculate part of filter using only available_inputs. /// If nothing may be calculated, returns nullptr. /// Otherwise, return actions which inputs are from available_inputs. diff --git a/src/Processors/QueryPlan/Optimizations/Optimizations.h b/src/Processors/QueryPlan/Optimizations/Optimizations.h index 10bc6293537..7438bb18cd4 100644 --- a/src/Processors/QueryPlan/Optimizations/Optimizations.h +++ b/src/Processors/QueryPlan/Optimizations/Optimizations.h @@ -44,16 +44,19 @@ size_t tryMergeExpressions(QueryPlan::Node * parent_node, QueryPlan::Nodes &); /// May split FilterStep and push down only part of it. size_t tryPushDownFilter(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes); +/// +size_t tryExecuteFunctionsAfterSorting(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes); + inline const auto & getOptimizations() { - static const std::array optimizations = - {{ + static const std::array optimizations = {{ {tryLiftUpArrayJoin, "liftUpArrayJoin", &QueryPlanOptimizationSettings::optimize_plan}, {tryPushDownLimit, "pushDownLimit", &QueryPlanOptimizationSettings::optimize_plan}, {trySplitFilter, "splitFilter", &QueryPlanOptimizationSettings::optimize_plan}, {tryMergeExpressions, "mergeExpressions", &QueryPlanOptimizationSettings::optimize_plan}, {tryPushDownFilter, "pushDownFilter", &QueryPlanOptimizationSettings::filter_push_down}, - }}; + {tryExecuteFunctionsAfterSorting, "liftUpFunctions", &QueryPlanOptimizationSettings::optimize_plan}, + }}; return optimizations; } diff --git a/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp b/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp new file mode 100644 index 00000000000..abf7ee48cb4 --- /dev/null +++ b/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp @@ -0,0 +1,80 @@ +#include +#include +#include +#include + +#include +#include + +namespace DB::QueryPlanOptimizations +{ + +void swapSortingAndUnnecessaryCalculation(QueryPlan::Node * parent_node, ActionsDAGPtr && actions) +{ + QueryPlan::Node * child_node = parent_node->children.front(); + + auto & parent_step = parent_node->step; + auto & child_step = child_node->step; + auto * sorting_step = typeid_cast(parent_step.get()); + + // Sorting -> UnnecessaryCalculations + std::swap(parent_step, child_step); + // UnnecessaryCalculations -> Sorting + + sorting_step->updateInputStream(child_node->children.at(0)->step->getOutputStream()); + auto input_header = child_step->getInputStreams().at(0).header; + sorting_step->updateOutputStream(input_header); + parent_step = std::make_unique(child_step->getOutputStream(), std::move(actions)); +} + +size_t tryExecuteFunctionsAfterSorting(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes) +{ + if (parent_node->children.size() != 1) + return 0; + + QueryPlan::Node * child_node = parent_node->children.front(); + + auto & parent_step = parent_node->step; + auto & child_step = child_node->step; + auto * sorting_step = typeid_cast(parent_step.get()); + auto * expression_step = typeid_cast(child_step.get()); + + if (!sorting_step || !expression_step) + return 0; + + const auto & sort_columns = sorting_step->getSortDescription(); + const auto & expression = expression_step->getExpression(); + + for (auto sc : sort_columns) + LOG_TRACE(&Poco::Logger::get("Optimizer"), "sort_columns: {}", sc.column_name); + + auto split_actions = expression->splitActionsBySortingDescription(sort_columns); + LOG_TRACE(&Poco::Logger::get("Optimizer"), "source: {}", expression->dumpDAG()); + LOG_TRACE(&Poco::Logger::get("Optimizer"), "first: {}", split_actions.first->dumpDAG()); + LOG_TRACE(&Poco::Logger::get("Optimizer"), "second: {}", split_actions.second->dumpDAG()); + + // No calculations can be postponed. + if (split_actions.second->trivial()) + return 0; + + // Everything can be done after the sorting. + if (split_actions.first->trivial()) + { + swapSortingAndUnnecessaryCalculation(parent_node, std::move(split_actions.second)); + return 2; + } + + // Sorting -> Expression + auto & node = nodes.emplace_back(); + + node.children.swap(child_node->children); + child_node->children.emplace_back(&node); + + node.step = std::make_unique(node.children.at(0)->step->getOutputStream(), std::move(split_actions.first)); + // Sorting (parent_node) -> UnnecessaryCalculations (child_node) -> NecessaryCalculations (node) + swapSortingAndUnnecessaryCalculation(parent_node, std::move(split_actions.second)); + // UnnecessaryCalculations (child_node) -> Sorting (parent_node) -> NecessaryCalculations (node) + + return 3; +} +} diff --git a/src/Processors/QueryPlan/SortingStep.cpp b/src/Processors/QueryPlan/SortingStep.cpp index 1e56c02504b..38da1381fa9 100644 --- a/src/Processors/QueryPlan/SortingStep.cpp +++ b/src/Processors/QueryPlan/SortingStep.cpp @@ -1,11 +1,12 @@ +#include +#include +#include #include -#include +#include +#include #include #include -#include -#include -#include -#include +#include #include namespace DB @@ -88,6 +89,19 @@ SortingStep::SortingStep( output_stream->sort_mode = DataStream::SortMode::Stream; } +void SortingStep::updateInputStream(const DataStream & input_stream) +{ + input_streams.clear(); + input_streams.emplace_back(input_stream); +} + +void SortingStep::updateOutputStream(Block result_header) +{ + if (input_streams.size() != 1) + throw std::runtime_error{"wasted"}; + output_stream = createOutputStream(input_streams.at(0), result_header, getDataStreamTraits()); +} + void SortingStep::updateLimit(size_t limit_) { if (limit_ && (limit == 0 || limit_ < limit)) diff --git a/src/Processors/QueryPlan/SortingStep.h b/src/Processors/QueryPlan/SortingStep.h index 8e253e71f44..d828cd35dff 100644 --- a/src/Processors/QueryPlan/SortingStep.h +++ b/src/Processors/QueryPlan/SortingStep.h @@ -49,6 +49,11 @@ public: /// Add limit or change it to lower value. void updateLimit(size_t limit_); + void updateInputStream(const DataStream & input_stream); + void updateOutputStream(Block result_header); + + SortDescription getSortDescription() const { return result_description; } + private: enum class Type diff --git a/tests/performance/function_calculation_after_sorting_and_limit.xml b/tests/performance/function_calculation_after_sorting_and_limit.xml new file mode 100644 index 00000000000..ddb8f860600 --- /dev/null +++ b/tests/performance/function_calculation_after_sorting_and_limit.xml @@ -0,0 +1,4 @@ + + SELECT sipHash64(number) FROM numbers(1e8) ORDER BY number LIMIT 5 + SELECT sipHash64(number) FROM numbers(1e8) ORDER BY number + 1 LIMIT 5 + diff --git a/tests/queries/0_stateless/01655_plan_optimizations.reference b/tests/queries/0_stateless/01655_plan_optimizations.reference index 33a7ff44b74..6c792c1092e 100644 --- a/tests/queries/0_stateless/01655_plan_optimizations.reference +++ b/tests/queries/0_stateless/01655_plan_optimizations.reference @@ -142,3 +142,15 @@ Filter Filter 2 3 2 3 +> function calculation should be done after sorting and limit (if possible) +> the whole Expression node could be moved after Sorting +Expression +Limit +Expression +Sorting +> Expression should be divided into two subnodes and only one of them could be moved after Sorting +Expression +Limit +Expression +Sorting +Expression diff --git a/tests/queries/0_stateless/01655_plan_optimizations.sh b/tests/queries/0_stateless/01655_plan_optimizations.sh index b66d788a338..d2f6914ff88 100755 --- a/tests/queries/0_stateless/01655_plan_optimizations.sh +++ b/tests/queries/0_stateless/01655_plan_optimizations.sh @@ -196,3 +196,13 @@ $CLICKHOUSE_CLIENT -q " select a, b from ( select number + 1 as a, number + 2 as b from numbers(2) union all select number + 1 as b, number + 2 as a from numbers(2) ) where a != 1 settings enable_optimize_predicate_expression = 0" + +echo "> function calculation should be done after sorting and limit (if possible)" +echo "> the whole Expression node could be moved after Sorting" +$CLICKHOUSE_CLIENT -q " + explain select sipHash64(number) from numbers(100) order by number limit 5" | + sed 's/ //g' | grep -o "^ *\(Expression\|Limit\|Sorting\)" +echo "> Expression should be divided into two subnodes and only one of them could be moved after Sorting" +$CLICKHOUSE_CLIENT -q " + explain select sipHash64(number) from numbers(100) order by number + 1 limit 5" | + sed 's/ //g' | grep -o "^ *\(Expression\|Limit\|Sorting\)" From b095838444b99b4b843da357a3db37f28d02ece0 Mon Sep 17 00:00:00 2001 From: Nickita Taranov Date: Fri, 25 Mar 2022 17:20:29 +0100 Subject: [PATCH 137/239] stash --- src/Interpreters/ActionsDAG.cpp | 9 +++------ src/Interpreters/ActionsDAG.h | 7 +++---- src/Processors/QueryPlan/Optimizations/Optimizations.h | 3 ++- .../QueryPlan/Optimizations/liftUpFunctions.cpp | 7 +++++-- src/Processors/QueryPlan/SortingStep.cpp | 8 +++----- src/Processors/QueryPlan/SortingStep.h | 2 +- .../0_stateless/01655_plan_optimizations.reference | 2 +- tests/queries/0_stateless/01655_plan_optimizations.sh | 2 +- 8 files changed, 19 insertions(+), 21 deletions(-) diff --git a/src/Interpreters/ActionsDAG.cpp b/src/Interpreters/ActionsDAG.cpp index 151ca631d2d..ea90bedd2f6 100644 --- a/src/Interpreters/ActionsDAG.cpp +++ b/src/Interpreters/ActionsDAG.cpp @@ -1527,15 +1527,12 @@ ActionsDAG::SplitResult ActionsDAG::splitActionsBeforeArrayJoin(const NameSet & return res; } -ActionsDAG::SplitResult ActionsDAG::splitActionsBySortingDescription(const SortDescription & sort_description) const +ActionsDAG::SplitResult ActionsDAG::splitActionsBySortingDescription(const NameSet & sort_columns) const { std::unordered_set split_nodes; - for (const auto & sort_column : sort_description) - { - const auto * node = tryFindInIndex(sort_column.column_name); - if (node) + for (const auto & sort_column : sort_columns) + if (const auto * node = tryFindInIndex(sort_column)) split_nodes.insert(node); - } auto res = split(split_nodes); res.second->project_input = project_input; return res; diff --git a/src/Interpreters/ActionsDAG.h b/src/Interpreters/ActionsDAG.h index a7424ac4967..1ff82c8ea60 100644 --- a/src/Interpreters/ActionsDAG.h +++ b/src/Interpreters/ActionsDAG.h @@ -7,8 +7,6 @@ #include "config_core.h" -#include - namespace DB { @@ -276,8 +274,9 @@ public: /// Index of initial actions must contain column_name. SplitResult splitActionsForFilter(const std::string & column_name) const; - /// - SplitResult splitActionsBySortingDescription(const SortDescription & sort_description) const; + /// Splits actions into two parts. The first part contains all the calculations required to calculate sort_columns. + /// The second contains the rest. + SplitResult splitActionsBySortingDescription(const NameSet & sort_columns) const; /// Create actions which may calculate part of filter using only available_inputs. /// If nothing may be calculated, returns nullptr. diff --git a/src/Processors/QueryPlan/Optimizations/Optimizations.h b/src/Processors/QueryPlan/Optimizations/Optimizations.h index 7438bb18cd4..45da00a7ccd 100644 --- a/src/Processors/QueryPlan/Optimizations/Optimizations.h +++ b/src/Processors/QueryPlan/Optimizations/Optimizations.h @@ -44,7 +44,8 @@ size_t tryMergeExpressions(QueryPlan::Node * parent_node, QueryPlan::Nodes &); /// May split FilterStep and push down only part of it. size_t tryPushDownFilter(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes); -/// +/// Move ExpressionStep up if possible. +/// May split ExpressionStep and lift up only part of it. size_t tryExecuteFunctionsAfterSorting(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes); inline const auto & getOptimizations() diff --git a/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp b/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp index abf7ee48cb4..936ce3c3e5f 100644 --- a/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp +++ b/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp @@ -42,11 +42,14 @@ size_t tryExecuteFunctionsAfterSorting(QueryPlan::Node * parent_node, QueryPlan: if (!sorting_step || !expression_step) return 0; - const auto & sort_columns = sorting_step->getSortDescription(); + NameSet sort_columns; + for (const auto & col : sorting_step->getSortDescription()) + sort_columns.insert(col.column_name); + const auto & expression = expression_step->getExpression(); for (auto sc : sort_columns) - LOG_TRACE(&Poco::Logger::get("Optimizer"), "sort_columns: {}", sc.column_name); + LOG_TRACE(&Poco::Logger::get("Optimizer"), "sort_columns: {}", fmt::join(sort_columns, ", ")); auto split_actions = expression->splitActionsBySortingDescription(sort_columns); LOG_TRACE(&Poco::Logger::get("Optimizer"), "source: {}", expression->dumpDAG()); diff --git a/src/Processors/QueryPlan/SortingStep.cpp b/src/Processors/QueryPlan/SortingStep.cpp index 38da1381fa9..3d75c461cf8 100644 --- a/src/Processors/QueryPlan/SortingStep.cpp +++ b/src/Processors/QueryPlan/SortingStep.cpp @@ -89,17 +89,15 @@ SortingStep::SortingStep( output_stream->sort_mode = DataStream::SortMode::Stream; } -void SortingStep::updateInputStream(const DataStream & input_stream) +void SortingStep::updateInputStream(DataStream input_stream) { input_streams.clear(); - input_streams.emplace_back(input_stream); + input_streams.push_back(std::move(input_stream)); } void SortingStep::updateOutputStream(Block result_header) { - if (input_streams.size() != 1) - throw std::runtime_error{"wasted"}; - output_stream = createOutputStream(input_streams.at(0), result_header, getDataStreamTraits()); + output_stream = createOutputStream(input_streams.front(), std::move(result_header), getDataStreamTraits()); } void SortingStep::updateLimit(size_t limit_) diff --git a/src/Processors/QueryPlan/SortingStep.h b/src/Processors/QueryPlan/SortingStep.h index d828cd35dff..1738d8d4e45 100644 --- a/src/Processors/QueryPlan/SortingStep.h +++ b/src/Processors/QueryPlan/SortingStep.h @@ -49,7 +49,7 @@ public: /// Add limit or change it to lower value. void updateLimit(size_t limit_); - void updateInputStream(const DataStream & input_stream); + void updateInputStream(DataStream input_stream); void updateOutputStream(Block result_header); SortDescription getSortDescription() const { return result_description; } diff --git a/tests/queries/0_stateless/01655_plan_optimizations.reference b/tests/queries/0_stateless/01655_plan_optimizations.reference index 6c792c1092e..5bdda6ac9aa 100644 --- a/tests/queries/0_stateless/01655_plan_optimizations.reference +++ b/tests/queries/0_stateless/01655_plan_optimizations.reference @@ -148,7 +148,7 @@ Expression Limit Expression Sorting -> Expression should be divided into two subnodes and only one of them could be moved after Sorting +> Expression should be divided into two subexpressions and only one of them should be moved after Sorting Expression Limit Expression diff --git a/tests/queries/0_stateless/01655_plan_optimizations.sh b/tests/queries/0_stateless/01655_plan_optimizations.sh index d2f6914ff88..efbd3973b62 100755 --- a/tests/queries/0_stateless/01655_plan_optimizations.sh +++ b/tests/queries/0_stateless/01655_plan_optimizations.sh @@ -202,7 +202,7 @@ echo "> the whole Expression node could be moved after Sorting" $CLICKHOUSE_CLIENT -q " explain select sipHash64(number) from numbers(100) order by number limit 5" | sed 's/ //g' | grep -o "^ *\(Expression\|Limit\|Sorting\)" -echo "> Expression should be divided into two subnodes and only one of them could be moved after Sorting" +echo "> Expression should be divided into two subexpressions and only one of them should be moved after Sorting" $CLICKHOUSE_CLIENT -q " explain select sipHash64(number) from numbers(100) order by number + 1 limit 5" | sed 's/ //g' | grep -o "^ *\(Expression\|Limit\|Sorting\)" From a08c035443a1c3549e73e17cc66bfe33a2f4cac8 Mon Sep 17 00:00:00 2001 From: Nickita Taranov Date: Fri, 25 Mar 2022 17:43:51 +0100 Subject: [PATCH 138/239] stash --- .../QueryPlan/Optimizations/liftUpFunctions.cpp | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp b/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp index 936ce3c3e5f..2fc41a0e8d8 100644 --- a/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp +++ b/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp @@ -23,7 +23,7 @@ void swapSortingAndUnnecessaryCalculation(QueryPlan::Node * parent_node, Actions sorting_step->updateInputStream(child_node->children.at(0)->step->getOutputStream()); auto input_header = child_step->getInputStreams().at(0).header; - sorting_step->updateOutputStream(input_header); + sorting_step->updateOutputStream(std::move(input_header)); parent_step = std::make_unique(child_step->getOutputStream(), std::move(actions)); } @@ -45,12 +45,7 @@ size_t tryExecuteFunctionsAfterSorting(QueryPlan::Node * parent_node, QueryPlan: NameSet sort_columns; for (const auto & col : sorting_step->getSortDescription()) sort_columns.insert(col.column_name); - const auto & expression = expression_step->getExpression(); - - for (auto sc : sort_columns) - LOG_TRACE(&Poco::Logger::get("Optimizer"), "sort_columns: {}", fmt::join(sort_columns, ", ")); - auto split_actions = expression->splitActionsBySortingDescription(sort_columns); LOG_TRACE(&Poco::Logger::get("Optimizer"), "source: {}", expression->dumpDAG()); LOG_TRACE(&Poco::Logger::get("Optimizer"), "first: {}", split_actions.first->dumpDAG()); @@ -69,11 +64,10 @@ size_t tryExecuteFunctionsAfterSorting(QueryPlan::Node * parent_node, QueryPlan: // Sorting -> Expression auto & node = nodes.emplace_back(); - node.children.swap(child_node->children); child_node->children.emplace_back(&node); - node.step = std::make_unique(node.children.at(0)->step->getOutputStream(), std::move(split_actions.first)); + // Sorting (parent_node) -> UnnecessaryCalculations (child_node) -> NecessaryCalculations (node) swapSortingAndUnnecessaryCalculation(parent_node, std::move(split_actions.second)); // UnnecessaryCalculations (child_node) -> Sorting (parent_node) -> NecessaryCalculations (node) From eedcd61479fc6e35dbbbdee1d67a6e490faf3a7c Mon Sep 17 00:00:00 2001 From: Nickita Taranov Date: Sat, 26 Mar 2022 01:36:06 +0100 Subject: [PATCH 139/239] fix --- .../Optimizations/liftUpFunctions.cpp | 56 +++++++++++------- src/Processors/QueryPlan/SortingStep.cpp | 1 + .../01576_alias_column_rewrite.reference | 9 +-- ...02149_read_in_order_fixed_prefix.reference | 58 ++++++++++--------- 4 files changed, 73 insertions(+), 51 deletions(-) diff --git a/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp b/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp index 2fc41a0e8d8..a304b91017c 100644 --- a/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp +++ b/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp @@ -9,7 +9,7 @@ namespace DB::QueryPlanOptimizations { -void swapSortingAndUnnecessaryCalculation(QueryPlan::Node * parent_node, ActionsDAGPtr && actions) +void swapSortingAndUnnecessaryCalculation(QueryPlan::Node * parent_node, ActionsDAGPtr && unneeded_for_sorting) { QueryPlan::Node * child_node = parent_node->children.front(); @@ -17,14 +17,24 @@ void swapSortingAndUnnecessaryCalculation(QueryPlan::Node * parent_node, Actions auto & child_step = child_node->step; auto * sorting_step = typeid_cast(parent_step.get()); - // Sorting -> UnnecessaryCalculations + // Sorting -> Expression std::swap(parent_step, child_step); - // UnnecessaryCalculations -> Sorting + // Expression -> Sorting sorting_step->updateInputStream(child_node->children.at(0)->step->getOutputStream()); - auto input_header = child_step->getInputStreams().at(0).header; + LOG_TRACE( + &Poco::Logger::get("Optimizer"), "New Sorting input header: {}", sorting_step->getInputStreams().at(0).header.dumpStructure()); + auto input_header = sorting_step->getInputStreams().at(0).header; + LOG_TRACE(&Poco::Logger::get("Optimizer"), "Old Sorting output header: {}", sorting_step->getOutputStream().header.dumpStructure()); sorting_step->updateOutputStream(std::move(input_header)); - parent_step = std::make_unique(child_step->getOutputStream(), std::move(actions)); + LOG_TRACE(&Poco::Logger::get("Optimizer"), "New Sorting output header: {}", sorting_step->getOutputStream().header.dumpStructure()); + auto description = parent_node->step->getStepDescription(); + parent_step = std::make_unique(child_step->getOutputStream(), std::move(unneeded_for_sorting)); + LOG_TRACE( + &Poco::Logger::get("Optimizer"), "New Expression input header: {}", parent_step->getInputStreams().at(0).header.dumpStructure()); + LOG_TRACE(&Poco::Logger::get("Optimizer"), "New Expression output header: {}", parent_step->getOutputStream().header.dumpStructure()); + parent_step->setStepDescription(description + " [lifted up part]"); + // UnneededCalculations -> Sorting } size_t tryExecuteFunctionsAfterSorting(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes) @@ -46,31 +56,35 @@ size_t tryExecuteFunctionsAfterSorting(QueryPlan::Node * parent_node, QueryPlan: for (const auto & col : sorting_step->getSortDescription()) sort_columns.insert(col.column_name); const auto & expression = expression_step->getExpression(); - auto split_actions = expression->splitActionsBySortingDescription(sort_columns); - LOG_TRACE(&Poco::Logger::get("Optimizer"), "source: {}", expression->dumpDAG()); - LOG_TRACE(&Poco::Logger::get("Optimizer"), "first: {}", split_actions.first->dumpDAG()); - LOG_TRACE(&Poco::Logger::get("Optimizer"), "second: {}", split_actions.second->dumpDAG()); + auto [needed_for_sorting, unneeded_for_sorting] = expression->splitActionsBySortingDescription(sort_columns); + LOG_TRACE(&Poco::Logger::get("Optimizer"), "Original Expression: {}", expression->dumpDAG()); + LOG_TRACE(&Poco::Logger::get("Optimizer"), "Needed for Sorting: {}", needed_for_sorting->dumpDAG()); + LOG_TRACE(&Poco::Logger::get("Optimizer"), "Unneeded for Sorting: {}", unneeded_for_sorting->dumpDAG()); + + auto description = child_step->getStepDescription(); // No calculations can be postponed. - if (split_actions.second->trivial()) + if (unneeded_for_sorting->trivial()) return 0; // Everything can be done after the sorting. - if (split_actions.first->trivial()) + /*if (needed_for_sorting->trivial()) { - swapSortingAndUnnecessaryCalculation(parent_node, std::move(split_actions.second)); + swapSortingAndUnnecessaryCalculation(parent_node, std::move(unneeded_for_sorting)); return 2; - } + }*/ - // Sorting -> Expression - auto & node = nodes.emplace_back(); - node.children.swap(child_node->children); - child_node->children.emplace_back(&node); - node.step = std::make_unique(node.children.at(0)->step->getOutputStream(), std::move(split_actions.first)); + // Sorting (parent_node) -> Expression (child_node) + auto & node_with_needed = nodes.emplace_back(); + node_with_needed.children.swap(child_node->children); + child_node->children.emplace_back(&node_with_needed); + node_with_needed.step + = std::make_unique(node_with_needed.children.at(0)->step->getOutputStream(), std::move(needed_for_sorting)); + node_with_needed.step->setStepDescription(std::move(description)); - // Sorting (parent_node) -> UnnecessaryCalculations (child_node) -> NecessaryCalculations (node) - swapSortingAndUnnecessaryCalculation(parent_node, std::move(split_actions.second)); - // UnnecessaryCalculations (child_node) -> Sorting (parent_node) -> NecessaryCalculations (node) + // Sorting (parent_node) -> so far the origin Expression (child_node) -> NeededCalculations (node_with_needed) + swapSortingAndUnnecessaryCalculation(parent_node, std::move(unneeded_for_sorting)); + // UneededCalculations (child_node) -> Sorting (parent_node) -> NeededCalculations (node_with_needed) return 3; } diff --git a/src/Processors/QueryPlan/SortingStep.cpp b/src/Processors/QueryPlan/SortingStep.cpp index 3d75c461cf8..9cc242852bf 100644 --- a/src/Processors/QueryPlan/SortingStep.cpp +++ b/src/Processors/QueryPlan/SortingStep.cpp @@ -98,6 +98,7 @@ void SortingStep::updateInputStream(DataStream input_stream) void SortingStep::updateOutputStream(Block result_header) { output_stream = createOutputStream(input_streams.front(), std::move(result_header), getDataStreamTraits()); + updateDistinctColumns(output_stream->header, output_stream->distinct_columns); } void SortingStep::updateLimit(size_t limit_) diff --git a/tests/queries/0_stateless/01576_alias_column_rewrite.reference b/tests/queries/0_stateless/01576_alias_column_rewrite.reference index 11cc146dd62..68875735110 100644 --- a/tests/queries/0_stateless/01576_alias_column_rewrite.reference +++ b/tests/queries/0_stateless/01576_alias_column_rewrite.reference @@ -35,10 +35,11 @@ Expression (Projection) ReadFromMergeTree (default.test_table) Expression (Projection) Limit (preliminary LIMIT (without OFFSET)) - Sorting - Expression (Before ORDER BY) - SettingQuotaAndLimits (Set limits and quota after reading from storage) - ReadFromMergeTree (default.test_table) + Expression (Before ORDER BY [lifted up part]) + Sorting + Expression (Before ORDER BY) + SettingQuotaAndLimits (Set limits and quota after reading from storage) + ReadFromMergeTree (default.test_table) optimize_aggregation_in_order Expression ((Projection + Before ORDER BY)) Aggregating diff --git a/tests/queries/0_stateless/02149_read_in_order_fixed_prefix.reference b/tests/queries/0_stateless/02149_read_in_order_fixed_prefix.reference index 9e24b7c6ea6..67a043d6646 100644 --- a/tests/queries/0_stateless/02149_read_in_order_fixed_prefix.reference +++ b/tests/queries/0_stateless/02149_read_in_order_fixed_prefix.reference @@ -7,13 +7,15 @@ ExpressionTransform (Limit) Limit - (Sorting) - MergingSortedTransform 2 → 1 - (Expression) - ExpressionTransform × 2 - (SettingQuotaAndLimits) - (ReadFromMergeTree) - MergeTreeInOrder × 2 0 → 1 + (Expression) + ExpressionTransform + (Sorting) + MergingSortedTransform 2 → 1 + (Expression) + ExpressionTransform × 2 + (SettingQuotaAndLimits) + (ReadFromMergeTree) + MergeTreeInOrder × 2 0 → 1 2020-10-01 9 2020-10-01 9 2020-10-01 9 @@ -23,16 +25,18 @@ ExpressionTransform ExpressionTransform (Limit) Limit - (Sorting) - MergingSortedTransform 2 → 1 - (Expression) - ExpressionTransform × 2 - (SettingQuotaAndLimits) - (ReadFromMergeTree) - ReverseTransform - MergeTreeReverse 0 → 1 - ReverseTransform - MergeTreeReverse 0 → 1 + (Expression) + ExpressionTransform + (Sorting) + MergingSortedTransform 2 → 1 + (Expression) + ExpressionTransform × 2 + (SettingQuotaAndLimits) + (ReadFromMergeTree) + ReverseTransform + MergeTreeReverse 0 → 1 + ReverseTransform + MergeTreeReverse 0 → 1 2020-10-01 9 2020-10-01 9 2020-10-01 9 @@ -42,15 +46,17 @@ ExpressionTransform ExpressionTransform (Limit) Limit - (Sorting) - FinishSortingTransform - PartialSortingTransform - MergingSortedTransform 2 → 1 - (Expression) - ExpressionTransform × 2 - (SettingQuotaAndLimits) - (ReadFromMergeTree) - MergeTreeInOrder × 2 0 → 1 + (Expression) + ExpressionTransform + (Sorting) + FinishSortingTransform + PartialSortingTransform + MergingSortedTransform 2 → 1 + (Expression) + ExpressionTransform × 2 + (SettingQuotaAndLimits) + (ReadFromMergeTree) + MergeTreeInOrder × 2 0 → 1 2020-10-11 0 2020-10-11 0 2020-10-11 0 From 3308b9d3afc854339052e2c8ca9a7cf02f4bb142 Mon Sep 17 00:00:00 2001 From: Nickita Taranov Date: Sat, 26 Mar 2022 01:36:41 +0100 Subject: [PATCH 140/239] disable test temporarily --- .../01600_remerge_sort_lowered_memory_bytes_ratio.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/01600_remerge_sort_lowered_memory_bytes_ratio.sql b/tests/queries/0_stateless/01600_remerge_sort_lowered_memory_bytes_ratio.sql index 5de4210d3f2..c0de98efd53 100644 --- a/tests/queries/0_stateless/01600_remerge_sort_lowered_memory_bytes_ratio.sql +++ b/tests/queries/0_stateless/01600_remerge_sort_lowered_memory_bytes_ratio.sql @@ -10,8 +10,8 @@ set max_block_size=40960; -- MergeSortingTransform: Re-merging intermediate ORDER BY data (20 blocks with 819200 rows) to save memory consumption -- MergeSortingTransform: Memory usage is lowered from 186.25 MiB to 95.00 MiB -- MergeSortingTransform: Re-merging is not useful (memory usage was not lowered by remerge_sort_lowered_memory_bytes_ratio=2.0) -select number k, repeat(toString(number), 11) v1, repeat(toString(number), 12) v2 from numbers(3e6) order by k limit 400e3 format Null; -- { serverError 241 } -select number k, repeat(toString(number), 11) v1, repeat(toString(number), 12) v2 from numbers(3e6) order by k limit 400e3 settings remerge_sort_lowered_memory_bytes_ratio=2. format Null; -- { serverError 241 } +select number k, repeat(toString(number), 11) v1, repeat(toString(number), 12) v2 from numbers(3e6) order by k limit 400e3 format Null; +select number k, repeat(toString(number), 11) v1, repeat(toString(number), 12) v2 from numbers(3e6) order by k limit 400e3 settings remerge_sort_lowered_memory_bytes_ratio=2. format Null; -- remerge_sort_lowered_memory_bytes_ratio 1.9 is good (need at least 1.91/0.98=1.94) -- MergeSortingTransform: Re-merging intermediate ORDER BY data (20 blocks with 819200 rows) to save memory consumption From b07f35ce280e923f4f99bd231036894daf356010 Mon Sep 17 00:00:00 2001 From: Nickita Taranov Date: Sat, 26 Mar 2022 02:16:59 +0100 Subject: [PATCH 141/239] fix test --- tests/queries/0_stateless/01655_plan_optimizations.reference | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/queries/0_stateless/01655_plan_optimizations.reference b/tests/queries/0_stateless/01655_plan_optimizations.reference index 5bdda6ac9aa..5b6c6f3d4b1 100644 --- a/tests/queries/0_stateless/01655_plan_optimizations.reference +++ b/tests/queries/0_stateless/01655_plan_optimizations.reference @@ -148,6 +148,7 @@ Expression Limit Expression Sorting +Expression > Expression should be divided into two subexpressions and only one of them should be moved after Sorting Expression Limit From 85fbf6cc621e46fc03535cc8ecf63bb33494c4df Mon Sep 17 00:00:00 2001 From: Nickita Taranov Date: Sat, 26 Mar 2022 13:52:14 +0100 Subject: [PATCH 142/239] update one more test --- .../queries/0_stateless/01591_window_functions.reference | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/queries/0_stateless/01591_window_functions.reference b/tests/queries/0_stateless/01591_window_functions.reference index 655232fcdd4..c766bf16f19 100644 --- a/tests/queries/0_stateless/01591_window_functions.reference +++ b/tests/queries/0_stateless/01591_window_functions.reference @@ -925,10 +925,11 @@ Expression ((Projection + Before ORDER BY)) Window (Window step for window \'ORDER BY o ASC, number ASC\') Sorting (Sorting for window \'ORDER BY o ASC, number ASC\') Window (Window step for window \'ORDER BY number ASC\') - Sorting (Sorting for window \'ORDER BY number ASC\') - Expression ((Before window functions + (Projection + Before ORDER BY))) - SettingQuotaAndLimits (Set limits and quota after reading from storage) - ReadFromStorage (SystemNumbers) + Expression ((Before window functions + (Projection + Before ORDER BY)) [lifted up part]) + Sorting (Sorting for window \'ORDER BY number ASC\') + Expression ((Before window functions + (Projection + Before ORDER BY))) + SettingQuotaAndLimits (Set limits and quota after reading from storage) + ReadFromStorage (SystemNumbers) -- A test case for the sort comparator found by fuzzer. SELECT max(number) OVER (ORDER BY number DESC NULLS FIRST), From a39427f00b61347cbf1399934c8f3efa96b60af1 Mon Sep 17 00:00:00 2001 From: Nickita Taranov Date: Tue, 29 Mar 2022 12:55:48 +0200 Subject: [PATCH 143/239] clean up --- .../QueryPlan/Optimizations/Optimizations.h | 4 +-- .../Optimizations/liftUpFunctions.cpp | 33 ++++--------------- ...emerge_sort_lowered_memory_bytes_ratio.sql | 2 +- .../01655_plan_optimizations.reference | 2 ++ .../0_stateless/01655_plan_optimizations.sh | 3 ++ 5 files changed, 15 insertions(+), 29 deletions(-) diff --git a/src/Processors/QueryPlan/Optimizations/Optimizations.h b/src/Processors/QueryPlan/Optimizations/Optimizations.h index 45da00a7ccd..1d5b83dc9d0 100644 --- a/src/Processors/QueryPlan/Optimizations/Optimizations.h +++ b/src/Processors/QueryPlan/Optimizations/Optimizations.h @@ -44,8 +44,8 @@ size_t tryMergeExpressions(QueryPlan::Node * parent_node, QueryPlan::Nodes &); /// May split FilterStep and push down only part of it. size_t tryPushDownFilter(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes); -/// Move ExpressionStep up if possible. -/// May split ExpressionStep and lift up only part of it. +/// Move ExpressionStep after SortingStep if possible. +/// May split ExpressionStep and lift up only a part of it. size_t tryExecuteFunctionsAfterSorting(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes); inline const auto & getOptimizations() diff --git a/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp b/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp index a304b91017c..8e4242ea73e 100644 --- a/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp +++ b/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp @@ -9,7 +9,7 @@ namespace DB::QueryPlanOptimizations { -void swapSortingAndUnnecessaryCalculation(QueryPlan::Node * parent_node, ActionsDAGPtr && unneeded_for_sorting) +void swapSortingAndUnneededCalculations(QueryPlan::Node * parent_node, ActionsDAGPtr && unneeded_for_sorting) { QueryPlan::Node * child_node = parent_node->children.front(); @@ -22,17 +22,11 @@ void swapSortingAndUnnecessaryCalculation(QueryPlan::Node * parent_node, Actions // Expression -> Sorting sorting_step->updateInputStream(child_node->children.at(0)->step->getOutputStream()); - LOG_TRACE( - &Poco::Logger::get("Optimizer"), "New Sorting input header: {}", sorting_step->getInputStreams().at(0).header.dumpStructure()); auto input_header = sorting_step->getInputStreams().at(0).header; - LOG_TRACE(&Poco::Logger::get("Optimizer"), "Old Sorting output header: {}", sorting_step->getOutputStream().header.dumpStructure()); sorting_step->updateOutputStream(std::move(input_header)); - LOG_TRACE(&Poco::Logger::get("Optimizer"), "New Sorting output header: {}", sorting_step->getOutputStream().header.dumpStructure()); + auto description = parent_node->step->getStepDescription(); parent_step = std::make_unique(child_step->getOutputStream(), std::move(unneeded_for_sorting)); - LOG_TRACE( - &Poco::Logger::get("Optimizer"), "New Expression input header: {}", parent_step->getInputStreams().at(0).header.dumpStructure()); - LOG_TRACE(&Poco::Logger::get("Optimizer"), "New Expression output header: {}", parent_step->getOutputStream().header.dumpStructure()); parent_step->setStepDescription(description + " [lifted up part]"); // UnneededCalculations -> Sorting } @@ -55,35 +49,22 @@ size_t tryExecuteFunctionsAfterSorting(QueryPlan::Node * parent_node, QueryPlan: NameSet sort_columns; for (const auto & col : sorting_step->getSortDescription()) sort_columns.insert(col.column_name); - const auto & expression = expression_step->getExpression(); - auto [needed_for_sorting, unneeded_for_sorting] = expression->splitActionsBySortingDescription(sort_columns); - LOG_TRACE(&Poco::Logger::get("Optimizer"), "Original Expression: {}", expression->dumpDAG()); - LOG_TRACE(&Poco::Logger::get("Optimizer"), "Needed for Sorting: {}", needed_for_sorting->dumpDAG()); - LOG_TRACE(&Poco::Logger::get("Optimizer"), "Unneeded for Sorting: {}", unneeded_for_sorting->dumpDAG()); - - auto description = child_step->getStepDescription(); + auto [needed_for_sorting, unneeded_for_sorting] = expression_step->getExpression()->splitActionsBySortingDescription(sort_columns); // No calculations can be postponed. if (unneeded_for_sorting->trivial()) return 0; - // Everything can be done after the sorting. - /*if (needed_for_sorting->trivial()) - { - swapSortingAndUnnecessaryCalculation(parent_node, std::move(unneeded_for_sorting)); - return 2; - }*/ - // Sorting (parent_node) -> Expression (child_node) auto & node_with_needed = nodes.emplace_back(); - node_with_needed.children.swap(child_node->children); - child_node->children.emplace_back(&node_with_needed); + std::swap(node_with_needed.children, child_node->children); + child_node->children = {&node_with_needed}; node_with_needed.step = std::make_unique(node_with_needed.children.at(0)->step->getOutputStream(), std::move(needed_for_sorting)); - node_with_needed.step->setStepDescription(std::move(description)); + node_with_needed.step->setStepDescription(child_step->getStepDescription()); // Sorting (parent_node) -> so far the origin Expression (child_node) -> NeededCalculations (node_with_needed) - swapSortingAndUnnecessaryCalculation(parent_node, std::move(unneeded_for_sorting)); + swapSortingAndUnneededCalculations(parent_node, std::move(unneeded_for_sorting)); // UneededCalculations (child_node) -> Sorting (parent_node) -> NeededCalculations (node_with_needed) return 3; diff --git a/tests/queries/0_stateless/01600_remerge_sort_lowered_memory_bytes_ratio.sql b/tests/queries/0_stateless/01600_remerge_sort_lowered_memory_bytes_ratio.sql index c0de98efd53..f89fd1c94ca 100644 --- a/tests/queries/0_stateless/01600_remerge_sort_lowered_memory_bytes_ratio.sql +++ b/tests/queries/0_stateless/01600_remerge_sort_lowered_memory_bytes_ratio.sql @@ -10,7 +10,7 @@ set max_block_size=40960; -- MergeSortingTransform: Re-merging intermediate ORDER BY data (20 blocks with 819200 rows) to save memory consumption -- MergeSortingTransform: Memory usage is lowered from 186.25 MiB to 95.00 MiB -- MergeSortingTransform: Re-merging is not useful (memory usage was not lowered by remerge_sort_lowered_memory_bytes_ratio=2.0) -select number k, repeat(toString(number), 11) v1, repeat(toString(number), 12) v2 from numbers(3e6) order by k limit 400e3 format Null; +select number k, repeat(toString(number), 11) v1, repeat(toString(number), 12) v2 from numbers(3e6) order by k limit 400e3 format Null; -- { serverError 241 }} select number k, repeat(toString(number), 11) v1, repeat(toString(number), 12) v2 from numbers(3e6) order by k limit 400e3 settings remerge_sort_lowered_memory_bytes_ratio=2. format Null; -- remerge_sort_lowered_memory_bytes_ratio 1.9 is good (need at least 1.91/0.98=1.94) diff --git a/tests/queries/0_stateless/01655_plan_optimizations.reference b/tests/queries/0_stateless/01655_plan_optimizations.reference index 5b6c6f3d4b1..218ff7bd8c9 100644 --- a/tests/queries/0_stateless/01655_plan_optimizations.reference +++ b/tests/queries/0_stateless/01655_plan_optimizations.reference @@ -155,3 +155,5 @@ Limit Expression Sorting Expression +> this query should be executed without throwing an exception +0 diff --git a/tests/queries/0_stateless/01655_plan_optimizations.sh b/tests/queries/0_stateless/01655_plan_optimizations.sh index efbd3973b62..1f5d88bd8bf 100755 --- a/tests/queries/0_stateless/01655_plan_optimizations.sh +++ b/tests/queries/0_stateless/01655_plan_optimizations.sh @@ -206,3 +206,6 @@ echo "> Expression should be divided into two subexpressions and only one of the $CLICKHOUSE_CLIENT -q " explain select sipHash64(number) from numbers(100) order by number + 1 limit 5" | sed 's/ //g' | grep -o "^ *\(Expression\|Limit\|Sorting\)" +echo "> this query should be executed without throwing an exception" +$CLICKHOUSE_CLIENT -q " + select throwIf(number = 5) from (select * from numbers(10)) order by number limit 1" From 5590f78dfe981e5396da91fcedb437772610e32f Mon Sep 17 00:00:00 2001 From: Nickita Taranov Date: Tue, 29 Mar 2022 19:51:58 +0200 Subject: [PATCH 144/239] update remerge_sort_lowered_memory_bytes_ratio --- .../01600_remerge_sort_lowered_memory_bytes_ratio.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/01600_remerge_sort_lowered_memory_bytes_ratio.sql b/tests/queries/0_stateless/01600_remerge_sort_lowered_memory_bytes_ratio.sql index f89fd1c94ca..8646b40563e 100644 --- a/tests/queries/0_stateless/01600_remerge_sort_lowered_memory_bytes_ratio.sql +++ b/tests/queries/0_stateless/01600_remerge_sort_lowered_memory_bytes_ratio.sql @@ -10,8 +10,8 @@ set max_block_size=40960; -- MergeSortingTransform: Re-merging intermediate ORDER BY data (20 blocks with 819200 rows) to save memory consumption -- MergeSortingTransform: Memory usage is lowered from 186.25 MiB to 95.00 MiB -- MergeSortingTransform: Re-merging is not useful (memory usage was not lowered by remerge_sort_lowered_memory_bytes_ratio=2.0) -select number k, repeat(toString(number), 11) v1, repeat(toString(number), 12) v2 from numbers(3e6) order by k limit 400e3 format Null; -- { serverError 241 }} -select number k, repeat(toString(number), 11) v1, repeat(toString(number), 12) v2 from numbers(3e6) order by k limit 400e3 settings remerge_sort_lowered_memory_bytes_ratio=2. format Null; +select repeat(toString(number), 11) v1, repeat(toString(number), 12) v2 from numbers(3e6) order by v1, v2 limit 400e3 format Null; -- { serverError 241 } +select repeat(toString(number), 11) v1, repeat(toString(number), 12) v2 from numbers(3e6) order by v1, v2 limit 400e3 settings remerge_sort_lowered_memory_bytes_ratio=2. format Null; -- { serverError 241 } -- remerge_sort_lowered_memory_bytes_ratio 1.9 is good (need at least 1.91/0.98=1.94) -- MergeSortingTransform: Re-merging intermediate ORDER BY data (20 blocks with 819200 rows) to save memory consumption From ce40d84eefb0629ff49f55f99caef7c15392aa8b Mon Sep 17 00:00:00 2001 From: Nickita Taranov Date: Tue, 29 Mar 2022 21:16:05 +0200 Subject: [PATCH 145/239] more fixes --- .../Optimizations/liftUpFunctions.cpp | 36 +++++++++++++------ src/Processors/QueryPlan/SortingStep.cpp | 2 +- ...emerge_sort_lowered_memory_bytes_ratio.sql | 4 +-- .../01655_plan_optimizations.reference | 14 +++----- .../0_stateless/01655_plan_optimizations.sh | 8 ++--- 5 files changed, 35 insertions(+), 29 deletions(-) diff --git a/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp b/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp index 8e4242ea73e..80b82d989dd 100644 --- a/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp +++ b/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp @@ -2,35 +2,48 @@ #include #include #include +#include -#include -#include +namespace DB +{ +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} +} -namespace DB::QueryPlanOptimizations +namespace { -void swapSortingAndUnneededCalculations(QueryPlan::Node * parent_node, ActionsDAGPtr && unneeded_for_sorting) +void swapSortingAndUnneededCalculations(DB::QueryPlan::Node * parent_node, DB::ActionsDAGPtr && unneeded_for_sorting) { - QueryPlan::Node * child_node = parent_node->children.front(); + DB::QueryPlan::Node * child_node = parent_node->children.front(); auto & parent_step = parent_node->step; auto & child_step = child_node->step; - auto * sorting_step = typeid_cast(parent_step.get()); + auto * sorting_step = typeid_cast(parent_step.get()); // Sorting -> Expression std::swap(parent_step, child_step); // Expression -> Sorting - sorting_step->updateInputStream(child_node->children.at(0)->step->getOutputStream()); - auto input_header = sorting_step->getInputStreams().at(0).header; + if (child_node->children.size() != 1) + throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "SortingStep is expected to have only one input stream."); + sorting_step->updateInputStream(child_node->children.front()->step->getOutputStream()); + auto input_header = sorting_step->getInputStreams().front().header; sorting_step->updateOutputStream(std::move(input_header)); auto description = parent_node->step->getStepDescription(); - parent_step = std::make_unique(child_step->getOutputStream(), std::move(unneeded_for_sorting)); + parent_step = std::make_unique(child_step->getOutputStream(), std::move(unneeded_for_sorting)); parent_step->setStepDescription(description + " [lifted up part]"); // UnneededCalculations -> Sorting } +} + +namespace DB::QueryPlanOptimizations +{ + size_t tryExecuteFunctionsAfterSorting(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes) { if (parent_node->children.size() != 1) @@ -55,12 +68,15 @@ size_t tryExecuteFunctionsAfterSorting(QueryPlan::Node * parent_node, QueryPlan: if (unneeded_for_sorting->trivial()) return 0; + if (child_node->children.size() != 1) + throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "ExpressionStep is expected to have only one input stream."); + // Sorting (parent_node) -> Expression (child_node) auto & node_with_needed = nodes.emplace_back(); std::swap(node_with_needed.children, child_node->children); child_node->children = {&node_with_needed}; node_with_needed.step - = std::make_unique(node_with_needed.children.at(0)->step->getOutputStream(), std::move(needed_for_sorting)); + = std::make_unique(node_with_needed.children.front()->step->getOutputStream(), std::move(needed_for_sorting)); node_with_needed.step->setStepDescription(child_step->getStepDescription()); // Sorting (parent_node) -> so far the origin Expression (child_node) -> NeededCalculations (node_with_needed) diff --git a/src/Processors/QueryPlan/SortingStep.cpp b/src/Processors/QueryPlan/SortingStep.cpp index 9cc242852bf..efefbad0ded 100644 --- a/src/Processors/QueryPlan/SortingStep.cpp +++ b/src/Processors/QueryPlan/SortingStep.cpp @@ -92,7 +92,7 @@ SortingStep::SortingStep( void SortingStep::updateInputStream(DataStream input_stream) { input_streams.clear(); - input_streams.push_back(std::move(input_stream)); + input_streams.emplace_back(std::move(input_stream)); } void SortingStep::updateOutputStream(Block result_header) diff --git a/tests/queries/0_stateless/01600_remerge_sort_lowered_memory_bytes_ratio.sql b/tests/queries/0_stateless/01600_remerge_sort_lowered_memory_bytes_ratio.sql index 8646b40563e..6e23ab9cdb9 100644 --- a/tests/queries/0_stateless/01600_remerge_sort_lowered_memory_bytes_ratio.sql +++ b/tests/queries/0_stateless/01600_remerge_sort_lowered_memory_bytes_ratio.sql @@ -10,8 +10,8 @@ set max_block_size=40960; -- MergeSortingTransform: Re-merging intermediate ORDER BY data (20 blocks with 819200 rows) to save memory consumption -- MergeSortingTransform: Memory usage is lowered from 186.25 MiB to 95.00 MiB -- MergeSortingTransform: Re-merging is not useful (memory usage was not lowered by remerge_sort_lowered_memory_bytes_ratio=2.0) -select repeat(toString(number), 11) v1, repeat(toString(number), 12) v2 from numbers(3e6) order by v1, v2 limit 400e3 format Null; -- { serverError 241 } -select repeat(toString(number), 11) v1, repeat(toString(number), 12) v2 from numbers(3e6) order by v1, v2 limit 400e3 settings remerge_sort_lowered_memory_bytes_ratio=2. format Null; -- { serverError 241 } +select number k, repeat(toString(number), 11) v1, repeat(toString(number), 12) v2 from numbers(3e6) order by v1, v2 limit 400e3 format Null; -- { serverError 241 } +select number k, repeat(toString(number), 11) v1, repeat(toString(number), 12) v2 from numbers(3e6) order by v1, v2 limit 400e3 settings remerge_sort_lowered_memory_bytes_ratio=2. format Null; -- { serverError 241 } -- remerge_sort_lowered_memory_bytes_ratio 1.9 is good (need at least 1.91/0.98=1.94) -- MergeSortingTransform: Re-merging intermediate ORDER BY data (20 blocks with 819200 rows) to save memory consumption diff --git a/tests/queries/0_stateless/01655_plan_optimizations.reference b/tests/queries/0_stateless/01655_plan_optimizations.reference index 218ff7bd8c9..bb9c614f728 100644 --- a/tests/queries/0_stateless/01655_plan_optimizations.reference +++ b/tests/queries/0_stateless/01655_plan_optimizations.reference @@ -143,17 +143,11 @@ Filter 2 3 2 3 > function calculation should be done after sorting and limit (if possible) -> the whole Expression node could be moved after Sorting -Expression -Limit -Expression -Sorting -Expression > Expression should be divided into two subexpressions and only one of them should be moved after Sorting -Expression -Limit -Expression +Expression (Before ORDER BY [lifted up part]) +FUNCTION sipHash64 Sorting -Expression +Expression (Before ORDER BY) +FUNCTION plus > this query should be executed without throwing an exception 0 diff --git a/tests/queries/0_stateless/01655_plan_optimizations.sh b/tests/queries/0_stateless/01655_plan_optimizations.sh index 1f5d88bd8bf..0b7f004a2ce 100755 --- a/tests/queries/0_stateless/01655_plan_optimizations.sh +++ b/tests/queries/0_stateless/01655_plan_optimizations.sh @@ -198,14 +198,10 @@ $CLICKHOUSE_CLIENT -q " ) where a != 1 settings enable_optimize_predicate_expression = 0" echo "> function calculation should be done after sorting and limit (if possible)" -echo "> the whole Expression node could be moved after Sorting" -$CLICKHOUSE_CLIENT -q " - explain select sipHash64(number) from numbers(100) order by number limit 5" | - sed 's/ //g' | grep -o "^ *\(Expression\|Limit\|Sorting\)" echo "> Expression should be divided into two subexpressions and only one of them should be moved after Sorting" $CLICKHOUSE_CLIENT -q " - explain select sipHash64(number) from numbers(100) order by number + 1 limit 5" | - sed 's/ //g' | grep -o "^ *\(Expression\|Limit\|Sorting\)" + explain actions = 1 select number as n, sipHash64(n) from numbers(100) order by number + 1 limit 5" | + sed 's/^ *//g' | grep -o "^ *\(Expression (Before ORDER BY.*)\|Sorting\|FUNCTION \w\+\)" echo "> this query should be executed without throwing an exception" $CLICKHOUSE_CLIENT -q " select throwIf(number = 5) from (select * from numbers(10)) order by number limit 1" From 440e57769a9f19cf5223e0eb14e66a8808a6cc13 Mon Sep 17 00:00:00 2001 From: Nickita Taranov Date: Wed, 30 Mar 2022 00:29:20 +0200 Subject: [PATCH 146/239] more fizes --- .../Optimizations/liftUpFunctions.cpp | 48 ++++++++----------- src/Processors/QueryPlan/SortingStep.cpp | 2 +- 2 files changed, 20 insertions(+), 30 deletions(-) diff --git a/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp b/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp index 80b82d989dd..32918f3e5a2 100644 --- a/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp +++ b/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp @@ -15,28 +15,12 @@ namespace ErrorCodes namespace { -void swapSortingAndUnneededCalculations(DB::QueryPlan::Node * parent_node, DB::ActionsDAGPtr && unneeded_for_sorting) +const DB::DataStream & getChildOutputStream(DB::QueryPlan::Node & node) { - DB::QueryPlan::Node * child_node = parent_node->children.front(); - - auto & parent_step = parent_node->step; - auto & child_step = child_node->step; - auto * sorting_step = typeid_cast(parent_step.get()); - - // Sorting -> Expression - std::swap(parent_step, child_step); - // Expression -> Sorting - - if (child_node->children.size() != 1) - throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "SortingStep is expected to have only one input stream."); - sorting_step->updateInputStream(child_node->children.front()->step->getOutputStream()); - auto input_header = sorting_step->getInputStreams().front().header; - sorting_step->updateOutputStream(std::move(input_header)); - - auto description = parent_node->step->getStepDescription(); - parent_step = std::make_unique(child_step->getOutputStream(), std::move(unneeded_for_sorting)); - parent_step->setStepDescription(description + " [lifted up part]"); - // UnneededCalculations -> Sorting + if (node.children.size() != 1) + throw DB::Exception( + DB::ErrorCodes::LOGICAL_ERROR, "Node \"{}\" is expected to have only one child.", node.step->getStepDescription()); + return node.children.front()->step->getOutputStream(); } } @@ -68,20 +52,26 @@ size_t tryExecuteFunctionsAfterSorting(QueryPlan::Node * parent_node, QueryPlan: if (unneeded_for_sorting->trivial()) return 0; - if (child_node->children.size() != 1) - throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "ExpressionStep is expected to have only one input stream."); - // Sorting (parent_node) -> Expression (child_node) auto & node_with_needed = nodes.emplace_back(); std::swap(node_with_needed.children, child_node->children); child_node->children = {&node_with_needed}; - node_with_needed.step - = std::make_unique(node_with_needed.children.front()->step->getOutputStream(), std::move(needed_for_sorting)); - node_with_needed.step->setStepDescription(child_step->getStepDescription()); + node_with_needed.step = std::make_unique(getChildOutputStream(node_with_needed), std::move(needed_for_sorting)); + node_with_needed.step->setStepDescription(child_step->getStepDescription()); // Sorting (parent_node) -> so far the origin Expression (child_node) -> NeededCalculations (node_with_needed) - swapSortingAndUnneededCalculations(parent_node, std::move(unneeded_for_sorting)); - // UneededCalculations (child_node) -> Sorting (parent_node) -> NeededCalculations (node_with_needed) + + std::swap(parent_step, child_step); + // so far the origin Expression (parent_node) -> Sorting (child_node) -> NeededCalculations (node_with_needed) + + sorting_step->updateInputStream(getChildOutputStream(*child_node)); + auto input_header = sorting_step->getInputStreams().at(0).header; + sorting_step->updateOutputStream(std::move(input_header)); + + auto description = parent_step->getStepDescription(); + parent_step = std::make_unique(child_step->getOutputStream(), std::move(unneeded_for_sorting)); + parent_step->setStepDescription(description + " [lifted up part]"); + // UneededCalculations (parent_node) -> Sorting (child_node) -> NeededCalculations (node_with_needed) return 3; } diff --git a/src/Processors/QueryPlan/SortingStep.cpp b/src/Processors/QueryPlan/SortingStep.cpp index efefbad0ded..859c9fd9e19 100644 --- a/src/Processors/QueryPlan/SortingStep.cpp +++ b/src/Processors/QueryPlan/SortingStep.cpp @@ -97,7 +97,7 @@ void SortingStep::updateInputStream(DataStream input_stream) void SortingStep::updateOutputStream(Block result_header) { - output_stream = createOutputStream(input_streams.front(), std::move(result_header), getDataStreamTraits()); + output_stream = createOutputStream(input_streams.at(0), std::move(result_header), getDataStreamTraits()); updateDistinctColumns(output_stream->header, output_stream->distinct_columns); } From 698a984c074390127b6989705f9b31ae5a89df7a Mon Sep 17 00:00:00 2001 From: Nickita Taranov Date: Thu, 31 Mar 2022 13:39:05 +0200 Subject: [PATCH 147/239] throw if sorting column not found --- src/Interpreters/ActionsDAG.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/Interpreters/ActionsDAG.cpp b/src/Interpreters/ActionsDAG.cpp index ea90bedd2f6..f06ac229e94 100644 --- a/src/Interpreters/ActionsDAG.cpp +++ b/src/Interpreters/ActionsDAG.cpp @@ -1533,6 +1533,10 @@ ActionsDAG::SplitResult ActionsDAG::splitActionsBySortingDescription(const NameS for (const auto & sort_column : sort_columns) if (const auto * node = tryFindInIndex(sort_column)) split_nodes.insert(node); + else + throw Exception( + ErrorCodes::LOGICAL_ERROR, "Sorting column {} wasn't found in the ActionsDAG's index. DAG:\n{}", sort_column, dumpDAG()); + auto res = split(split_nodes); res.second->project_input = project_input; return res; From 0f94a58f3a7bd224662feb7bc8e4e9a954eb167a Mon Sep 17 00:00:00 2001 From: Nickita Taranov Date: Mon, 4 Apr 2022 14:59:38 +0200 Subject: [PATCH 148/239] use getName() --- src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp b/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp index 32918f3e5a2..2a415f8c5af 100644 --- a/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp +++ b/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp @@ -18,8 +18,7 @@ namespace const DB::DataStream & getChildOutputStream(DB::QueryPlan::Node & node) { if (node.children.size() != 1) - throw DB::Exception( - DB::ErrorCodes::LOGICAL_ERROR, "Node \"{}\" is expected to have only one child.", node.step->getStepDescription()); + throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Node \"{}\" is expected to have only one child.", node.step->getName()); return node.children.front()->step->getOutputStream(); } From 2a8e47927789d3d8b3d87794bfce0e22bb94aae9 Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Mon, 4 Apr 2022 15:56:01 +0200 Subject: [PATCH 149/239] ExecutableUserDefinedFunction prevent function execution during query analysis --- src/Common/ProfileEvents.cpp | 2 ++ src/Common/ShellCommand.cpp | 6 ++++++ src/Interpreters/UserDefinedExecutableFunctionFactory.cpp | 4 ++++ 3 files changed, 12 insertions(+) diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp index 074ec02394b..3f55970f3aa 100644 --- a/src/Common/ProfileEvents.cpp +++ b/src/Common/ProfileEvents.cpp @@ -112,6 +112,8 @@ M(CompileExpressionsMicroseconds, "Total time spent for compilation of expressions to LLVM code.") \ M(CompileExpressionsBytes, "Number of bytes used for expressions compilation.") \ \ + M(ExecuteShellCommand, "Number of shell command executions.") \ + \ M(ExternalSortWritePart, "") \ M(ExternalSortMerge, "") \ M(ExternalAggregationWritePart, "") \ diff --git a/src/Common/ShellCommand.cpp b/src/Common/ShellCommand.cpp index 0093d72e766..229807c868e 100644 --- a/src/Common/ShellCommand.cpp +++ b/src/Common/ShellCommand.cpp @@ -29,6 +29,11 @@ namespace }; } +namespace ProfileEvents +{ + extern const int ExecuteShellCommand; +} + namespace DB { @@ -158,6 +163,7 @@ std::unique_ptr ShellCommand::executeImpl( const Config & config) { logCommand(filename, argv); + ProfileEvents::increment(ProfileEvents::ExecuteShellCommand); #if !defined(USE_MUSL) /** Here it is written that with a normal call `vfork`, there is a chance of deadlock in multithreaded programs, diff --git a/src/Interpreters/UserDefinedExecutableFunctionFactory.cpp b/src/Interpreters/UserDefinedExecutableFunctionFactory.cpp index 6d7dee7a4c7..d3a38f42e21 100644 --- a/src/Interpreters/UserDefinedExecutableFunctionFactory.cpp +++ b/src/Interpreters/UserDefinedExecutableFunctionFactory.cpp @@ -57,6 +57,10 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override { + /// Do not start user defined script during query analysis. Because user script startup could be heavy. + if (input_rows_count == 0) + return result_type->createColumn(); + auto coordinator = executable_function->getCoordinator(); const auto & coordinator_configuration = coordinator->getConfiguration(); const auto & configuration = executable_function->getConfiguration(); From 482c8f667cee0bbc713ada47f9e129037988a72d Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Mon, 4 Apr 2022 16:10:19 +0200 Subject: [PATCH 150/239] Added tests --- ...table_user_defined_function_short_circuit.reference | 1 + ..._executable_user_defined_function_short_circuit.sql | 10 ++++++++++ 2 files changed, 11 insertions(+) create mode 100644 tests/queries/0_stateless/02252_executable_user_defined_function_short_circuit.reference create mode 100644 tests/queries/0_stateless/02252_executable_user_defined_function_short_circuit.sql diff --git a/tests/queries/0_stateless/02252_executable_user_defined_function_short_circuit.reference b/tests/queries/0_stateless/02252_executable_user_defined_function_short_circuit.reference new file mode 100644 index 00000000000..573541ac970 --- /dev/null +++ b/tests/queries/0_stateless/02252_executable_user_defined_function_short_circuit.reference @@ -0,0 +1 @@ +0 diff --git a/tests/queries/0_stateless/02252_executable_user_defined_function_short_circuit.sql b/tests/queries/0_stateless/02252_executable_user_defined_function_short_circuit.sql new file mode 100644 index 00000000000..a475ba33740 --- /dev/null +++ b/tests/queries/0_stateless/02252_executable_user_defined_function_short_circuit.sql @@ -0,0 +1,10 @@ +SELECT number FROM numbers(10) WHERE number > 15 and test_function(number, number) == 4; + +SYSTEM FLUSH LOGS; + +SELECT ProfileEvents['ExecuteShellCommand'] FROM system.query_log WHERE + current_database = currentDatabase() + AND type = 'QueryFinish' + AND query == 'SELECT number FROM numbers(10) WHERE number > 15 and test_function(number, number) == 4;' + AND event_date >= yesterday() AND event_time > now() - interval 10 minute + LIMIT 1; From a46495de5c6f0ec2d44ef68666944be62eb4712f Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Mon, 4 Apr 2022 16:22:16 +0200 Subject: [PATCH 151/239] JIT ProfileEvents added test --- .../02252_jit_profile_events.reference | 4 +++ .../0_stateless/02252_jit_profile_events.sql | 31 +++++++++++++++++++ 2 files changed, 35 insertions(+) create mode 100644 tests/queries/0_stateless/02252_jit_profile_events.reference create mode 100644 tests/queries/0_stateless/02252_jit_profile_events.sql diff --git a/tests/queries/0_stateless/02252_jit_profile_events.reference b/tests/queries/0_stateless/02252_jit_profile_events.reference new file mode 100644 index 00000000000..12d82114f75 --- /dev/null +++ b/tests/queries/0_stateless/02252_jit_profile_events.reference @@ -0,0 +1,4 @@ +0 +1 +0 1 2 +1 diff --git a/tests/queries/0_stateless/02252_jit_profile_events.sql b/tests/queries/0_stateless/02252_jit_profile_events.sql new file mode 100644 index 00000000000..e4c9d9d8791 --- /dev/null +++ b/tests/queries/0_stateless/02252_jit_profile_events.sql @@ -0,0 +1,31 @@ +-- Tags: no-fasttest + +SET compile_expressions = 1; +SET min_count_to_compile_expression = 0; + +SYSTEM DROP COMPILED EXPRESSION CACHE; + +SELECT number + number + number FROM numbers(1); + +SYSTEM FLUSH LOGS; + +SELECT ProfileEvents['CompileFunction'] FROM system.query_log WHERE + current_database = currentDatabase() + AND type = 'QueryFinish' + AND query == 'SELECT number + number + number FROM numbers(1);' + AND event_date >= yesterday() AND event_time > now() - interval 10 minute + LIMIT 1; + +SET compile_aggregate_expressions = 1; +SET min_count_to_compile_aggregate_expression = 0; + +SELECT sum(number), sum(number + 1), sum(number + 2) FROM numbers(1) GROUP BY number; + +SYSTEM FLUSH LOGS; + +SELECT ProfileEvents['CompileFunction'] FROM system.query_log WHERE + current_database = currentDatabase() + AND type = 'QueryFinish' + AND query == 'SELECT sum(number), sum(number), sum(number) FROM numbers(1) GROUP BY number;' + AND event_date >= yesterday() AND event_time > now() - interval 10 minute + LIMIT 1; \ No newline at end of file From cf71b18472fbaf6a26b36f2062ae410e0f6bb01d Mon Sep 17 00:00:00 2001 From: Meena Renganathan Date: Mon, 4 Apr 2022 07:23:31 -0700 Subject: [PATCH 152/239] Modified the code to fix the getenv() call issue idenitified in the clang-tidy --- .clang-tidy | 2 ++ programs/client/Client.cpp | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.clang-tidy b/.clang-tidy index 0400b500e5c..5e5fae57dba 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -164,6 +164,8 @@ Checks: '-*, clang-analyzer-unix.cstring.NullArg, boost-use-to-string, + + alpha.security.cert.env.InvalidPtr, ' WarningsAsErrors: '*' diff --git a/programs/client/Client.cpp b/programs/client/Client.cpp index c2094b3b00d..becbc471d08 100644 --- a/programs/client/Client.cpp +++ b/programs/client/Client.cpp @@ -164,9 +164,9 @@ void Client::initialize(Poco::Util::Application & self) configReadClient(config(), home_path); const char * env_user = getenv("CLICKHOUSE_USER"); - const char * env_password = getenv("CLICKHOUSE_PASSWORD"); if (env_user) config().setString("user", env_user); + const char * env_password = getenv("CLICKHOUSE_PASSWORD"); if (env_password) config().setString("password", env_password); From 6c6fb5c3e822dd4e70363aae4d6874543cdb95a6 Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 4 Apr 2022 16:32:37 +0200 Subject: [PATCH 153/239] Fix race in cached buffer --- src/Disks/IO/CachedReadBufferFromRemoteFS.cpp | 18 +++++++++++------- src/IO/ReadBufferFromS3.cpp | 2 +- src/IO/ReadBufferFromS3.h | 6 ++++-- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/src/Disks/IO/CachedReadBufferFromRemoteFS.cpp b/src/Disks/IO/CachedReadBufferFromRemoteFS.cpp index 4766b838fda..de671e58687 100644 --- a/src/Disks/IO/CachedReadBufferFromRemoteFS.cpp +++ b/src/Disks/IO/CachedReadBufferFromRemoteFS.cpp @@ -334,15 +334,17 @@ SeekableReadBufferPtr CachedReadBufferFromRemoteFS::getImplementationBuffer(File read_buffer_for_file_segment->seek(file_offset_of_buffer_end, SEEK_SET); } - auto impl_range = read_buffer_for_file_segment->getRemainingReadRange(); auto download_offset = file_segment->getDownloadOffset(); if (download_offset != static_cast(read_buffer_for_file_segment->getPosition())) + { + auto impl_range = read_buffer_for_file_segment->getRemainingReadRange(); throw Exception( ErrorCodes::LOGICAL_ERROR, "Buffer's offsets mismatch; cached buffer offset: {}, download_offset: {}, position: {}, implementation buffer offset: {}, " "implementation buffer reading until: {}, file segment info: {}", file_offset_of_buffer_end, download_offset, read_buffer_for_file_segment->getPosition(), impl_range.left, *impl_range.right, file_segment->getInfoForLog()); + } break; } @@ -802,12 +804,14 @@ std::optional CachedReadBufferFromRemoteFS::getLastNonDownloadedOffset() String CachedReadBufferFromRemoteFS::getInfoForLog() { - auto implementation_buffer_read_range_str = - implementation_buffer ? - std::to_string(implementation_buffer->getRemainingReadRange().left) - + '-' - + (implementation_buffer->getRemainingReadRange().right ? std::to_string(*implementation_buffer->getRemainingReadRange().right) : "None") - : "None"; + String implementation_buffer_read_range_str; + if (implementation_buffer) + { + auto read_range = implementation_buffer->getRemainingReadRange(); + implementation_buffer_read_range_str = std::to_string(read_range.left) + '-' + (read_range.right ? std::to_string(*read_range.right) : "None"); + } + else + implementation_buffer_read_range_str = "None"; auto current_file_segment_info = current_file_segment_it == file_segments_holder->file_segments.end() ? "None" : (*current_file_segment_it)->getInfoForLog(); diff --git a/src/IO/ReadBufferFromS3.cpp b/src/IO/ReadBufferFromS3.cpp index 6616d92b492..728893e912d 100644 --- a/src/IO/ReadBufferFromS3.cpp +++ b/src/IO/ReadBufferFromS3.cpp @@ -240,7 +240,7 @@ void ReadBufferFromS3::setReadUntilPosition(size_t position) SeekableReadBuffer::Range ReadBufferFromS3::getRemainingReadRange() const { - return Range{.left = static_cast(offset), .right = read_until_position ? std::optional{read_until_position - 1} : std::nullopt}; + return Range{ .left = static_cast(offset), .right = read_until_position ? std::optional{read_until_position - 1} : std::nullopt }; } std::unique_ptr ReadBufferFromS3::initialize() diff --git a/src/IO/ReadBufferFromS3.h b/src/IO/ReadBufferFromS3.h index 5c9d709d58e..0040ede6d6b 100644 --- a/src/IO/ReadBufferFromS3.h +++ b/src/IO/ReadBufferFromS3.h @@ -33,8 +33,10 @@ private: String key; UInt64 max_single_read_retries; - off_t offset = 0; - off_t read_until_position = 0; + /// These variables are atomic because they can be used for `logging only` + /// from separate thread other than the one which uses the buffer for s3 reading. + std::atomic offset = 0; + std::atomic read_until_position = 0; Aws::S3::Model::GetObjectResult read_result; std::unique_ptr impl; From d69757696721bb9486e524bc288300ad932174a7 Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Mon, 4 Apr 2022 17:53:01 +0200 Subject: [PATCH 154/239] Update ReadBufferFromS3.h --- src/IO/ReadBufferFromS3.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/IO/ReadBufferFromS3.h b/src/IO/ReadBufferFromS3.h index 0040ede6d6b..5282d9ad482 100644 --- a/src/IO/ReadBufferFromS3.h +++ b/src/IO/ReadBufferFromS3.h @@ -34,6 +34,7 @@ private: UInt64 max_single_read_retries; /// These variables are atomic because they can be used for `logging only` + /// (where it is not important to get consistent result) /// from separate thread other than the one which uses the buffer for s3 reading. std::atomic offset = 0; std::atomic read_until_position = 0; From 86f42e7a3a900649772d06a5444d9bff55dc4361 Mon Sep 17 00:00:00 2001 From: alesapin Date: Mon, 4 Apr 2022 18:07:36 +0200 Subject: [PATCH 155/239] Better check for kafka_num_consumers --- src/Storages/Kafka/StorageKafka.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/Storages/Kafka/StorageKafka.cpp b/src/Storages/Kafka/StorageKafka.cpp index 4c7465d587d..9c3506742fd 100644 --- a/src/Storages/Kafka/StorageKafka.cpp +++ b/src/Storages/Kafka/StorageKafka.cpp @@ -779,11 +779,13 @@ void registerStorageKafka(StorageFactory & factory) #undef CHECK_KAFKA_STORAGE_ARGUMENT auto num_consumers = kafka_settings->kafka_num_consumers.value; - auto physical_cpu_cores = getNumberOfPhysicalCPUCores(); + auto max_consumers = std::max(getNumberOfPhysicalCPUCores(), 16); - if (num_consumers > physical_cpu_cores) + if (num_consumers > max_consumers) { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Number of consumers can not be bigger than {}", physical_cpu_cores); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Number of consumers can not be bigger than {}, it just doesn't make sense. " + "Note that kafka_num_consumers is not number of consumers for Kafka partitions -- they are managed by Kafka client library. " + "kafka_num_consumers is internal amount of threads for ClickHouse and it shouldn't be big", max_consumers); } else if (num_consumers < 1) { From 72331856eb552b7c20208363f92f016f31abc8a5 Mon Sep 17 00:00:00 2001 From: alesapin Date: Mon, 4 Apr 2022 18:28:26 +0200 Subject: [PATCH 156/239] fix message --- src/Storages/Kafka/StorageKafka.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/Storages/Kafka/StorageKafka.cpp b/src/Storages/Kafka/StorageKafka.cpp index 9c3506742fd..c9f6bcabcc1 100644 --- a/src/Storages/Kafka/StorageKafka.cpp +++ b/src/Storages/Kafka/StorageKafka.cpp @@ -783,9 +783,12 @@ void registerStorageKafka(StorageFactory & factory) if (num_consumers > max_consumers) { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Number of consumers can not be bigger than {}, it just doesn't make sense. " - "Note that kafka_num_consumers is not number of consumers for Kafka partitions -- they are managed by Kafka client library. " - "kafka_num_consumers is internal amount of threads for ClickHouse and it shouldn't be big", max_consumers); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "The number of consumers can not be bigger than {}. " + "A single consumer can read any number of partitions. Extra consumers are relatively expensive, " + "and using a lot of them can lead to high memory and CPU usage. To achieve better performance " + "of getting data from Kafka, consider using a setting kafka_thread_per_consumer=1, " + "and ensure you have enough threads in MessageBrokerSchedulePool (background_message_broker_schedule_pool_size). " + "See also https://clickhouse.com/docs/integrations/kafka/kafka-table-engine#tuning-performance", max_consumers); } else if (num_consumers < 1) { From d475ce5d169cf3d85f9c768531dc211d247bd27d Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Mon, 4 Apr 2022 18:50:50 +0200 Subject: [PATCH 157/239] Fix building ubuntu image from deb-repo --- docker/server/Dockerfile.ubuntu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docker/server/Dockerfile.ubuntu b/docker/server/Dockerfile.ubuntu index cc198772251..6e93bd97036 100644 --- a/docker/server/Dockerfile.ubuntu +++ b/docker/server/Dockerfile.ubuntu @@ -94,8 +94,9 @@ RUN arch=${TARGETARCH:-amd64} \ && apt-get update \ && apt-get --yes -o "Dpkg::Options::=--force-confdef" -o "Dpkg::Options::=--force-confold" upgrade \ && for package in ${PACKAGES}; do \ - apt-get install --allow-unauthenticated --yes --no-install-recommends "${package}=${VERSION}" || exit 1 \ + packages="${packages} ${package}=${VERSION}" \ ; done \ + && apt-get install --allow-unauthenticated --yes --no-install-recommends ${packages} || exit 1 \ ; fi \ && clickhouse-local -q 'SELECT * FROM system.build_options' \ && rm -rf \ From 3a6bee309b79987a3a21a22cb3b3aab69d56d3b1 Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Mon, 4 Apr 2022 19:10:56 +0200 Subject: [PATCH 158/239] Skip test with ordinary database --- tests/queries/0_stateless/02262_column_ttl.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02262_column_ttl.sh b/tests/queries/0_stateless/02262_column_ttl.sh index affb0c802ff..b5e29c9b2a1 100755 --- a/tests/queries/0_stateless/02262_column_ttl.sh +++ b/tests/queries/0_stateless/02262_column_ttl.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Tags: no-parallel +# Tags: no-parallel, no-ordinary-database # ^^^^^^^^^^^ # Since the underlying view may disappears while flushing log, and leads to: # From 0477e74f42b9d2cc9056574b01f02e82016a0a52 Mon Sep 17 00:00:00 2001 From: alesapin Date: Mon, 4 Apr 2022 19:41:54 +0200 Subject: [PATCH 159/239] Get rid of caps --- src/Storages/AlterCommands.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Storages/AlterCommands.cpp b/src/Storages/AlterCommands.cpp index 16e1f044fd9..15095335a51 100644 --- a/src/Storages/AlterCommands.cpp +++ b/src/Storages/AlterCommands.cpp @@ -713,7 +713,7 @@ bool isMetadataOnlyConversion(const IDataType * from, const IDataType * to) return false; }; - static const std::unordered_multimap ALLOWED_CONVERSIONS = + static const std::unordered_multimap allowed_conversions = { { typeid(DataTypeEnum8), typeid(DataTypeInt8) }, { typeid(DataTypeEnum16), typeid(DataTypeInt16) }, @@ -735,7 +735,7 @@ bool isMetadataOnlyConversion(const IDataType * from, const IDataType * to) return true; /// Types changed, but representation on disk didn't - auto it_range = ALLOWED_CONVERSIONS.equal_range(typeid(*from)); + auto it_range = allowed_conversions.equal_range(typeid(*from)); for (auto it = it_range.first; it != it_range.second; ++it) { if (it->second == typeid(*to)) From d04c48e67a96493ddae35e6fe7fc15c7fc03d363 Mon Sep 17 00:00:00 2001 From: alesapin Date: Mon, 4 Apr 2022 20:14:09 +0200 Subject: [PATCH 160/239] Fix build --- src/Storages/MergeTree/MergeTreeData.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 8419f07ae73..42b26db72ce 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -2967,11 +2967,11 @@ void MergeTreeData::tryRemovePartImmediately(DataPartPtr && part) if (!((*it)->getState() == DataPartState::Outdated && it->unique())) { if ((*it)->getState() != DataPartState::Outdated) - LOG_WARNING("Cannot immediately remove part {} because it's not in Outdated state " + LOG_WARNING(log, "Cannot immediately remove part {} because it's not in Outdated state " "usage counter {}", part_name_with_state, it->use_count()); if (!it->unique()) - LOG_WARNING("Cannot immediately remove part {} because someone using it right now " + LOG_WARNING(log, "Cannot immediately remove part {} because someone using it right now " "usage counter {}", part_name_with_state, it->use_count()); return; } From 53c7376e37d206ebc954f3aeca7f98c565e4c53a Mon Sep 17 00:00:00 2001 From: Mikhail Filimonov Date: Mon, 4 Apr 2022 16:05:31 +0200 Subject: [PATCH 161/239] Add some metrics to engine Kafka --- src/Common/CurrentMetrics.cpp | 8 ++++ src/Common/ProfileEvents.cpp | 19 +++++++++ src/Storages/Kafka/KafkaSource.cpp | 30 ++++++++++++- .../Kafka/ReadBufferFromKafkaConsumer.cpp | 42 +++++++++++++++++++ .../Kafka/ReadBufferFromKafkaConsumer.h | 7 ++++ src/Storages/Kafka/StorageKafka.cpp | 31 ++++++++++++++ .../Kafka/WriteBufferToKafkaProducer.cpp | 17 ++++++++ .../Kafka/WriteBufferToKafkaProducer.h | 9 ++++ 8 files changed, 162 insertions(+), 1 deletion(-) diff --git a/src/Common/CurrentMetrics.cpp b/src/Common/CurrentMetrics.cpp index a741f1f1bfc..d49fc02084f 100644 --- a/src/Common/CurrentMetrics.cpp +++ b/src/Common/CurrentMetrics.cpp @@ -81,6 +81,14 @@ M(ActiveSyncDrainedConnections, "Number of active connections drained synchronously.") \ M(AsynchronousReadWait, "Number of threads waiting for asynchronous read.") \ M(PendingAsyncInsert, "Number of asynchronous inserts that are waiting for flush.") \ + M(KafkaConsumers, "Number of active Kafka consumers") \ + M(KafkaConsumersWithAssignment, "Number of active Kafka consumers which have some partitions assigned.") \ + M(KafkaProducers, "Number of active Kafka producer created") \ + M(KafkaLibrdkafkaThreads, "Number of active librdkafka threads") \ + M(KafkaBackgroundReads, "Number of background reads currently working (populating materialized views from Kafka)") \ + M(KafkaDirectReads, "Number of direct selects from Kafka currently executing") \ + M(KafkaWrites, "Number of currently running inserts to Kafka") \ + M(KafkaAssignedPartitions, "Number of partitions Kafka tables currently assigned to") \ namespace CurrentMetrics { diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp index 074ec02394b..a963c024ab1 100644 --- a/src/Common/ProfileEvents.cpp +++ b/src/Common/ProfileEvents.cpp @@ -295,6 +295,25 @@ M(MergeTreeMetadataCacheHit, "Number of times the read of meta file was done from MergeTree metadata cache") \ M(MergeTreeMetadataCacheMiss, "Number of times the read of meta file was not done from MergeTree metadata cache") \ \ + M(KafkaRebalanceRevocations, "Number of partition revocations (the first stage of consumer group rebalance)") \ + M(KafkaRebalanceAssignments, "Number of partition assignments (the final stage of consumer group rebalance)") \ + M(KafkaRebalanceErrors, "Number of failed consumer group rebalances") \ + M(KafkaMessagesPolled, "Number of Kafka messages polled from librdkafka to ClickHouse") \ + M(KafkaMessagesRead, "Number of Kafka messages already processed by ClickHouse") \ + M(KafkaMessagesFailed, "Number of Kafka messages ClickHouse failed to parse") \ + M(KafkaRowsRead, "Number of rows parsed from Kafka messages") \ + M(KafkaRowsRejected, "Number of parsed rows which were later rejected (due to rebalances / errors or similar reasons). Those rows will be consumed again after the rebalance.") \ + M(KafkaDirectReads, "Number of direct selects from Kafka tables since server start") \ + M(KafkaBackgroundReads, "Number of background reads populating materialized views from Kafka since server start") \ + M(KafkaCommits, "Number of successful commits of consumed offsets to Kafka (normally should be the same as KafkaBackgroundReads)") \ + M(KafkaCommitFailures, "Number of failed commits of consumed offsets to Kafka (usually is a sign of some data duplication)") \ + M(KafkaConsumerErrors, "Number of errors reported by librdkafka during polls") \ + M(KafkaWrites, "Number of writes (inserts) to Kafka tables ") \ + M(KafkaRowsWritten, "Number of rows inserted into Kafka tables") \ + M(KafkaProducerFlushes, "Number of explicit flushes to Kafka producer") \ + M(KafkaMessagesProduced, "Number of messages produced to Kafka") \ + M(KafkaProducerErrors, "Number of errors during producing the messages to Kafka") \ + \ M(ScalarSubqueriesGlobalCacheHit, "Number of times a read from a scalar subquery was done using the global cache") \ M(ScalarSubqueriesLocalCacheHit, "Number of times a read from a scalar subquery was done using the local cache") \ M(ScalarSubqueriesCacheMiss, "Number of times a read from a scalar subquery was not cached and had to be calculated completely") diff --git a/src/Storages/Kafka/KafkaSource.cpp b/src/Storages/Kafka/KafkaSource.cpp index 99130f615f5..60047af8774 100644 --- a/src/Storages/Kafka/KafkaSource.cpp +++ b/src/Storages/Kafka/KafkaSource.cpp @@ -6,6 +6,16 @@ #include #include +#include + +namespace ProfileEvents +{ + extern const Event KafkaMessagesRead; + extern const Event KafkaMessagesFailed; + extern const Event KafkaRowsRead; + extern const Event KafkaRowsRejected; +} + namespace DB { namespace ErrorCodes @@ -85,6 +95,8 @@ Chunk KafkaSource::generateImpl() auto on_error = [&](const MutableColumns & result_columns, Exception & e) { + ProfileEvents::increment(ProfileEvents::KafkaMessagesFailed); + if (put_error_to_stream) { exception_message = e.message(); @@ -117,7 +129,11 @@ Chunk KafkaSource::generateImpl() size_t new_rows = 0; exception_message.reset(); if (buffer->poll()) + { + // poll provide one message at a time to the input_format + ProfileEvents::increment(ProfileEvents::KafkaMessagesRead); new_rows = executor.execute(); + } if (new_rows) { @@ -128,6 +144,8 @@ Chunk KafkaSource::generateImpl() if (buffer->isStalled()) throw Exception("Polled messages became unusable", ErrorCodes::LOGICAL_ERROR); + ProfileEvents::increment(ProfileEvents::KafkaRowsRead, new_rows); + buffer->storeLastReadMessageOffset(); auto topic = buffer->currentTopic(); @@ -212,8 +230,18 @@ Chunk KafkaSource::generateImpl() } } - if (buffer->polledDataUnusable() || total_rows == 0) + if (total_rows == 0) + { return {}; + } + else if (buffer->polledDataUnusable()) + { + // the rows were counted already before by KafkaRowsRead, + // so let's count the rows we ignore separately + // (they will be retried after the rebalance) + ProfileEvents::increment(ProfileEvents::KafkaRowsRejected, total_rows); + return {}; + } /// MATERIALIZED columns can be added here, but I think // they are not needed here: diff --git a/src/Storages/Kafka/ReadBufferFromKafkaConsumer.cpp b/src/Storages/Kafka/ReadBufferFromKafkaConsumer.cpp index ebfeaed8346..5ff90164064 100644 --- a/src/Storages/Kafka/ReadBufferFromKafkaConsumer.cpp +++ b/src/Storages/Kafka/ReadBufferFromKafkaConsumer.cpp @@ -10,6 +10,26 @@ #include #include +#include +#include + +namespace CurrentMetrics +{ + extern const Metric KafkaAssignedPartitions; + extern const Metric KafkaConsumersWithAssignment; +} + +namespace ProfileEvents +{ + extern const Event KafkaRebalanceRevocations; + extern const Event KafkaRebalanceAssignments; + extern const Event KafkaRebalanceErrors; + extern const Event KafkaMessagesPolled; + extern const Event KafkaCommitFailures; + extern const Event KafkaCommits; + extern const Event KafkaConsumerErrors; +} + namespace DB { @@ -45,6 +65,9 @@ ReadBufferFromKafkaConsumer::ReadBufferFromKafkaConsumer( // called (synchronously, during poll) when we enter the consumer group consumer->set_assignment_callback([this](const cppkafka::TopicPartitionList & topic_partitions) { + CurrentMetrics::add(CurrentMetrics::KafkaAssignedPartitions, topic_partitions.size()); + ProfileEvents::increment(ProfileEvents::KafkaRebalanceAssignments); + if (topic_partitions.empty()) { LOG_INFO(log, "Got empty assignment: Not enough partitions in the topic for all consumers?"); @@ -52,6 +75,7 @@ ReadBufferFromKafkaConsumer::ReadBufferFromKafkaConsumer( else { LOG_TRACE(log, "Topics/partitions assigned: {}", topic_partitions); + CurrentMetrics::add(CurrentMetrics::KafkaConsumersWithAssignment, 1); } assignment = topic_partitions; @@ -60,10 +84,18 @@ ReadBufferFromKafkaConsumer::ReadBufferFromKafkaConsumer( // called (synchronously, during poll) when we leave the consumer group consumer->set_revocation_callback([this](const cppkafka::TopicPartitionList & topic_partitions) { + CurrentMetrics::sub(CurrentMetrics::KafkaAssignedPartitions, topic_partitions.size()); + ProfileEvents::increment(ProfileEvents::KafkaRebalanceRevocations); + // Rebalance is happening now, and now we have a chance to finish the work // with topics/partitions we were working with before rebalance LOG_TRACE(log, "Rebalance initiated. Revoking partitions: {}", topic_partitions); + if (!topic_partitions.empty()) + { + CurrentMetrics::sub(CurrentMetrics::KafkaConsumersWithAssignment, 1); + } + // we can not flush data to target from that point (it is pulled, not pushed) // so the best we can now it to // 1) repeat last commit in sync mode (async could be still in queue, we need to be sure is is properly committed before rebalance) @@ -91,6 +123,7 @@ ReadBufferFromKafkaConsumer::ReadBufferFromKafkaConsumer( consumer->set_rebalance_error_callback([this](cppkafka::Error err) { LOG_ERROR(log, "Rebalance error: {}", err); + ProfileEvents::increment(ProfileEvents::KafkaRebalanceErrors); }); } @@ -229,8 +262,14 @@ void ReadBufferFromKafkaConsumer::commit() if (!committed) { // TODO: insert atomicity / transactions is needed here (possibility to rollback, on 2 phase commits) + ProfileEvents::increment(ProfileEvents::KafkaCommitFailures); throw Exception("All commit attempts failed. Last block was already written to target table(s), but was not committed to Kafka.", ErrorCodes::CANNOT_COMMIT_OFFSET); } + else + { + ProfileEvents::increment(ProfileEvents::KafkaCommits); + } + } else { @@ -423,6 +462,8 @@ bool ReadBufferFromKafkaConsumer::poll() return false; } + ProfileEvents::increment(ProfileEvents::KafkaMessagesPolled, messages.size()); + stalled_status = NOT_STALLED; allowed = true; return true; @@ -436,6 +477,7 @@ size_t ReadBufferFromKafkaConsumer::filterMessageErrors() { if (auto error = message.get_error()) { + ProfileEvents::increment(ProfileEvents::KafkaConsumerErrors); LOG_ERROR(log, "Consumer error: {}", error); return true; } diff --git a/src/Storages/Kafka/ReadBufferFromKafkaConsumer.h b/src/Storages/Kafka/ReadBufferFromKafkaConsumer.h index 4e9bf2e55c2..f390d1c1330 100644 --- a/src/Storages/Kafka/ReadBufferFromKafkaConsumer.h +++ b/src/Storages/Kafka/ReadBufferFromKafkaConsumer.h @@ -5,6 +5,12 @@ #include #include +#include + +namespace CurrentMetrics +{ + extern const Metric KafkaConsumers; +} namespace Poco { @@ -67,6 +73,7 @@ public: private: using Messages = std::vector; + CurrentMetrics::Increment metric_increment{CurrentMetrics::KafkaConsumers}; enum StalledStatus { diff --git a/src/Storages/Kafka/StorageKafka.cpp b/src/Storages/Kafka/StorageKafka.cpp index 4c7465d587d..722c55e6c93 100644 --- a/src/Storages/Kafka/StorageKafka.cpp +++ b/src/Storages/Kafka/StorageKafka.cpp @@ -41,6 +41,26 @@ #include +#include +#include + + +namespace CurrentMetrics +{ + extern const Metric KafkaLibrdkafkaThreads; + extern const Metric KafkaBackgroundReads; + extern const Metric KafkaDirectReads; + extern const Metric KafkaWrites; +} + +namespace ProfileEvents +{ + extern const Event KafkaDirectReads; + extern const Event KafkaBackgroundReads; + extern const Event KafkaWrites; +} + + namespace DB { @@ -58,6 +78,7 @@ struct StorageKafkaInterceptors static rd_kafka_resp_err_t rdKafkaOnThreadStart(rd_kafka_t *, rd_kafka_thread_type_t thread_type, const char *, void * ctx) { StorageKafka * self = reinterpret_cast(ctx); + CurrentMetrics::add(CurrentMetrics::KafkaLibrdkafkaThreads, 1); const auto & storage_id = self->getStorageID(); const auto & table = storage_id.getTableName(); @@ -89,6 +110,7 @@ struct StorageKafkaInterceptors static rd_kafka_resp_err_t rdKafkaOnThreadExit(rd_kafka_t *, rd_kafka_thread_type_t, const char *, void * ctx) { StorageKafka * self = reinterpret_cast(ctx); + CurrentMetrics::sub(CurrentMetrics::KafkaLibrdkafkaThreads, 1); std::lock_guard lock(self->thread_statuses_mutex); const auto it = std::find_if(self->thread_statuses.begin(), self->thread_statuses.end(), [](const auto & thread_status_ptr) @@ -279,6 +301,9 @@ Pipe StorageKafka::read( if (mv_attached) throw Exception(ErrorCodes::QUERY_NOT_ALLOWED, "Cannot read from StorageKafka with attached materialized views"); + CurrentMetrics::Increment metric_increment{CurrentMetrics::KafkaDirectReads}; + ProfileEvents::increment(ProfileEvents::KafkaDirectReads); + /// Always use all consumers at once, otherwise SELECT may not read messages from all partitions. Pipes pipes; pipes.reserve(num_created_consumers); @@ -304,6 +329,9 @@ SinkToStoragePtr StorageKafka::write(const ASTPtr &, const StorageMetadataPtr & auto modified_context = Context::createCopy(local_context); modified_context->applySettingsChanges(settings_adjustments); + CurrentMetrics::Increment metric_increment{CurrentMetrics::KafkaWrites}; + ProfileEvents::increment(ProfileEvents::KafkaWrites); + if (topics.size() > 1) throw Exception("Can't write to Kafka table with multiple topics!", ErrorCodes::NOT_IMPLEMENTED); return std::make_shared(*this, metadata_snapshot, modified_context); @@ -615,6 +643,9 @@ bool StorageKafka::streamToViews() if (!table) throw Exception("Engine table " + table_id.getNameForLogs() + " doesn't exist.", ErrorCodes::LOGICAL_ERROR); + CurrentMetrics::Increment metric_increment{CurrentMetrics::KafkaBackgroundReads}; + ProfileEvents::increment(ProfileEvents::KafkaBackgroundReads); + auto storage_snapshot = getStorageSnapshot(getInMemoryMetadataPtr()); // Create an INSERT query for streaming data diff --git a/src/Storages/Kafka/WriteBufferToKafkaProducer.cpp b/src/Storages/Kafka/WriteBufferToKafkaProducer.cpp index 748ea02ac6d..28877864e16 100644 --- a/src/Storages/Kafka/WriteBufferToKafkaProducer.cpp +++ b/src/Storages/Kafka/WriteBufferToKafkaProducer.cpp @@ -3,6 +3,16 @@ #include "Columns/ColumnString.h" #include "Columns/ColumnsNumber.h" +#include + +namespace ProfileEvents +{ + extern const Event KafkaRowsWritten; + extern const Event KafkaProducerFlushes; + extern const Event KafkaMessagesProduced; + extern const Event KafkaProducerErrors; +} + namespace DB { WriteBufferToKafkaProducer::WriteBufferToKafkaProducer( @@ -53,6 +63,8 @@ WriteBufferToKafkaProducer::~WriteBufferToKafkaProducer() void WriteBufferToKafkaProducer::countRow(const Columns & columns, size_t current_row) { + ProfileEvents::increment(ProfileEvents::KafkaRowsWritten); + if (++rows % max_rows == 0) { const std::string & last_chunk = chunks.back(); @@ -103,8 +115,10 @@ void WriteBufferToKafkaProducer::countRow(const Columns & columns, size_t curren producer->poll(timeout); continue; } + ProfileEvents::increment(ProfileEvents::KafkaProducerErrors); throw; } + ProfileEvents::increment(ProfileEvents::KafkaMessagesProduced); break; } @@ -126,9 +140,12 @@ void WriteBufferToKafkaProducer::flush() { if (e.get_error() == RD_KAFKA_RESP_ERR__TIMED_OUT) continue; + + ProfileEvents::increment(ProfileEvents::KafkaProducerErrors); throw; } + ProfileEvents::increment(ProfileEvents::KafkaProducerFlushes); break; } } diff --git a/src/Storages/Kafka/WriteBufferToKafkaProducer.h b/src/Storages/Kafka/WriteBufferToKafkaProducer.h index 15881b7a8e5..64b06571f0a 100644 --- a/src/Storages/Kafka/WriteBufferToKafkaProducer.h +++ b/src/Storages/Kafka/WriteBufferToKafkaProducer.h @@ -7,6 +7,14 @@ #include +#include + +namespace CurrentMetrics +{ + extern const Metric KafkaProducers; +} + + namespace DB { class Block; @@ -32,6 +40,7 @@ private: void nextImpl() override; void addChunk(); void reinitializeChunks(); + CurrentMetrics::Increment metric_increment{CurrentMetrics::KafkaProducers}; ProducerPtr producer; const std::string topic; From 43f697d7bac7bad36acd95dd6e555be9d96e77f7 Mon Sep 17 00:00:00 2001 From: Nir Peled Date: Mon, 4 Apr 2022 14:29:37 -0400 Subject: [PATCH 162/239] Fixed GA not reporting events. --- website/js/base.js | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/website/js/base.js b/website/js/base.js index 9389028f1ef..1ab8f841dbe 100644 --- a/website/js/base.js +++ b/website/js/base.js @@ -67,22 +67,17 @@ }); } - (function (d, w, c) { - (w[c] = w[c] || []).push(function() { - var is_single_page = $('html').attr('data-single-page') === 'true'; - - if (!is_single_page) { - $('head').each(function(_, element) { - $(element).append( - '' - ); - $(element).append( - '' - ); - }); - } + var is_single_page = $('html').attr('data-single-page') === 'true'; + if (!is_single_page) { + $('head').each(function (_, element) { + $(element).append( + '' + ); + $(element).append( + '' + ); }); - })(document, window, ""); + } var beforePrint = function() { var details = document.getElementsByTagName("details"); From 0340932d57c538a1c56c5e5761872805ea78e8ac Mon Sep 17 00:00:00 2001 From: LAL2211 Date: Mon, 4 Apr 2022 14:35:21 -0400 Subject: [PATCH 163/239] updated hard coded/ default credentials --- tests/integration/helpers/config_cluster.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/integration/helpers/config_cluster.py diff --git a/tests/integration/helpers/config_cluster.py b/tests/integration/helpers/config_cluster.py new file mode 100644 index 00000000000..e69de29bb2d From 3ecdad9d4baa0fc4cf3934fc3f170b34e04b5fb4 Mon Sep 17 00:00:00 2001 From: LAL2211 Date: Mon, 4 Apr 2022 14:49:30 -0400 Subject: [PATCH 164/239] updated --- tests/integration/helpers/cluster.py | 56 +++++++++++---------- tests/integration/helpers/config_cluster.py | 35 +++++++++++++ 2 files changed, 64 insertions(+), 27 deletions(-) diff --git a/tests/integration/helpers/cluster.py b/tests/integration/helpers/cluster.py index d0b5e892f5b..3a9107d821e 100644 --- a/tests/integration/helpers/cluster.py +++ b/tests/integration/helpers/cluster.py @@ -41,6 +41,8 @@ import docker from .client import Client from .hdfs_api import HDFSApi +from .config_cluster import * + HELPERS_DIR = p.dirname(__file__) CLICKHOUSE_ROOT_DIR = p.join(p.dirname(__file__), "../../..") LOCAL_DOCKER_COMPOSE_DIR = p.join( @@ -1657,8 +1659,8 @@ class ClickHouseCluster: while time.time() - start < timeout: try: conn = pymysql.connect( - user="root", - password="clickhouse", + user=mysql_user, + password=mysql_pass, host=self.mysql_ip, port=self.mysql_port, ) @@ -1679,8 +1681,8 @@ class ClickHouseCluster: while time.time() - start < timeout: try: conn = pymysql.connect( - user="root", - password="clickhouse", + user=mysql8_user, + password=mysql8_pass, host=self.mysql8_ip, port=self.mysql8_port, ) @@ -1704,8 +1706,8 @@ class ClickHouseCluster: try: for ip in [self.mysql2_ip, self.mysql3_ip, self.mysql4_ip]: conn = pymysql.connect( - user="root", - password="clickhouse", + user=mysql_user, + password=mysql_pass, host=ip, port=self.mysql_port, ) @@ -1728,9 +1730,9 @@ class ClickHouseCluster: self.postgres_conn = psycopg2.connect( host=self.postgres_ip, port=self.postgres_port, - database="postgres", - user="postgres", - password="mysecretpassword", + database=pg_db, + user=pg_user, + password=pg_pass, ) self.postgres_conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) self.postgres_conn.autocommit = True @@ -1752,9 +1754,9 @@ class ClickHouseCluster: self.postgres2_conn = psycopg2.connect( host=self.postgres2_ip, port=self.postgres_port, - database="postgres", - user="postgres", - password="mysecretpassword", + database=pg_db, + user=pg_user, + password=pg_pass, ) self.postgres2_conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) self.postgres2_conn.autocommit = True @@ -1768,9 +1770,9 @@ class ClickHouseCluster: self.postgres3_conn = psycopg2.connect( host=self.postgres3_ip, port=self.postgres_port, - database="postgres", - user="postgres", - password="mysecretpassword", + database=pg_db, + user=pg_user, + password=pg_pass, ) self.postgres3_conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) self.postgres3_conn.autocommit = True @@ -1784,9 +1786,9 @@ class ClickHouseCluster: self.postgres4_conn = psycopg2.connect( host=self.postgres4_ip, port=self.postgres_port, - database="postgres", - user="postgres", - password="mysecretpassword", + database=pg_db, + user=pg_user, + password=pg_pass, ) self.postgres4_conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) self.postgres4_conn.autocommit = True @@ -1938,7 +1940,7 @@ class ClickHouseCluster: def wait_mongo_to_start(self, timeout=30, secure=False): connection_str = "mongodb://{user}:{password}@{host}:{port}".format( - host="localhost", port=self.mongo_port, user="root", password="clickhouse" + host="localhost", port=self.mongo_port, user=mongo_user, password=mongo_pass ) if secure: connection_str += "/?tls=true&tlsAllowInvalidCertificates=true" @@ -1962,8 +1964,8 @@ class ClickHouseCluster: ) minio_client = Minio( f"{self.minio_ip}:{self.minio_port}", - access_key="minio", - secret_key="minio123", + access_key=minio_access_key, + secret_key=minio_secret_key, secure=secure, http_client=urllib3.PoolManager(cert_reqs="CERT_NONE"), ) # disable SSL check as we test ClickHouse and not Python library @@ -3481,16 +3483,16 @@ class ClickHouseInstance: "MySQL": { "DSN": "mysql_odbc", "Driver": "/usr/lib/x86_64-linux-gnu/odbc/libmyodbc.so", - "Database": "clickhouse", - "Uid": "root", - "Pwd": "clickhouse", + "Database": odbc_mysql_db, + "Uid": odbc_mysql_uid, + "Pwd": odbc_mysql_pass, "Server": self.cluster.mysql_host, }, "PostgreSQL": { "DSN": "postgresql_odbc", - "Database": "postgres", - "UserName": "postgres", - "Password": "mysecretpassword", + "Database": odbc_psql_db, + "UserName": odbc_psql_user, + "Password": odbc_psql_pass, "Port": str(self.cluster.postgres_port), "Servername": self.cluster.postgres_host, "Protocol": "9.3", diff --git a/tests/integration/helpers/config_cluster.py b/tests/integration/helpers/config_cluster.py index e69de29bb2d..cb4bc6286ff 100644 --- a/tests/integration/helpers/config_cluster.py +++ b/tests/integration/helpers/config_cluster.py @@ -0,0 +1,35 @@ + +# MYSQL CREDENTIALS +mysql_user = 'root' +mysql_pass = 'clickhouse' + + +# MYSQL8 CREDENTIALS +mysql8_user = 'root' +mysql8_pass = 'clickhouse' + +# POSTGRES CREDENTIALS +pg_user = 'postgres' +pg_pass = 'mysecretpassword' +pg_db = 'postgres' + + +# MINIO CREDENTIALS +minio_access_key="minio" +minio_secret_key="minio123" + +# MONGODB CREDENTIALS +mongo_user = 'root' +mongo_pass = 'clickhouse' + +# ODBC CREDENTIALS +odbc_mysql_uid = 'root' +odbc_mysql_pass = 'clickhouse' +odbc_mysql_db = 'clickhouse' + +odbc_psql_db = 'postgres' +odbc_psql_user = 'postgres' +odbc_psql_pass = 'mysecretpassword' + + + From 4d6c030d235f6480a2c978bf7dcc16867d6b2cce Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 4 Apr 2022 23:41:42 +0300 Subject: [PATCH 165/239] Revert "clang-tidy report issues with Medium priority" --- src/IO/ReadBufferFromFileDescriptor.h | 7 ------- src/Interpreters/SystemLog.cpp | 2 +- src/Storages/ColumnsDescription.cpp | 2 +- src/Storages/StorageDistributed.h | 2 +- 4 files changed, 3 insertions(+), 10 deletions(-) diff --git a/src/IO/ReadBufferFromFileDescriptor.h b/src/IO/ReadBufferFromFileDescriptor.h index 000f4d371eb..ba1502fb9aa 100644 --- a/src/IO/ReadBufferFromFileDescriptor.h +++ b/src/IO/ReadBufferFromFileDescriptor.h @@ -39,10 +39,6 @@ public: { } - virtual ~ReadBufferFromFileDescriptor() override - { - } - int getFD() const { return fd; @@ -84,9 +80,6 @@ public: { use_pread = true; } - virtual ~ReadBufferFromFileDescriptorPRead() override - { - } }; } diff --git a/src/Interpreters/SystemLog.cpp b/src/Interpreters/SystemLog.cpp index 59533e5a586..3b4d665e41b 100644 --- a/src/Interpreters/SystemLog.cpp +++ b/src/Interpreters/SystemLog.cpp @@ -379,7 +379,7 @@ void SystemLog::flushImpl(const std::vector & to_flush, for (const auto & name_and_type : log_element_names_and_types) log_element_columns.emplace_back(name_and_type.type, name_and_type.name); - Block block(log_element_columns); + Block block(std::move(log_element_columns)); MutableColumns columns = block.mutateColumns(); for (const auto & elem : to_flush) diff --git a/src/Storages/ColumnsDescription.cpp b/src/Storages/ColumnsDescription.cpp index f3a939614c1..1264da77b04 100644 --- a/src/Storages/ColumnsDescription.cpp +++ b/src/Storages/ColumnsDescription.cpp @@ -122,7 +122,7 @@ void ColumnDescription::readText(ReadBuffer & buf) if (col_ast->default_expression) { default_desc.kind = columnDefaultKindFromString(col_ast->default_specifier); - default_desc.expression = col_ast->default_expression; + default_desc.expression = std::move(col_ast->default_expression); } if (col_ast->comment) diff --git a/src/Storages/StorageDistributed.h b/src/Storages/StorageDistributed.h index b6d738fb61e..317463783ee 100644 --- a/src/Storages/StorageDistributed.h +++ b/src/Storages/StorageDistributed.h @@ -44,7 +44,7 @@ class StorageDistributed final : public shared_ptr_helper, p friend class StorageSystemDistributionQueue; public: - virtual ~StorageDistributed() override; + ~StorageDistributed() override; std::string getName() const override { return "Distributed"; } From 808d9afd0f8110faba5ae027051bf0a64e506da3 Mon Sep 17 00:00:00 2001 From: larryluogit Date: Mon, 4 Apr 2022 16:47:14 -0400 Subject: [PATCH 166/239] Fix optin.cplusplus.UninitializedObject issue (#35626) * Fix optin.cplusplus.UninitializedObject issue * Enable optin.cplusplus.UninitializedObject --- .clang-tidy | 1 + src/Common/ColumnsHashingImpl.h | 2 +- src/Common/CompactArray.h | 3 +++ src/Common/JSONBuilder.h | 6 +++--- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/.clang-tidy b/.clang-tidy index 687b3741b1c..2a9cba30a85 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -144,6 +144,7 @@ Checks: '-*, clang-analyzer-cplusplus.SelfAssignment, clang-analyzer-deadcode.DeadStores, clang-analyzer-cplusplus.Move, + clang-analyzer-optin.cplusplus.UninitializedObject, clang-analyzer-optin.cplusplus.VirtualCall, clang-analyzer-security.insecureAPI.UncheckedReturn, clang-analyzer-security.insecureAPI.bcmp, diff --git a/src/Common/ColumnsHashingImpl.h b/src/Common/ColumnsHashingImpl.h index f5a732b275f..7b0650487f5 100644 --- a/src/Common/ColumnsHashingImpl.h +++ b/src/Common/ColumnsHashingImpl.h @@ -125,7 +125,7 @@ class FindResultImpl : public FindResultImplBase, public FindResultImplOffsetBas public: FindResultImpl() - : FindResultImplBase(false), FindResultImplOffsetBase(0) + : FindResultImplBase(false), FindResultImplOffsetBase(0) // NOLINT(clang-analyzer-optin.cplusplus.UninitializedObject) intentionally allow uninitialized value here {} FindResultImpl(Mapped * value_, bool found_, size_t off) diff --git a/src/Common/CompactArray.h b/src/Common/CompactArray.h index 629fa08aaaa..cf97206edb8 100644 --- a/src/Common/CompactArray.h +++ b/src/Common/CompactArray.h @@ -214,6 +214,9 @@ private: /// offset in bits to the next to the rightmost bit at that byte; or zero if the rightmost bit is the rightmost bit in that byte. offset_r = (l + content_width) % 8; + + content_l = nullptr; + content_r = nullptr; } UInt8 ALWAYS_INLINE read(UInt8 value_l) const diff --git a/src/Common/JSONBuilder.h b/src/Common/JSONBuilder.h index 9a218fcf08b..38d19da011d 100644 --- a/src/Common/JSONBuilder.h +++ b/src/Common/JSONBuilder.h @@ -61,7 +61,7 @@ private: class JSONBool : public IItem { public: - explicit JSONBool(bool value_) : value(std::move(value_)) {} + explicit JSONBool(bool value_) : value(value_) {} void format(const FormatSettings & settings, FormatContext & context) override; private: @@ -74,7 +74,7 @@ public: void add(ItemPtr value) { values.push_back(std::move(value)); } void add(std::string value) { add(std::make_unique(std::move(value))); } void add(const char * value) { add(std::make_unique(value)); } - void add(bool value) { add(std::make_unique(std::move(value))); } + void add(bool value) { add(std::make_unique(value)); } template requires std::is_arithmetic_v @@ -99,7 +99,7 @@ public: void add(std::string key, std::string value) { add(std::move(key), std::make_unique(std::move(value))); } void add(std::string key, const char * value) { add(std::move(key), std::make_unique(value)); } void add(std::string key, std::string_view value) { add(std::move(key), std::make_unique(value)); } - void add(std::string key, bool value) { add(std::move(key), std::make_unique(std::move(value))); } + void add(std::string key, bool value) { add(std::move(key), std::make_unique(value)); } template requires std::is_arithmetic_v From e78ff3ea7b0759ba80f9c509204a2e93462076f3 Mon Sep 17 00:00:00 2001 From: snyk-bot Date: Mon, 4 Apr 2022 21:20:18 +0000 Subject: [PATCH 167/239] fix: docker/test/integration/mysql_js_client/Dockerfile to reduce vulnerabilities The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-DEBIAN9-PYTHON27-341379 - https://snyk.io/vuln/SNYK-DEBIAN9-PYTHON35-1063181 - https://snyk.io/vuln/SNYK-DEBIAN9-PYTHON35-340072 - https://snyk.io/vuln/SNYK-DEBIAN9-PYTHON35-453739 - https://snyk.io/vuln/SNYK-DEBIAN9-PYTHON35-584435 --- docker/test/integration/mysql_js_client/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/test/integration/mysql_js_client/Dockerfile b/docker/test/integration/mysql_js_client/Dockerfile index b1397b40d38..14e0a20e87f 100644 --- a/docker/test/integration/mysql_js_client/Dockerfile +++ b/docker/test/integration/mysql_js_client/Dockerfile @@ -1,7 +1,7 @@ # docker build -t clickhouse/mysql-js-client . # MySQL JavaScript client docker container -FROM node:8 +FROM node:16.14.2 RUN npm install mysql From 6c7b8a0fd4d6a587cbed3a3d520a02133f54ddb2 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Mon, 4 Apr 2022 23:34:33 +0200 Subject: [PATCH 168/239] Update unixodbc to mitigate CVE-2018-7485 --- contrib/unixodbc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/unixodbc b/contrib/unixodbc index b0ad30f7f62..a2cd5395e8c 160000 --- a/contrib/unixodbc +++ b/contrib/unixodbc @@ -1 +1 @@ -Subproject commit b0ad30f7f6289c12b76f04bfb9d466374bb32168 +Subproject commit a2cd5395e8c7f7390025ec93af5bfebef3fb5fcd From 4d7618585b7a5235e820663556932e004ee601ba Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Tue, 5 Apr 2022 00:10:33 +0200 Subject: [PATCH 169/239] Fix WORKDIR issue after upgrading npm --- docker/test/integration/mysql_js_client/Dockerfile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docker/test/integration/mysql_js_client/Dockerfile b/docker/test/integration/mysql_js_client/Dockerfile index 14e0a20e87f..4c9df10ace1 100644 --- a/docker/test/integration/mysql_js_client/Dockerfile +++ b/docker/test/integration/mysql_js_client/Dockerfile @@ -3,6 +3,8 @@ FROM node:16.14.2 +WORKDIR /usr/app + RUN npm install mysql -COPY ./test.js test.js +COPY ./test.js ./test.js From 61183ac07b619044c5821a5794bcd903e75f0e60 Mon Sep 17 00:00:00 2001 From: Nikita Mikhaylov Date: Mon, 4 Apr 2022 22:24:39 +0000 Subject: [PATCH 170/239] Done --- docker/keeper/Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docker/keeper/Dockerfile b/docker/keeper/Dockerfile index 207dddce1bb..068377e8f8c 100644 --- a/docker/keeper/Dockerfile +++ b/docker/keeper/Dockerfile @@ -20,6 +20,8 @@ ENV LANG=en_US.UTF-8 \ COPY --from=glibc-donor /lib/linux-gnu/libc.so.6 /lib/linux-gnu/libdl.so.2 /lib/linux-gnu/libm.so.6 /lib/linux-gnu/libpthread.so.0 /lib/linux-gnu/librt.so.1 /lib/linux-gnu/libnss_dns.so.2 /lib/linux-gnu/libnss_files.so.2 /lib/linux-gnu/libresolv.so.2 /lib/linux-gnu/ld-2.31.so /lib/ COPY --from=glibc-donor /etc/nsswitch.conf /etc/ COPY entrypoint.sh /entrypoint.sh + +ARG TARGETARCH RUN arch=${TARGETARCH:-amd64} \ && case $arch in \ amd64) mkdir -p /lib64 && ln -sf /lib/ld-2.31.so /lib64/ld-linux-x86-64.so.2 ;; \ From f408c86a13812f00c5ebf121879ea9ce09a23872 Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 5 Apr 2022 00:42:29 +0200 Subject: [PATCH 171/239] Use FATAL logs level as default for clickhouse-test --- tests/clickhouse-test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/clickhouse-test b/tests/clickhouse-test index de36fc3da27..f925fddcd1a 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -1140,7 +1140,7 @@ def run_tests_array(all_tests_with_params): sys.stdout.flush() -server_logs_level = "warning" +server_logs_level = "fatal" def check_server_started(args): From 4e9ec5dc2f7c26ee0a22655acfb195e94caa3c33 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Tue, 5 Apr 2022 00:51:48 +0200 Subject: [PATCH 172/239] make some replicated DDL faster --- src/Common/ZooKeeper/ZooKeeper.cpp | 51 +++++- src/Common/ZooKeeper/ZooKeeper.h | 5 +- src/Databases/DatabaseReplicated.cpp | 1 + src/Databases/DatabaseReplicatedWorker.cpp | 27 +++- src/Databases/DatabaseReplicatedWorker.h | 2 + src/Interpreters/DDLWorker.cpp | 2 +- src/Storages/StorageReplicatedMergeTree.cpp | 166 +++++++++++++++----- 7 files changed, 200 insertions(+), 54 deletions(-) diff --git a/src/Common/ZooKeeper/ZooKeeper.cpp b/src/Common/ZooKeeper/ZooKeeper.cpp index 118789c0ffc..aae3b6d4191 100644 --- a/src/Common/ZooKeeper/ZooKeeper.cpp +++ b/src/Common/ZooKeeper/ZooKeeper.cpp @@ -701,24 +701,34 @@ void ZooKeeper::removeChildrenRecursive(const std::string & path, const String & } } -void ZooKeeper::tryRemoveChildrenRecursive(const std::string & path, const String & keep_child_node) +bool ZooKeeper::tryRemoveChildrenRecursive(const std::string & path, bool probably_flat, const String & keep_child_node) { Strings children; if (tryGetChildren(path, children) != Coordination::Error::ZOK) - return; + return false; + + bool removed_as_expected = true; while (!children.empty()) { Coordination::Requests ops; Strings batch; + ops.reserve(MULTI_BATCH_SIZE); + batch.reserve(MULTI_BATCH_SIZE); for (size_t i = 0; i < MULTI_BATCH_SIZE && !children.empty(); ++i) { String child_path = fs::path(path) / children.back(); - tryRemoveChildrenRecursive(child_path); + + /// Will try to avoid recursive getChildren calls if child_path probably has no children. + /// It may be extremely slow when path contain a lot of leaf children. + if (!probably_flat) + tryRemoveChildrenRecursive(child_path); + if (likely(keep_child_node.empty() || keep_child_node != children.back())) { batch.push_back(child_path); ops.emplace_back(zkutil::makeRemoveRequest(child_path, -1)); } + children.pop_back(); } @@ -726,10 +736,39 @@ void ZooKeeper::tryRemoveChildrenRecursive(const std::string & path, const Strin /// this means someone is concurrently removing these children and we will have /// to remove them one by one. Coordination::Responses responses; - if (tryMulti(ops, responses) != Coordination::Error::ZOK) - for (const std::string & child : batch) - tryRemove(child); + if (tryMulti(ops, responses) == Coordination::Error::ZOK) + continue; + + removed_as_expected = false; + + std::vector futures; + futures.reserve(batch.size()); + for (const std::string & child : batch) + futures.push_back(asyncTryRemoveNoThrow(child, -1)); + + for (size_t i = 0; i < batch.size(); ++i) + { + auto res = futures[i].get(); + if (res.error == Coordination::Error::ZOK) + continue; + if (res.error == Coordination::Error::ZNONODE) + continue; + + if (res.error == Coordination::Error::ZNOTEMPTY) + { + if (probably_flat) + { + /// It actually has children, let's remove them + tryRemoveChildrenRecursive(batch[i]); + tryRemove(batch[i]); + } + continue; + } + + throw KeeperException(res.error, batch[i]); + } } + return removed_as_expected; } void ZooKeeper::removeRecursive(const std::string & path) diff --git a/src/Common/ZooKeeper/ZooKeeper.h b/src/Common/ZooKeeper/ZooKeeper.h index f901a79591f..0f7eccd2547 100644 --- a/src/Common/ZooKeeper/ZooKeeper.h +++ b/src/Common/ZooKeeper/ZooKeeper.h @@ -225,7 +225,10 @@ public: /// If keep_child_node is not empty, this method will not remove path/keep_child_node (but will remove its subtree). /// It can be useful to keep some child node as a flag which indicates that path is currently removing. void removeChildrenRecursive(const std::string & path, const String & keep_child_node = {}); - void tryRemoveChildrenRecursive(const std::string & path, const String & keep_child_node = {}); + /// If probably_flat is true, this method will optimistically try to remove children non-recursive + /// and will fall back to recursive removal if it gets ZNOTEMPTY for some child. + /// Returns true if no kind of fallback happened. + bool tryRemoveChildrenRecursive(const std::string & path, bool probably_flat = false, const String & keep_child_node = {}); /// Remove all children nodes (non recursive). void removeChildren(const std::string & path); diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 0c3cc56c061..2337a063a5e 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -675,6 +675,7 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep } } current_zookeeper->set(replica_path + "/log_ptr", toString(max_log_ptr)); + ddl_worker->updateLogPointer(DDLTaskBase::getLogEntryName(max_log_ptr)); } std::map DatabaseReplicated::tryGetConsistentMetadataSnapshot(const ZooKeeperPtr & zookeeper, UInt32 & max_log_ptr) diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp index b45cfb16362..e9475b56377 100644 --- a/src/Databases/DatabaseReplicatedWorker.cpp +++ b/src/Databases/DatabaseReplicatedWorker.cpp @@ -66,9 +66,14 @@ void DatabaseReplicatedDDLWorker::initializeReplication() UInt32 max_log_ptr = parse(zookeeper->get(database->zookeeper_path + "/max_log_ptr")); logs_to_keep = parse(zookeeper->get(database->zookeeper_path + "/logs_to_keep")); if (our_log_ptr == 0 || our_log_ptr + logs_to_keep < max_log_ptr) + { database->recoverLostReplica(zookeeper, our_log_ptr, max_log_ptr); + } else + { last_skipped_entry_name.emplace(DDLTaskBase::getLogEntryName(our_log_ptr)); + updateLogPointer(DDLTaskBase::getLogEntryName(our_log_ptr)); + } } String DatabaseReplicatedDDLWorker::enqueueQuery(DDLLogEntry & entry) @@ -140,10 +145,10 @@ String DatabaseReplicatedDDLWorker::tryEnqueueAndExecuteEntry(DDLLogEntry & entr /// but it requires more complex logic around /try node. auto zookeeper = getAndSetZooKeeper(); - UInt32 our_log_ptr = parse(zookeeper->get(database->replica_path + "/log_ptr")); + UInt32 our_log_ptr = getLogPointer(); UInt32 max_log_ptr = parse(zookeeper->get(database->zookeeper_path + "/max_log_ptr")); - assert(our_log_ptr <= max_log_ptr); - if (database->db_settings.max_replication_lag_to_enqueue < max_log_ptr - our_log_ptr) + + if (our_log_ptr + database->db_settings.max_replication_lag_to_enqueue < max_log_ptr) throw Exception(ErrorCodes::NOT_A_LEADER, "Cannot enqueue query on this replica, " "because it has replication lag of {} queries. Try other replica.", max_log_ptr - our_log_ptr); @@ -203,7 +208,7 @@ DDLTaskPtr DatabaseReplicatedDDLWorker::initAndCheckTask(const String & entry_na } } - UInt32 our_log_ptr = parse(zookeeper->get(fs::path(database->replica_path) / "log_ptr")); + UInt32 our_log_ptr = getLogPointer(); UInt32 entry_num = DatabaseReplicatedTask::getLogEntryNumber(entry_name); if (entry_num <= our_log_ptr) @@ -308,4 +313,18 @@ bool DatabaseReplicatedDDLWorker::canRemoveQueueEntry(const String & entry_name, return entry_number + logs_to_keep < max_log_ptr; } +void DatabaseReplicatedDDLWorker::updateLogPointer(const String & processed_entry_name) +{ + updateMaxDDLEntryID(processed_entry_name); + assert(max_id.load() == parse(getAndSetZooKeeper()->get(fs::path(database->replica_path) / "log_ptr"))); +} + +UInt32 DatabaseReplicatedDDLWorker::getLogPointer() const +{ + /// NOTE it main not be equal to the log_ptr in zk: + /// - max_id can be equal to log_ptr - 1 due to race condition (when it's updated in zk, but not updated in memory yet) + /// - max_id can be greater than log_ptr, because log_ptr is not updated for failed and dummy entries + return max_id.load(); +} + } diff --git a/src/Databases/DatabaseReplicatedWorker.h b/src/Databases/DatabaseReplicatedWorker.h index 6b957e567ff..e23be472c54 100644 --- a/src/Databases/DatabaseReplicatedWorker.h +++ b/src/Databases/DatabaseReplicatedWorker.h @@ -32,6 +32,8 @@ public: static String enqueueQueryImpl(const ZooKeeperPtr & zookeeper, DDLLogEntry & entry, DatabaseReplicated * const database, bool committed = false); /// NOLINT + void updateLogPointer(const String & processed_entry_name); + UInt32 getLogPointer() const; private: bool initializeMainThread() override; void initializeReplication(); diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index 4d2cdf7dd2c..ba5de0c6668 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -876,7 +876,7 @@ void DDLWorker::cleanupQueue(Int64, const ZooKeeperPtr & zookeeper) /// We recursively delete all nodes except node_path/finished to prevent staled hosts from /// creating node_path/active node (see createStatusDirs(...)) - zookeeper->tryRemoveChildrenRecursive(node_path, "finished"); + zookeeper->tryRemoveChildrenRecursive(node_path, /* probably_flat */ false, "finished"); /// And then we remove node_path and node_path/finished in a single transaction Coordination::Requests ops; diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index d9f72cf7feb..7f32d85c4f5 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -318,19 +318,22 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree( } bool skip_sanity_checks = false; - - if (current_zookeeper && current_zookeeper->exists(replica_path + "/flags/force_restore_data")) + /// It does not make sense for CREATE query + if (attach) { - skip_sanity_checks = true; - current_zookeeper->remove(replica_path + "/flags/force_restore_data"); + if (current_zookeeper && current_zookeeper->exists(replica_path + "/flags/force_restore_data")) + { + skip_sanity_checks = true; + current_zookeeper->remove(replica_path + "/flags/force_restore_data"); - LOG_WARNING(log, "Skipping the limits on severity of changes to data parts and columns (flag {}/flags/force_restore_data).", replica_path); - } - else if (has_force_restore_data_flag) - { - skip_sanity_checks = true; + LOG_WARNING(log, "Skipping the limits on severity of changes to data parts and columns (flag {}/flags/force_restore_data).", replica_path); + } + else if (has_force_restore_data_flag) + { + skip_sanity_checks = true; - LOG_WARNING(log, "Skipping the limits on severity of changes to data parts and columns (flag force_restore_data)."); + LOG_WARNING(log, "Skipping the limits on severity of changes to data parts and columns (flag force_restore_data)."); + } } loadDataParts(skip_sanity_checks); @@ -568,35 +571,31 @@ void StorageReplicatedMergeTree::createNewZooKeeperNodes() { auto zookeeper = getZooKeeper(); - /// Working with quorum. - zookeeper->createIfNotExists(zookeeper_path + "/quorum", String()); - zookeeper->createIfNotExists(zookeeper_path + "/quorum/parallel", String()); - zookeeper->createIfNotExists(zookeeper_path + "/quorum/last_part", String()); - zookeeper->createIfNotExists(zookeeper_path + "/quorum/failed_parts", String()); - - /// Tracking lag of replicas. - zookeeper->createIfNotExists(replica_path + "/min_unprocessed_insert_time", String()); - zookeeper->createIfNotExists(replica_path + "/max_processed_insert_time", String()); - - /// Mutations - zookeeper->createIfNotExists(zookeeper_path + "/mutations", String()); - zookeeper->createIfNotExists(replica_path + "/mutation_pointer", String()); + std::vector futures; + futures.push_back(zookeeper->asyncTryCreateNoThrow(zookeeper_path + "/quorum/parallel", String(), zkutil::CreateMode::Persistent)); /// Nodes for remote fs zero-copy replication const auto settings = getSettings(); if (settings->allow_remote_fs_zero_copy_replication) { - zookeeper->createIfNotExists(zookeeper_path + "/zero_copy_s3", String()); - zookeeper->createIfNotExists(zookeeper_path + "/zero_copy_s3/shared", String()); - zookeeper->createIfNotExists(zookeeper_path + "/zero_copy_hdfs", String()); - zookeeper->createIfNotExists(zookeeper_path + "/zero_copy_hdfs/shared", String()); + futures.push_back(zookeeper->asyncTryCreateNoThrow(zookeeper_path + "/zero_copy_s3", String(), zkutil::CreateMode::Persistent)); + futures.push_back(zookeeper->asyncTryCreateNoThrow(zookeeper_path + "/zero_copy_s3/shared", String(), zkutil::CreateMode::Persistent)); + futures.push_back(zookeeper->asyncTryCreateNoThrow(zookeeper_path + "/zero_copy_hdfs", String(), zkutil::CreateMode::Persistent)); + futures.push_back(zookeeper->asyncTryCreateNoThrow(zookeeper_path + "/zero_copy_hdfs/shared", String(), zkutil::CreateMode::Persistent)); } /// Part movement. - zookeeper->createIfNotExists(zookeeper_path + "/part_moves_shard", String()); - zookeeper->createIfNotExists(zookeeper_path + "/pinned_part_uuids", getPinnedPartUUIDs()->toString()); + futures.push_back(zookeeper->asyncTryCreateNoThrow(zookeeper_path + "/part_moves_shard", String(), zkutil::CreateMode::Persistent)); + futures.push_back(zookeeper->asyncTryCreateNoThrow(zookeeper_path + "/pinned_part_uuids", getPinnedPartUUIDs()->toString(), zkutil::CreateMode::Persistent)); /// For ALTER PARTITION with multi-leaders - zookeeper->createIfNotExists(zookeeper_path + "/alter_partition_version", String()); + futures.push_back(zookeeper->asyncTryCreateNoThrow(zookeeper_path + "/alter_partition_version", String(), zkutil::CreateMode::Persistent)); + + for (auto & future : futures) + { + auto res = future.get(); + if (res.error != Coordination::Error::ZOK && res.error != Coordination::Error::ZNODEEXISTS) + throw Coordination::Exception(fmt::format("Failed to create new nodes at {}", zookeeper_path), res.error); + } } @@ -671,6 +670,16 @@ bool StorageReplicatedMergeTree::createTableIfNotExists(const StorageMetadataPtr ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/replicas", "last added replica: " + replica_name, zkutil::CreateMode::Persistent)); + /// The following 4 nodes were added in version 1.1.xxx, so we create them here, not in createNewZooKeeperNodes() + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/quorum", "", + zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/quorum/last_part", "", + zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/quorum/failed_parts", "", + zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/mutations", "", + zkutil::CreateMode::Persistent)); + /// And create first replica atomically. See also "createReplica" method that is used to create not the first replicas. ops.emplace_back(zkutil::makeCreateRequest(replica_path, "", @@ -694,6 +703,14 @@ bool StorageReplicatedMergeTree::createTableIfNotExists(const StorageMetadataPtr ops.emplace_back(zkutil::makeCreateRequest(replica_path + "/metadata_version", std::to_string(metadata_version), zkutil::CreateMode::Persistent)); + /// The following 3 nodes were added in version 1.1.xxx, so we create them here, not in createNewZooKeeperNodes() + ops.emplace_back(zkutil::makeCreateRequest(replica_path + "/min_unprocessed_insert_time", "", + zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(replica_path + "/max_processed_insert_time", "", + zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(replica_path + "/mutation_pointer", "", + zkutil::CreateMode::Persistent)); + Coordination::Responses responses; auto code = zookeeper->tryMulti(ops, responses); if (code == Coordination::Error::ZNODEEXISTS) @@ -759,6 +776,14 @@ void StorageReplicatedMergeTree::createReplica(const StorageMetadataPtr & metada ops.emplace_back(zkutil::makeCreateRequest(replica_path + "/metadata_version", std::to_string(metadata_version), zkutil::CreateMode::Persistent)); + /// The following 3 nodes were added in version 1.1.xxx, so we create them here, not in createNewZooKeeperNodes() + ops.emplace_back(zkutil::makeCreateRequest(replica_path + "/min_unprocessed_insert_time", "", + zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(replica_path + "/max_processed_insert_time", "", + zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(replica_path + "/mutation_pointer", "", + zkutil::CreateMode::Persistent)); + /// Check version of /replicas to see if there are any replicas created at the same moment of time. ops.emplace_back(zkutil::makeSetRequest(zookeeper_path + "/replicas", "last added replica: " + replica_name, replicas_stat.version)); @@ -840,18 +865,42 @@ void StorageReplicatedMergeTree::dropReplica(zkutil::ZooKeeperPtr zookeeper, con /// "The local set of parts of table X doesn't look like the set of parts in ZooKeeper" /// { - Strings children = zookeeper->getChildren(remote_replica_path); + /// Remove metadata first + [[maybe_unused]] auto code = zookeeper->tryRemove(fs::path(remote_replica_path) / "metadata"); + assert(code == Coordination::Error::ZOK || code == Coordination::Error::ZNONODE); - if (std::find(children.begin(), children.end(), "metadata") != children.end()) - zookeeper->remove(fs::path(remote_replica_path) / "metadata"); - - for (const auto & child : children) + /// Then try to remove paths that are known to be flat (all children are leafs) + Strings flat_nodes = {"flags", "parts", "queue"}; + for (const auto & node : flat_nodes) { - if (child != "metadata") - zookeeper->removeRecursive(fs::path(remote_replica_path) / child); + bool removed_quickly = zookeeper->tryRemoveChildrenRecursive(fs::path(zookeeper_path) / node, /* probably flat */ true); + if (!removed_quickly) + LOG_WARNING(logger, "Cannot quickly remove node {} and its children (replica: {}). Will remove recursively.", + node, remote_replica_path); } - zookeeper->remove(remote_replica_path); + /// Then try to remove nodes that are known to have no children (and should always exist) + Coordination::Requests ops; + for (const auto & node : flat_nodes) + ops.emplace_back(zkutil::makeRemoveRequest(remote_replica_path + "/" + node, -1)); + + ops.emplace_back(zkutil::makeRemoveRequest(remote_replica_path + "/columns", -1)); + ops.emplace_back(zkutil::makeRemoveRequest(remote_replica_path + "/host", -1)); + ops.emplace_back(zkutil::makeRemoveRequest(remote_replica_path + "/is_lost", -1)); + ops.emplace_back(zkutil::makeRemoveRequest(remote_replica_path + "/log_pointer", -1)); + ops.emplace_back(zkutil::makeRemoveRequest(remote_replica_path + "/max_processed_insert_time", -1)); + ops.emplace_back(zkutil::makeRemoveRequest(remote_replica_path + "/min_unprocessed_insert_time", -1)); + ops.emplace_back(zkutil::makeRemoveRequest(remote_replica_path + "/metadata_version", -1)); + ops.emplace_back(zkutil::makeRemoveRequest(remote_replica_path + "/mutation_pointer", -1)); + Coordination::Responses res; + code = zookeeper->tryMulti(ops, res); + if (code != Coordination::Error::ZOK) + LOG_WARNING(logger, "Cannot quickly remove nodes without children: {} (replica: {}). Will remove recursively.", + Coordination::errorMessage(code), remote_replica_path); + + + /// And finally remove everything else recursively + zookeeper->tryRemoveRecursive(remote_replica_path); } /// It may left some garbage if replica_path subtree are concurrently modified @@ -911,17 +960,50 @@ bool StorageReplicatedMergeTree::removeTableNodesFromZooKeeper(zkutil::ZooKeeper const String & zookeeper_path, const zkutil::EphemeralNodeHolder::Ptr & metadata_drop_lock, Poco::Logger * logger) { bool completely_removed = false; + + Strings flat_nodes = {"block_numbers", "blocks", "leader_election", "log", "mutations", "pinned_part_uuids", "log"}; + + /// First try to remove paths that are known to be flat + for (const auto & node : flat_nodes) + { + bool removed_quickly = zookeeper->tryRemoveChildrenRecursive(fs::path(zookeeper_path) / node, /* probably flat */ true); + if (!removed_quickly) + LOG_WARNING(logger, "Cannot quickly remove node {} and its children (table: {}). Will remove recursively.", + node, zookeeper_path); + } + + /// Then try to remove nodes that are known to have no children (and should always exist) + Coordination::Requests ops; + for (const auto & node : flat_nodes) + ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_path + "/" + node, -1)); + + ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_path + "/alter_partition_version", -1)); + ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_path + "/columns", -1)); + ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_path + "/metadata", -1)); + ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_path + "/table_shared_id", -1)); + ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_path + "/max_processed_insert_time", -1)); + ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_path + "/min_unprocessed_insert_time", -1)); + ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_path + "/metadata_version", -1)); + ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_path + "/mutation_pointer", -1)); + ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_path + "/table_shared_id", -1)); + Coordination::Responses res; + auto code = zookeeper->tryMulti(ops, res); + if (code != Coordination::Error::ZOK) + LOG_WARNING(logger, "Cannot quickly remove nodes without children: {} (table: {}). Will remove recursively.", + Coordination::errorMessage(code), zookeeper_path); + Strings children; - Coordination::Error code = zookeeper->tryGetChildren(zookeeper_path, children); + code = zookeeper->tryGetChildren(zookeeper_path, children); if (code == Coordination::Error::ZNONODE) throw Exception(ErrorCodes::LOGICAL_ERROR, "There is a race condition between creation and removal of replicated table. It's a bug"); - for (const auto & child : children) + { if (child != "dropped") zookeeper->tryRemoveRecursive(fs::path(zookeeper_path) / child); + } - Coordination::Requests ops; + ops.clear(); Coordination::Responses responses; ops.emplace_back(zkutil::makeRemoveRequest(metadata_drop_lock->getPath(), -1)); ops.emplace_back(zkutil::makeRemoveRequest(fs::path(zookeeper_path) / "dropped", -1)); @@ -4327,7 +4409,7 @@ std::optional StorageReplicatedMergeTree::totalBytes(const Settings & se void StorageReplicatedMergeTree::assertNotReadonly() const { if (is_readonly) - throw Exception(ErrorCodes::TABLE_IS_READ_ONLY, "Table is in readonly mode (zookeeper path: {})", zookeeper_path); + throw Exception(ErrorCodes::TABLE_IS_READ_ONLY, "Table is in readonly mode (replica path: {})", replica_path); } From 1d60824d6af7415755992b3c54d4949424278938 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Tue, 5 Apr 2022 00:52:37 +0200 Subject: [PATCH 173/239] Highlight headers in PR template --- .github/PULL_REQUEST_TEMPLATE.md | 4 ++-- tests/ci/run_check.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 6540b60476f..1b7498c3091 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,4 +1,4 @@ -Changelog category (leave one): +## Changelog category (leave one): - New Feature - Improvement - Bug Fix (user-visible misbehaviour in official stable or prestable release) @@ -9,7 +9,7 @@ Changelog category (leave one): - Not for changelog (changelog entry is not required) -Changelog entry (a user-readable short description of the changes that goes to CHANGELOG.md): +## Changelog entry (a user-readable short description of the changes that goes to CHANGELOG.md): ... diff --git a/tests/ci/run_check.py b/tests/ci/run_check.py index 93dc77124c2..bc818ffb6bf 100644 --- a/tests/ci/run_check.py +++ b/tests/ci/run_check.py @@ -162,7 +162,7 @@ def check_pr_description(pr_info): i = 0 while i < len(lines): - if re.match(r"(?i)^[>*_ ]*change\s*log\s*category", lines[i]): + if re.match(r"(?i)^[#>*_ ]*change\s*log\s*category", lines[i]): i += 1 if i >= len(lines): break @@ -191,7 +191,7 @@ def check_pr_description(pr_info): return result_status[:140], category elif re.match( - r"(?i)^[>*_ ]*(short\s*description|change\s*log\s*entry)", lines[i] + r"(?i)^[#>*_ ]*(short\s*description|change\s*log\s*entry)", lines[i] ): i += 1 # Can have one empty line between header and the entry itself. From a665861f5f06f81c337a7a7648847f23c336cb43 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Tue, 5 Apr 2022 01:06:46 +0200 Subject: [PATCH 174/239] Improve descriptrion check logging --- tests/ci/run_check.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/ci/run_check.py b/tests/ci/run_check.py index bc818ffb6bf..6f00232be77 100644 --- a/tests/ci/run_check.py +++ b/tests/ci/run_check.py @@ -262,9 +262,14 @@ if __name__ == "__main__": remove_labels(gh, pr_info, pr_labels_to_remove) if description_report: - print("::notice ::Cannot run, description does not match the template") + print( + "::error ::Cannot run, PR description does not match the template: " + f"{description_report}" + ) logging.info( - "PR body doesn't match the template: (start)\n%s\n(end)", pr_info.body + "PR body doesn't match the template: (start)\n%s\n(end)\n" "Reason: %s", + pr_info.body, + description_report, ) url = ( f"{GITHUB_SERVER_URL}/{GITHUB_REPOSITORY}/" From 163664fad776eb2fd5613dda2fffd390d585b458 Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Thu, 31 Mar 2022 15:55:56 +0800 Subject: [PATCH 175/239] Improve minmax_count_projection --- src/Storages/MergeTree/MergeTreeData.cpp | 260 +++++++++++------- src/Storages/MergeTree/MergeTreeData.h | 1 + .../01710_minmax_count_projection.reference | 2 + .../01710_minmax_count_projection.sql | 12 + 4 files changed, 181 insertions(+), 94 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 533f44ac9cf..a902895fcf0 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -4641,6 +4641,7 @@ static void selectBestProjection( Block MergeTreeData::getMinMaxCountProjectionBlock( const StorageMetadataPtr & metadata_snapshot, const Names & required_columns, + bool has_filter, const SelectQueryInfo & query_info, const DataPartsVector & parts, DataPartsVector & normal_parts, @@ -4655,13 +4656,12 @@ Block MergeTreeData::getMinMaxCountProjectionBlock( auto block = metadata_snapshot->minmax_count_projection->sample_block.cloneEmpty(); bool need_primary_key_max_column = false; const auto & primary_key_max_column_name = metadata_snapshot->minmax_count_projection->primary_key_max_column_name; + NameSet required_columns_set(required_columns.begin(), required_columns.end()); if (!primary_key_max_column_name.empty()) - { - need_primary_key_max_column = std::any_of( - required_columns.begin(), required_columns.end(), [&](const auto & name) { return primary_key_max_column_name == name; }); - } + need_primary_key_max_column = required_columns_set.contains(primary_key_max_column_name); auto partition_minmax_count_columns = block.mutateColumns(); + auto partition_minmax_count_column_names = block.getNames(); auto insert = [](ColumnAggregateFunction & column, const Field & value) { auto func = column.getAggregateFunction(); @@ -4670,51 +4670,77 @@ Block MergeTreeData::getMinMaxCountProjectionBlock( size_t align_of_state = func->alignOfData(); auto * place = arena.alignedAlloc(size_of_state, align_of_state); func->create(place); - auto value_column = func->getReturnType()->createColumnConst(1, value)->convertToFullColumnIfConst(); - const auto * value_column_ptr = value_column.get(); - func->add(place, &value_column_ptr, 0, &arena); + if (const AggregateFunctionCount * agg_count = typeid_cast(func.get())) + agg_count->set(place, value.get()); + else + { + auto value_column = func->getReturnType()->createColumnConst(1, value)->convertToFullColumnIfConst(); + const auto * value_column_ptr = value_column.get(); + func->add(place, &value_column_ptr, 0, &arena); + } column.insertFrom(place); }; - ASTPtr expression_ast; - Block virtual_columns_block = getBlockWithVirtualPartColumns(parts, false /* one_part */, true /* ignore_empty */); - if (virtual_columns_block.rows() == 0) - return {}; - std::optional partition_pruner; std::optional minmax_idx_condition; DataTypes minmax_columns_types; - if (metadata_snapshot->hasPartitionKey()) - { - const auto & partition_key = metadata_snapshot->getPartitionKey(); - auto minmax_columns_names = getMinMaxColumnsNames(partition_key); - minmax_columns_types = getMinMaxColumnsTypes(partition_key); + size_t rows = parts.size(); + ColumnPtr part_name_column; - minmax_idx_condition.emplace( - query_info, - query_context, - minmax_columns_names, - getMinMaxExpr(partition_key, ExpressionActionsSettings::fromContext(query_context))); - partition_pruner.emplace(metadata_snapshot, query_info, query_context, false /* strict */); + Block virtual_columns_block; + auto virtual_block = getSampleBlockWithVirtualColumns(); + bool has_virtual_column = std::any_of(required_columns.begin(), required_columns.end(), [&](const auto & name) { return virtual_block.has(name); }); + if (has_virtual_column || has_filter) + { + virtual_columns_block = getBlockWithVirtualPartColumns(parts, false /* one_part */, true /* ignore_empty */); + if (virtual_columns_block.rows() == 0) + return {}; } - // Generate valid expressions for filtering - VirtualColumnUtils::prepareFilterBlockWithQuery(query_info.query, query_context, virtual_columns_block, expression_ast); - if (expression_ast) - VirtualColumnUtils::filterBlockWithQuery(query_info.query, virtual_columns_block, query_context, expression_ast); + if (has_filter) + { + if (metadata_snapshot->hasPartitionKey()) + { + const auto & partition_key = metadata_snapshot->getPartitionKey(); + auto minmax_columns_names = getMinMaxColumnsNames(partition_key); + minmax_columns_types = getMinMaxColumnsTypes(partition_key); + + minmax_idx_condition.emplace( + query_info, + query_context, + minmax_columns_names, + getMinMaxExpr(partition_key, ExpressionActionsSettings::fromContext(query_context))); + partition_pruner.emplace(metadata_snapshot, query_info, query_context, false /* strict */); + } + + // Generate valid expressions for filtering + ASTPtr expression_ast; + VirtualColumnUtils::prepareFilterBlockWithQuery(query_info.query, query_context, virtual_columns_block, expression_ast); + if (expression_ast) + VirtualColumnUtils::filterBlockWithQuery(query_info.query, virtual_columns_block, query_context, expression_ast); + + rows = virtual_columns_block.rows(); + part_name_column = virtual_columns_block.getByName("_part").column; + } - size_t rows = virtual_columns_block.rows(); - const ColumnString & part_name_column = typeid_cast(*virtual_columns_block.getByName("_part").column); - size_t part_idx = 0; auto filter_column = ColumnUInt8::create(); auto & filter_column_data = filter_column->getData(); - for (size_t row = 0; row < rows; ++row) + + DataPartsVector real_parts; + real_parts.reserve(rows); + for (size_t row = 0, part_idx = 0; row < rows; ++row, ++part_idx) { - while (parts[part_idx]->name != part_name_column.getDataAt(row)) - ++part_idx; + if (has_filter) + { + while (parts[part_idx]->name != part_name_column->getDataAt(row)) + ++part_idx; + } const auto & part = parts[part_idx]; + if (part->isEmpty()) + continue; + if (!part->minmax_idx->initialized) throw Exception("Found a non-empty part with uninitialized minmax_idx. It's a bug", ErrorCodes::LOGICAL_ERROR); @@ -4743,49 +4769,14 @@ Block MergeTreeData::getMinMaxCountProjectionBlock( continue; } + real_parts.push_back(part); filter_column_data.back() = 1; - size_t pos = 0; - for (size_t i : metadata_snapshot->minmax_count_projection->partition_value_indices) - { - if (i >= part->partition.value.size()) - throw Exception("Partition value index is out of boundary. It's a bug", ErrorCodes::LOGICAL_ERROR); - partition_minmax_count_columns[pos++]->insert(part->partition.value[i]); - } - - size_t minmax_idx_size = part->minmax_idx->hyperrectangle.size(); - for (size_t i = 0; i < minmax_idx_size; ++i) - { - auto & min_column = assert_cast(*partition_minmax_count_columns[pos++]); - auto & max_column = assert_cast(*partition_minmax_count_columns[pos++]); - const auto & range = part->minmax_idx->hyperrectangle[i]; - insert(min_column, range.left); - insert(max_column, range.right); - } - - if (!primary_key_max_column_name.empty()) - { - const auto & primary_key_column = *part->index[0]; - auto & min_column = assert_cast(*partition_minmax_count_columns[pos++]); - auto & max_column = assert_cast(*partition_minmax_count_columns[pos++]); - insert(min_column, primary_key_column[0]); - insert(max_column, primary_key_column[primary_key_column.size() - 1]); - } - - { - auto & column = assert_cast(*partition_minmax_count_columns.back()); - auto func = column.getAggregateFunction(); - Arena & arena = column.createOrGetArena(); - size_t size_of_state = func->sizeOfData(); - size_t align_of_state = func->alignOfData(); - auto * place = arena.alignedAlloc(size_of_state, align_of_state); - func->create(place); - const AggregateFunctionCount & agg_count = assert_cast(*func); - agg_count.set(place, part->rows_count); - column.insertFrom(place); - } } - block.setColumns(std::move(partition_minmax_count_columns)); + if (real_parts.empty()) + return {}; + + size_t minmax_idx_size = real_parts.front()->minmax_idx->hyperrectangle.size(); FilterDescription filter(*filter_column); for (size_t i = 0; i < virtual_columns_block.columns(); ++i) { @@ -4793,8 +4784,77 @@ Block MergeTreeData::getMinMaxCountProjectionBlock( column = column->filter(*filter.data, -1); } - if (block.rows() == 0) - return {}; + size_t pos = 0; + for (size_t i : metadata_snapshot->minmax_count_projection->partition_value_indices) + { + if (required_columns_set.contains(partition_minmax_count_column_names[pos])) + for (const auto & part : real_parts) + partition_minmax_count_columns[pos]->insert(part->partition.value[i]); + ++pos; + } + + for (size_t i = 0; i < minmax_idx_size; ++i) + { + if (required_columns_set.contains(partition_minmax_count_column_names[pos])) + { + for (const auto & part : real_parts) + { + const auto & range = part->minmax_idx->hyperrectangle[i]; + auto & min_column = assert_cast(*partition_minmax_count_columns[pos]); + insert(min_column, range.left); + } + } + ++pos; + + if (required_columns_set.contains(partition_minmax_count_column_names[pos])) + { + for (const auto & part : real_parts) + { + const auto & range = part->minmax_idx->hyperrectangle[i]; + auto & max_column = assert_cast(*partition_minmax_count_columns[pos]); + insert(max_column, range.right); + } + } + ++pos; + } + + if (!primary_key_max_column_name.empty()) + { + if (required_columns_set.contains(partition_minmax_count_column_names[pos])) + { + for (const auto & part : real_parts) + { + const auto & primary_key_column = *part->index[0]; + auto & min_column = assert_cast(*partition_minmax_count_columns[pos]); + insert(min_column, primary_key_column[0]); + } + } + ++pos; + + if (required_columns_set.contains(partition_minmax_count_column_names[pos])) + { + for (const auto & part : real_parts) + { + const auto & primary_key_column = *part->index[0]; + auto & max_column = assert_cast(*partition_minmax_count_columns[pos]); + insert(max_column, primary_key_column[primary_key_column.size() - 1]); + } + } + ++pos; + } + + bool has_count + = std::any_of(required_columns.begin(), required_columns.end(), [&](const auto & name) { return startsWith(name, "count"); }); + if (has_count) + { + for (const auto & part : real_parts) + { + auto & column = assert_cast(*partition_minmax_count_columns.back()); + insert(column, part->rows_count); + } + } + + block.setColumns(std::move(partition_minmax_count_columns)); Block res; for (const auto & name : required_columns) @@ -4803,6 +4863,11 @@ Block MergeTreeData::getMinMaxCountProjectionBlock( res.insert(virtual_columns_block.getByName(name)); else if (block.has(name)) res.insert(block.getByName(name)); + else if (startsWith(name, "count")) // special case to match count(...) variants + { + const auto & column = block.getByName("count()"); + res.insert({column.column, column.type, name}); + } else throw Exception( ErrorCodes::LOGICAL_ERROR, @@ -4974,7 +5039,7 @@ std::optional MergeTreeData::getQueryProcessingStageWithAgg }; auto virtual_block = getSampleBlockWithVirtualColumns(); - auto add_projection_candidate = [&](const ProjectionDescription & projection) + auto add_projection_candidate = [&](const ProjectionDescription & projection, bool normalize_count_not_null = false) { ProjectionCandidate candidate{}; candidate.desc = &projection; @@ -5001,22 +5066,28 @@ std::optional MergeTreeData::getQueryProcessingStageWithAgg if (projection.type == ProjectionDescription::Type::Aggregate && analysis_result.need_aggregate && can_use_aggregate_projection) { - bool match = true; Block aggregates; // Let's first check if all aggregates are provided by current projection for (const auto & aggregate : select.getQueryAnalyzer()->aggregates()) { - const auto * column = sample_block.findByName(aggregate.column_name); - if (column) - aggregates.insert(*column); - else + if (const auto * column = sample_block.findByName(aggregate.column_name)) { - match = false; - break; + aggregates.insert(*column); + continue; } - } - if (!match) + + if (normalize_count_not_null && dynamic_cast(aggregate.function.get())) + { + const auto * count_column = sample_block.findByName("count()"); + if (!count_column) + throw Exception(ErrorCodes::LOGICAL_ERROR, "count_column is missing when normalize_count_not_null == true. It is a bug"); + aggregates.insert({count_column->column, count_column->type, aggregate.column_name}); + continue; + } + + // No match return; + } // Check if all aggregation keys can be either provided by some action, or by current // projection directly. Reshape the `before_aggregation` action DAG so that it only @@ -5069,11 +5140,11 @@ std::optional MergeTreeData::getQueryProcessingStageWithAgg ProjectionCandidate * selected_candidate = nullptr; size_t min_sum_marks = std::numeric_limits::max(); if (metadata_snapshot->minmax_count_projection) - add_projection_candidate(*metadata_snapshot->minmax_count_projection); - std::optional minmax_conut_projection_candidate; + add_projection_candidate(*metadata_snapshot->minmax_count_projection, true); + std::optional minmax_count_projection_candidate; if (!candidates.empty()) { - minmax_conut_projection_candidate.emplace(std::move(candidates.front())); + minmax_count_projection_candidate.emplace(std::move(candidates.front())); candidates.clear(); } MergeTreeDataSelectExecutor reader(*this); @@ -5086,21 +5157,22 @@ std::optional MergeTreeData::getQueryProcessingStageWithAgg auto parts = getDataPartsVector(); // If minmax_count_projection is a valid candidate, check its completeness. - if (minmax_conut_projection_candidate) + if (minmax_count_projection_candidate) { DataPartsVector normal_parts; query_info.minmax_count_projection_block = getMinMaxCountProjectionBlock( metadata_snapshot, - minmax_conut_projection_candidate->required_columns, + minmax_count_projection_candidate->required_columns, + analysis_result.prewhere_info || analysis_result.before_where, query_info, parts, normal_parts, max_added_blocks.get(), query_context); - if (query_info.minmax_count_projection_block && minmax_conut_projection_candidate->prewhere_info) + if (query_info.minmax_count_projection_block && minmax_count_projection_candidate->prewhere_info) { - const auto & prewhere_info = minmax_conut_projection_candidate->prewhere_info; + const auto & prewhere_info = minmax_count_projection_candidate->prewhere_info; if (prewhere_info->alias_actions) ExpressionActions(prewhere_info->alias_actions, actions_settings).execute(query_info.minmax_count_projection_block); @@ -5119,7 +5191,7 @@ std::optional MergeTreeData::getQueryProcessingStageWithAgg if (normal_parts.empty()) { - selected_candidate = &*minmax_conut_projection_candidate; + selected_candidate = &*minmax_count_projection_candidate; selected_candidate->complete = true; min_sum_marks = query_info.minmax_count_projection_block.rows(); } @@ -5143,7 +5215,7 @@ std::optional MergeTreeData::getQueryProcessingStageWithAgg if (!normal_result_ptr->error()) { - selected_candidate = &*minmax_conut_projection_candidate; + selected_candidate = &*minmax_count_projection_candidate; selected_candidate->merge_tree_normal_select_result_ptr = normal_result_ptr; min_sum_marks = query_info.minmax_count_projection_block.rows() + normal_result_ptr->marks(); } diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index 1cbcd4282d0..2fec580f876 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -393,6 +393,7 @@ public: Block getMinMaxCountProjectionBlock( const StorageMetadataPtr & metadata_snapshot, const Names & required_columns, + bool has_filter, const SelectQueryInfo & query_info, const DataPartsVector & parts, DataPartsVector & normal_parts, diff --git a/tests/queries/0_stateless/01710_minmax_count_projection.reference b/tests/queries/0_stateless/01710_minmax_count_projection.reference index b13738a66de..bb35f3cfbd3 100644 --- a/tests/queries/0_stateless/01710_minmax_count_projection.reference +++ b/tests/queries/0_stateless/01710_minmax_count_projection.reference @@ -17,3 +17,5 @@ 0 2021-10-24 10:00:00 0 +1000 +1000 diff --git a/tests/queries/0_stateless/01710_minmax_count_projection.sql b/tests/queries/0_stateless/01710_minmax_count_projection.sql index 0792fe331bb..dd360b65016 100644 --- a/tests/queries/0_stateless/01710_minmax_count_projection.sql +++ b/tests/queries/0_stateless/01710_minmax_count_projection.sql @@ -59,3 +59,15 @@ SELECT min(dt) FROM d PREWHERE ((0.9998999834060669 AND 1023) AND 255) <= ceil(j SELECT count('') AND NULL FROM d PREWHERE ceil(j) <= NULL; drop table d; + +-- count variant optimization + +drop table if exists test; +create table test (id Int64, d Int64, projection dummy(select * order by id)) engine MergeTree order by id; +insert into test select number, number from numbers(1e3); + +select count(if(d=4, d, 1)) from test settings force_optimize_projection = 1; +select count(d/3) from test settings force_optimize_projection = 1; +select count(if(d=4, Null, 1)) from test settings force_optimize_projection = 1; -- { serverError 584 } + +drop table test; From f87b25f2d705b5df4503b73178a7d9f6bb1dc4ac Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Thu, 31 Mar 2022 20:46:46 +0800 Subject: [PATCH 176/239] Fix tests --- ...2_last_granula_adjust_LOGICAL_ERROR.reference | 16 ++++++++-------- ...2052_last_granula_adjust_LOGICAL_ERROR.sql.j2 | 4 ++-- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/queries/0_stateless/02052_last_granula_adjust_LOGICAL_ERROR.reference b/tests/queries/0_stateless/02052_last_granula_adjust_LOGICAL_ERROR.reference index d7d3ee8f362..72d9eb2928a 100644 --- a/tests/queries/0_stateless/02052_last_granula_adjust_LOGICAL_ERROR.reference +++ b/tests/queries/0_stateless/02052_last_granula_adjust_LOGICAL_ERROR.reference @@ -1,8 +1,8 @@ -1 -1 -10 -10 -100 -100 -10000 -10000 +0 00000 +0 00000 +9 99999 +9 99999 +99 9999999999 +99 9999999999 +9999 99999999999999999999 +9999 99999999999999999999 diff --git a/tests/queries/0_stateless/02052_last_granula_adjust_LOGICAL_ERROR.sql.j2 b/tests/queries/0_stateless/02052_last_granula_adjust_LOGICAL_ERROR.sql.j2 index 465aa22beb3..53d970496b2 100644 --- a/tests/queries/0_stateless/02052_last_granula_adjust_LOGICAL_ERROR.sql.j2 +++ b/tests/queries/0_stateless/02052_last_granula_adjust_LOGICAL_ERROR.sql.j2 @@ -11,8 +11,8 @@ settings as select number, repeat(toString(number), 5) from numbers({{ rows_in_table }}); -- avoid any optimizations with ignore(*) -select count(ignore(*)) from data_02052_{{ rows_in_table }}_wide{{ wide }} settings max_read_buffer_size=1, max_threads=1; -select count(ignore(*)) from data_02052_{{ rows_in_table }}_wide{{ wide }} settings max_read_buffer_size=0, max_threads=1; -- { serverError CANNOT_READ_ALL_DATA } +select * apply max from data_02052_{{ rows_in_table }}_wide{{ wide }} settings max_read_buffer_size=1, max_threads=1; +select * apply max from data_02052_{{ rows_in_table }}_wide{{ wide }} settings max_read_buffer_size=0, max_threads=1; -- { serverError CANNOT_READ_ALL_DATA } drop table data_02052_{{ rows_in_table }}_wide{{ wide }}; {% endfor %} From 35a8bb2a9bb90c5706537361c19d29ee113e7611 Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Fri, 1 Apr 2022 17:18:55 +0800 Subject: [PATCH 177/239] add comment --- src/Storages/MergeTree/MergeTreeData.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index 2fec580f876..44736fe2cc5 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -383,6 +383,8 @@ public: /// Build a block of minmax and count values of a MergeTree table. These values are extracted /// from minmax_indices, the first expression of primary key, and part rows. /// + /// has_filter - if query has no filter, bypass partition pruning completely + /// /// query_info - used to filter unneeded parts /// /// parts - part set to filter From 5bc09550d877f5086741ddecfa99928a8c72f879 Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Tue, 5 Apr 2022 15:55:17 +0800 Subject: [PATCH 178/239] Fix tests --- src/Storages/MergeTree/MergeTreeData.cpp | 36 ++++++++++++------- .../01710_minmax_count_projection.reference | 1 + .../01710_minmax_count_projection.sql | 2 ++ 3 files changed, 27 insertions(+), 12 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index a902895fcf0..a2ae344994f 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -4681,12 +4681,6 @@ Block MergeTreeData::getMinMaxCountProjectionBlock( column.insertFrom(place); }; - std::optional partition_pruner; - std::optional minmax_idx_condition; - DataTypes minmax_columns_types; - size_t rows = parts.size(); - ColumnPtr part_name_column; - Block virtual_columns_block; auto virtual_block = getSampleBlockWithVirtualColumns(); bool has_virtual_column = std::any_of(required_columns.begin(), required_columns.end(), [&](const auto & name) { return virtual_block.has(name); }); @@ -4697,6 +4691,11 @@ Block MergeTreeData::getMinMaxCountProjectionBlock( return {}; } + size_t rows = parts.size(); + ColumnPtr part_name_column; + std::optional partition_pruner; + std::optional minmax_idx_condition; + DataTypes minmax_columns_types; if (has_filter) { if (metadata_snapshot->hasPartitionKey()) @@ -4730,7 +4729,7 @@ Block MergeTreeData::getMinMaxCountProjectionBlock( real_parts.reserve(rows); for (size_t row = 0, part_idx = 0; row < rows; ++row, ++part_idx) { - if (has_filter) + if (part_name_column) { while (parts[part_idx]->name != part_name_column->getDataAt(row)) ++part_idx; @@ -4776,7 +4775,6 @@ Block MergeTreeData::getMinMaxCountProjectionBlock( if (real_parts.empty()) return {}; - size_t minmax_idx_size = real_parts.front()->minmax_idx->hyperrectangle.size(); FilterDescription filter(*filter_column); for (size_t i = 0; i < virtual_columns_block.columns(); ++i) { @@ -4793,6 +4791,7 @@ Block MergeTreeData::getMinMaxCountProjectionBlock( ++pos; } + size_t minmax_idx_size = real_parts.front()->minmax_idx->hyperrectangle.size(); for (size_t i = 0; i < minmax_idx_size; ++i) { if (required_columns_set.contains(partition_minmax_count_column_names[pos])) @@ -5039,7 +5038,7 @@ std::optional MergeTreeData::getQueryProcessingStageWithAgg }; auto virtual_block = getSampleBlockWithVirtualColumns(); - auto add_projection_candidate = [&](const ProjectionDescription & projection, bool normalize_count_not_null = false) + auto add_projection_candidate = [&](const ProjectionDescription & projection, bool minmax_count_projection = false) { ProjectionCandidate candidate{}; candidate.desc = &projection; @@ -5076,11 +5075,13 @@ std::optional MergeTreeData::getQueryProcessingStageWithAgg continue; } - if (normalize_count_not_null && dynamic_cast(aggregate.function.get())) + // We can treat every count_not_null_column as count() when selecting minmax_count_projection + if (minmax_count_projection && dynamic_cast(aggregate.function.get())) { const auto * count_column = sample_block.findByName("count()"); if (!count_column) - throw Exception(ErrorCodes::LOGICAL_ERROR, "count_column is missing when normalize_count_not_null == true. It is a bug"); + throw Exception( + ErrorCodes::LOGICAL_ERROR, "`count()` column is missing when minmax_count_projection == true. It is a bug"); aggregates.insert({count_column->column, count_column->type, aggregate.column_name}); continue; } @@ -5107,9 +5108,20 @@ std::optional MergeTreeData::getQueryProcessingStageWithAgg candidate.before_aggregation->reorderAggregationKeysForProjection(key_name_pos_map); candidate.before_aggregation->addAggregatesViaProjection(aggregates); + // minmax_count_projections only have aggregation actions + if (minmax_count_projection) + candidate.required_columns = {required_columns.begin(), required_columns.end()}; + if (rewrite_before_where(candidate, projection, required_columns, sample_block_for_keys, aggregates)) { - candidate.required_columns = {required_columns.begin(), required_columns.end()}; + if (minmax_count_projection) + { + candidate.before_where = nullptr; + candidate.prewhere_info = nullptr; + } + else + candidate.required_columns = {required_columns.begin(), required_columns.end()}; + for (const auto & aggregate : aggregates) candidate.required_columns.push_back(aggregate.name); candidates.push_back(std::move(candidate)); diff --git a/tests/queries/0_stateless/01710_minmax_count_projection.reference b/tests/queries/0_stateless/01710_minmax_count_projection.reference index bb35f3cfbd3..259d320a38a 100644 --- a/tests/queries/0_stateless/01710_minmax_count_projection.reference +++ b/tests/queries/0_stateless/01710_minmax_count_projection.reference @@ -9,6 +9,7 @@ 1 9999 3 2021-10-25 10:00:00 2021-10-27 10:00:00 3 +2021-10-25 10:00:00 2021-10-27 10:00:00 3 1 1 1 diff --git a/tests/queries/0_stateless/01710_minmax_count_projection.sql b/tests/queries/0_stateless/01710_minmax_count_projection.sql index dd360b65016..a6c04725583 100644 --- a/tests/queries/0_stateless/01710_minmax_count_projection.sql +++ b/tests/queries/0_stateless/01710_minmax_count_projection.sql @@ -50,6 +50,8 @@ drop table if exists d; create table d (dt DateTime, j int) engine MergeTree partition by (toDate(dt), ceiling(j), toDate(dt), CEILING(j)) order by tuple(); insert into d values ('2021-10-24 10:00:00', 10), ('2021-10-25 10:00:00', 10), ('2021-10-26 10:00:00', 10), ('2021-10-27 10:00:00', 10); select min(dt), max(dt), count() from d where toDate(dt) >= '2021-10-25'; +-- fuzz crash +select min(dt), max(dt), count(toDate(dt) >= '2021-10-25') from d where toDate(dt) >= '2021-10-25'; select count() from d group by toDate(dt); -- fuzz crash From 588a168e091e834c21d5a4dcb70c2a98f62f11bb Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Tue, 5 Apr 2022 10:05:54 +0200 Subject: [PATCH 179/239] Decrease headers size Co-authored-by: Azat Khuzhin --- .github/PULL_REQUEST_TEMPLATE.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 1b7498c3091..2d8540b57ea 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,4 +1,4 @@ -## Changelog category (leave one): +### Changelog category (leave one): - New Feature - Improvement - Bug Fix (user-visible misbehaviour in official stable or prestable release) @@ -9,7 +9,7 @@ - Not for changelog (changelog entry is not required) -## Changelog entry (a user-readable short description of the changes that goes to CHANGELOG.md): +### Changelog entry (a user-readable short description of the changes that goes to CHANGELOG.md): ... From 6e1d8374394b993811c56cfe962ae29f13efad8e Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Tue, 5 Apr 2022 11:24:33 +0200 Subject: [PATCH 180/239] Fixed style check --- src/Common/ShellCommand.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Common/ShellCommand.cpp b/src/Common/ShellCommand.cpp index 229807c868e..f24add7acf0 100644 --- a/src/Common/ShellCommand.cpp +++ b/src/Common/ShellCommand.cpp @@ -31,7 +31,7 @@ namespace namespace ProfileEvents { - extern const int ExecuteShellCommand; + extern const Event ExecuteShellCommand; } namespace DB From fd1c8103a0e675c8828fef70909426a9242e4ba7 Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Tue, 5 Apr 2022 11:47:45 +0200 Subject: [PATCH 181/239] Fixed tests --- tests/queries/0_stateless/02252_jit_profile_events.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/02252_jit_profile_events.sql b/tests/queries/0_stateless/02252_jit_profile_events.sql index e4c9d9d8791..561e25505bc 100644 --- a/tests/queries/0_stateless/02252_jit_profile_events.sql +++ b/tests/queries/0_stateless/02252_jit_profile_events.sql @@ -26,6 +26,6 @@ SYSTEM FLUSH LOGS; SELECT ProfileEvents['CompileFunction'] FROM system.query_log WHERE current_database = currentDatabase() AND type = 'QueryFinish' - AND query == 'SELECT sum(number), sum(number), sum(number) FROM numbers(1) GROUP BY number;' + AND query == 'SELECT sum(number), sum(number + 1), sum(number + 2) FROM numbers(1) GROUP BY number;' AND event_date >= yesterday() AND event_time > now() - interval 10 minute - LIMIT 1; \ No newline at end of file + LIMIT 1; From 4bfac4ec9990de8cea1218fa875f8cb946ad986d Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 5 Apr 2022 12:17:34 +0200 Subject: [PATCH 182/239] Remove more testmode mentions --- tests/integration/helpers/client.py | 2 +- tests/queries/0_stateless/01691_parser_data_type_exponential.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/helpers/client.py b/tests/integration/helpers/client.py index af49408abee..41c5608081d 100644 --- a/tests/integration/helpers/client.py +++ b/tests/integration/helpers/client.py @@ -55,7 +55,7 @@ class Client: command = self.command[:] if stdin is None: - command += ["--multiquery", "--testmode"] + command += ["--multiquery"] stdin = sql else: command += ["--query", sql] diff --git a/tests/queries/0_stateless/01691_parser_data_type_exponential.sh b/tests/queries/0_stateless/01691_parser_data_type_exponential.sh index 2b1d34982a2..f8004f9350d 100755 --- a/tests/queries/0_stateless/01691_parser_data_type_exponential.sh +++ b/tests/queries/0_stateless/01691_parser_data_type_exponential.sh @@ -5,4 +5,4 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) . "$CURDIR"/../shell_config.sh # Check that DataType parser does not have exponential complexity in the case found by fuzzer. -for _ in {1..10}; do ${CLICKHOUSE_CLIENT} -n --testmode --query "SELECT CAST(1 AS A2222222222222222222222222222222222222222222222222teFuncpion(groupBitmap, 222222222222222ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222 222222222222222222ggregateF222222222222222222222222222222222222222222222222222222222teFuncpion(groupBitmap, 222222222222222ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222222222 222222222222222222ggregateFuncpion(groupBitmapp, 222222222222222ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222222222 222222222222222222ggregateF222222222222222222222222222222222222222222222222222222222teFuncpion(groupBitmap, 222222222222223ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222222222 222222222222222222ggregateFuncpion(groupBitmap, 00000000000000000000000000000000000000000000000000000000000000000000000000000001841416382, 222222222222222ggregateFuncpion(groupBitmap22222222222222222222222222222222222222222222222222220000000000000000000000000000000000000000000000000000000000000000000000000000002260637443813394204 222222222222222222ggregateFuncpion(groupBitmapp, 222222222222222ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222222222 222222222222222222ggregateF222222222222222222222222222222222222222222222222222222222teFuncpion(groupBitmap, 222222222222223ggregateFuncpio22222222ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222222222 222222222222222222ggregateF222222222222222222222222222222222222222222222222222222222teFuncpion(groupBitmap, 222222222222222ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222 222222222222222222ggregateF222222222222222222222222222222222222222222222222222222222teFuncpion(groupBitmap, 222222222222222ggregateFuncpion(groupBitmap2222222222222ggregateFuncpion(groupBitmapp, 222222222222222ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222222222 222222222222222222ggregateF222222222222222222222222222222222222222222222222222222222teFuncpion(groupBitmap, 222222222222223ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222222222 222222222222222222ggre222222222222222222222222222222222222222222222222222teFuncpion(groupBitmap, 22222222222222222ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222222222 222222222222222222ggregateF222222222222222222222222222222222222222222222222222222222teFuncpion(groupBitmap, 2222222222222eFuncpion(groupBitmap, 00000000000000000000000000000000000000000000000000000000000000000000000000000001841416382, 222222222222222ggregateFuncpion(groupBitmap22222222222222222222222222222222222222222222222222222222222222222222222200000000000000000000178859639454016722222222222222222222222222222222222222222 222222222222222222ggregateFuncpion(groupBitmapp, 222222222222222ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222222222 222222222222222222ggregateF222222222222222222222222222222222222222222222222222222222teFuncpion(groupBitmap, 222222222222223ggregateFuncpio22222222ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222222222 222222222222222222ggregateF222222222222222222222222222222222222222222222222222222222teFuncpion(groupBitmap, 222222222222222ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222 222222222222222222ggregateF222222222222222222222222222222222222222222222222222222222teFuncpion(groupBitmap, 222222222222222ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222222222 222222222222222222ggregateFuncpion(groupBitmapp, 222222222222222ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222222222 222222222222222222ggregateF222222222222222222222222222222222222222222222222222222222teFuncpion(groupBitmap, 222222222222223ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222222222 222222222222222222ggregateFuncpion(groupBitmap, 00000000000000000000000000000000000000000000000000000000000000000000000000000001841416382, 222222222222222ggregateFuncpion(groupBitmap22222222222222222222222222222222222222222222222222222222222222222222222200000000000000000000178859639454016722222222222222222222222222222222222222222 222222222222222222ggregateFuncpion(groupBitmapp, 222222222222222ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222222222 222222222222222222ggregateF222222222222222222222222222222222222222222222222222222222teFuncpion(groupBitmap, 222222222222223ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222222222 222222222222222222ggregateFuncpion(groupBitmap, 22222222222222222222222222222222222222222222222222222222222222222222222222222222222222222, 222222222222222ggregateFuncpion(groupBitmap222222222222222222222222222222222222222222222222222222222222222222222222000000000000000000001788596394540167623 222222222222222222ggregateFu22222222222222222222222222 222222222, UInt33)); -- { clientError 62 }"; done +for _ in {1..10}; do ${CLICKHOUSE_CLIENT} -n --query "SELECT CAST(1 AS A2222222222222222222222222222222222222222222222222teFuncpion(groupBitmap, 222222222222222ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222 222222222222222222ggregateF222222222222222222222222222222222222222222222222222222222teFuncpion(groupBitmap, 222222222222222ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222222222 222222222222222222ggregateFuncpion(groupBitmapp, 222222222222222ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222222222 222222222222222222ggregateF222222222222222222222222222222222222222222222222222222222teFuncpion(groupBitmap, 222222222222223ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222222222 222222222222222222ggregateFuncpion(groupBitmap, 00000000000000000000000000000000000000000000000000000000000000000000000000000001841416382, 222222222222222ggregateFuncpion(groupBitmap22222222222222222222222222222222222222222222222222220000000000000000000000000000000000000000000000000000000000000000000000000000002260637443813394204 222222222222222222ggregateFuncpion(groupBitmapp, 222222222222222ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222222222 222222222222222222ggregateF222222222222222222222222222222222222222222222222222222222teFuncpion(groupBitmap, 222222222222223ggregateFuncpio22222222ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222222222 222222222222222222ggregateF222222222222222222222222222222222222222222222222222222222teFuncpion(groupBitmap, 222222222222222ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222 222222222222222222ggregateF222222222222222222222222222222222222222222222222222222222teFuncpion(groupBitmap, 222222222222222ggregateFuncpion(groupBitmap2222222222222ggregateFuncpion(groupBitmapp, 222222222222222ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222222222 222222222222222222ggregateF222222222222222222222222222222222222222222222222222222222teFuncpion(groupBitmap, 222222222222223ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222222222 222222222222222222ggre222222222222222222222222222222222222222222222222222teFuncpion(groupBitmap, 22222222222222222ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222222222 222222222222222222ggregateF222222222222222222222222222222222222222222222222222222222teFuncpion(groupBitmap, 2222222222222eFuncpion(groupBitmap, 00000000000000000000000000000000000000000000000000000000000000000000000000000001841416382, 222222222222222ggregateFuncpion(groupBitmap22222222222222222222222222222222222222222222222222222222222222222222222200000000000000000000178859639454016722222222222222222222222222222222222222222 222222222222222222ggregateFuncpion(groupBitmapp, 222222222222222ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222222222 222222222222222222ggregateF222222222222222222222222222222222222222222222222222222222teFuncpion(groupBitmap, 222222222222223ggregateFuncpio22222222ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222222222 222222222222222222ggregateF222222222222222222222222222222222222222222222222222222222teFuncpion(groupBitmap, 222222222222222ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222 222222222222222222ggregateF222222222222222222222222222222222222222222222222222222222teFuncpion(groupBitmap, 222222222222222ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222222222 222222222222222222ggregateFuncpion(groupBitmapp, 222222222222222ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222222222 222222222222222222ggregateF222222222222222222222222222222222222222222222222222222222teFuncpion(groupBitmap, 222222222222223ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222222222 222222222222222222ggregateFuncpion(groupBitmap, 00000000000000000000000000000000000000000000000000000000000000000000000000000001841416382, 222222222222222ggregateFuncpion(groupBitmap22222222222222222222222222222222222222222222222222222222222222222222222200000000000000000000178859639454016722222222222222222222222222222222222222222 222222222222222222ggregateFuncpion(groupBitmapp, 222222222222222ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222222222 222222222222222222ggregateF222222222222222222222222222222222222222222222222222222222teFuncpion(groupBitmap, 222222222222223ggregateFuncpion(groupBitmap2222222222222222222222222222222222222222222222222222 222222222222222222ggregateFuncpion(groupBitmap, 22222222222222222222222222222222222222222222222222222222222222222222222222222222222222222, 222222222222222ggregateFuncpion(groupBitmap222222222222222222222222222222222222222222222222222222222222222222222222000000000000000000001788596394540167623 222222222222222222ggregateFu22222222222222222222222222 222222222, UInt33)); -- { clientError 62 }"; done From 8a05cf392703ad233a1b23c330220fbb13ececd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Tue, 5 Apr 2022 13:00:14 +0200 Subject: [PATCH 183/239] DNS cache: Add option to drop elements after several consecutive failures --- .../settings.md | 12 +++ programs/server/Server.cpp | 3 +- src/Common/DNSResolver.cpp | 75 ++++++++++++++----- src/Common/DNSResolver.h | 14 +++- src/Interpreters/DNSCacheUpdater.cpp | 5 +- src/Interpreters/DNSCacheUpdater.h | 3 +- tests/integration/parallel_skip.json | 11 +-- .../__init__.py | 0 .../configs/dns_update_long.xml | 0 .../configs/dns_update_short.xml | 1 + .../configs/listen_host.xml | 0 .../configs/remote_servers.xml | 0 .../configs/users_with_hostname.xml | 0 .../test.py | 11 +++ 14 files changed, 103 insertions(+), 32 deletions(-) rename tests/integration/{test_host_ip_change => test_dns_cache}/__init__.py (100%) rename tests/integration/{test_host_ip_change => test_dns_cache}/configs/dns_update_long.xml (100%) rename tests/integration/{test_host_ip_change => test_dns_cache}/configs/dns_update_short.xml (55%) rename tests/integration/{test_host_ip_change => test_dns_cache}/configs/listen_host.xml (100%) rename tests/integration/{test_host_ip_change => test_dns_cache}/configs/remote_servers.xml (100%) rename tests/integration/{test_host_ip_change => test_dns_cache}/configs/users_with_hostname.xml (100%) rename tests/integration/{test_host_ip_change => test_dns_cache}/test.py (92%) diff --git a/docs/en/operations/server-configuration-parameters/settings.md b/docs/en/operations/server-configuration-parameters/settings.md index 88c43c9c3c2..301b348925f 100644 --- a/docs/en/operations/server-configuration-parameters/settings.md +++ b/docs/en/operations/server-configuration-parameters/settings.md @@ -1467,6 +1467,18 @@ The update is performed asynchronously, in a separate system thread. - [background_schedule_pool_size](../../operations/settings/settings.md#background_schedule_pool_size) + +## dns_max_consecutive_failures {#server-settings-dns-max-consecutive-failures} + +The number of consecutive failures accepted when updating a DNS cache entry before it is dropped. +Use `0` to disable cache dropping (entries will only be cleaned by `SYSTEM DROP DNS CACHE`) + +**Default value**: 5. + +**See also** + +- [`SYSTEM DROP DNS CACHE`](../../sql-reference/statements/system.md#query_language-system-drop-dns-cache) + ## distributed_ddl {#server-settings-distributed_ddl} Manage executing [distributed ddl queries](../../sql-reference/distributed-ddl.md) (CREATE, DROP, ALTER, RENAME) on cluster. diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index c12abda9594..0b5a7724fe5 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -1503,7 +1503,8 @@ int Server::main(const std::vector & /*args*/) else { /// Initialize a watcher periodically updating DNS cache - dns_cache_updater = std::make_unique(global_context, config().getInt("dns_cache_update_period", 15)); + dns_cache_updater = std::make_unique( + global_context, config().getInt("dns_cache_update_period", 15), config().getUInt("dns_max_consecutive_failures", 5)); } #if defined(OS_LINUX) diff --git a/src/Common/DNSResolver.cpp b/src/Common/DNSResolver.cpp index 13da3efd57a..fcf175fc9bf 100644 --- a/src/Common/DNSResolver.cpp +++ b/src/Common/DNSResolver.cpp @@ -118,12 +118,12 @@ static DNSResolver::IPAddresses resolveIPAddressImpl(const std::string & host) } catch (const Poco::Net::DNSException & e) { - LOG_ERROR(&Poco::Logger::get("DNSResolver"), "Cannot resolve host ({}), error {}: {}.", host, e.code(), e.message()); + LOG_ERROR(&Poco::Logger::get("DNSResolver"), "Cannot resolve host ({}), error {}: {}.", host, e.code(), e.name()); addresses.clear(); } if (addresses.empty()) - throw Exception("Not found address of host: " + host, ErrorCodes::DNS_ERROR); + throw Poco::Net::DNSException("Not found address of host: " + host); return addresses; } @@ -142,6 +142,9 @@ static String reverseResolveImpl(const Poco::Net::IPAddress & address) struct DNSResolver::Impl { + using HostWithConsecutiveFailures = std::unordered_map; + using AddressWithConsecutiveFailures = std::unordered_map; + CachedFn<&resolveIPAddressImpl> cache_host; CachedFn<&reverseResolveImpl> cache_address; @@ -152,12 +155,12 @@ struct DNSResolver::Impl std::optional host_name; /// Store hosts, which was asked to resolve from last update of DNS cache. - NameSet new_hosts; - std::unordered_set new_addresses; + HostWithConsecutiveFailures new_hosts; + AddressWithConsecutiveFailures new_addresses; /// Store all hosts, which was whenever asked to resolve - NameSet known_hosts; - std::unordered_set known_addresses; + HostWithConsecutiveFailures known_hosts; + AddressWithConsecutiveFailures known_addresses; /// If disabled, will not make cache lookups, will resolve addresses manually on each call std::atomic disable_cache{false}; @@ -246,38 +249,64 @@ String DNSResolver::getHostName() static const String & cacheElemToString(const String & str) { return str; } static String cacheElemToString(const Poco::Net::IPAddress & addr) { return addr.toString(); } -template -bool DNSResolver::updateCacheImpl(UpdateF && update_func, ElemsT && elems, const String & log_msg) +template +bool DNSResolver::updateCacheImpl( + UpdateF && update_func, + ElemsT && elems, + UInt32 max_consecutive_failures, + const String & notfound_log_msg, + const String & dropped_log_msg) { bool updated = false; String lost_elems; - for (const auto & elem : elems) + using iterators = typename std::remove_reference_t::iterator; + std::vector elements_to_drop; + for (auto it = elems.begin(); it != elems.end(); it++) { try { - updated |= (this->*update_func)(elem); + updated |= (this->*update_func)(it->first); + it->second = 0; } catch (const Poco::Net::NetException &) { ProfileEvents::increment(ProfileEvents::DNSError); - if (!lost_elems.empty()) lost_elems += ", "; - lost_elems += cacheElemToString(elem); + lost_elems += cacheElemToString(it->first); + if (max_consecutive_failures) + { + it->second++; + if (it->second >= max_consecutive_failures) + elements_to_drop.emplace_back(it); + } } catch (...) { - tryLogCurrentException(__PRETTY_FUNCTION__); + tryLogCurrentException(log, __PRETTY_FUNCTION__); } } if (!lost_elems.empty()) - LOG_INFO(log, fmt::runtime(log_msg), lost_elems); + LOG_INFO(log, fmt::runtime(notfound_log_msg), lost_elems); + if (elements_to_drop.size()) + { + updated = true; + String deleted_elements; + for (auto it : elements_to_drop) + { + if (!deleted_elements.empty()) + deleted_elements += ", "; + deleted_elements += cacheElemToString(it->first); + elems.erase(it); + } + LOG_INFO(log, fmt::runtime(dropped_log_msg), deleted_elements); + } return updated; } -bool DNSResolver::updateCache() +bool DNSResolver::updateCache(UInt32 max_consecutive_failures) { LOG_DEBUG(log, "Updating DNS cache"); @@ -301,8 +330,14 @@ bool DNSResolver::updateCache() /// DROP DNS CACHE will wait on update_mutex (possibly while holding drop_mutex) std::lock_guard lock(impl->update_mutex); - bool hosts_updated = updateCacheImpl(&DNSResolver::updateHost, impl->known_hosts, "Cached hosts not found: {}"); - updateCacheImpl(&DNSResolver::updateAddress, impl->known_addresses, "Cached addresses not found: {}"); + bool hosts_updated = updateCacheImpl( + &DNSResolver::updateHost, impl->known_hosts, max_consecutive_failures, "Cached hosts not found: {}", "Cached hosts dropped: {}"); + updateCacheImpl( + &DNSResolver::updateAddress, + impl->known_addresses, + max_consecutive_failures, + "Cached addresses not found: {}", + "Cached addresses dropped: {}"); LOG_DEBUG(log, "Updated DNS cache"); return hosts_updated; @@ -326,13 +361,15 @@ bool DNSResolver::updateAddress(const Poco::Net::IPAddress & address) void DNSResolver::addToNewHosts(const String & host) { std::lock_guard lock(impl->drop_mutex); - impl->new_hosts.insert(host); + UInt8 consecutive_failures = 0; + impl->new_hosts.insert({host, consecutive_failures}); } void DNSResolver::addToNewAddresses(const Poco::Net::IPAddress & address) { std::lock_guard lock(impl->drop_mutex); - impl->new_addresses.insert(address); + UInt8 consecutive_failures = 0; + impl->new_addresses.insert({address, consecutive_failures}); } DNSResolver::~DNSResolver() = default; diff --git a/src/Common/DNSResolver.h b/src/Common/DNSResolver.h index e26f46f44c7..c0d375ced71 100644 --- a/src/Common/DNSResolver.h +++ b/src/Common/DNSResolver.h @@ -47,14 +47,20 @@ public: void dropCache(); /// Updates all known hosts in cache. - /// Returns true if IP of any host has been changed. - bool updateCache(); + /// Returns true if IP of any host has been changed or an element was dropped (too many failures) + bool updateCache(UInt32 max_consecutive_failures); ~DNSResolver(); private: - template - bool updateCacheImpl(UpdateF && update_func, ElemsT && elems, const String & log_msg); + template + + bool updateCacheImpl( + UpdateF && update_func, + ElemsT && elems, + UInt32 max_consecutive_failures, + const String & notfound_log_msg, + const String & dropped_log_msg); DNSResolver(); diff --git a/src/Interpreters/DNSCacheUpdater.cpp b/src/Interpreters/DNSCacheUpdater.cpp index 27d0c3abd92..e7918451a2f 100644 --- a/src/Interpreters/DNSCacheUpdater.cpp +++ b/src/Interpreters/DNSCacheUpdater.cpp @@ -7,9 +7,10 @@ namespace DB { -DNSCacheUpdater::DNSCacheUpdater(ContextPtr context_, Int32 update_period_seconds_) +DNSCacheUpdater::DNSCacheUpdater(ContextPtr context_, Int32 update_period_seconds_, UInt32 max_consecutive_failures_) : WithContext(context_) , update_period_seconds(update_period_seconds_) + , max_consecutive_failures(max_consecutive_failures_) , pool(getContext()->getSchedulePool()) { task_handle = pool.createTask("DNSCacheUpdater", [this]{ run(); }); @@ -20,7 +21,7 @@ void DNSCacheUpdater::run() auto & resolver = DNSResolver::instance(); /// Reload cluster config if IP of any host has been changed since last update. - if (resolver.updateCache()) + if (resolver.updateCache(max_consecutive_failures)) { LOG_INFO(&Poco::Logger::get("DNSCacheUpdater"), "IPs of some hosts have been changed. Will reload cluster config."); try diff --git a/src/Interpreters/DNSCacheUpdater.h b/src/Interpreters/DNSCacheUpdater.h index 5d5486bd012..54bf0b4cecd 100644 --- a/src/Interpreters/DNSCacheUpdater.h +++ b/src/Interpreters/DNSCacheUpdater.h @@ -10,7 +10,7 @@ namespace DB class DNSCacheUpdater : WithContext { public: - DNSCacheUpdater(ContextPtr context, Int32 update_period_seconds_); + DNSCacheUpdater(ContextPtr context, Int32 update_period_seconds_, UInt32 max_consecutive_failures); ~DNSCacheUpdater(); void start(); @@ -18,6 +18,7 @@ private: void run(); Int32 update_period_seconds; + UInt32 max_consecutive_failures; BackgroundSchedulePool & pool; BackgroundSchedulePoolTaskHolder task_handle; diff --git a/tests/integration/parallel_skip.json b/tests/integration/parallel_skip.json index b56264fb570..b220e56dbd9 100644 --- a/tests/integration/parallel_skip.json +++ b/tests/integration/parallel_skip.json @@ -1,7 +1,8 @@ [ - "test_host_ip_change/test.py::test_dns_cache_update", - "test_host_ip_change/test.py::test_ip_change_drop_dns_cache", - "test_host_ip_change/test.py::test_ip_change_update_dns_cache", - "test_host_ip_change/test.py::test_user_access_ip_change[node0]", - "test_host_ip_change/test.py::test_user_access_ip_change[node1]" + "test_dns_cache/test.py::test_dns_cache_update", + "test_dns_cache/test.py::test_ip_change_drop_dns_cache", + "test_dns_cache/test.py::test_ip_change_update_dns_cache", + "test_dns_cache/test.py::test_user_access_ip_change[node0]", + "test_dns_cache/test.py::test_user_access_ip_change[node1]", + "test_dns_cache/test.py::test_host_is_drop_from_cache_after_consecutive_failures" ] diff --git a/tests/integration/test_host_ip_change/__init__.py b/tests/integration/test_dns_cache/__init__.py similarity index 100% rename from tests/integration/test_host_ip_change/__init__.py rename to tests/integration/test_dns_cache/__init__.py diff --git a/tests/integration/test_host_ip_change/configs/dns_update_long.xml b/tests/integration/test_dns_cache/configs/dns_update_long.xml similarity index 100% rename from tests/integration/test_host_ip_change/configs/dns_update_long.xml rename to tests/integration/test_dns_cache/configs/dns_update_long.xml diff --git a/tests/integration/test_host_ip_change/configs/dns_update_short.xml b/tests/integration/test_dns_cache/configs/dns_update_short.xml similarity index 55% rename from tests/integration/test_host_ip_change/configs/dns_update_short.xml rename to tests/integration/test_dns_cache/configs/dns_update_short.xml index e0b68e27be0..86e1310b335 100644 --- a/tests/integration/test_host_ip_change/configs/dns_update_short.xml +++ b/tests/integration/test_dns_cache/configs/dns_update_short.xml @@ -1,3 +1,4 @@ 1 + 6 diff --git a/tests/integration/test_host_ip_change/configs/listen_host.xml b/tests/integration/test_dns_cache/configs/listen_host.xml similarity index 100% rename from tests/integration/test_host_ip_change/configs/listen_host.xml rename to tests/integration/test_dns_cache/configs/listen_host.xml diff --git a/tests/integration/test_host_ip_change/configs/remote_servers.xml b/tests/integration/test_dns_cache/configs/remote_servers.xml similarity index 100% rename from tests/integration/test_host_ip_change/configs/remote_servers.xml rename to tests/integration/test_dns_cache/configs/remote_servers.xml diff --git a/tests/integration/test_host_ip_change/configs/users_with_hostname.xml b/tests/integration/test_dns_cache/configs/users_with_hostname.xml similarity index 100% rename from tests/integration/test_host_ip_change/configs/users_with_hostname.xml rename to tests/integration/test_dns_cache/configs/users_with_hostname.xml diff --git a/tests/integration/test_host_ip_change/test.py b/tests/integration/test_dns_cache/test.py similarity index 92% rename from tests/integration/test_host_ip_change/test.py rename to tests/integration/test_dns_cache/test.py index 604f2e5dc76..5ac0a393505 100644 --- a/tests/integration/test_host_ip_change/test.py +++ b/tests/integration/test_dns_cache/test.py @@ -285,3 +285,14 @@ def test_user_access_ip_change(cluster_with_dns_cache_update, node): retry_count=retry_count, sleep_time=1, ) + + +def test_host_is_drop_from_cache_after_consecutive_failures(cluster_with_dns_cache_update): + with pytest.raises(QueryRuntimeException): + node4.query("SELECT * FROM remote('InvalidHostThatDoesNotExist', 'system', 'one')") + + # Note that the list of hosts in variable since lost_host will be there too (and it's dropped and added back) + # dns_update_short -> dns_max_consecutive_failures set to 6 + assert node4.wait_for_log_line("Cannot resolve host \\(InvalidHostThatDoesNotExist\\), error 0: Host not found.") + assert node4.wait_for_log_line("Cached hosts not found:.*InvalidHostThatDoesNotExist**", repetitions=6) + assert node4.wait_for_log_line("Cached hosts dropped:.*InvalidHostThatDoesNotExist.*") From 83488b2d139ba8d176c294f1aa25a0b8ff7a32e7 Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 5 Apr 2022 14:46:18 +0200 Subject: [PATCH 184/239] Better --- base/loggers/Loggers.cpp | 1 - programs/local/LocalServer.cpp | 17 +++++++++++------ programs/local/LocalServer.h | 2 ++ src/Client/ClientBase.cpp | 7 +++++++ src/Client/ClientBase.h | 1 + 5 files changed, 21 insertions(+), 7 deletions(-) diff --git a/base/loggers/Loggers.cpp b/base/loggers/Loggers.cpp index 7c627ad2272..512e44f79c7 100644 --- a/base/loggers/Loggers.cpp +++ b/base/loggers/Loggers.cpp @@ -197,7 +197,6 @@ void Loggers::buildLoggers(Poco::Util::AbstractConfiguration & config, Poco::Log Poco::AutoPtr pf = new OwnPatternFormatter(color_enabled); Poco::AutoPtr log = new DB::OwnFormattingChannel(pf, new Poco::ConsoleChannel); - logger.warning("Logging " + console_log_level_string + " to console"); log->setLevel(console_log_level); split->addChannel(log, "console"); } diff --git a/programs/local/LocalServer.cpp b/programs/local/LocalServer.cpp index 613c86002f3..1dfb5c1d636 100644 --- a/programs/local/LocalServer.cpp +++ b/programs/local/LocalServer.cpp @@ -434,6 +434,11 @@ catch (...) return getCurrentExceptionCode(); } +void LocalServer::setLogger(const String & logs_level) +{ + config().setString("logger.level", logs_level); + updateLevels(config(), logger()); +} void LocalServer::processConfig() { @@ -463,19 +468,19 @@ void LocalServer::processConfig() || config().has("send_logs_level") || config().has("logger.log")); - auto level = Poco::Logger::parseLevel(config().getString("log-level", config().getString("send_logs_level", "trace"))); + auto level = config().getString("log-level", "trace"); if (config().has("server_logs_file")) { - Poco::Logger::root().setLevel(level); + auto poco_logs_level = Poco::Logger::parseLevel(level); + Poco::Logger::root().setLevel(poco_logs_level); Poco::Logger::root().setChannel(Poco::AutoPtr(new Poco::SimpleFileChannel(server_logs_file))); } - else if (logging) + else if (logging || is_interactive) { - // force enable logging config().setString("logger", "logger"); - Poco::Logger::root().setLevel(level); - // sensitive data rules are not used here + auto log_level_default = is_interactive && !logging ? "none" : level; + config().setString("logger.level", config().getString("log-level", config().getString("send_logs_level", log_level_default))); buildLoggers(config(), logger(), "clickhouse-local"); } else diff --git a/programs/local/LocalServer.h b/programs/local/LocalServer.h index 969af7f1b77..3ee6d80136e 100644 --- a/programs/local/LocalServer.h +++ b/programs/local/LocalServer.h @@ -46,6 +46,8 @@ protected: void processConfig() override; + void setLogger(const String & logs_level) override; + private: /** Composes CREATE subquery based on passed arguments (--structure --file --table and --input-format) * This query will be executed first, before queries passed through --query argument diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp index 115d047e6e8..7f5b5b82082 100644 --- a/src/Client/ClientBase.cpp +++ b/src/Client/ClientBase.cpp @@ -1298,6 +1298,13 @@ void ClientBase::processParsedSingleQuery(const String & full_query, const Strin } } + if (const auto * set_query = parsed_query->as()) + { + const auto * logs_level_field = set_query->changes.tryGet(std::string_view{"send_logs_level"}); + if (logs_level_field) + setLogger(logs_level_field->safeGet()); + } + processed_rows = 0; written_first_block = false; progress_indication.resetProgress(); diff --git a/src/Client/ClientBase.h b/src/Client/ClientBase.h index ae6d090eaee..1ce8be94d39 100644 --- a/src/Client/ClientBase.h +++ b/src/Client/ClientBase.h @@ -95,6 +95,7 @@ protected: std::optional hosts_and_ports_description; }; + virtual void setLogger(const String &) { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method `initializeLogger()` is not implemented for `{}`", getName()); } virtual void printHelpMessage(const OptionsDescription & options_description) = 0; virtual void addOptions(OptionsDescription & options_description) = 0; virtual void processOptions(const OptionsDescription & options_description, From d296eeee2da3e53b1663daef0d1cf93fc29e8b58 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Tue, 5 Apr 2022 06:27:03 +0000 Subject: [PATCH 185/239] Small changes for Keeper --- src/Common/ZooKeeper/ZooKeeperImpl.cpp | 2 +- src/Common/ZooKeeper/ZooKeeperImpl.h | 2 +- src/Coordination/Changelog.cpp | 14 +-- src/Coordination/KeeperDispatcher.cpp | 2 +- src/Coordination/KeeperSnapshotManager.cpp | 8 +- src/Coordination/KeeperStateMachine.cpp | 48 +++------- src/Coordination/KeeperStateManager.h | 2 +- src/Coordination/KeeperStorage.cpp | 94 +++++++++++-------- src/Coordination/KeeperStorage.h | 34 ++++--- src/Coordination/SnapshotableHashTable.h | 5 +- src/Coordination/ZooKeeperDataReader.cpp | 8 +- src/Coordination/tests/gtest_coordination.cpp | 4 + 12 files changed, 117 insertions(+), 106 deletions(-) diff --git a/src/Common/ZooKeeper/ZooKeeperImpl.cpp b/src/Common/ZooKeeper/ZooKeeperImpl.cpp index d3c993344b6..935df255843 100644 --- a/src/Common/ZooKeeper/ZooKeeperImpl.cpp +++ b/src/Common/ZooKeeper/ZooKeeperImpl.cpp @@ -846,7 +846,7 @@ void ZooKeeper::receiveEvent() void ZooKeeper::finalize(bool error_send, bool error_receive, const String & reason) { /// If some thread (send/receive) already finalizing session don't try to do it - bool already_started = finalization_started.exchange(true); + bool already_started = finalization_started.test_and_set(); LOG_TEST(log, "Finalizing session {}: finalization_started={}, queue_finished={}, reason={}", session_id, already_started, requests_queue.isFinished(), reason); diff --git a/src/Common/ZooKeeper/ZooKeeperImpl.h b/src/Common/ZooKeeper/ZooKeeperImpl.h index b87469bd339..58c5947e8ea 100644 --- a/src/Common/ZooKeeper/ZooKeeperImpl.h +++ b/src/Common/ZooKeeper/ZooKeeperImpl.h @@ -209,7 +209,7 @@ private: std::atomic next_xid {1}; /// Mark session finalization start. Used to avoid simultaneous /// finalization from different threads. One-shot flag. - std::atomic finalization_started {false}; + std::atomic_flag finalization_started; using clock = std::chrono::steady_clock; diff --git a/src/Coordination/Changelog.cpp b/src/Coordination/Changelog.cpp index eb8a724ade9..92a83deae1e 100644 --- a/src/Coordination/Changelog.cpp +++ b/src/Coordination/Changelog.cpp @@ -222,8 +222,8 @@ public: } /// Check for duplicated changelog ids - if (logs.count(record.header.index) != 0) - std::erase_if(logs, [record] (const auto & item) { return item.first >= record.header.index; }); + if (logs.contains(record.header.index)) + std::erase_if(logs, [&record] (const auto & item) { return item.first >= record.header.index; }); result.total_entries_read_from_log += 1; @@ -659,6 +659,7 @@ LogEntryPtr Changelog::getLatestConfigChange() const nuraft::ptr Changelog::serializeEntriesToBuffer(uint64_t index, int32_t count) { std::vector> returned_logs; + returned_logs.reserve(count); uint64_t size_total = 0; for (uint64_t i = index; i < index + count; ++i) @@ -669,7 +670,7 @@ nuraft::ptr Changelog::serializeEntriesToBuffer(uint64_t index, nuraft::ptr buf = entry->second->serialize(); size_total += buf->size(); - returned_logs.push_back(buf); + returned_logs.push_back(std::move(buf)); } nuraft::ptr buf_out = nuraft::buffer::alloc(sizeof(int32_t) + count * sizeof(int32_t) + size_total); @@ -678,9 +679,8 @@ nuraft::ptr Changelog::serializeEntriesToBuffer(uint64_t index, for (auto & entry : returned_logs) { - nuraft::ptr & bb = entry; - buf_out->put(static_cast(bb->size())); - buf_out->put(*bb); + buf_out->put(static_cast(entry->size())); + buf_out->put(*entry); } return buf_out; } @@ -699,7 +699,7 @@ void Changelog::applyEntriesFromBuffer(uint64_t index, nuraft::buffer & buffer) buffer.get(buf_local); LogEntryPtr log_entry = nuraft::log_entry::deserialize(*buf_local); - if (i == 0 && logs.count(cur_index)) + if (i == 0 && logs.contains(cur_index)) writeAt(cur_index, log_entry); else appendEntry(cur_index, log_entry); diff --git a/src/Coordination/KeeperDispatcher.cpp b/src/Coordination/KeeperDispatcher.cpp index a4dcb0acc52..4d71c11221e 100644 --- a/src/Coordination/KeeperDispatcher.cpp +++ b/src/Coordination/KeeperDispatcher.cpp @@ -121,7 +121,7 @@ void KeeperDispatcher::requestThread() current_batch.clear(); } - prev_batch = current_batch; + prev_batch = std::move(current_batch); prev_result = result; } diff --git a/src/Coordination/KeeperSnapshotManager.cpp b/src/Coordination/KeeperSnapshotManager.cpp index 429a76eec5e..43fc8b1ec0d 100644 --- a/src/Coordination/KeeperSnapshotManager.cpp +++ b/src/Coordination/KeeperSnapshotManager.cpp @@ -43,7 +43,7 @@ namespace void writeNode(const KeeperStorage::Node & node, SnapshotVersion version, WriteBuffer & out) { - writeBinary(node.data, out); + writeBinary(node.getData(), out); /// Serialize ACL writeBinary(node.acl_id, out); @@ -71,7 +71,9 @@ namespace void readNode(KeeperStorage::Node & node, ReadBuffer & in, SnapshotVersion version, ACLMap & acl_map) { - readBinary(node.data, in); + String new_data; + readBinary(new_data, in); + node.setData(std::move(new_data)); if (version >= SnapshotVersion::V1) { @@ -281,7 +283,7 @@ void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserial if (itr.key != "/") { auto parent_path = parentPath(itr.key); - storage.container.updateValue(parent_path, [path = itr.key] (KeeperStorage::Node & value) { value.children.insert(getBaseName(path)); }); + storage.container.updateValue(parent_path, [path = itr.key] (KeeperStorage::Node & value) { value.addChild(getBaseName(path)); }); } } diff --git a/src/Coordination/KeeperStateMachine.cpp b/src/Coordination/KeeperStateMachine.cpp index 35e56ba1e30..315882ee988 100644 --- a/src/Coordination/KeeperStateMachine.cpp +++ b/src/Coordination/KeeperStateMachine.cpp @@ -259,22 +259,9 @@ void KeeperStateMachine::save_logical_snp_obj( { LOG_DEBUG(log, "Saving snapshot {} obj_id {}", s.get_last_log_idx(), obj_id); - nuraft::ptr cloned_buffer; - nuraft::ptr cloned_meta; - if (obj_id == 0) /// Fake snapshot required by NuRaft at startup - { - std::lock_guard lock(storage_and_responses_lock); - KeeperStorageSnapshot snapshot(storage.get(), s.get_last_log_idx(), getClusterConfig()); - cloned_buffer = snapshot_manager.serializeSnapshotToBuffer(snapshot); - } - else - { - /// copy snapshot into memory - } - /// copy snapshot meta into memory nuraft::ptr snp_buf = s.serialize(); - cloned_meta = nuraft::snapshot::deserialize(*snp_buf); + nuraft::ptr cloned_meta = nuraft::snapshot::deserialize(*snp_buf); try { @@ -332,31 +319,22 @@ int KeeperStateMachine::read_logical_snp_obj( { LOG_DEBUG(log, "Reading snapshot {} obj_id {}", s.get_last_log_idx(), obj_id); - if (obj_id == 0) /// Fake snapshot required by NuRaft at startup + + std::lock_guard lock(snapshots_lock); + /// Our snapshot is not equal to required. Maybe we still creating it in the background. + /// Let's wait and NuRaft will retry this call. + if (s.get_last_log_idx() != latest_snapshot_meta->get_last_log_idx()) { - data_out = nuraft::buffer::alloc(sizeof(int32_t)); - nuraft::buffer_serializer bs(data_out); - bs.put_i32(0); - is_last_obj = false; + LOG_WARNING(log, "Required to apply snapshot with last log index {}, but our last log index is {}. Will ignore this one and retry", + s.get_last_log_idx(), latest_snapshot_meta->get_last_log_idx()); + return -1; } - else + if (bufferFromFile(log, latest_snapshot_path, data_out)) { - std::lock_guard lock(snapshots_lock); - /// Our snapshot is not equal to required. Maybe we still creating it in the background. - /// Let's wait and NuRaft will retry this call. - if (s.get_last_log_idx() != latest_snapshot_meta->get_last_log_idx()) - { - LOG_WARNING(log, "Required to apply snapshot with last log index {}, but our last log index is {}. Will ignore this one and retry", - s.get_last_log_idx(), latest_snapshot_meta->get_last_log_idx()); - return -1; - } - if (bufferFromFile(log, latest_snapshot_path, data_out)) - { - LOG_WARNING(log, "Error reading snapshot {} from {}", s.get_last_log_idx(), latest_snapshot_path); - return -1; - } - is_last_obj = true; + LOG_WARNING(log, "Error reading snapshot {} from {}", s.get_last_log_idx(), latest_snapshot_path); + return -1; } + is_last_obj = true; return 1; } diff --git a/src/Coordination/KeeperStateManager.h b/src/Coordination/KeeperStateManager.h index a56bafb6bae..66c6cc03b87 100644 --- a/src/Coordination/KeeperStateManager.h +++ b/src/Coordination/KeeperStateManager.h @@ -84,7 +84,7 @@ public: bool shouldStartAsFollower() const { std::lock_guard lock(configuration_wrapper_mutex); - return configuration_wrapper.servers_start_as_followers.count(my_server_id); + return configuration_wrapper.servers_start_as_followers.contains(my_server_id); } bool isSecure() const diff --git a/src/Coordination/KeeperStorage.cpp b/src/Coordination/KeeperStorage.cpp index e13b43d056a..a066f7257ae 100644 --- a/src/Coordination/KeeperStorage.cpp +++ b/src/Coordination/KeeperStorage.cpp @@ -24,7 +24,10 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; } -static String base64Encode(const String & decoded) +namespace +{ + +String base64Encode(const String & decoded) { std::ostringstream ostr; // STYLE_CHECK_ALLOW_STD_STRING_STREAM ostr.exceptions(std::ios::failbit); @@ -35,7 +38,7 @@ static String base64Encode(const String & decoded) return ostr.str(); } -static String getSHA1(const String & userdata) +String getSHA1(const String & userdata) { Poco::SHA1Engine engine; engine.update(userdata); @@ -43,14 +46,14 @@ static String getSHA1(const String & userdata) return String{digest_id.begin(), digest_id.end()}; } -static String generateDigest(const String & userdata) +String generateDigest(const String & userdata) { std::vector user_password; boost::split(user_password, userdata, [](char c) { return c == ':'; }); return user_password[0] + ":" + base64Encode(getSHA1(userdata)); } -static bool checkACL(int32_t permission, const Coordination::ACLs & node_acls, const std::vector & session_auths) +bool checkACL(int32_t permission, const Coordination::ACLs & node_acls, const std::vector & session_auths) { if (node_acls.empty()) return true; @@ -77,7 +80,7 @@ static bool checkACL(int32_t permission, const Coordination::ACLs & node_acls, c return false; } -static bool fixupACL( +bool fixupACL( const std::vector & request_acls, const std::vector & current_ids, std::vector & result_acls) @@ -119,7 +122,7 @@ static bool fixupACL( return valid_found; } -static KeeperStorage::ResponsesForSessions processWatchesImpl(const String & path, KeeperStorage::Watches & watches, KeeperStorage::Watches & list_watches, Coordination::Event event_type) +KeeperStorage::ResponsesForSessions processWatchesImpl(const String & path, KeeperStorage::Watches & watches, KeeperStorage::Watches & list_watches, Coordination::Event event_type) { KeeperStorage::ResponsesForSessions result; auto it = watches.find(path); @@ -174,6 +177,25 @@ static KeeperStorage::ResponsesForSessions processWatchesImpl(const String & pat } return result; } +} + +void KeeperStorage::Node::setData(String new_data) +{ + size_bytes = size_bytes - data.size() + new_data.size(); + data = std::move(new_data); +} + +void KeeperStorage::Node::addChild(StringRef child_path) +{ + size_bytes += sizeof child_path; + children.insert(child_path); +} + +void KeeperStorage::Node::removeChild(StringRef child_path) +{ + size_bytes -= sizeof child_path; + children.erase(child_path); +} KeeperStorage::KeeperStorage(int64_t tick_time_ms, const String & superdigest_) : session_expiry_queue(tick_time_ms) @@ -314,8 +336,8 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr created_node.stat.numChildren = 0; created_node.stat.dataLength = request.data.length(); created_node.stat.ephemeralOwner = request.is_ephemeral ? session_id : 0; - created_node.data = request.data; created_node.is_sequental = request.is_sequential; + created_node.setData(std::move(request.data)); auto [map_key, _] = container.insert(path_created, created_node); /// Take child path from key owned by map. @@ -327,8 +349,7 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr container.updateValue(parent_path, [child_path, zxid, &prev_parent_zxid, parent_cversion, &prev_parent_cversion] (KeeperStorage::Node & parent) { - parent.children.insert(child_path); - parent.size_bytes += child_path.size; + parent.addChild(child_path); prev_parent_cversion = parent.stat.cversion; prev_parent_zxid = parent.stat.pzxid; @@ -363,8 +384,7 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr --undo_parent.seq_num; undo_parent.stat.cversion = prev_parent_cversion; undo_parent.stat.pzxid = prev_parent_zxid; - undo_parent.children.erase(child_path); - undo_parent.size_bytes -= child_path.size; + undo_parent.removeChild(child_path); }); storage.container.erase(path_created); @@ -409,7 +429,7 @@ struct KeeperStorageGetRequestProcessor final : public KeeperStorageRequestProce else { response.stat = it->value.stat; - response.data = it->value.data; + response.data = it->value.getData(); response.error = Coordination::Error::ZOK; } @@ -498,8 +518,7 @@ struct KeeperStorageRemoveRequestProcessor final : public KeeperStorageRequestPr { --parent.stat.numChildren; ++parent.stat.cversion; - parent.children.erase(child_basename); - parent.size_bytes -= child_basename.size; + parent.removeChild(child_basename); }); response.error = Coordination::Error::ZOK; @@ -520,8 +539,7 @@ struct KeeperStorageRemoveRequestProcessor final : public KeeperStorageRequestPr { ++parent.stat.numChildren; --parent.stat.cversion; - parent.children.insert(child_name); - parent.size_bytes += child_name.size; + parent.addChild(child_name); }); }; } @@ -598,14 +616,13 @@ struct KeeperStorageSetRequestProcessor final : public KeeperStorageRequestProce auto prev_node = it->value; - auto itr = container.updateValue(request.path, [zxid, request, time] (KeeperStorage::Node & value) + auto itr = container.updateValue(request.path, [zxid, request, time] (KeeperStorage::Node & value) mutable { value.stat.version++; value.stat.mzxid = zxid; value.stat.mtime = time; value.stat.dataLength = request.data.length(); - value.size_bytes = value.size_bytes + request.data.size() - value.data.size(); - value.data = request.data; + value.setData(std::move(request.data)); }); container.updateValue(parentPath(request.path), [] (KeeperStorage::Node & parent) @@ -675,9 +692,10 @@ struct KeeperStorageListRequestProcessor final : public KeeperStorageRequestProc if (path_prefix.empty()) throw DB::Exception("Logical error: path cannot be empty", ErrorCodes::LOGICAL_ERROR); - response.names.reserve(it->value.children.size()); + const auto & children = it->value.getChildren(); + response.names.reserve(children.size()); - for (const auto child : it->value.children) + for (const auto child : children) response.names.push_back(child.toString()); response.stat = it->value.stat; @@ -856,24 +874,23 @@ struct KeeperStorageMultiRequestProcessor final : public KeeperStorageRequestPro for (const auto & sub_request : request.requests) { auto sub_zk_request = std::dynamic_pointer_cast(sub_request); - if (sub_zk_request->getOpNum() == Coordination::OpNum::Create) + switch (sub_zk_request->getOpNum()) { - concrete_requests.push_back(std::make_shared(sub_zk_request)); + case Coordination::OpNum::Create: + concrete_requests.push_back(std::make_shared(sub_zk_request)); + break; + case Coordination::OpNum::Remove: + concrete_requests.push_back(std::make_shared(sub_zk_request)); + break; + case Coordination::OpNum::Set: + concrete_requests.push_back(std::make_shared(sub_zk_request)); + break; + case Coordination::OpNum::Check: + concrete_requests.push_back(std::make_shared(sub_zk_request)); + break; + default: + throw DB::Exception(ErrorCodes::BAD_ARGUMENTS, "Illegal command as part of multi ZooKeeper request {}", sub_zk_request->getOpNum()); } - else if (sub_zk_request->getOpNum() == Coordination::OpNum::Remove) - { - concrete_requests.push_back(std::make_shared(sub_zk_request)); - } - else if (sub_zk_request->getOpNum() == Coordination::OpNum::Set) - { - concrete_requests.push_back(std::make_shared(sub_zk_request)); - } - else if (sub_zk_request->getOpNum() == Coordination::OpNum::Check) - { - concrete_requests.push_back(std::make_shared(sub_zk_request)); - } - else - throw DB::Exception(ErrorCodes::BAD_ARGUMENTS, "Illegal command as part of multi ZooKeeper request {}", sub_zk_request->getOpNum()); } } @@ -1092,8 +1109,7 @@ KeeperStorage::ResponsesForSessions KeeperStorage::processRequest(const Coordina --parent.stat.numChildren; ++parent.stat.cversion; auto base_name = getBaseName(ephemeral_path); - parent.children.erase(base_name); - parent.size_bytes -= base_name.size; + parent.removeChild(base_name); }); container.erase(ephemeral_path); diff --git a/src/Coordination/KeeperStorage.h b/src/Coordination/KeeperStorage.h index 89a42078bc9..ccbddcf6e19 100644 --- a/src/Coordination/KeeperStorage.h +++ b/src/Coordination/KeeperStorage.h @@ -32,28 +32,38 @@ public: struct Node { - String data; uint64_t acl_id = 0; /// 0 -- no ACL by default bool is_sequental = false; Coordination::Stat stat{}; int32_t seq_num = 0; - ChildrenSet children{}; uint64_t size_bytes; // save size to avoid calculate every time - Node() - { - size_bytes = sizeof(size_bytes); - size_bytes += data.size(); - size_bytes += sizeof(acl_id); - size_bytes += sizeof(is_sequental); - size_bytes += sizeof(stat); - size_bytes += sizeof(seq_num); - } + Node() : size_bytes(sizeof(Node)) { } + /// Object memory size uint64_t sizeInBytes() const { return size_bytes; } + + void setData(String new_data); + + const auto & getData() const noexcept + { + return data; + } + + void addChild(StringRef child_path); + + void removeChild(StringRef child_path); + + const auto & getChildren() const noexcept + { + return children; + } + private: + String data; + ChildrenSet children{}; }; struct ResponseForSession @@ -104,7 +114,7 @@ public: /// Mapping session_id -> set of ephemeral nodes paths Ephemerals ephemerals; - /// Mapping sessuib_id -> set of watched nodes paths + /// Mapping session_id -> set of watched nodes paths SessionAndWatcher sessions_and_watchers; /// Expiration queue for session, allows to get dead sessions at some point of time SessionExpiryQueue session_expiry_queue; diff --git a/src/Coordination/SnapshotableHashTable.h b/src/Coordination/SnapshotableHashTable.h index a02e090cb60..27572ab86c7 100644 --- a/src/Coordination/SnapshotableHashTable.h +++ b/src/Coordination/SnapshotableHashTable.h @@ -80,7 +80,7 @@ private: approximate_data_size += value_size; if (!snapshot_mode) { - approximate_data_size += key_size; + approximate_data_size -= key_size; approximate_data_size -= old_value_size; } } @@ -132,7 +132,6 @@ public: if (!it) { - ListElem elem{copyStringInArena(arena, key), value, current_version}; auto itr = list.insert(list.end(), std::move(elem)); bool inserted; @@ -228,7 +227,7 @@ public: /// We in snapshot mode but updating some node which is already more /// fresh than snapshot distance. So it will not participate in /// snapshot and we don't need to copy it. - if (snapshot_mode && list_itr->version <= snapshot_up_to_version) + if (list_itr->version <= snapshot_up_to_version) { auto elem_copy = *(list_itr); list_itr->active_in_map = false; diff --git a/src/Coordination/ZooKeeperDataReader.cpp b/src/Coordination/ZooKeeperDataReader.cpp index 8a3e177c507..e59c67329ff 100644 --- a/src/Coordination/ZooKeeperDataReader.cpp +++ b/src/Coordination/ZooKeeperDataReader.cpp @@ -98,7 +98,9 @@ int64_t deserializeStorageData(KeeperStorage & storage, ReadBuffer & in, Poco::L while (path != "/") { KeeperStorage::Node node{}; - Coordination::read(node.data, in); + String data; + Coordination::read(data, in); + node.setData(std::move(data)); Coordination::read(node.acl_id, in); /// Deserialize stat @@ -117,7 +119,7 @@ int64_t deserializeStorageData(KeeperStorage & storage, ReadBuffer & in, Poco::L Coordination::read(node.stat.pzxid, in); if (!path.empty()) { - node.stat.dataLength = node.data.length(); + node.stat.dataLength = node.getData().length(); node.seq_num = node.stat.cversion; storage.container.insertOrReplace(path, node); @@ -137,7 +139,7 @@ int64_t deserializeStorageData(KeeperStorage & storage, ReadBuffer & in, Poco::L if (itr.key != "/") { auto parent_path = parentPath(itr.key); - storage.container.updateValue(parent_path, [path = itr.key] (KeeperStorage::Node & value) { value.children.insert(getBaseName(path)); value.stat.numChildren++; }); + storage.container.updateValue(parent_path, [path = itr.key] (KeeperStorage::Node & value) { value.addChild(getBaseName(path)); value.stat.numChildren++; }); } } diff --git a/src/Coordination/tests/gtest_coordination.cpp b/src/Coordination/tests/gtest_coordination.cpp index 0fc00cbd75d..07544dfbb89 100644 --- a/src/Coordination/tests/gtest_coordination.cpp +++ b/src/Coordination/tests/gtest_coordination.cpp @@ -946,6 +946,8 @@ TEST_P(CoordinationTest, SnapshotableHashMapDataSize) EXPECT_EQ(hello.getApproximateDataSize(), 9); hello.updateValue("hello", [](IntNode & value) { value = 2; }); EXPECT_EQ(hello.getApproximateDataSize(), 9); + hello.insertOrReplace("hello", 3); + EXPECT_EQ(hello.getApproximateDataSize(), 9); hello.erase("hello"); EXPECT_EQ(hello.getApproximateDataSize(), 0); @@ -958,6 +960,8 @@ TEST_P(CoordinationTest, SnapshotableHashMapDataSize) EXPECT_EQ(hello.getApproximateDataSize(), 9); hello.updateValue("hello", [](IntNode & value) { value = 2; }); EXPECT_EQ(hello.getApproximateDataSize(), 18); + hello.insertOrReplace("hello", 1); + EXPECT_EQ(hello.getApproximateDataSize(), 27); hello.clearOutdatedNodes(); EXPECT_EQ(hello.getApproximateDataSize(), 9); From 3a91b17044e58bd5bdd126c5fa983b628886874f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Violette?= Date: Tue, 5 Apr 2022 16:10:06 +0200 Subject: [PATCH 186/239] Update Contentsquare company case --- docs/en/introduction/adopters.md | 2 +- docs/ja/introduction/adopters.md | 2 +- .../clickhouse-community-meetup-in-paris-on-october-2-2018.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/en/introduction/adopters.md b/docs/en/introduction/adopters.md index ad199ce452e..20d6b20feb6 100644 --- a/docs/en/introduction/adopters.md +++ b/docs/en/introduction/adopters.md @@ -43,7 +43,7 @@ toc_title: Adopters | Citymobil | Taxi | Analytics | — | — | [Blog Post in Russian, March 2020](https://habr.com/en/company/citymobil/blog/490660/) | | Cloudflare | CDN | Traffic analysis | 36 servers | — | [Blog post, May 2017](https://blog.cloudflare.com/how-cloudflare-analyzes-1m-dns-queries-per-second/), [Blog post, March 2018](https://blog.cloudflare.com/http-analytics-for-6m-requests-per-second-using-clickhouse/) | | Comcast | Media | CDN Traffic Analysis | — | — | [ApacheCon 2019 Talk](https://www.youtube.com/watch?v=e9TZ6gFDjNg) | -| ContentSquare | Web analytics | Main product | — | — | [Blog post in French, November 2018](http://souslecapot.net/2018/11/21/patrick-chatain-vp-engineering-chez-contentsquare-penser-davantage-amelioration-continue-que-revolution-constante/) | +| Contentsquare | Web analytics | Main product | — | — | [Blog post in French, November 2018](http://souslecapot.net/2018/11/21/patrick-chatain-vp-engineering-chez-contentsquare-penser-davantage-amelioration-continue-que-revolution-constante/) | | Corunet | Analytics | Main product | — | — | [Slides in English, April 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup21/predictive_models.pdf) | | CraiditX 氪信 | Finance AI | Analysis | — | — | [Slides in English, November 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup33/udf.pptx) | | Crazypanda | Games | | — | — | Live session on ClickHouse meetup | diff --git a/docs/ja/introduction/adopters.md b/docs/ja/introduction/adopters.md index 6f878bf1dfe..3372bb74f12 100644 --- a/docs/ja/introduction/adopters.md +++ b/docs/ja/introduction/adopters.md @@ -27,7 +27,7 @@ toc_title: "\u30A2\u30C0\u30D7\u30BF\u30FC" | Cisco | ネットワーク | トラフィック分析 | — | — | [ライトニングトーク2019](https://youtu.be/-hI1vDR2oPY?t=5057) | | Citadel Securities | 金融 | — | — | — | [2019年の貢献](https://github.com/ClickHouse/ClickHouse/pull/4774) | | シティモービル | タクシー | 分析 | — | — | [ロシア語でのブログ投稿,月2020](https://habr.com/en/company/citymobil/blog/490660/) | -| ContentSquare | ウェブ分析 | 主な製品 | — | — | [フランス語でのブログ投稿,November2018](http://souslecapot.net/2018/11/21/patrick-chatain-vp-engineering-chez-contentsquare-penser-davantage-amelioration-continue-que-revolution-constante/) | +| Contentsquare | ウェブ分析 | 主な製品 | — | — | [フランス語でのブログ投稿,November2018](http://souslecapot.net/2018/11/21/patrick-chatain-vp-engineering-chez-contentsquare-penser-davantage-amelioration-continue-que-revolution-constante/) | | Cloudflare | CDN | トラフィック分析 | 36台のサーバー | — | [ブログ投稿,月2017](https://blog.cloudflare.com/how-cloudflare-analyzes-1m-dns-queries-per-second/), [ブログ投稿,月2018](https://blog.cloudflare.com/http-analytics-for-6m-requests-per-second-using-clickhouse/) | | コルネット | 分析 | 主な製品 | — | — | [2019年英語スライド](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup21/predictive_models.pdf) | | CraiditX 氪信 | ファイナンスAI | 分析 | — | — | [2019年のスライド](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup33/udf.pptx) | diff --git a/website/blog/en/2018/clickhouse-community-meetup-in-paris-on-october-2-2018.md b/website/blog/en/2018/clickhouse-community-meetup-in-paris-on-october-2-2018.md index a8c5c2a92dd..f94d2de411c 100644 --- a/website/blog/en/2018/clickhouse-community-meetup-in-paris-on-october-2-2018.md +++ b/website/blog/en/2018/clickhouse-community-meetup-in-paris-on-october-2-2018.md @@ -7,7 +7,7 @@ tags: ['meetup', 'Paris', 'France', 'events'] Agenda of Paris ClickHouse Meetup was full of use cases, mostly from France-based companies which are actively using ClickHouse. Slides for all talks are [available on the GitHub](https://github.com/clickhouse/clickhouse-presentations/tree/master/meetup18). -Christophe Kalenzaga and Vianney Foucault, engineers from ContentSquare, company that provided the meetup venue: +Christophe Kalenzaga and Vianney Foucault, engineers from Contentsquare, company that provided the meetup venue: ![Christophe Kalenzaga and Vianney Foucault](https://blog-images.clickhouse.com/en/2018/clickhouse-community-meetup-in-paris-on-october-2-2018/1.jpg) Matthieu Jacquet from Storetail (Criteo): From 39cc3baf5f358e5e44fd7d1dc5b6ca9121eca24f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Tue, 5 Apr 2022 16:37:53 +0200 Subject: [PATCH 187/239] Keep the previous exception code --- src/Common/DNSResolver.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/Common/DNSResolver.cpp b/src/Common/DNSResolver.cpp index fcf175fc9bf..66f67caf456 100644 --- a/src/Common/DNSResolver.cpp +++ b/src/Common/DNSResolver.cpp @@ -123,7 +123,7 @@ static DNSResolver::IPAddresses resolveIPAddressImpl(const std::string & host) } if (addresses.empty()) - throw Poco::Net::DNSException("Not found address of host: " + host); + throw Exception("Not found address of host: " + host, ErrorCodes::DNS_ERROR); return addresses; } @@ -268,8 +268,13 @@ bool DNSResolver::updateCacheImpl( updated |= (this->*update_func)(it->first); it->second = 0; } - catch (const Poco::Net::NetException &) + catch (const DB::Exception & e) { + if (e.code() != ErrorCodes::DNS_ERROR) + { + tryLogCurrentException(log, __PRETTY_FUNCTION__); + continue; + } ProfileEvents::increment(ProfileEvents::DNSError); if (!lost_elems.empty()) lost_elems += ", "; From dce0c9f059d95c63a4108bb19b4aabc12b91c97e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Tue, 5 Apr 2022 17:19:56 +0200 Subject: [PATCH 188/239] Python format --- tests/integration/test_dns_cache/test.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/tests/integration/test_dns_cache/test.py b/tests/integration/test_dns_cache/test.py index 5ac0a393505..820ff221f55 100644 --- a/tests/integration/test_dns_cache/test.py +++ b/tests/integration/test_dns_cache/test.py @@ -287,12 +287,22 @@ def test_user_access_ip_change(cluster_with_dns_cache_update, node): ) -def test_host_is_drop_from_cache_after_consecutive_failures(cluster_with_dns_cache_update): +def test_host_is_drop_from_cache_after_consecutive_failures( + cluster_with_dns_cache_update, +): with pytest.raises(QueryRuntimeException): - node4.query("SELECT * FROM remote('InvalidHostThatDoesNotExist', 'system', 'one')") + node4.query( + "SELECT * FROM remote('InvalidHostThatDoesNotExist', 'system', 'one')" + ) # Note that the list of hosts in variable since lost_host will be there too (and it's dropped and added back) # dns_update_short -> dns_max_consecutive_failures set to 6 - assert node4.wait_for_log_line("Cannot resolve host \\(InvalidHostThatDoesNotExist\\), error 0: Host not found.") - assert node4.wait_for_log_line("Cached hosts not found:.*InvalidHostThatDoesNotExist**", repetitions=6) - assert node4.wait_for_log_line("Cached hosts dropped:.*InvalidHostThatDoesNotExist.*") + assert node4.wait_for_log_line( + "Cannot resolve host \\(InvalidHostThatDoesNotExist\\), error 0: Host not found." + ) + assert node4.wait_for_log_line( + "Cached hosts not found:.*InvalidHostThatDoesNotExist**", repetitions=6 + ) + assert node4.wait_for_log_line( + "Cached hosts dropped:.*InvalidHostThatDoesNotExist.*" + ) From 6eff1d2b02983738fbb30786c8fb5de7c9df43be Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Tue, 5 Apr 2022 17:30:03 +0200 Subject: [PATCH 189/239] Fixed tests --- tests/queries/0_stateless/02252_jit_profile_events.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02252_jit_profile_events.sql b/tests/queries/0_stateless/02252_jit_profile_events.sql index 561e25505bc..ddb95d4fa37 100644 --- a/tests/queries/0_stateless/02252_jit_profile_events.sql +++ b/tests/queries/0_stateless/02252_jit_profile_events.sql @@ -1,4 +1,4 @@ --- Tags: no-fasttest +-- Tags: no-fasttest, no-ubsan, no-cpu-aarch64 SET compile_expressions = 1; SET min_count_to_compile_expression = 0; From 37a06eec1a062e5ac8ed9a179051491ea98e285c Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Tue, 5 Apr 2022 17:36:53 +0200 Subject: [PATCH 190/239] fixes --- src/Databases/DatabaseReplicated.cpp | 2 -- src/Databases/DatabaseReplicatedWorker.cpp | 11 +++++---- src/Databases/DatabaseReplicatedWorker.h | 2 +- src/Interpreters/DDLWorker.cpp | 3 +-- src/Storages/StorageReplicatedMergeTree.cpp | 25 ++++++++++----------- src/Storages/StorageReplicatedMergeTree.h | 3 ++- 6 files changed, 23 insertions(+), 23 deletions(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 2337a063a5e..d94eceb7dec 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -674,8 +674,6 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep LOG_INFO(log, "Marked recovered {} as finished", entry_name); } } - current_zookeeper->set(replica_path + "/log_ptr", toString(max_log_ptr)); - ddl_worker->updateLogPointer(DDLTaskBase::getLogEntryName(max_log_ptr)); } std::map DatabaseReplicated::tryGetConsistentMetadataSnapshot(const ZooKeeperPtr & zookeeper, UInt32 & max_log_ptr) diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp index e9475b56377..84c3f857a81 100644 --- a/src/Databases/DatabaseReplicatedWorker.cpp +++ b/src/Databases/DatabaseReplicatedWorker.cpp @@ -68,11 +68,14 @@ void DatabaseReplicatedDDLWorker::initializeReplication() if (our_log_ptr == 0 || our_log_ptr + logs_to_keep < max_log_ptr) { database->recoverLostReplica(zookeeper, our_log_ptr, max_log_ptr); + zookeeper->set(database->replica_path + "/log_ptr", toString(max_log_ptr)); + initializeLogPointer(DDLTaskBase::getLogEntryName(max_log_ptr)); } else { - last_skipped_entry_name.emplace(DDLTaskBase::getLogEntryName(our_log_ptr)); - updateLogPointer(DDLTaskBase::getLogEntryName(our_log_ptr)); + String log_entry_name = DDLTaskBase::getLogEntryName(our_log_ptr); + last_skipped_entry_name.emplace(log_entry_name); + initializeLogPointer(log_entry_name); } } @@ -313,7 +316,7 @@ bool DatabaseReplicatedDDLWorker::canRemoveQueueEntry(const String & entry_name, return entry_number + logs_to_keep < max_log_ptr; } -void DatabaseReplicatedDDLWorker::updateLogPointer(const String & processed_entry_name) +void DatabaseReplicatedDDLWorker::initializeLogPointer(const String & processed_entry_name) { updateMaxDDLEntryID(processed_entry_name); assert(max_id.load() == parse(getAndSetZooKeeper()->get(fs::path(database->replica_path) / "log_ptr"))); @@ -321,7 +324,7 @@ void DatabaseReplicatedDDLWorker::updateLogPointer(const String & processed_entr UInt32 DatabaseReplicatedDDLWorker::getLogPointer() const { - /// NOTE it main not be equal to the log_ptr in zk: + /// NOTE it may not be equal to the log_ptr in zk: /// - max_id can be equal to log_ptr - 1 due to race condition (when it's updated in zk, but not updated in memory yet) /// - max_id can be greater than log_ptr, because log_ptr is not updated for failed and dummy entries return max_id.load(); diff --git a/src/Databases/DatabaseReplicatedWorker.h b/src/Databases/DatabaseReplicatedWorker.h index e23be472c54..3c53d288841 100644 --- a/src/Databases/DatabaseReplicatedWorker.h +++ b/src/Databases/DatabaseReplicatedWorker.h @@ -32,11 +32,11 @@ public: static String enqueueQueryImpl(const ZooKeeperPtr & zookeeper, DDLLogEntry & entry, DatabaseReplicated * const database, bool committed = false); /// NOLINT - void updateLogPointer(const String & processed_entry_name); UInt32 getLogPointer() const; private: bool initializeMainThread() override; void initializeReplication(); + void initializeLogPointer(const String & processed_entry_name); DDLTaskPtr initAndCheckTask(const String & entry_name, String & out_reason, const ZooKeeperPtr & zookeeper) override; bool canRemoveQueueEntry(const String & entry_name, const Coordination::Stat & stat) override; diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index ba5de0c6668..9af6b61a0c1 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -632,8 +632,6 @@ void DDLWorker::processTask(DDLTaskBase & task, const ZooKeeperPtr & zookeeper) task.was_executed = true; } - updateMaxDDLEntryID(task.entry_name); - /// Step 3: Create node in finished/ status dir and write execution status. /// FIXME: if server fails right here, the task will be executed twice. We need WAL here. /// NOTE: If ZooKeeper connection is lost here, we will try again to write query status. @@ -650,6 +648,7 @@ void DDLWorker::processTask(DDLTaskBase & task, const ZooKeeperPtr & zookeeper) active_node->setAlreadyRemoved(); task.completely_processed = true; + updateMaxDDLEntryID(task.entry_name); } diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index ff37b98bbb6..1127337adff 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -828,13 +828,14 @@ void StorageReplicatedMergeTree::drop() throw Exception("Can't drop readonly replicated table (need to drop data in ZooKeeper as well)", ErrorCodes::TABLE_IS_READ_ONLY); shutdown(); - dropReplica(zookeeper, zookeeper_path, replica_name, log); + dropReplica(zookeeper, zookeeper_path, replica_name, log, getSettings()); } dropAllData(); } -void StorageReplicatedMergeTree::dropReplica(zkutil::ZooKeeperPtr zookeeper, const String & zookeeper_path, const String & replica, Poco::Logger * logger) +void StorageReplicatedMergeTree::dropReplica(zkutil::ZooKeeperPtr zookeeper, const String & zookeeper_path, const String & replica, + Poco::Logger * logger, MergeTreeSettingsPtr table_settings) { if (zookeeper->expired()) throw Exception("Table was not dropped because ZooKeeper session has expired.", ErrorCodes::TABLE_WAS_NOT_DROPPED); @@ -871,12 +872,14 @@ void StorageReplicatedMergeTree::dropReplica(zkutil::ZooKeeperPtr zookeeper, con assert(code == Coordination::Error::ZOK || code == Coordination::Error::ZNONODE); /// Then try to remove paths that are known to be flat (all children are leafs) - Strings flat_nodes = {"flags", "parts", "queue"}; + Strings flat_nodes = {"flags", "queue"}; + if (table_settings && table_settings->use_minimalistic_part_header_in_zookeeper) + flat_nodes.emplace_back("parts"); for (const auto & node : flat_nodes) { - bool removed_quickly = zookeeper->tryRemoveChildrenRecursive(fs::path(zookeeper_path) / node, /* probably flat */ true); + bool removed_quickly = zookeeper->tryRemoveChildrenRecursive(fs::path(remote_replica_path) / node, /* probably flat */ true); if (!removed_quickly) - LOG_WARNING(logger, "Cannot quickly remove node {} and its children (replica: {}). Will remove recursively.", + LOG_WARNING(logger, "Failed to quickly remove node '{}' and its children, fell back to recursive removal (replica: {})", node, remote_replica_path); } @@ -899,7 +902,6 @@ void StorageReplicatedMergeTree::dropReplica(zkutil::ZooKeeperPtr zookeeper, con LOG_WARNING(logger, "Cannot quickly remove nodes without children: {} (replica: {}). Will remove recursively.", Coordination::errorMessage(code), remote_replica_path); - /// And finally remove everything else recursively zookeeper->tryRemoveRecursive(remote_replica_path); } @@ -962,14 +964,16 @@ bool StorageReplicatedMergeTree::removeTableNodesFromZooKeeper(zkutil::ZooKeeper { bool completely_removed = false; - Strings flat_nodes = {"block_numbers", "blocks", "leader_election", "log", "mutations", "pinned_part_uuids", "log"}; + /// NOTE /block_numbers/ actually is not flat, because /block_numbers// may have ephemeral children, + /// but we assume that all ephemeral block locks are already removed when table is being dropped. + static constexpr std::array flat_nodes = {"block_numbers", "blocks", "leader_election", "log", "mutations", "pinned_part_uuids"}; /// First try to remove paths that are known to be flat for (const auto & node : flat_nodes) { bool removed_quickly = zookeeper->tryRemoveChildrenRecursive(fs::path(zookeeper_path) / node, /* probably flat */ true); if (!removed_quickly) - LOG_WARNING(logger, "Cannot quickly remove node {} and its children (table: {}). Will remove recursively.", + LOG_WARNING(logger, "Failed to quickly remove node '{}' and its children, fell back to recursive removal (table: {})", node, zookeeper_path); } @@ -982,11 +986,6 @@ bool StorageReplicatedMergeTree::removeTableNodesFromZooKeeper(zkutil::ZooKeeper ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_path + "/columns", -1)); ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_path + "/metadata", -1)); ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_path + "/table_shared_id", -1)); - ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_path + "/max_processed_insert_time", -1)); - ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_path + "/min_unprocessed_insert_time", -1)); - ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_path + "/metadata_version", -1)); - ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_path + "/mutation_pointer", -1)); - ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_path + "/table_shared_id", -1)); Coordination::Responses res; auto code = zookeeper->tryMulti(ops, res); if (code != Coordination::Error::ZOK) diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index c567447e9f2..317544c8bb8 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -214,7 +214,8 @@ public: /** Remove a specific replica from zookeeper. */ - static void dropReplica(zkutil::ZooKeeperPtr zookeeper, const String & zookeeper_path, const String & replica, Poco::Logger * logger); + static void dropReplica(zkutil::ZooKeeperPtr zookeeper, const String & zookeeper_path, const String & replica, + Poco::Logger * logger, MergeTreeSettingsPtr table_settings = nullptr); /// Removes table from ZooKeeper after the last replica was dropped static bool removeTableNodesFromZooKeeper(zkutil::ZooKeeperPtr zookeeper, const String & zookeeper_path, From 3c0c1a11761f8d491ed5342c75ba24cc563db91b Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 5 Apr 2022 18:35:23 +0200 Subject: [PATCH 191/239] Add a comment #35919 --- programs/client/Client.cpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/programs/client/Client.cpp b/programs/client/Client.cpp index b632849484c..cae74df97da 100644 --- a/programs/client/Client.cpp +++ b/programs/client/Client.cpp @@ -163,9 +163,23 @@ void Client::initialize(Poco::Util::Application & self) configReadClient(config(), home_path); + /** getenv is thread-safe in Linux glibc and in all sane libc implementations. + * But the standard does not guarantee that subsequent calls will not rewrite the value by returned pointer. + * + * man getenv: + * + * As typically implemented, getenv() returns a pointer to a string within the environment list. + * The caller must take care not to modify this string, since that would change the environment of + * the process. + * + * The implementation of getenv() is not required to be reentrant. The string pointed to by the return value of getenv() + * may be statically allocated, and can be modified by a subsequent call to getenv(), putenv(3), setenv(3), or unsetenv(3). + */ + const char * env_user = getenv("CLICKHOUSE_USER"); if (env_user) config().setString("user", env_user); + const char * env_password = getenv("CLICKHOUSE_PASSWORD"); if (env_password) config().setString("password", env_password); From acaeaf28422d2e9a21148c99d3c7d8421b7ab37a Mon Sep 17 00:00:00 2001 From: LAL2211 Date: Tue, 5 Apr 2022 12:46:54 -0400 Subject: [PATCH 192/239] black check formatted --- tests/integration/helpers/config_cluster.py | 38 +++++++++------------ 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/tests/integration/helpers/config_cluster.py b/tests/integration/helpers/config_cluster.py index cb4bc6286ff..1912d9be06b 100644 --- a/tests/integration/helpers/config_cluster.py +++ b/tests/integration/helpers/config_cluster.py @@ -1,35 +1,31 @@ - # MYSQL CREDENTIALS -mysql_user = 'root' -mysql_pass = 'clickhouse' +mysql_user = "root" +mysql_pass = "clickhouse" # MYSQL8 CREDENTIALS -mysql8_user = 'root' -mysql8_pass = 'clickhouse' +mysql8_user = "root" +mysql8_pass = "clickhouse" # POSTGRES CREDENTIALS -pg_user = 'postgres' -pg_pass = 'mysecretpassword' -pg_db = 'postgres' +pg_user = "postgres" +pg_pass = "mysecretpassword" +pg_db = "postgres" # MINIO CREDENTIALS -minio_access_key="minio" -minio_secret_key="minio123" +minio_access_key = "minio" +minio_secret_key = "minio123" # MONGODB CREDENTIALS -mongo_user = 'root' -mongo_pass = 'clickhouse' +mongo_user = "root" +mongo_pass = "clickhouse" # ODBC CREDENTIALS -odbc_mysql_uid = 'root' -odbc_mysql_pass = 'clickhouse' -odbc_mysql_db = 'clickhouse' - -odbc_psql_db = 'postgres' -odbc_psql_user = 'postgres' -odbc_psql_pass = 'mysecretpassword' - - +odbc_mysql_uid = "root" +odbc_mysql_pass = "clickhouse" +odbc_mysql_db = "clickhouse" +odbc_psql_db = "postgres" +odbc_psql_user = "postgres" +odbc_psql_pass = "mysecretpassword" From 2298b80ed8174be49c59c0ae7ed9739ba38e5a37 Mon Sep 17 00:00:00 2001 From: Boris Kuschel Date: Mon, 14 Mar 2022 09:51:50 -0400 Subject: [PATCH 193/239] Add coverity scan Signed-off-by: Boris Kuschel --- .github/workflows/nightly.yml | 49 +++++++++++++++++++++++++++++++++ docker/packager/binary/build.sh | 18 ++++++++++-- docker/packager/packager | 11 ++++++-- tests/ci/build_check.py | 7 +++++ tests/ci/ci_config.py | 11 ++++++++ 5 files changed, 91 insertions(+), 5 deletions(-) diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index 5b47f94a324..b9541589a64 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -72,3 +72,52 @@ jobs: with: name: changed_images path: ${{ runner.temp }}/changed_images.json + BuilderCoverity: + needs: DockerHubPush + runs-on: [self-hosted, builder] + steps: + - name: Set envs + run: | + cat >> "$GITHUB_ENV" << 'EOF' + TEMP_PATH=${{runner.temp}}/build_check + IMAGES_PATH=${{runner.temp}}/images_path + REPO_COPY=${{runner.temp}}/build_check/ClickHouse + CACHES_PATH=${{runner.temp}}/../ccaches + CHECK_NAME=ClickHouse build check (actions) + BUILD_NAME=coverity + EOF + - name: Download changed images + uses: actions/download-artifact@v2 + with: + name: changed_images + path: ${{ env.IMAGES_PATH }} + - name: Clear repository + run: | + sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE" + - name: Check out repository code + id: coverity-checkout + uses: actions/checkout@v2 + with: + submodules: 'true' + fetch-depth: 0 # otherwise we will have no info about contributors + - name: Build + run: | + sudo rm -fr "$TEMP_PATH" + mkdir -p "$TEMP_PATH" + cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" + cd "$REPO_COPY/tests/ci" && python3 build_check.py "$CHECK_NAME" "$BUILD_NAME" "$${{ secrets.COV_TOKEN }}" + - name: Upload Coverity Analysis + if: ${{ success() || failure() }} + run: | + curl --form token=$${{ secrets.COV_TOKEN }} \ + --form email=${{ secrets.ROBOT_CLICKHOUSE_EMAIL }} \ + --form file=@$TEMP_PATH/$BUILD_NAME/clickhouse-scan.tgz \ + --form version="${GITHUB_REF#refs/heads/}-${GITHUB_SHA::6}" \ + --form description="Nighly Scan: $(date +'%Y-%m-%dT%H:%M:%S')" \ + https://scan.coverity.com/builds?project=ClickHouse%2FClickHouse + - name: Cleanup + if: always() + run: | + docker kill "$(docker ps -q)" ||: + docker rm -f "$(docker ps -a -q)" ||: + sudo rm -fr "$TEMP_PATH" "$CACHES_PATH" diff --git a/docker/packager/binary/build.sh b/docker/packager/binary/build.sh index 31416e1a0ee..269d3eb52c6 100755 --- a/docker/packager/binary/build.sh +++ b/docker/packager/binary/build.sh @@ -25,13 +25,21 @@ read -ra CMAKE_FLAGS <<< "${CMAKE_FLAGS:-}" env cmake --debug-trycompile --verbose=1 -DCMAKE_VERBOSE_MAKEFILE=1 -LA "-DCMAKE_BUILD_TYPE=$BUILD_TYPE" "-DSANITIZE=$SANITIZER" -DENABLE_CHECK_HEAVY_BUILDS=1 "${CMAKE_FLAGS[@]}" .. +if [ "coverity" == "$COMBINED_OUTPUT" ] +then + wget --post-data "token=$COV_TOKEN&project=ClickHouse%2FClickHouse" -qO- https://scan.coverity.com/download/linux64 | tar xz -C /opt/cov-analysis --strip-components 1 + export PATH=$PATH:/opt/cov-analysis/bin + cov-configure --config ./coverity.config --template --comptype clangcc --compiler "$CC" + SCAN_WRAPPER="cov-build --config ./coverity.config --dir cov-int" +fi + cache_status # clear cache stats ccache --zero-stats ||: # No quotes because I want it to expand to nothing if empty. -# shellcheck disable=SC2086 -ninja $NINJA_FLAGS clickhouse-bundle +# shellcheck disable=SC2086 # No quotes because I want it to expand to nothing if empty. +$SCAN_WRAPPER ninja $NINJA_FLAGS clickhouse-bundle cache_status @@ -91,6 +99,12 @@ then mv "$COMBINED_OUTPUT.tgz" /output fi +if [ "coverity" == "$COMBINED_OUTPUT" ] +then + tar -cv -I pigz -f "coverity-scan.tgz" cov-int + mv "coverity-scan.tgz" /output +fi + # Also build fuzzers if any sanitizer specified # if [ -n "$SANITIZER" ] # then diff --git a/docker/packager/packager b/docker/packager/packager index f82d402d613..1a79b497fa2 100755 --- a/docker/packager/packager +++ b/docker/packager/packager @@ -86,6 +86,7 @@ def parse_env_variables( additional_pkgs, with_coverage, with_binaries, + coverity_scan, ): DARWIN_SUFFIX = "-darwin" DARWIN_ARM_SUFFIX = "-darwin-aarch64" @@ -176,6 +177,9 @@ def parse_env_variables( if package_type == "performance": result.append("COMBINED_OUTPUT=performance") cmake_flags.append("-DENABLE_TESTS=0") + elif package_type == "coverity": + result.append("COMBINED_OUTPUT=coverity") + result.append("COV_TOKEN={}".format(cov_token)) elif split_binary: result.append("COMBINED_OUTPUT=shared_build") @@ -262,9 +266,8 @@ if __name__ == "__main__": # and configs to be used for performance test. parser.add_argument( "--package-type", - choices=("deb", "binary", "performance"), + choices=["deb", "binary", "performance", "coverity"], required=True, - help="a build type", ) parser.add_argument( "--clickhouse-repo-path", @@ -325,12 +328,13 @@ if __name__ == "__main__": parser.add_argument( "--docker-image-version", default="latest", help="docker image tag to use" ) + parser.add_argument("--cov_token", default="") args = parser.parse_args() if not os.path.isabs(args.output_dir): args.output_dir = os.path.abspath(os.path.join(os.getcwd(), args.output_dir)) - image_type = "binary" if args.package_type == "performance" else args.package_type + image_type = "binary" if args.package_type in ("performance", "coverity") else args.package_type image_name = "clickhouse/binary-builder" if not os.path.isabs(args.clickhouse_repo_path): @@ -372,6 +376,7 @@ if __name__ == "__main__": args.additional_pkgs, args.with_coverage, args.with_binaries, + args.cov_token, ) run_docker_image_with_env( diff --git a/tests/ci/build_check.py b/tests/ci/build_check.py index 8ef723454d5..1dcfb9d01d3 100644 --- a/tests/ci/build_check.py +++ b/tests/ci/build_check.py @@ -55,6 +55,7 @@ def get_packager_cmd( image_version: str, ccache_path: str, official: bool, + cov_token: str, ) -> str: package_type = build_config["package_type"] comp = build_config["compiler"] @@ -87,6 +88,8 @@ def get_packager_cmd( if official: cmd += " --official" + if cov_token: + cmd += " --cov-token={}".format(cov_token) return cmd @@ -203,6 +206,9 @@ def main(): build_check_name = sys.argv[1] build_name = sys.argv[2] + cov_token = "" + if len(sys.argv) > 3: + cov_token = sys.argv[3] build_config = get_build_config(build_check_name, build_name) @@ -297,6 +303,7 @@ def main(): image_version, ccache_path, official=official_flag, + cov_token, ) logging.info("Going to run packager with %s", packager_cmd) diff --git a/tests/ci/ci_config.py b/tests/ci/ci_config.py index 7a183fd0bd6..b4f13817896 100644 --- a/tests/ci/ci_config.py +++ b/tests/ci/ci_config.py @@ -28,6 +28,16 @@ CI_CONFIG = { "tidy": "disable", "with_coverage": False, }, + "coverity": { + "compiler": "clang-13", + "build_type": "", + "sanitizer": "", + "package_type": "coverity", + "bundled": "bundled", + "splitted": "unsplitted", + "tidy": "disable", + "with_coverage": False, + }, "binary_gcc": { "compiler": "gcc-11", "build_type": "", @@ -190,6 +200,7 @@ CI_CONFIG = { "ClickHouse build check (actions)": [ "package_release", "performance", + "coverity", "package_aarch64", "package_asan", "package_ubsan", From 3412be9d4d6f13001734d4e34dabedbf1692d9ba Mon Sep 17 00:00:00 2001 From: Mikhail Filimonov Date: Tue, 5 Apr 2022 20:25:05 +0200 Subject: [PATCH 194/239] Change KafkaDirectReads to KafkaConsumersInUse --- src/Common/CurrentMetrics.cpp | 2 +- src/Storages/Kafka/StorageKafka.cpp | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/Common/CurrentMetrics.cpp b/src/Common/CurrentMetrics.cpp index d49fc02084f..4df1c4eaec8 100644 --- a/src/Common/CurrentMetrics.cpp +++ b/src/Common/CurrentMetrics.cpp @@ -86,7 +86,7 @@ M(KafkaProducers, "Number of active Kafka producer created") \ M(KafkaLibrdkafkaThreads, "Number of active librdkafka threads") \ M(KafkaBackgroundReads, "Number of background reads currently working (populating materialized views from Kafka)") \ - M(KafkaDirectReads, "Number of direct selects from Kafka currently executing") \ + M(KafkaConsumersInUse, "Number of consumers which are currently used by direct or background reads") \ M(KafkaWrites, "Number of currently running inserts to Kafka") \ M(KafkaAssignedPartitions, "Number of partitions Kafka tables currently assigned to") \ diff --git a/src/Storages/Kafka/StorageKafka.cpp b/src/Storages/Kafka/StorageKafka.cpp index 722c55e6c93..32c6fd1a655 100644 --- a/src/Storages/Kafka/StorageKafka.cpp +++ b/src/Storages/Kafka/StorageKafka.cpp @@ -49,7 +49,7 @@ namespace CurrentMetrics { extern const Metric KafkaLibrdkafkaThreads; extern const Metric KafkaBackgroundReads; - extern const Metric KafkaDirectReads; + extern const Metric KafkaConsumersInUse; extern const Metric KafkaWrites; } @@ -301,7 +301,6 @@ Pipe StorageKafka::read( if (mv_attached) throw Exception(ErrorCodes::QUERY_NOT_ALLOWED, "Cannot read from StorageKafka with attached materialized views"); - CurrentMetrics::Increment metric_increment{CurrentMetrics::KafkaDirectReads}; ProfileEvents::increment(ProfileEvents::KafkaDirectReads); /// Always use all consumers at once, otherwise SELECT may not read messages from all partitions. @@ -386,6 +385,7 @@ void StorageKafka::pushReadBuffer(ConsumerBufferPtr buffer) std::lock_guard lock(mutex); buffers.push_back(buffer); semaphore.set(); + CurrentMetrics::sub(CurrentMetrics::KafkaConsumersInUse, 1); } @@ -410,6 +410,7 @@ ConsumerBufferPtr StorageKafka::popReadBuffer(std::chrono::milliseconds timeout) std::lock_guard lock(mutex); auto buffer = buffers.back(); buffers.pop_back(); + CurrentMetrics::add(CurrentMetrics::KafkaConsumersInUse, 1); return buffer; } From acd48ebe2d51d20d0c2a5bc545d46d27f9bf7904 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Tue, 5 Apr 2022 23:22:49 +0300 Subject: [PATCH 195/239] Disable processors_profile_log for test_input_format_parallel_parsing_memory_tracking This should fix memory usage, like in [1]. [1]: https://s3.amazonaws.com/clickhouse-test-reports/34355/e5a837e574fac93e01d985f55d069fe522abc182/integration_tests__thread__actions__[3/4].html Signed-off-by: Azat Khuzhin --- .../configs/conf.xml | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/test_input_format_parallel_parsing_memory_tracking/configs/conf.xml b/tests/integration/test_input_format_parallel_parsing_memory_tracking/configs/conf.xml index 3adba1d402a..2c40f0fab4a 100644 --- a/tests/integration/test_input_format_parallel_parsing_memory_tracking/configs/conf.xml +++ b/tests/integration/test_input_format_parallel_parsing_memory_tracking/configs/conf.xml @@ -18,6 +18,7 @@ + From ea9ce3ea18cbe80ce63c2b02867dc11e39ac0bd8 Mon Sep 17 00:00:00 2001 From: Yakov Olkhovskiy Date: Tue, 5 Apr 2022 19:50:16 -0400 Subject: [PATCH 196/239] 'T' is added as delimiter, tests added --- src/IO/ReadHelpers.h | 5 +++-- .../0_stateless/02249_parse_date_time_basic.reference | 8 +++++--- tests/queries/0_stateless/02249_parse_date_time_basic.sql | 6 +++++- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/src/IO/ReadHelpers.h b/src/IO/ReadHelpers.h index 48c291d8fcc..e68da3a1c7d 100644 --- a/src/IO/ReadHelpers.h +++ b/src/IO/ReadHelpers.h @@ -867,7 +867,8 @@ inline ReturnType readDateTimeTextImpl(time_t & datetime, ReadBuffer & buf, cons UInt8 minute = 0; UInt8 second = 0; ///simply determine whether it is YYYY-MM-DD hh:mm:ss or YYYY-MM-DD by the content of the tenth character in an optimistic scenario - if (s[10] == ' ') + bool dt_long = (s[10] == ' ' || s[10] == 'T'); + if (dt_long) { hour = (s[11] - '0') * 10 + (s[12] - '0'); minute = (s[14] - '0') * 10 + (s[15] - '0'); @@ -879,7 +880,7 @@ inline ReturnType readDateTimeTextImpl(time_t & datetime, ReadBuffer & buf, cons else datetime = date_lut.makeDateTime(year, month, day, hour, minute, second); - if (s[10] == ' ') + if (dt_long) buf.position() += DateTimeStringInputSize; else buf.position() += DateStringInputSize; diff --git a/tests/queries/0_stateless/02249_parse_date_time_basic.reference b/tests/queries/0_stateless/02249_parse_date_time_basic.reference index d67e0ae15e0..027c72d802f 100644 --- a/tests/queries/0_stateless/02249_parse_date_time_basic.reference +++ b/tests/queries/0_stateless/02249_parse_date_time_basic.reference @@ -1,3 +1,5 @@ -2022-03-31 00:00:00 1 -2022-04-01 17:10:24 2 -2022-03-31 10:18:56 3 +2022-03-31T04:00:00Z 1 +2022-04-01T09:10:24Z 2 +2022-03-31T14:18:56Z 3 +2022-03-31T14:18:56Z 4 +2022-04-01T09:10:24Z 5 diff --git a/tests/queries/0_stateless/02249_parse_date_time_basic.sql b/tests/queries/0_stateless/02249_parse_date_time_basic.sql index 2cea41874d5..cb443bbdd8e 100644 --- a/tests/queries/0_stateless/02249_parse_date_time_basic.sql +++ b/tests/queries/0_stateless/02249_parse_date_time_basic.sql @@ -1,6 +1,10 @@ +SET date_time_output_format='iso'; drop table if exists t; CREATE TABLE t (a DateTime, b String, c String, d String, e Int32) ENGINE = Memory; INSERT INTO t(a, b, c, d ,e) VALUES ('2022-03-31','','','',1); INSERT INTO t(a, b, c, d ,e) VALUES (1648804224,'','','',2); INSERT INTO t(a, b, c, d ,e) VALUES ('2022-03-31 10:18:56','','','',3); -select a, e from t; +INSERT INTO t(a, b, c, d ,e) VALUES ('2022-03-31T10:18:56','','','',4); +INSERT INTO t(a, b, c, d ,e) VALUES ('1648804224','','','',5); +select a, e from t order by e; +drop table if exists t; From d59d4eda4f3ffc4c45ea543fd5acaab64243bd72 Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 5 Apr 2022 15:35:59 +0200 Subject: [PATCH 197/239] Fix tests --- docker/test/fuzzer/run-fuzzer.sh | 1 - src/Client/ClientBase.cpp | 7 +++++++ tests/clickhouse-test | 2 +- .../0_stateless/00921_datetime64_compatibility_long.sh | 2 +- tests/queries/0_stateless/02221_parallel_replicas_bug.sh | 2 +- 5 files changed, 10 insertions(+), 4 deletions(-) diff --git a/docker/test/fuzzer/run-fuzzer.sh b/docker/test/fuzzer/run-fuzzer.sh index 74711f476f8..32799a669eb 100755 --- a/docker/test/fuzzer/run-fuzzer.sh +++ b/docker/test/fuzzer/run-fuzzer.sh @@ -226,7 +226,6 @@ quit --receive_data_timeout_ms=10000 \ --stacktrace \ --query-fuzzer-runs=1000 \ - --testmode \ --queries-file $(ls -1 ch/tests/queries/0_stateless/*.sql | sort -R) \ $NEW_TESTS_OPT \ > >(tail -n 100000 > fuzzer.log) \ diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp index 24dba19a72c..e1d2b673571 100644 --- a/src/Client/ClientBase.cpp +++ b/src/Client/ClientBase.cpp @@ -1499,6 +1499,13 @@ bool ClientBase::executeMultiQuery(const String & all_queries_text) /// Test tags are started with "--" so they are interpreted as comments anyway. /// But if the echo is enabled we have to remove the test tags from `all_queries_text` /// because we don't want test tags to be echoed. + { + /// disable logs if expects errors + TestHint test_hint(all_queries_text); + if (test_hint.clientError() || test_hint.serverError()) + processTextAsSingleQuery("SET send_logs_level = 'fatal'"); + } + size_t test_tags_length = getTestTagsLength(all_queries_text); /// Several queries separated by ';'. diff --git a/tests/clickhouse-test b/tests/clickhouse-test index f925fddcd1a..de36fc3da27 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -1140,7 +1140,7 @@ def run_tests_array(all_tests_with_params): sys.stdout.flush() -server_logs_level = "fatal" +server_logs_level = "warning" def check_server_started(args): diff --git a/tests/queries/0_stateless/00921_datetime64_compatibility_long.sh b/tests/queries/0_stateless/00921_datetime64_compatibility_long.sh index 6d2cd0a176b..d310a2c3612 100755 --- a/tests/queries/0_stateless/00921_datetime64_compatibility_long.sh +++ b/tests/queries/0_stateless/00921_datetime64_compatibility_long.sh @@ -13,5 +13,5 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # ${CURDIR}/00921_datetime64_compatibility.python python3 "${CURDIR}"/00921_datetime64_compatibility_long.python \ - | ${CLICKHOUSE_CLIENT} --ignore-error -T -nm --calculate_text_stack_trace 0 --log-level 'error' 2>&1 \ + | ${CLICKHOUSE_CLIENT} --ignore-error -nm --calculate_text_stack_trace 0 --log-level 'error' 2>&1 \ | grep -v -e 'Received exception .*$' -e '^(query: ' | sed 's/^\(Code: [0-9]\+\).*$/\1/g' diff --git a/tests/queries/0_stateless/02221_parallel_replicas_bug.sh b/tests/queries/0_stateless/02221_parallel_replicas_bug.sh index b4ac6817a54..cce32bf8272 100755 --- a/tests/queries/0_stateless/02221_parallel_replicas_bug.sh +++ b/tests/queries/0_stateless/02221_parallel_replicas_bug.sh @@ -4,4 +4,4 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) . "$CURDIR"/../shell_config.sh -${CLICKHOUSE_CLIENT} --allow_experimental_parallel_reading_from_replicas=1 -nmT < "$CURDIR"/01099_parallel_distributed_insert_select.sql > /dev/null +${CLICKHOUSE_CLIENT} --allow_experimental_parallel_reading_from_replicas=1 -nm < "$CURDIR"/01099_parallel_distributed_insert_select.sql > /dev/null From c7d72b92dac1d3650c61bdfd3d6a4286dbc97819 Mon Sep 17 00:00:00 2001 From: Yakov Olkhovskiy Date: Tue, 5 Apr 2022 20:32:52 -0400 Subject: [PATCH 198/239] explicit timezone added to test --- .../0_stateless/02249_parse_date_time_basic.reference | 6 +++--- tests/queries/0_stateless/02249_parse_date_time_basic.sql | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/queries/0_stateless/02249_parse_date_time_basic.reference b/tests/queries/0_stateless/02249_parse_date_time_basic.reference index 027c72d802f..eb030a8fd3d 100644 --- a/tests/queries/0_stateless/02249_parse_date_time_basic.reference +++ b/tests/queries/0_stateless/02249_parse_date_time_basic.reference @@ -1,5 +1,5 @@ -2022-03-31T04:00:00Z 1 +2022-03-31T00:00:00Z 1 2022-04-01T09:10:24Z 2 -2022-03-31T14:18:56Z 3 -2022-03-31T14:18:56Z 4 +2022-03-31T10:18:56Z 3 +2022-03-31T10:18:56Z 4 2022-04-01T09:10:24Z 5 diff --git a/tests/queries/0_stateless/02249_parse_date_time_basic.sql b/tests/queries/0_stateless/02249_parse_date_time_basic.sql index cb443bbdd8e..7146462fb74 100644 --- a/tests/queries/0_stateless/02249_parse_date_time_basic.sql +++ b/tests/queries/0_stateless/02249_parse_date_time_basic.sql @@ -1,6 +1,6 @@ SET date_time_output_format='iso'; drop table if exists t; -CREATE TABLE t (a DateTime, b String, c String, d String, e Int32) ENGINE = Memory; +CREATE TABLE t (a DateTime('UTC'), b String, c String, d String, e Int32) ENGINE = Memory; INSERT INTO t(a, b, c, d ,e) VALUES ('2022-03-31','','','',1); INSERT INTO t(a, b, c, d ,e) VALUES (1648804224,'','','',2); INSERT INTO t(a, b, c, d ,e) VALUES ('2022-03-31 10:18:56','','','',3); From 43e8af697aa57ce3b2bb593f4cb147ae54e338c4 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Wed, 6 Apr 2022 11:41:16 +0800 Subject: [PATCH 199/239] fix code style --- src/Storages/Hive/HiveFile.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Storages/Hive/HiveFile.cpp b/src/Storages/Hive/HiveFile.cpp index 407d9602b61..867b5650ea2 100644 --- a/src/Storages/Hive/HiveFile.cpp +++ b/src/Storages/Hive/HiveFile.cpp @@ -139,7 +139,7 @@ void HiveOrcFile::prepareColumnMapping() for (size_t pos = 0; pos < count; pos++) { /// Column names in hive is case-insensitive. - String column{type.getFieldName(pos)}; + String columnn{type.getFieldName(pos)}; boost::to_lower(column); orc_column_positions[column] = pos; } @@ -267,9 +267,9 @@ void HiveParquetFile::loadSubMinMaxIndex() auto it = index_names_and_types.begin(); for (; it != index_names_and_types.end(); ++j, ++it) { - String name{it->name}; - boost::to_lower(name); - auto mit = parquet_column_positions.find(name); + String column{it->name}; + boost::to_lower(column); + auto mit = parquet_column_positions.find(column); if (mit == parquet_column_positions.end()) continue; From a2ce366c3420a2806eb6974e3cb45020115deb39 Mon Sep 17 00:00:00 2001 From: fenglv Date: Wed, 6 Apr 2022 04:49:43 +0000 Subject: [PATCH 200/239] parallel reading files for FileLog Engine --- src/Storages/FileLog/StorageFileLog.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Storages/FileLog/StorageFileLog.cpp b/src/Storages/FileLog/StorageFileLog.cpp index 32ca936f039..4fb19b12bab 100644 --- a/src/Storages/FileLog/StorageFileLog.cpp +++ b/src/Storages/FileLog/StorageFileLog.cpp @@ -723,6 +723,7 @@ bool StorageFileLog::streamToViews() size_t rows = 0; { block_io.pipeline.complete(std::move(input)); + block_io.pipeline.setNumThreads(max_streams_number); block_io.pipeline.setProgressCallback([&](const Progress & progress) { rows += progress.read_rows.load(); }); CompletedPipelineExecutor executor(block_io.pipeline); executor.execute(); From 84eef61d17ee430f2703f24e17c0ab8bb05715b9 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Wed, 6 Apr 2022 06:39:56 +0000 Subject: [PATCH 201/239] Pull under reader mutex --- src/Storages/StorageURL.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index a435ab1a654..7bdb070bfdf 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -224,14 +224,12 @@ namespace } Chunk chunk; + std::lock_guard lock(reader_mutex); if (reader->pull(chunk)) return chunk; - { - std::lock_guard lock(reader_mutex); - pipeline->reset(); - reader.reset(); - } + pipeline->reset(); + reader.reset(); } } From bf1f34ddb372ad45ccfd0297e6cd328abfc06f6b Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Wed, 6 Apr 2022 07:43:48 +0000 Subject: [PATCH 202/239] Fix unit tests --- src/Coordination/tests/gtest_coordination.cpp | 56 +++++++++---------- utils/keeper-data-dumper/main.cpp | 4 +- 2 files changed, 30 insertions(+), 30 deletions(-) diff --git a/src/Coordination/tests/gtest_coordination.cpp b/src/Coordination/tests/gtest_coordination.cpp index 07544dfbb89..7dfd451e111 100644 --- a/src/Coordination/tests/gtest_coordination.cpp +++ b/src/Coordination/tests/gtest_coordination.cpp @@ -976,31 +976,31 @@ TEST_P(CoordinationTest, SnapshotableHashMapDataSize) using Node = DB::KeeperStorage::Node; DB::SnapshotableHashTable world; Node n1; - n1.data = "1234"; + n1.setData("1234"); Node n2; - n2.data = "123456"; - n2.children.insert(""); + n2.setData("123456"); + n2.addChild(""); world.disableSnapshotMode(); world.insert("world", n1); - EXPECT_EQ(world.getApproximateDataSize(), 98); + EXPECT_EQ(world.getApproximateDataSize(), 177); world.updateValue("world", [&](Node & value) { value = n2; }); - EXPECT_EQ(world.getApproximateDataSize(), 98); + EXPECT_EQ(world.getApproximateDataSize(), 195); world.erase("world"); EXPECT_EQ(world.getApproximateDataSize(), 0); world.enableSnapshotMode(100000); world.insert("world", n1); - EXPECT_EQ(world.getApproximateDataSize(), 98); + EXPECT_EQ(world.getApproximateDataSize(), 177); world.updateValue("world", [&](Node & value) { value = n2; }); - EXPECT_EQ(world.getApproximateDataSize(), 196); + EXPECT_EQ(world.getApproximateDataSize(), 372); world.clearOutdatedNodes(); - EXPECT_EQ(world.getApproximateDataSize(), 98); + EXPECT_EQ(world.getApproximateDataSize(), 195); world.erase("world"); - EXPECT_EQ(world.getApproximateDataSize(), 98); + EXPECT_EQ(world.getApproximateDataSize(), 195); world.clear(); EXPECT_EQ(world.getApproximateDataSize(), 0); @@ -1010,7 +1010,7 @@ void addNode(DB::KeeperStorage & storage, const std::string & path, const std::s { using Node = DB::KeeperStorage::Node; Node node{}; - node.data = data; + node.setData(data); node.stat.ephemeralOwner = ephemeral_owner; storage.container.insertOrReplace(path, node); } @@ -1048,13 +1048,13 @@ TEST_P(CoordinationTest, TestStorageSnapshotSimple) auto [restored_storage, snapshot_meta, _] = manager.deserializeSnapshotFromBuffer(debuf); EXPECT_EQ(restored_storage->container.size(), 3); - EXPECT_EQ(restored_storage->container.getValue("/").children.size(), 1); - EXPECT_EQ(restored_storage->container.getValue("/hello").children.size(), 1); - EXPECT_EQ(restored_storage->container.getValue("/hello/somepath").children.size(), 0); + EXPECT_EQ(restored_storage->container.getValue("/").getChildren().size(), 1); + EXPECT_EQ(restored_storage->container.getValue("/hello").getChildren().size(), 1); + EXPECT_EQ(restored_storage->container.getValue("/hello/somepath").getChildren().size(), 0); - EXPECT_EQ(restored_storage->container.getValue("/").data, ""); - EXPECT_EQ(restored_storage->container.getValue("/hello").data, "world"); - EXPECT_EQ(restored_storage->container.getValue("/hello/somepath").data, "somedata"); + EXPECT_EQ(restored_storage->container.getValue("/").getData(), ""); + EXPECT_EQ(restored_storage->container.getValue("/hello").getData(), "world"); + EXPECT_EQ(restored_storage->container.getValue("/hello/somepath").getData(), "somedata"); EXPECT_EQ(restored_storage->session_id_counter, 7); EXPECT_EQ(restored_storage->zxid, 2); EXPECT_EQ(restored_storage->ephemerals.size(), 2); @@ -1099,7 +1099,7 @@ TEST_P(CoordinationTest, TestStorageSnapshotMoreWrites) EXPECT_EQ(restored_storage->container.size(), 51); for (size_t i = 0; i < 50; ++i) { - EXPECT_EQ(restored_storage->container.getValue("/hello_" + std::to_string(i)).data, "world_" + std::to_string(i)); + EXPECT_EQ(restored_storage->container.getValue("/hello_" + std::to_string(i)).getData(), "world_" + std::to_string(i)); } } @@ -1139,7 +1139,7 @@ TEST_P(CoordinationTest, TestStorageSnapshotManySnapshots) for (size_t i = 0; i < 250; ++i) { - EXPECT_EQ(restored_storage->container.getValue("/hello_" + std::to_string(i)).data, "world_" + std::to_string(i)); + EXPECT_EQ(restored_storage->container.getValue("/hello_" + std::to_string(i)).getData(), "world_" + std::to_string(i)); } } @@ -1162,7 +1162,7 @@ TEST_P(CoordinationTest, TestStorageSnapshotMode) } for (size_t i = 0; i < 50; ++i) { - EXPECT_EQ(storage.container.getValue("/hello_" + std::to_string(i)).data, "wlrd_" + std::to_string(i)); + EXPECT_EQ(storage.container.getValue("/hello_" + std::to_string(i)).getData(), "wlrd_" + std::to_string(i)); } for (size_t i = 0; i < 50; ++i) { @@ -1182,7 +1182,7 @@ TEST_P(CoordinationTest, TestStorageSnapshotMode) for (size_t i = 0; i < 50; ++i) { if (i % 2 != 0) - EXPECT_EQ(storage.container.getValue("/hello_" + std::to_string(i)).data, "wlrd_" + std::to_string(i)); + EXPECT_EQ(storage.container.getValue("/hello_" + std::to_string(i)).getData(), "wlrd_" + std::to_string(i)); else EXPECT_FALSE(storage.container.contains("/hello_" + std::to_string(i))); } @@ -1191,7 +1191,7 @@ TEST_P(CoordinationTest, TestStorageSnapshotMode) for (size_t i = 0; i < 50; ++i) { - EXPECT_EQ(restored_storage->container.getValue("/hello_" + std::to_string(i)).data, "world_" + std::to_string(i)); + EXPECT_EQ(restored_storage->container.getValue("/hello_" + std::to_string(i)).getData(), "world_" + std::to_string(i)); } } @@ -1314,7 +1314,7 @@ void testLogAndStateMachine(Coordination::CoordinationSettingsPtr settings, uint for (size_t i = 1; i < total_logs + 1; ++i) { auto path = "/hello_" + std::to_string(i); - EXPECT_EQ(source_storage.container.getValue(path).data, restored_storage.container.getValue(path).data); + EXPECT_EQ(source_storage.container.getValue(path).getData(), restored_storage.container.getValue(path).getData()); } } @@ -1589,13 +1589,13 @@ TEST_P(CoordinationTest, TestStorageSnapshotDifferentCompressions) auto [restored_storage, snapshot_meta, _] = new_manager.deserializeSnapshotFromBuffer(debuf); EXPECT_EQ(restored_storage->container.size(), 3); - EXPECT_EQ(restored_storage->container.getValue("/").children.size(), 1); - EXPECT_EQ(restored_storage->container.getValue("/hello").children.size(), 1); - EXPECT_EQ(restored_storage->container.getValue("/hello/somepath").children.size(), 0); + EXPECT_EQ(restored_storage->container.getValue("/").getChildren().size(), 1); + EXPECT_EQ(restored_storage->container.getValue("/hello").getChildren().size(), 1); + EXPECT_EQ(restored_storage->container.getValue("/hello/somepath").getChildren().size(), 0); - EXPECT_EQ(restored_storage->container.getValue("/").data, ""); - EXPECT_EQ(restored_storage->container.getValue("/hello").data, "world"); - EXPECT_EQ(restored_storage->container.getValue("/hello/somepath").data, "somedata"); + EXPECT_EQ(restored_storage->container.getValue("/").getData(), ""); + EXPECT_EQ(restored_storage->container.getValue("/hello").getData(), "world"); + EXPECT_EQ(restored_storage->container.getValue("/hello/somepath").getData(), "somedata"); EXPECT_EQ(restored_storage->session_id_counter, 7); EXPECT_EQ(restored_storage->zxid, 2); EXPECT_EQ(restored_storage->ephemerals.size(), 2); diff --git a/utils/keeper-data-dumper/main.cpp b/utils/keeper-data-dumper/main.cpp index 0f86d34d334..df6083e4bd7 100644 --- a/utils/keeper-data-dumper/main.cpp +++ b/utils/keeper-data-dumper/main.cpp @@ -32,9 +32,9 @@ void dumpMachine(std::shared_ptr machine) ", numChildren: " << value.stat.numChildren << ", dataLength: " << value.stat.dataLength << "}" << std::endl; - std::cout << "\tData: " << storage.container.getValue(key).data << std::endl; + std::cout << "\tData: " << storage.container.getValue(key).getData() << std::endl; - for (const auto & child : value.children) + for (const auto & child : value.getChildren()) { if (key == "/") keys.push(key + child.toString()); From e4408d42dc12dc0fe855579981376db0c85eaa4b Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Wed, 6 Apr 2022 10:13:09 +0200 Subject: [PATCH 203/239] Fix context expanding, quote secrets to avoid word split --- .github/workflows/nightly.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index b9541589a64..1e70213adf5 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -105,13 +105,13 @@ jobs: sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH" - cd "$REPO_COPY/tests/ci" && python3 build_check.py "$CHECK_NAME" "$BUILD_NAME" "$${{ secrets.COV_TOKEN }}" + cd "$REPO_COPY/tests/ci" && python3 build_check.py "$CHECK_NAME" "$BUILD_NAME" "${{ secrets.COV_TOKEN }}" - name: Upload Coverity Analysis if: ${{ success() || failure() }} run: | - curl --form token=$${{ secrets.COV_TOKEN }} \ - --form email=${{ secrets.ROBOT_CLICKHOUSE_EMAIL }} \ - --form file=@$TEMP_PATH/$BUILD_NAME/clickhouse-scan.tgz \ + curl --form token='${{ secrets.COV_TOKEN }}' \ + --form email='${{ secrets.ROBOT_CLICKHOUSE_EMAIL }}' \ + --form file="@$TEMP_PATH/$BUILD_NAME/clickhouse-scan.tgz" \ --form version="${GITHUB_REF#refs/heads/}-${GITHUB_SHA::6}" \ --form description="Nighly Scan: $(date +'%Y-%m-%dT%H:%M:%S')" \ https://scan.coverity.com/builds?project=ClickHouse%2FClickHouse From 62fa528e7d7baa4e48c84d7e02dff179faacffd0 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Wed, 6 Apr 2022 10:15:36 +0200 Subject: [PATCH 204/239] Fix syntax error in build_check.py --- tests/ci/build_check.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ci/build_check.py b/tests/ci/build_check.py index 1dcfb9d01d3..74ebebf1e6f 100644 --- a/tests/ci/build_check.py +++ b/tests/ci/build_check.py @@ -302,7 +302,7 @@ def main(): version.string, image_version, ccache_path, - official=official_flag, + official_flag, cov_token, ) From acb9f1632eeda8489cc2eecdb2c9fcb5541ebc16 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Wed, 6 Apr 2022 16:40:22 +0800 Subject: [PATCH 205/239] suppoort skip splits in orc and parquet --- src/Formats/FormatSettings.h | 2 + .../Formats/Impl/ORCBlockInputFormat.cpp | 42 ++++++++++++------- .../Formats/Impl/ORCBlockInputFormat.h | 5 ++- .../Formats/Impl/ParquetBlockInputFormat.cpp | 5 ++- .../Formats/Impl/ParquetBlockInputFormat.h | 3 +- src/Storages/Hive/HiveFile.cpp | 2 +- src/Storages/Hive/HiveFile.h | 6 +-- src/Storages/Hive/StorageHive.cpp | 22 +++++++--- 8 files changed, 58 insertions(+), 29 deletions(-) diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index bd0a84d9ded..695c63d6379 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -138,6 +138,7 @@ struct FormatSettings bool import_nested = false; bool allow_missing_columns = false; bool case_insensitive_column_matching = false; + std::unordered_set skip_row_groups = {}; } parquet; struct Pretty @@ -219,6 +220,7 @@ struct FormatSettings bool allow_missing_columns = false; int64_t row_batch_size = 100'000; bool case_insensitive_column_matching = false; + std::unordered_set skip_stripes = {}; } orc; /// For capnProto format we should determine how to diff --git a/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp b/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp index c68b59833db..8b8426132de 100644 --- a/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp @@ -20,13 +20,12 @@ namespace ErrorCodes } ORCBlockInputFormat::ORCBlockInputFormat(ReadBuffer & in_, Block header_, const FormatSettings & format_settings_) - : IInputFormat(std::move(header_), in_), format_settings(format_settings_) + : IInputFormat(std::move(header_), in_), format_settings(format_settings_), skip_stripes(format_settings.orc.skip_stripes) { } Chunk ORCBlockInputFormat::generate() { - Chunk res; block_missing_values.clear(); if (!file_reader) @@ -35,24 +34,32 @@ Chunk ORCBlockInputFormat::generate() if (is_stopped) return {}; - std::shared_ptr batch_reader; - auto result = file_reader->NextStripeReader(format_settings.orc.row_batch_size, include_indices); - if (!result.ok()) - throw ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA, "Failed to create batch reader: {}", result.status().ToString()); - batch_reader = std::move(result).ValueOrDie(); - if (!batch_reader) - { - return res; - } + for (; stripe_current < stripe_total && skip_stripes.contains(stripe_current); ++stripe_current) + ; - std::shared_ptr table; - arrow::Status table_status = batch_reader->ReadAll(&table); - if (!table_status.ok()) - throw ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA, "Error while reading batch of ORC data: {}", table_status.ToString()); + if (stripe_current >= stripe_total) + return {}; + auto batch_result = file_reader->ReadStripe(stripe_current, include_indices); + if (!batch_result.ok()) + throw ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA, "Failed to create batch reader: {}", batch_result.status().ToString()); + + auto batch = batch_result.ValueOrDie(); + if (!batch) + return {}; + + auto table_result = arrow::Table::FromRecordBatches({batch}); + if (!table_result.ok()) + throw ParsingException( + ErrorCodes::CANNOT_READ_ALL_DATA, "Error while reading batch of ORC data: {}", table_result.status().ToString()); + + auto table = table_result.ValueOrDie(); if (!table || !table->num_rows()) - return res; + return {}; + ++stripe_current; + + Chunk res; arrow_column_to_ch_column->arrowTableToCHChunk(res, table); /// If defaults_for_omitted_fields is true, calculate the default values from default expression for omitted fields. /// Otherwise fill the missing columns with zero values of its type. @@ -130,6 +137,9 @@ void ORCBlockInputFormat::prepareReader() if (is_stopped) return; + stripe_total = file_reader->NumberOfStripes(); + stripe_current = 0; + arrow_column_to_ch_column = std::make_unique( getPort().getHeader(), "ORC", diff --git a/src/Processors/Formats/Impl/ORCBlockInputFormat.h b/src/Processors/Formats/Impl/ORCBlockInputFormat.h index b7a771730ea..3c363699a0b 100644 --- a/src/Processors/Formats/Impl/ORCBlockInputFormat.h +++ b/src/Processors/Formats/Impl/ORCBlockInputFormat.h @@ -38,6 +38,7 @@ protected: } private: + void prepareReader(); // TODO: check that this class implements every part of its parent @@ -52,8 +53,10 @@ private: BlockMissingValues block_missing_values; const FormatSettings format_settings; + const std::unordered_set & skip_stripes; - void prepareReader(); + int stripe_total = 0; + int stripe_current = 0; std::atomic is_stopped{0}; }; diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp index 13582ce5019..69e51e0dad2 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp @@ -32,7 +32,7 @@ namespace ErrorCodes } while (false) ParquetBlockInputFormat::ParquetBlockInputFormat(ReadBuffer & in_, Block header_, const FormatSettings & format_settings_) - : IInputFormat(std::move(header_), in_), format_settings(format_settings_) + : IInputFormat(std::move(header_), in_), format_settings(format_settings_), skip_row_groups(format_settings.parquet.skip_row_groups) { } @@ -47,6 +47,9 @@ Chunk ParquetBlockInputFormat::generate() if (is_stopped) return {}; + for (; row_group_current < row_group_total && skip_row_groups.contains(row_group_current); ++row_group_current) + ; + if (row_group_current >= row_group_total) return res; diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.h b/src/Processors/Formats/Impl/ParquetBlockInputFormat.h index 1faadaa3d21..76803bb5b89 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.h +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.h @@ -38,13 +38,14 @@ private: std::unique_ptr file_reader; int row_group_total = 0; + int row_group_current = 0; // indices of columns to read from Parquet file std::vector column_indices; std::unique_ptr arrow_column_to_ch_column; - int row_group_current = 0; std::vector missing_columns; BlockMissingValues block_missing_values; const FormatSettings format_settings; + const std::unordered_set & skip_row_groups; std::atomic is_stopped{0}; }; diff --git a/src/Storages/Hive/HiveFile.cpp b/src/Storages/Hive/HiveFile.cpp index 867b5650ea2..3f4260d9f9e 100644 --- a/src/Storages/Hive/HiveFile.cpp +++ b/src/Storages/Hive/HiveFile.cpp @@ -139,7 +139,7 @@ void HiveOrcFile::prepareColumnMapping() for (size_t pos = 0; pos < count; pos++) { /// Column names in hive is case-insensitive. - String columnn{type.getFieldName(pos)}; + String column{type.getFieldName(pos)}; boost::to_lower(column); orc_column_positions[column] = pos; } diff --git a/src/Storages/Hive/HiveFile.h b/src/Storages/Hive/HiveFile.h index aef9d72755a..74f893a073f 100644 --- a/src/Storages/Hive/HiveFile.h +++ b/src/Storages/Hive/HiveFile.h @@ -125,9 +125,9 @@ public: virtual const std::vector & getSubMinMaxIndexes() const { return sub_minmax_idxes; } - virtual void setSkipSplits(const std::set & splits) { skip_splits = splits; } + virtual void setSkipSplits(const std::unordered_set & skip_splits_) { skip_splits = skip_splits_; } - virtual const std::set & getSkipSplits() const { return skip_splits; } + virtual const std::unordered_set & getSkipSplits() const { return skip_splits; } inline std::string describeMinMaxIndex(const MinMaxIndexPtr & idx) const { @@ -157,7 +157,7 @@ protected: MinMaxIndexPtr minmax_idx; std::vector sub_minmax_idxes; /// Skip splits for this file after applying minmax index (if any) - std::set skip_splits; + std::unordered_set skip_splits; std::shared_ptr storage_settings; }; diff --git a/src/Storages/Hive/StorageHive.cpp b/src/Storages/Hive/StorageHive.cpp index 4296df3d7b1..a7537f5a1f6 100644 --- a/src/Storages/Hive/StorageHive.cpp +++ b/src/Storages/Hive/StorageHive.cpp @@ -111,9 +111,9 @@ public: : SourceWithProgress(getHeader(sample_block_, source_info_)) , WithContext(context_) , source_info(std::move(source_info_)) - , hdfs_namenode_url(hdfs_namenode_url_) + , hdfs_namenode_url(std::move(hdfs_namenode_url_)) , format(std::move(format_)) - , compression_method(compression_method_) + , compression_method(std::move(compression_method_)) , max_block_size(max_block_size_) , sample_block(std::move(sample_block_)) , columns_description(getColumnsDescription(sample_block, source_info)) @@ -121,15 +121,25 @@ public: , format_settings(getFormatSettings(getContext())) { to_read_block = sample_block; + /// Initialize to_read_block, which is used to read data from HDFS. for (const auto & name_type : source_info->partition_name_types) { if (to_read_block.has(name_type.name)) to_read_block.erase(name_type.name); } + } - /// Initialize format settings - format_settings.hive_text.input_field_names = text_input_field_names; + FormatSettings updateFormatSettings(const HiveFilePtr & hive_file) + { + auto updated = format_settings; + if (format == "HiveText") + updated.hive_text.input_field_names = text_input_field_names; + else if (format == "ORC") + updated.orc.skip_stripes = hive_file->getSkipSplits(); + else if (format == "Parquet") + updated.parquet.skip_row_groups = hive_file->getSkipSplits(); + return updated; } String getName() const override { return "Hive"; } @@ -188,7 +198,7 @@ public: read_buf = std::move(remote_read_buf); auto input_format = FormatFactory::instance().getInputFormat( - format, *read_buf, to_read_block, getContext(), max_block_size, format_settings); + format, *read_buf, to_read_block, getContext(), max_block_size, updateFormatSettings(curr_file)); QueryPipelineBuilder builder; builder.init(Pipe(input_format)); @@ -545,7 +555,7 @@ HiveFilePtr StorageHive::createHiveFileIfNeeded( /// Load sub-file level minmax index and apply if (hive_file->hasSubMinMaxIndex()) { - std::set skip_splits; + std::unordered_set skip_splits; hive_file->loadSubMinMaxIndex(); const auto & sub_minmax_idxes = hive_file->getSubMinMaxIndexes(); for (size_t i = 0; i < sub_minmax_idxes.size(); ++i) From 060f5118f00fef5ee3ce20fb8d8f8f7b50c5c9eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Wed, 6 Apr 2022 10:48:12 +0200 Subject: [PATCH 206/239] Improvements based on PR review --- src/Common/DNSResolver.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/Common/DNSResolver.cpp b/src/Common/DNSResolver.cpp index 66f67caf456..1a3eaf91f68 100644 --- a/src/Common/DNSResolver.cpp +++ b/src/Common/DNSResolver.cpp @@ -123,7 +123,10 @@ static DNSResolver::IPAddresses resolveIPAddressImpl(const std::string & host) } if (addresses.empty()) + { + ProfileEvents::increment(ProfileEvents::DNSError); throw Exception("Not found address of host: " + host, ErrorCodes::DNS_ERROR); + } return addresses; } @@ -142,8 +145,8 @@ static String reverseResolveImpl(const Poco::Net::IPAddress & address) struct DNSResolver::Impl { - using HostWithConsecutiveFailures = std::unordered_map; - using AddressWithConsecutiveFailures = std::unordered_map; + using HostWithConsecutiveFailures = std::unordered_map; + using AddressWithConsecutiveFailures = std::unordered_map; CachedFn<&resolveIPAddressImpl> cache_host; CachedFn<&reverseResolveImpl> cache_address; @@ -275,7 +278,6 @@ bool DNSResolver::updateCacheImpl( tryLogCurrentException(log, __PRETTY_FUNCTION__); continue; } - ProfileEvents::increment(ProfileEvents::DNSError); if (!lost_elems.empty()) lost_elems += ", "; lost_elems += cacheElemToString(it->first); From 9e7ffcce59fb7b013149f1036eedcfa0a47c46d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Wed, 6 Apr 2022 11:09:08 +0200 Subject: [PATCH 207/239] Add test to check DNS profile events --- .../02265_test_dns_profile_events.reference | 2 ++ .../02265_test_dns_profile_events.sh | 17 +++++++++++++++++ 2 files changed, 19 insertions(+) create mode 100644 tests/queries/0_stateless/02265_test_dns_profile_events.reference create mode 100755 tests/queries/0_stateless/02265_test_dns_profile_events.sh diff --git a/tests/queries/0_stateless/02265_test_dns_profile_events.reference b/tests/queries/0_stateless/02265_test_dns_profile_events.reference new file mode 100644 index 00000000000..97ca33b311f --- /dev/null +++ b/tests/queries/0_stateless/02265_test_dns_profile_events.reference @@ -0,0 +1,2 @@ +first_check 1 +second_check 1 diff --git a/tests/queries/0_stateless/02265_test_dns_profile_events.sh b/tests/queries/0_stateless/02265_test_dns_profile_events.sh new file mode 100755 index 00000000000..756a761a0ae --- /dev/null +++ b/tests/queries/0_stateless/02265_test_dns_profile_events.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash +# Tags: no-parallel + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + + +current_dns_errors=$($CLICKHOUSE_CLIENT --query "SELECT sum(value) FROM system.events where event = 'DNSError';") +${CLICKHOUSE_CLIENT} --query "SELECT * FROM remote('ThisHostNameDoesNotExistSoItShouldFail', system, one)" 2>/dev/null +${CLICKHOUSE_CLIENT} --query "SELECT 'first_check', sum(value) > ${current_dns_errors} FROM system.events where event = 'DNSError';" + +current_dns_errors=$($CLICKHOUSE_CLIENT --query "SELECT sum(value) FROM system.events where event = 'DNSError';") +${CLICKHOUSE_CLIENT} --query "SELECT * FROM remote('ThisHostNameDoesNotExistSoItShouldFail2', system, one)" 2>/dev/null +${CLICKHOUSE_CLIENT} --query "SELECT 'second_check', sum(value) > ${current_dns_errors} FROM system.events where event = 'DNSError';" + +${CLICKHOUSE_CLIENT} --query "SYSTEM DROP DNS CACHE" From df06f9f974102cc6c5b45e622fc3a5a5f4fa1ec5 Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Wed, 6 Apr 2022 14:53:10 +0800 Subject: [PATCH 208/239] Fix performance regression of scalar query --- src/Functions/getScalar.cpp | 2 +- src/Interpreters/Context.cpp | 10 +++++----- src/Interpreters/Context.h | 7 ++++--- .../ExecuteScalarSubqueriesVisitor.cpp | 8 ++++++-- .../IInterpreterUnionOrSelectQuery.h | 4 ++-- tests/performance/scalar2.xml | 17 +++++++++++++++++ 6 files changed, 35 insertions(+), 13 deletions(-) create mode 100644 tests/performance/scalar2.xml diff --git a/src/Functions/getScalar.cpp b/src/Functions/getScalar.cpp index b06fb360366..c165ef26ffa 100644 --- a/src/Functions/getScalar.cpp +++ b/src/Functions/getScalar.cpp @@ -78,7 +78,7 @@ public: static ColumnWithTypeAndName createScalar(ContextPtr context_) { - if (const auto * block = context_->tryGetLocalScalar(Scalar::scalar_name)) + if (const auto * block = context_->tryGetSpecialScalar(Scalar::scalar_name)) return block->getByPosition(0); else if (context_->hasQueryContext()) { diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index e837ce5dae1..493e250ea85 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -930,10 +930,10 @@ const Block & Context::getScalar(const String & name) const return it->second; } -const Block * Context::tryGetLocalScalar(const String & name) const +const Block * Context::tryGetSpecialScalar(const String & name) const { - auto it = local_scalars.find(name); - if (local_scalars.end() == it) + auto it = special_scalars.find(name); + if (special_scalars.end() == it) return nullptr; return &it->second; } @@ -1004,12 +1004,12 @@ void Context::addScalar(const String & name, const Block & block) } -void Context::addLocalScalar(const String & name, const Block & block) +void Context::addSpecialScalar(const String & name, const Block & block) { if (isGlobalContext()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Global context cannot have local scalars"); - local_scalars[name] = block; + special_scalars[name] = block; } diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index ad68f2a2245..31d853318d4 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -227,7 +227,8 @@ private: /// Thus, used in HTTP interface. If not specified - then some globally default format is used. TemporaryTablesMapping external_tables_mapping; Scalars scalars; - Scalars local_scalars; + /// Used to store constant values which are different on each instance during distributed plan, such as _shard_num. + Scalars special_scalars; /// Used in s3Cluster table function. With this callback, a worker node could ask an initiator /// about next file to read from s3. @@ -487,8 +488,8 @@ public: void addScalar(const String & name, const Block & block); bool hasScalar(const String & name) const; - const Block * tryGetLocalScalar(const String & name) const; - void addLocalScalar(const String & name, const Block & block); + const Block * tryGetSpecialScalar(const String & name) const; + void addSpecialScalar(const String & name, const Block & block); const QueryAccessInfo & getQueryAccessInfo() const { return query_access_info; } void addQueryAccessInfo( diff --git a/src/Interpreters/ExecuteScalarSubqueriesVisitor.cpp b/src/Interpreters/ExecuteScalarSubqueriesVisitor.cpp index ac8a27484d9..55fdaffdfe9 100644 --- a/src/Interpreters/ExecuteScalarSubqueriesVisitor.cpp +++ b/src/Interpreters/ExecuteScalarSubqueriesVisitor.cpp @@ -113,14 +113,18 @@ void ExecuteScalarSubqueriesMatcher::visit(const ASTSubquery & subquery, ASTPtr bool is_local = false; Block scalar; - if (data.local_scalars.count(scalar_query_hash_str)) + if (data.only_analyze) + { + /// Don't use scalar cache during query analysis + } + else if (data.local_scalars.contains(scalar_query_hash_str)) { hit = true; scalar = data.local_scalars[scalar_query_hash_str]; is_local = true; ProfileEvents::increment(ProfileEvents::ScalarSubqueriesLocalCacheHit); } - else if (data.scalars.count(scalar_query_hash_str)) + else if (data.scalars.contains(scalar_query_hash_str)) { hit = true; scalar = data.scalars[scalar_query_hash_str]; diff --git a/src/Interpreters/IInterpreterUnionOrSelectQuery.h b/src/Interpreters/IInterpreterUnionOrSelectQuery.h index 7906ab189fc..a9262fbfa1e 100644 --- a/src/Interpreters/IInterpreterUnionOrSelectQuery.h +++ b/src/Interpreters/IInterpreterUnionOrSelectQuery.h @@ -18,11 +18,11 @@ public: , max_streams(context->getSettingsRef().max_threads) { if (options.shard_num) - context->addLocalScalar( + context->addSpecialScalar( "_shard_num", Block{{DataTypeUInt32().createColumnConst(1, *options.shard_num), std::make_shared(), "_shard_num"}}); if (options.shard_count) - context->addLocalScalar( + context->addSpecialScalar( "_shard_count", Block{{DataTypeUInt32().createColumnConst(1, *options.shard_count), std::make_shared(), "_shard_count"}}); } diff --git a/tests/performance/scalar2.xml b/tests/performance/scalar2.xml new file mode 100644 index 00000000000..eb427536646 --- /dev/null +++ b/tests/performance/scalar2.xml @@ -0,0 +1,17 @@ + + CREATE TABLE tbl0 (`ds` Date, `x1` String, `x2` UInt32, `x3` UInt32, `x4` UInt32, `bm` AggregateFunction(groupBitmap, UInt32)) ENGINE = MergeTree PARTITION BY (ds, x1) ORDER BY (x2, x3, x4) SETTINGS index_granularity = 1 + + CREATE TABLE tbl (`ds` Date, `y1` UInt32, `x4` UInt32, `y2` UInt32, `y3` UInt32, `bm` AggregateFunction(groupBitmap, UInt32), `y4` UInt32 DEFAULT 0) ENGINE = MergeTree PARTITION BY (ds) ORDER BY (x4, y2, y3) SETTINGS index_granularity = 8192, max_parts_in_total = 10000000 + + insert into tbl0 with murmurHash3_32(toUInt32(rand())) as uid select toDate('2022-03-01')+rand()%7 as ds, concat('xx',toString(rand()%10+1)) as x1, 1 as x2, 2 as x3, bitShiftRight(uid, 22) as x4, groupBitmapState(uid) as bm from numbers(100000000) where x4%40=0 group by ds, x1, x2, x3, x4 + + insert into tbl with murmurHash3_32(toUInt32(rand())) as uid select toDate('2022-03-01')+rand()%7 as ds, rand()%1000+5000 as y1, bitShiftRight(uid, 22) as x4, rand()%100 as y2, rand()%2000 as y3, groupBitmapState(uid) as bm, rand()%1 as y4 from numbers(100000000) where x4%40=0 group by ds, y1, x4, y2, y3, y4 + + CREATE TABLE tmp_acc_hit engine Memory AS SELECT x1, x2, x3, arrayReduceInRanges('groupBitmapMergeState', [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5), (1, 6), (1, 7)], bs) AS bs FROM (SELECT x1, x2, x3, groupArrayInsertAt(b, multiIf(ds = '2022-03-01', 0, ds = '2022-03-02', 1, ds = '2022-03-03', 2, ds = '2022-03-04', 3, ds = '2022-03-05', 4, ds = '2022-03-06', 5, ds = '2022-03-07', 6, 7)) AS bs FROM (SELECT x1, x2, x3, ds, groupBitmapOrState(bm) AS b FROM tbl0 WHERE ((ds >= '2022-03-01') AND (ds <= '2022-03-07')) AND (((x1 = 'xx1') AND (x2 = 1) AND (x3 = 2)) OR ((x1 = 'xx2') AND (x2 = 1) AND (x3 = 2)) OR ((x1 = 'xx3') AND (x2 = 1) AND (x3 = 2)) OR ((x1 = 'xx4') AND (x2 = 1) AND (x3 = 2)) OR ((x1 = 'xx5') AND (x2 = 1) AND (x3 = 2)) OR ((x1 = 'xx6') AND (x2 = 1) AND (x3 = 2)) OR ((x1 = 'xx7') AND (x2 = 1) AND (x3 = 2)) OR ((x1 = 'xx8') AND (x2 = 1) AND (x3 = 2)) OR ((x1 = 'xx9') AND (x2 = 1) AND (x3 = 2)) OR ((x1 = 'xx10') AND (x2 = 1) AND (x3 = 2))) AND (x4 IN (0, 40, 80, 120, 160, 200, 240, 280, 320, 360, 400, 440, 480, 520, 560, 600, 640, 680, 720, 760, 800, 840, 880, 920, 960, 1000)) GROUP BY x1, x2, x3, ds) AS t_hit GROUP BY x1, x2, x3) + + WITH (SELECT groupArrayInsertAt(b, multiIf((x1 = 'xx1') AND (x2 = 1) AND (x3 = 2), 0, (x1 = 'xx2') AND (x2 = 1) AND (x3 = 2), 1, (x1 = 'xx3') AND (x2 = 1) AND (x3 = 2), 2, (x1 = 'xx4') AND (x2 = 1) AND (x3 = 2), 3, (x1 = 'xx5') AND (x2 = 1) AND (x3 = 2), 4, (x1 = 'xx6') AND (x2 = 1) AND (x3 = 2), 5, (x1 = 'xx7') AND (x2 = 1) AND (x3 = 2), 6, (x1 = 'xx8') AND (x2 = 1) AND (x3 = 2), 7, (x1 = 'xx9') AND (x2 = 1) AND (x3 = 2), 8, (x1 = 'xx10') AND (x2 = 1) AND (x3 = 2), 9, 10)) FROM (SELECT x1, x2, x3, bs AS b FROM tmp_acc_hit)) AS bs SELECT y1, x4, toString(flat_arr) AS flat_arr, toString([bitmapAndCardinality(bmor1, (bs[1])[1]), bitmapAndCardinality(bmor2, (bs[1])[1]), bitmapAndCardinality(bmor3, (bs[1])[1]), bitmapAndCardinality(bmor1, (bs[2])[1]), bitmapAndCardinality(bmor2, (bs[2])[1]), bitmapAndCardinality(bmor3, (bs[2])[1]), bitmapAndCardinality(bmor1, (bs[3])[1]), bitmapAndCardinality(bmor2, (bs[3])[1]), bitmapAndCardinality(bmor3, (bs[3])[1]), bitmapAndCardinality(bmor1, (bs[4])[1]), bitmapAndCardinality(bmor2, (bs[4])[1]), bitmapAndCardinality(bmor3, (bs[4])[1]), bitmapAndCardinality(bmor1, (bs[5])[1]), bitmapAndCardinality(bmor2, (bs[5])[1]), bitmapAndCardinality(bmor3, (bs[5])[1]), bitmapAndCardinality(bmor1, (bs[6])[1]), bitmapAndCardinality(bmor2, (bs[6])[1]), bitmapAndCardinality(bmor3, (bs[6])[1]), bitmapAndCardinality(bmor1, (bs[7])[1]), bitmapAndCardinality(bmor2, (bs[7])[1]), bitmapAndCardinality(bmor3, (bs[7])[1]), bitmapAndCardinality(bmor1, (bs[8])[1]), bitmapAndCardinality(bmor2, (bs[8])[1]), bitmapAndCardinality(bmor3, (bs[8])[1]), bitmapAndCardinality(bmor1, (bs[9])[1]), bitmapAndCardinality(bmor2, (bs[9])[1]), bitmapAndCardinality(bmor3, (bs[9])[1]), bitmapAndCardinality(bmor1, (bs[10])[1]), bitmapAndCardinality(bmor2, (bs[10])[1]), bitmapAndCardinality(bmor3, (bs[10])[1])]) AS flat_arr_2 from (SELECT toString(y1) AS y1, toString(x4) AS x4, arrayFlatten(groupArrayInsertAt(flat_arr, multiIf(date_ = '2022-03-01', 0, 1))) AS flat_arr, groupBitmapOrState(bmor1) AS bmor1, groupBitmapOrState(bmor2) AS bmor2, groupBitmapOrState(bmor3) AS bmor3 FROM (WITH '2022-03-01' AS start_ds SELECT y1, x4, groupBitmapOrState(bm) AS bmor1, groupBitmapOrStateIf(bm, y2 > 0) AS bmor2, groupBitmapOrStateIf(bm, y4 = 1) AS bmor3, [sum(y2 * bitmapAndCardinality(bm, (bs[1])[1])), sum(y2 * bitmapAndCardinality(bm, (bs[2])[1])), sum(y2 * bitmapAndCardinality(bm, (bs[3])[1])), sum(y2 * bitmapAndCardinality(bm, (bs[4])[1])), sum(y2 * bitmapAndCardinality(bm, (bs[5])[1])), sum(y2 * bitmapAndCardinality(bm, (bs[6])[1])), sum(y2 * bitmapAndCardinality(bm, (bs[7])[1])), sum(y2 * bitmapAndCardinality(bm, (bs[8])[1])), sum(y2 * bitmapAndCardinality(bm, (bs[9])[1])), sum(y2 * bitmapAndCardinality(bm, (bs[10])[1]))] AS flat_arr, start_ds AS date_ FROM tbl WHERE (ds = start_ds) AND (y1 IN (7063, 5010, 5006, 6788, 6176, 6203, 6769, 6555, 7062, 5119, 5007, 5212, 6814, 6177, 6789, 5095, 4942, 6243, 7061, 6744, 6201, 7196, 6181, 7195, 6178, 5004, 6790, 5008, 6877, 7281, 6791, 6179, 5214, 5005, 7146, 6980, 6322, 5222, 5217, 5137, 6561, 5133, 6937, 5142, 5130, 6885, 7250, 5103, 6867, 7066, 5096, 6868, 6199, 7269, 5131, 6414, 6884, 6560, 5136, 6883, 5158, 6869, 5097, 5132, 5102, 7251, 5219, 4695, 5220, 5202, 4203, 4204, 5098, 6870, 7064, 5101, 5105, 5140, 5135, 5139, 6880, 6194, 5218, 4202, 6655, 5104, 5183, 7245, 5100, 7065, 5099, 6938, 5138, 6881, 5134, 6886, 5141, 5129)) AND (x4 IN (0, 40, 80, 120, 160, 200, 240, 280, 320, 360, 400, 440, 480, 520, 560, 600, 640, 680, 720, 760, 800, 840, 880, 920, 960, 1000)) AND (y4 IN (0, 1)) GROUP BY y1, x4) GROUP BY y1, x4) LIMIT 1 + + DROP TABLE IF EXISTS tbl + DROP TABLE IF EXISTS tbl0 + DROP TABLE IF EXISTS tmp_acc_hit + From d3763c4a62590c19c4fa9525504edac591b49b2c Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Tue, 5 Apr 2022 15:38:44 +0200 Subject: [PATCH 209/239] Update ClientBase.h --- programs/client/Client.cpp | 1 + programs/local/LocalServer.cpp | 8 +++++++- programs/local/LocalServer.h | 2 +- src/Client/ClientBase.cpp | 2 +- src/Client/ClientBase.h | 4 +++- 5 files changed, 13 insertions(+), 4 deletions(-) diff --git a/programs/client/Client.cpp b/programs/client/Client.cpp index c2094b3b00d..b1baf978d87 100644 --- a/programs/client/Client.cpp +++ b/programs/client/Client.cpp @@ -1005,6 +1005,7 @@ void Client::processConfig() global_context->setCurrentQueryId(query_id); } print_stack_trace = config().getBool("stacktrace", false); + logging_initialized = true; if (config().has("multiquery")) is_multiquery = true; diff --git a/programs/local/LocalServer.cpp b/programs/local/LocalServer.cpp index 1dfb5c1d636..18b62e65765 100644 --- a/programs/local/LocalServer.cpp +++ b/programs/local/LocalServer.cpp @@ -434,8 +434,11 @@ catch (...) return getCurrentExceptionCode(); } -void LocalServer::setLogger(const String & logs_level) +void LocalServer::updateLoggerLevel(const String & logs_level) { + if (!logging_initialized) + return; + config().setString("logger.level", logs_level); updateLevels(config(), logger()); } @@ -475,6 +478,7 @@ void LocalServer::processConfig() auto poco_logs_level = Poco::Logger::parseLevel(level); Poco::Logger::root().setLevel(poco_logs_level); Poco::Logger::root().setChannel(Poco::AutoPtr(new Poco::SimpleFileChannel(server_logs_file))); + logging_initialized = true; } else if (logging || is_interactive) { @@ -482,11 +486,13 @@ void LocalServer::processConfig() auto log_level_default = is_interactive && !logging ? "none" : level; config().setString("logger.level", config().getString("log-level", config().getString("send_logs_level", log_level_default))); buildLoggers(config(), logger(), "clickhouse-local"); + logging_initialized = true; } else { Poco::Logger::root().setLevel("none"); Poco::Logger::root().setChannel(Poco::AutoPtr(new Poco::NullChannel())); + logging_initialized = false; } shared_context = Context::createShared(); diff --git a/programs/local/LocalServer.h b/programs/local/LocalServer.h index 3ee6d80136e..e96fb211554 100644 --- a/programs/local/LocalServer.h +++ b/programs/local/LocalServer.h @@ -46,7 +46,7 @@ protected: void processConfig() override; - void setLogger(const String & logs_level) override; + void updateLoggerLevel(const String & logs_level) override; private: /** Composes CREATE subquery based on passed arguments (--structure --file --table and --input-format) diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp index 7f5b5b82082..568205c6dfe 100644 --- a/src/Client/ClientBase.cpp +++ b/src/Client/ClientBase.cpp @@ -1302,7 +1302,7 @@ void ClientBase::processParsedSingleQuery(const String & full_query, const Strin { const auto * logs_level_field = set_query->changes.tryGet(std::string_view{"send_logs_level"}); if (logs_level_field) - setLogger(logs_level_field->safeGet()); + updateLoggerLevel(logs_level_field->safeGet()); } processed_rows = 0; diff --git a/src/Client/ClientBase.h b/src/Client/ClientBase.h index 1ce8be94d39..6846fa247e8 100644 --- a/src/Client/ClientBase.h +++ b/src/Client/ClientBase.h @@ -95,7 +95,7 @@ protected: std::optional hosts_and_ports_description; }; - virtual void setLogger(const String &) { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method `initializeLogger()` is not implemented for `{}`", getName()); } + virtual void updateLoggerLevel(const String &) {} virtual void printHelpMessage(const OptionsDescription & options_description) = 0; virtual void addOptions(OptionsDescription & options_description) = 0; virtual void processOptions(const OptionsDescription & options_description, @@ -266,6 +266,8 @@ protected: bool allow_repeated_settings = false; bool cancelled = false; + + bool logging_initialized = false; }; } From 0b0c8ef09e1409bceae44c134c2fbc8eee6044c8 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Wed, 6 Apr 2022 18:47:34 +0800 Subject: [PATCH 210/239] add integration tests --- src/Storages/Hive/StorageHive.cpp | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/Storages/Hive/StorageHive.cpp b/src/Storages/Hive/StorageHive.cpp index a7537f5a1f6..6c31a959269 100644 --- a/src/Storages/Hive/StorageHive.cpp +++ b/src/Storages/Hive/StorageHive.cpp @@ -176,7 +176,8 @@ public: /// Use local cache for remote storage if enabled. std::unique_ptr remote_read_buf; - if (ExternalDataSourceCache::instance().isInitialized() && getContext()->getSettingsRef().use_local_cache_for_remote_storage) + if (ExternalDataSourceCache::instance().isInitialized() + && getContext()->getSettingsRef().use_local_cache_for_remote_storage) { size_t buff_size = raw_read_buf->internalBuffer().size(); if (buff_size == 0) @@ -547,7 +548,8 @@ HiveFilePtr StorageHive::createHiveFileIfNeeded( if (!hivefile_key_condition.checkInHyperrectangle(hive_file->getMinMaxIndex()->hyperrectangle, hivefile_name_types.getTypes()) .can_be_true) { - LOG_TRACE(log, "Skip hive file {} by index {}", hive_file->getPath(), hive_file->describeMinMaxIndex(hive_file->getMinMaxIndex())); + LOG_TRACE( + log, "Skip hive file {} by index {}", hive_file->getPath(), hive_file->describeMinMaxIndex(hive_file->getMinMaxIndex())); return {}; } } @@ -563,7 +565,12 @@ HiveFilePtr StorageHive::createHiveFileIfNeeded( if (!hivefile_key_condition.checkInHyperrectangle(sub_minmax_idxes[i]->hyperrectangle, hivefile_name_types.getTypes()) .can_be_true) { - LOG_TRACE(log, "Skip split {} of hive file {}", i, hive_file->getPath()); + LOG_TRACE( + log, + "Skip split {} of hive file {} by index {}", + i, + hive_file->getPath(), + hive_file->describeMinMaxIndex(sub_minmax_idxes[i])); skip_splits.insert(i); } } From 5d8a1a4465e766d814630af3f2a2a87d32a6fdc4 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Wed, 6 Apr 2022 18:50:39 +0800 Subject: [PATCH 211/239] add integration tests --- tests/integration/test_hive_query/test.py | 87 +++++++++++++++++++++++ 1 file changed, 87 insertions(+) diff --git a/tests/integration/test_hive_query/test.py b/tests/integration/test_hive_query/test.py index 9e9a20fa6d1..6a8e152a4bd 100644 --- a/tests/integration/test_hive_query/test.py +++ b/tests/integration/test_hive_query/test.py @@ -149,6 +149,93 @@ def test_orc_groupby(started_cluster): assert result == expected_result +@pytest.mark.parametrize( + "table,use_local_cache_for_remote_storage,enable_orc_file_minmax_index,enable_orc_stripe_minmax_index", + [ + pytest.param( + "demo_orc_no_cache_no_index", + "false", + "false", + "false", + id="demo_orc_no_cache_no_index", + ), + pytest.param( + "demo_orc_with_cache_no_index", + "true", + "false", + "false", + id="demo_orc_with_cache_no_index", + ), + pytest.param( + "demo_orc_no_cache_file_index", + "false", + "true", + "false", + id="demo_orc_no_cache_file_index", + ), + pytest.param( + "demo_orc_with_cache_file_index", + "true", + "true", + "false", + id="demo_orc_with_cache_file_index", + ), + pytest.param( + "demo_orc_no_cache_stripe_index", + "false", + "true", + "true", + id="demo_orc_no_cache_stripe_index", + ), + pytest.param( + "demo_orc_with_cache_stripe_index", + "true", + "true", + "true", + id="demo_orc_with_cache_stripe_index", + ), + ], +) +def test_orc_minmax_index( + started_cluster, + table, + use_local_cache_for_remote_storage, + enable_orc_file_minmax_index, + enable_orc_stripe_minmax_index, +): + node = started_cluster.instances["h0_0_0"] + result = node.query( + """ + DROP TABLE IF EXISTS default.{table}; + CREATE TABLE default.{table} (`id` Nullable(String), `score` Nullable(Int32), `day` Nullable(String)) ENGINE = Hive('thrift://hivetest:9083', 'test', 'demo_orc') PARTITION BY(day) + SETTINGS enable_orc_file_minmax_index = {enable_orc_file_minmax_index}, enable_orc_stripe_minmax_index = {enable_orc_stripe_minmax_index}; + """.format( + table=table, + enable_orc_file_minmax_index=enable_orc_file_minmax_index, + enable_orc_stripe_minmax_index=enable_orc_stripe_minmax_index, + ) + ) + assert result.strip() == "" + + for i in range(2): + result = node.query( + """ + SELECT day, id, score FROM default.{table} where day >= '2021-11-05' and day <= '2021-11-16' and score >= 15 and score <= 30 order by day, id + SETTINGS use_local_cache_for_remote_storage = {use_local_cache_for_remote_storage} + """.format( + table=table, + use_local_cache_for_remote_storage=use_local_cache_for_remote_storage, + ) + ) + + assert ( + result.strip() + == """2021-11-05 abd 15 +2021-11-16 aaa 22 +""" + ) + + def test_hive_columns_prunning(started_cluster): logging.info("Start testing groupby ...") node = started_cluster.instances["h0_0_0"] From acbd11ccbcae8900a5cec43bc0a0d4147e5a34e4 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Wed, 6 Apr 2022 18:59:17 +0800 Subject: [PATCH 212/239] add integration tests --- tests/integration/test_hive_query/test.py | 67 +++++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/tests/integration/test_hive_query/test.py b/tests/integration/test_hive_query/test.py index 6a8e152a4bd..e1aa811c1a2 100644 --- a/tests/integration/test_hive_query/test.py +++ b/tests/integration/test_hive_query/test.py @@ -236,6 +236,73 @@ def test_orc_minmax_index( ) +@pytest.mark.parametrize( + "table,use_local_cache_for_remote_storage,enable_parquet_rowgroup_minmax_index", + [ + pytest.param( + "demo_parquet_no_cache_no_index", + "false", + "false", + id="demo_parquet_no_cache_no_index", + ), + pytest.param( + "demo_parquet_with_cache_no_index", + "true", + "false", + id="demo_parquet_with_cache_no_index", + ), + pytest.param( + "demo_parquet_no_cache_rowgroup_index", + "false", + "true", + id="demo_parquet_no_cache_rowgroup_index", + ), + pytest.param( + "demo_parquet_with_cache_rowgroup_index", + "true", + "true", + id="demo_parquet_with_cache_rowgroup_index", + ), + ], +) +def test_parquet_minmax_index( + started_cluster, + table, + use_local_cache_for_remote_storage, + enable_parquet_rowgroup_minmax_index, +): + node = started_cluster.instances["h0_0_0"] + result = node.query( + """ + DROP TABLE IF EXISTS default.{table}; + CREATE TABLE default.{table} (`id` Nullable(String), `score` Nullable(Int32), `day` Nullable(String)) ENGINE = Hive('thrift://hivetest:9083', 'test', 'demo') PARTITION BY(day) + SETTINGS enable_parquet_rowgroup_minmax_index = {enable_parquet_rowgroup_minmax_index} + """.format( + table=table, + enable_parquet_rowgroup_minmax_index=enable_parquet_rowgroup_minmax_index, + ) + ) + assert result.strip() == "" + + for i in range(2): + result = node.query( + """ + SELECT day, id, score FROM default.{table} where day >= '2021-11-05' and day <= '2021-11-16' and score >= 15 and score <= 30 order by day, id + SETTINGS use_local_cache_for_remote_storage = {use_local_cache_for_remote_storage} + """.format( + table=table, + use_local_cache_for_remote_storage=use_local_cache_for_remote_storage, + ) + ) + + assert ( + result.strip() + == """2021-11-05 abd 15 +2021-11-16 aaa 22 +""" + ) + + def test_hive_columns_prunning(started_cluster): logging.info("Start testing groupby ...") node = started_cluster.instances["h0_0_0"] From 058cde820fd06f76a0f0fcca7782540fa17d745f Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Wed, 6 Apr 2022 09:48:18 +0300 Subject: [PATCH 213/239] test/stress: improve error checks for backward compatiblity check Before the log was analyzed only after the server had been restarted after stressing, and hence it cannot find anything in case of error, like here [1]. [1]: https://s3.amazonaws.com/clickhouse-test-reports/34355/acd48ebe2d51d20d0c2a5bc545d46d27f9bf7904/stress_test__thread__actions_.html Signed-off-by: Azat Khuzhin --- docker/test/stress/run.sh | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/docker/test/stress/run.sh b/docker/test/stress/run.sh index e56afcbfd7a..1ca49b5efaf 100755 --- a/docker/test/stress/run.sh +++ b/docker/test/stress/run.sh @@ -289,7 +289,7 @@ then install_packages package_folder mkdir tmp_stress_output - + ./stress --backward-compatibility-check --output-folder tmp_stress_output --global-time-limit=1200 \ && echo -e 'Backward compatibility check: Test script exit code\tOK' >> /test_output/test_results.tsv \ || echo -e 'Backward compatibility check: Test script failed\tFAIL' >> /test_output/test_results.tsv @@ -297,8 +297,9 @@ then clickhouse-client --query="SELECT 'Tables count:', count() FROM system.tables" - stop - + stop + mv /var/log/clickhouse-server/clickhouse-server.log /var/log/clickhouse-server/clickhouse-server.backward.stress.log + # Start new server configure start 500 @@ -310,8 +311,9 @@ then # Let the server run for a while before checking log. sleep 60 - + stop + mv /var/log/clickhouse-server/clickhouse-server.log /var/log/clickhouse-server/clickhouse-server.backward.clean.log # Error messages (we should ignore some errors) echo "Check for Error messages in server log:" @@ -332,7 +334,7 @@ then -e "Code: 1000, e.code() = 111, Connection refused" \ -e "UNFINISHED" \ -e "Renaming unexpected part" \ - /var/log/clickhouse-server/clickhouse-server.log | zgrep -Fa "" > /test_output/bc_check_error_messages.txt \ + /var/log/clickhouse-server/clickhouse-server.backward.*.log | zgrep -Fa "" > /test_output/bc_check_error_messages.txt \ && echo -e 'Backward compatibility check: Error message in clickhouse-server.log (see bc_check_error_messages.txt)\tFAIL' >> /test_output/test_results.tsv \ || echo -e 'Backward compatibility check: No Error messages in clickhouse-server.log\tOK' >> /test_output/test_results.tsv @@ -348,13 +350,13 @@ then rm -f /test_output/tmp # OOM - zgrep -Fa " Application: Child process was terminated by signal 9" /var/log/clickhouse-server/clickhouse-server.log* > /dev/null \ + zgrep -Fa " Application: Child process was terminated by signal 9" /var/log/clickhouse-server/clickhouse-server.backward.*.log > /dev/null \ && echo -e 'Backward compatibility check: OOM killer (or signal 9) in clickhouse-server.log\tFAIL' >> /test_output/test_results.tsv \ || echo -e 'Backward compatibility check: No OOM messages in clickhouse-server.log\tOK' >> /test_output/test_results.tsv # Logical errors echo "Check for Logical errors in server log:" - zgrep -Fa -A20 "Code: 49, e.displayText() = DB::Exception:" /var/log/clickhouse-server/clickhouse-server.log* > /test_output/bc_check_logical_errors.txt \ + zgrep -Fa -A20 "Code: 49, e.displayText() = DB::Exception:" /var/log/clickhouse-server/clickhouse-server.backward.*.log > /test_output/bc_check_logical_errors.txt \ && echo -e 'Backward compatibility check: Logical error thrown (see clickhouse-server.log or bc_check_logical_errors.txt)\tFAIL' >> /test_output/test_results.tsv \ || echo -e 'Backward compatibility check: No logical errors\tOK' >> /test_output/test_results.tsv @@ -362,19 +364,18 @@ then [ -s /test_output/bc_check_logical_errors.txt ] || rm /test_output/bc_check_logical_errors.txt # Crash - zgrep -Fa "########################################" /var/log/clickhouse-server/clickhouse-server.log* > /dev/null \ + zgrep -Fa "########################################" /var/log/clickhouse-server/clickhouse-server.backward.*.log > /dev/null \ && echo -e 'Backward compatibility check: Killed by signal (in clickhouse-server.log)\tFAIL' >> /test_output/test_results.tsv \ || echo -e 'Backward compatibility check: Not crashed\tOK' >> /test_output/test_results.tsv # It also checks for crash without stacktrace (printed by watchdog) echo "Check for Fatal message in server log:" - zgrep -Fa " " /var/log/clickhouse-server/clickhouse-server.log* > /test_output/bc_check_fatal_messages.txt \ + zgrep -Fa " " /var/log/clickhouse-server/clickhouse-server.backward.*.log > /test_output/bc_check_fatal_messages.txt \ && echo -e 'Backward compatibility check: Fatal message in clickhouse-server.log (see bc_check_fatal_messages.txt)\tFAIL' >> /test_output/test_results.tsv \ || echo -e 'Backward compatibility check: No fatal messages in clickhouse-server.log\tOK' >> /test_output/test_results.tsv # Remove file bc_check_fatal_messages.txt if it's empty [ -s /test_output/bc_check_fatal_messages.txt ] || rm /test_output/bc_check_fatal_messages.txt - else echo -e "Backward compatibility check: Failed to download previous release packets\tFAIL" >> /test_output/test_results.tsv fi From 2fb6d6acb4ff12488ce18fbb9c6687802edfa3e4 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Wed, 6 Apr 2022 09:52:45 +0300 Subject: [PATCH 214/239] stress: use more meaningful names for clickhouse-server.log But note, that stderr.log/stdout.log/clickhouse-server.err.log still not separated. Signed-off-by: Azat Khuzhin --- docker/test/stress/run.sh | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/docker/test/stress/run.sh b/docker/test/stress/run.sh index 1ca49b5efaf..ba85999caa5 100755 --- a/docker/test/stress/run.sh +++ b/docker/test/stress/run.sh @@ -106,17 +106,6 @@ function stop() function start() { - # Rename existing log file - it will be more convenient to read separate files for separate server runs. - if [ -f '/var/log/clickhouse-server/clickhouse-server.log' ] - then - log_file_counter=1 - while [ -f "/var/log/clickhouse-server/clickhouse-server.log.${log_file_counter}" ] - do - log_file_counter=$((log_file_counter + 1)) - done - mv '/var/log/clickhouse-server/clickhouse-server.log' "/var/log/clickhouse-server/clickhouse-server.log.${log_file_counter}" - fi - counter=0 until clickhouse-client --query "SELECT 1" do @@ -190,6 +179,8 @@ clickhouse-client --query "ATTACH DATABASE IF NOT EXISTS datasets ENGINE = Ordin clickhouse-client --query "CREATE DATABASE IF NOT EXISTS test" stop +mv /var/log/clickhouse-server/clickhouse-server.log /var/log/clickhouse-server/clickhouse-server.initial.log + start clickhouse-client --query "SHOW TABLES FROM datasets" @@ -205,6 +196,8 @@ clickhouse-client --query "SHOW TABLES FROM test" || echo -e 'Test script failed\tFAIL' >> /test_output/test_results.tsv stop +mv /var/log/clickhouse-server/clickhouse-server.log /var/log/clickhouse-server/clickhouse-server.stress.log + start clickhouse-client --query "SELECT 'Server successfully started', 'OK'" >> /test_output/test_results.tsv \ @@ -263,10 +256,12 @@ mkdir previous_release_package_folder clickhouse-client --query="SELECT version()" | ./download_previous_release && echo -e 'Download script exit code\tOK' >> /test_output/test_results.tsv \ || echo -e 'Download script failed\tFAIL' >> /test_output/test_results.tsv +stop +mv /var/log/clickhouse-server/clickhouse-server.log /var/log/clickhouse-server/clickhouse-server.clean.log + if [ "$(ls -A previous_release_package_folder/clickhouse-common-static_*.deb && ls -A previous_release_package_folder/clickhouse-server_*.deb)" ] then echo -e "Successfully downloaded previous release packets\tOK" >> /test_output/test_results.tsv - stop # Uninstall current packages dpkg --remove clickhouse-client From 4cb7b7e49baadbaabb23f46fd0f45bce2d12b0a9 Mon Sep 17 00:00:00 2001 From: vdimir Date: Wed, 6 Apr 2022 11:15:37 +0000 Subject: [PATCH 215/239] Create parent directories in DiskLocal::replaceFile --- src/Disks/DiskLocal.cpp | 1 + ...265_rename_join_ordinary_to_atomic.reference | 1 + .../02265_rename_join_ordinary_to_atomic.sql | 17 +++++++++++++++++ 3 files changed, 19 insertions(+) create mode 100644 tests/queries/0_stateless/02265_rename_join_ordinary_to_atomic.reference create mode 100644 tests/queries/0_stateless/02265_rename_join_ordinary_to_atomic.sql diff --git a/src/Disks/DiskLocal.cpp b/src/Disks/DiskLocal.cpp index 44fdbb77323..a91db508295 100644 --- a/src/Disks/DiskLocal.cpp +++ b/src/Disks/DiskLocal.cpp @@ -333,6 +333,7 @@ void DiskLocal::replaceFile(const String & from_path, const String & to_path) { fs::path from_file = fs::path(disk_path) / from_path; fs::path to_file = fs::path(disk_path) / to_path; + fs::create_directories(to_file.parent_path()); fs::rename(from_file, to_file); } diff --git a/tests/queries/0_stateless/02265_rename_join_ordinary_to_atomic.reference b/tests/queries/0_stateless/02265_rename_join_ordinary_to_atomic.reference new file mode 100644 index 00000000000..58c9bdf9d01 --- /dev/null +++ b/tests/queries/0_stateless/02265_rename_join_ordinary_to_atomic.reference @@ -0,0 +1 @@ +111 diff --git a/tests/queries/0_stateless/02265_rename_join_ordinary_to_atomic.sql b/tests/queries/0_stateless/02265_rename_join_ordinary_to_atomic.sql new file mode 100644 index 00000000000..3ec995a6a24 --- /dev/null +++ b/tests/queries/0_stateless/02265_rename_join_ordinary_to_atomic.sql @@ -0,0 +1,17 @@ +-- Tags: no-parallel + +DROP DATABASE IF EXISTS 02265_atomic_db; +DROP DATABASE IF EXISTS 02265_ordinary_db; + +CREATE DATABASE 02265_atomic_db ENGINE = Atomic; +CREATE DATABASE 02265_ordinary_db ENGINE = Ordinary; + +CREATE TABLE 02265_ordinary_db.join_table ( `a` Int64 ) ENGINE = Join(`ALL`, LEFT, a); +INSERT INTO 02265_ordinary_db.join_table VALUES (111); + +RENAME TABLE 02265_ordinary_db.join_table TO 02265_atomic_db.join_table; + +SELECT * FROM 02265_atomic_db.join_table; + +DROP DATABASE IF EXISTS 02265_atomic_db; +DROP DATABASE IF EXISTS 02265_ordinary_db; From 1309e781b624fdfa5c5886eb25f5f2ee52c5fe89 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Wed, 6 Apr 2022 13:56:26 +0200 Subject: [PATCH 216/239] apply suggestion --- src/Storages/StorageReplicatedMergeTree.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 1127337adff..39840f91325 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -969,7 +969,7 @@ bool StorageReplicatedMergeTree::removeTableNodesFromZooKeeper(zkutil::ZooKeeper static constexpr std::array flat_nodes = {"block_numbers", "blocks", "leader_election", "log", "mutations", "pinned_part_uuids"}; /// First try to remove paths that are known to be flat - for (const auto & node : flat_nodes) + for (const auto * node : flat_nodes) { bool removed_quickly = zookeeper->tryRemoveChildrenRecursive(fs::path(zookeeper_path) / node, /* probably flat */ true); if (!removed_quickly) @@ -979,7 +979,7 @@ bool StorageReplicatedMergeTree::removeTableNodesFromZooKeeper(zkutil::ZooKeeper /// Then try to remove nodes that are known to have no children (and should always exist) Coordination::Requests ops; - for (const auto & node : flat_nodes) + for (const auto * node : flat_nodes) ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_path + "/" + node, -1)); ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_path + "/alter_partition_version", -1)); From 53628092e2ae20d6f4054344438cda041aa1dfde Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Wed, 6 Apr 2022 20:04:13 +0800 Subject: [PATCH 217/239] Fix test --- .../queries/0_stateless/02174_cte_scalar_cache_mv.reference | 2 +- tests/queries/0_stateless/02174_cte_scalar_cache_mv.sql | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/queries/0_stateless/02174_cte_scalar_cache_mv.reference b/tests/queries/0_stateless/02174_cte_scalar_cache_mv.reference index 246706164df..055c88160ad 100644 --- a/tests/queries/0_stateless/02174_cte_scalar_cache_mv.reference +++ b/tests/queries/0_stateless/02174_cte_scalar_cache_mv.reference @@ -18,7 +18,7 @@ 89 89 89 89 5 94 94 94 94 5 99 99 99 99 5 -02177_MV 7 80 22 +02177_MV 3 80 26 10 40 70 diff --git a/tests/queries/0_stateless/02174_cte_scalar_cache_mv.sql b/tests/queries/0_stateless/02174_cte_scalar_cache_mv.sql index 4d4447c7f31..742d72fe2b2 100644 --- a/tests/queries/0_stateless/02174_cte_scalar_cache_mv.sql +++ b/tests/queries/0_stateless/02174_cte_scalar_cache_mv.sql @@ -39,13 +39,13 @@ SYSTEM FLUSH LOGS; -- The main query should have a cache miss and 3 global hits -- The MV is executed 20 times (100 / 5) and each run does 1 miss and 4 hits to the LOCAL cache -- In addition to this, to prepare the MV, there is an extra preparation to get the list of columns via --- InterpreterSelectQuery, which adds 1 miss and 4 global hits (since it uses the global cache) +-- InterpreterSelectQuery, which adds 5 miss (since we don't use cache for preparation) -- So in total we have: -- Main query: 1 miss, 3 global --- Preparation: 1 miss, 4 global +-- Preparation: 5 miss -- Blocks (20): 20 miss, 0 global, 80 local hits --- TOTAL: 22 miss, 7 global, 80 local +-- TOTAL: 26 miss, 3 global, 80 local SELECT '02177_MV', ProfileEvents['ScalarSubqueriesGlobalCacheHit'] as scalar_cache_global_hit, From e2044b65049aa343268dee72f5cfbfe386e05edb Mon Sep 17 00:00:00 2001 From: Denis Glazachev Date: Wed, 6 Apr 2022 16:17:22 +0400 Subject: [PATCH 218/239] Set ENABLE_BUILD_PATH_MAPPING to OFF by default, if CMAKE_BUILD_TYPE is set to Debug --- CMakeLists.txt | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a9ce64b87ba..d893ba773cc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -294,14 +294,19 @@ include(cmake/cpu_features.cmake) # Enable it explicitly. set (COMPILER_FLAGS "${COMPILER_FLAGS} -fasynchronous-unwind-tables") -# Reproducible builds -# If turned `ON`, remap file source paths in debug info, predefined preprocessor macros and __builtin_FILE(). -option(ENABLE_BUILD_PATH_MAPPING "Enable remap file source paths in debug info, predefined preprocessor macros and __builtin_FILE(). It's to generate reproducible builds. See https://reproducible-builds.org/docs/build-path" ON) +# Reproducible builds. +if (CMAKE_BUILD_TYPE_UC STREQUAL "DEBUG") + set (ENABLE_BUILD_PATH_MAPPING_DEFAULT OFF) +else () + set (ENABLE_BUILD_PATH_MAPPING_DEFAULT ON) +endif () + +option (ENABLE_BUILD_PATH_MAPPING "Enable remapping of file source paths in debug info, predefined preprocessor macros, and __builtin_FILE(). It's used to generate reproducible builds. See https://reproducible-builds.org/docs/build-path" ${ENABLE_BUILD_PATH_MAPPING_DEFAULT}) if (ENABLE_BUILD_PATH_MAPPING) set (COMPILER_FLAGS "${COMPILER_FLAGS} -ffile-prefix-map=${CMAKE_SOURCE_DIR}=.") set (CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} -ffile-prefix-map=${CMAKE_SOURCE_DIR}=.") -endif() +endif () if (${CMAKE_VERSION} VERSION_LESS "3.12.4") # CMake < 3.12 doesn't support setting 20 as a C++ standard version. From 59e47472f182f9541af5048b7da29f52d61c7e0e Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Wed, 6 Apr 2022 20:51:31 +0800 Subject: [PATCH 219/239] fix integration tests --- tests/integration/test_hive_query/test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_hive_query/test.py b/tests/integration/test_hive_query/test.py index e1aa811c1a2..374a86d51e8 100644 --- a/tests/integration/test_hive_query/test.py +++ b/tests/integration/test_hive_query/test.py @@ -229,7 +229,7 @@ def test_orc_minmax_index( ) assert ( - result.strip() + result == """2021-11-05 abd 15 2021-11-16 aaa 22 """ @@ -296,7 +296,7 @@ def test_parquet_minmax_index( ) assert ( - result.strip() + result == """2021-11-05 abd 15 2021-11-16 aaa 22 """ From 19e0e6e7652101dbf3a14137c25254adc0147c69 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Wed, 6 Apr 2022 14:27:29 +0200 Subject: [PATCH 220/239] Fix failing BuilderReport --- .github/workflows/backport_branches.yml | 11 ++++++++--- .github/workflows/master.yml | 12 +++++++++--- .github/workflows/pull_request.yml | 12 +++++++++--- .github/workflows/release_branches.yml | 12 +++++++++--- tests/ci/build_report_check.py | 4 +++- 5 files changed, 38 insertions(+), 13 deletions(-) diff --git a/.github/workflows/backport_branches.yml b/.github/workflows/backport_branches.yml index 75f8a63368d..417284f14d5 100644 --- a/.github/workflows/backport_branches.yml +++ b/.github/workflows/backport_branches.yml @@ -341,10 +341,15 @@ jobs: steps: - name: Set envs run: | + DEPENDENCIES=$(cat << 'EOF' | jq '. | length' + ${{ toJSON(needs) }} + EOF + ) + echo "DEPENDENCIES=$DEPENDENCIES" >> "$GITHUB_ENV" cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/report_check - REPORTS_PATH=${{runner.temp}}/reports_dir CHECK_NAME=ClickHouse build check (actions) + REPORTS_PATH=${{runner.temp}}/reports_dir + TEMP_PATH=${{runner.temp}}/report_check EOF - name: Download json reports uses: actions/download-artifact@v2 @@ -360,7 +365,7 @@ jobs: sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cd "$GITHUB_WORKSPACE/tests/ci" - python3 build_report_check.py "$CHECK_NAME" + python3 build_report_check.py "$CHECK_NAME" "$DEPENDENCIES" - name: Cleanup if: always() run: | diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml index 081fa165c68..eab7ce36eb7 100644 --- a/.github/workflows/master.yml +++ b/.github/workflows/master.yml @@ -992,10 +992,16 @@ jobs: steps: - name: Set envs run: | + DEPENDENCIES=$(cat << 'EOF' | jq '. | length' + ${{ toJSON(needs) }} + EOF + ) + echo "DEPENDENCIES=$DEPENDENCIES" >> "$GITHUB_ENV" cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/report_check - REPORTS_PATH=${{runner.temp}}/reports_dir CHECK_NAME=ClickHouse build check (actions) + REPORTS_PATH=${{runner.temp}}/reports_dir + REPORTS_PATH=${{runner.temp}}/reports_dir + TEMP_PATH=${{runner.temp}}/report_check EOF - name: Download json reports uses: actions/download-artifact@v2 @@ -1011,7 +1017,7 @@ jobs: sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cd "$GITHUB_WORKSPACE/tests/ci" - python3 build_report_check.py "$CHECK_NAME" + python3 build_report_check.py "$CHECK_NAME" "$DEPENDENCIES" - name: Cleanup if: always() run: | diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index c01d1821d0f..8942cca391e 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -1044,10 +1044,16 @@ jobs: steps: - name: Set envs run: | + DEPENDENCIES=$(cat << 'EOF' | jq '. | length' + ${{ toJSON(needs) }} + EOF + ) + echo "DEPENDENCIES=$DEPENDENCIES" >> "$GITHUB_ENV" cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/report_check - REPORTS_PATH=${{runner.temp}}/reports_dir CHECK_NAME=ClickHouse build check (actions) + REPORTS_PATH=${{runner.temp}}/reports_dir + REPORTS_PATH=${{runner.temp}}/reports_dir + TEMP_PATH=${{runner.temp}}/report_check EOF - name: Download json reports uses: actions/download-artifact@v2 @@ -1063,7 +1069,7 @@ jobs: sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cd "$GITHUB_WORKSPACE/tests/ci" - python3 build_report_check.py "$CHECK_NAME" + python3 build_report_check.py "$CHECK_NAME" "$DEPENDENCIES" - name: Cleanup if: always() run: | diff --git a/.github/workflows/release_branches.yml b/.github/workflows/release_branches.yml index d916699acc2..b2af465142b 100644 --- a/.github/workflows/release_branches.yml +++ b/.github/workflows/release_branches.yml @@ -436,10 +436,16 @@ jobs: steps: - name: Set envs run: | + DEPENDENCIES=$(cat << 'EOF' | jq '. | length' + ${{ toJSON(needs) }} + EOF + ) + echo "DEPENDENCIES=$DEPENDENCIES" >> "$GITHUB_ENV" cat >> "$GITHUB_ENV" << 'EOF' - TEMP_PATH=${{runner.temp}}/report_check - REPORTS_PATH=${{runner.temp}}/reports_dir CHECK_NAME=ClickHouse build check (actions) + REPORTS_PATH=${{runner.temp}}/reports_dir + REPORTS_PATH=${{runner.temp}}/reports_dir + TEMP_PATH=${{runner.temp}}/report_check EOF - name: Download json reports uses: actions/download-artifact@v2 @@ -455,7 +461,7 @@ jobs: sudo rm -fr "$TEMP_PATH" mkdir -p "$TEMP_PATH" cd "$GITHUB_WORKSPACE/tests/ci" - python3 build_report_check.py "$CHECK_NAME" + python3 build_report_check.py "$CHECK_NAME" "$DEPENDENCIES" - name: Cleanup if: always() run: | diff --git a/tests/ci/build_report_check.py b/tests/ci/build_report_check.py index 4e0859ef865..dc3126d32c3 100644 --- a/tests/ci/build_report_check.py +++ b/tests/ci/build_report_check.py @@ -120,6 +120,7 @@ if __name__ == "__main__": os.makedirs(temp_path) build_check_name = sys.argv[1] + reports_length = int(sys.argv[2]) if len(sys.argv) > 2 else 0 gh = Github(get_best_robot_token()) pr_info = PRInfo() @@ -148,7 +149,8 @@ if __name__ == "__main__": build_name, ) - some_builds_are_missing = len(build_reports_map) < len(reports_order) + reports_length = reports_length or len(reports_order) + some_builds_are_missing = len(build_reports_map) < reports_length if some_builds_are_missing: logging.info( From 1323c3617f70b1d79e4f92415b181effb3eb812a Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Wed, 6 Apr 2022 14:34:20 +0200 Subject: [PATCH 221/239] Fix linter points --- tests/ci/build_report_check.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/ci/build_report_check.py b/tests/ci/build_report_check.py index dc3126d32c3..f29c6ced122 100644 --- a/tests/ci/build_report_check.py +++ b/tests/ci/build_report_check.py @@ -139,7 +139,7 @@ if __name__ == "__main__": logging.info("Found build report json %s", f) build_name = get_build_name_from_file_name(f) if build_name in reports_order: - with open(os.path.join(root, f), "r") as file_handler: + with open(os.path.join(root, f), "rb") as file_handler: build_report = json.load(file_handler) build_reports_map[build_name] = build_report else: @@ -190,7 +190,7 @@ if __name__ == "__main__": branch_url = f"{GITHUB_SERVER_URL}/{GITHUB_REPOSITORY}/commits/master" branch_name = "master" if pr_info.number != 0: - branch_name = "PR #{}".format(pr_info.number) + branch_name = f"PR #{pr_info.number}" branch_url = f"{GITHUB_SERVER_URL}/{GITHUB_REPOSITORY}/pull/{pr_info.number}" commit_url = f"{GITHUB_SERVER_URL}/{GITHUB_REPOSITORY}/commit/{pr_info.sha}" task_url = GITHUB_RUN_URL @@ -206,8 +206,8 @@ if __name__ == "__main__": ) report_path = os.path.join(temp_path, "report.html") - with open(report_path, "w") as f: - f.write(report) + with open(report_path, "w", encoding="utf-8") as fd: + fd.write(report) logging.info("Going to upload prepared report") context_name_for_path = build_check_name.lower().replace(" ", "_") @@ -241,7 +241,7 @@ if __name__ == "__main__": description = f"{ok_builds}/{total_builds} builds are OK {addition}" - print("::notice ::Report url: {}".format(url)) + print(f"::notice ::Report url: {url}") commit = get_commit(gh, pr_info.sha) commit.create_status( From 12442fd0a5bd915c9bbe9fd602c0fe699c76cf43 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Wed, 6 Apr 2022 14:39:23 +0200 Subject: [PATCH 222/239] Add description for missing builds --- tests/ci/build_report_check.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ci/build_report_check.py b/tests/ci/build_report_check.py index f29c6ced122..7ca958bd745 100644 --- a/tests/ci/build_report_check.py +++ b/tests/ci/build_report_check.py @@ -237,7 +237,7 @@ if __name__ == "__main__": addition = "" if some_builds_are_missing: - addition = "(some builds are missing)" + addition = f"({len(build_reports_map)} < {reports_length})" description = f"{ok_builds}/{total_builds} builds are OK {addition}" From 57ddb18f861c683949b174ac5cb37f4396ac7bc7 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Wed, 6 Apr 2022 16:04:55 +0200 Subject: [PATCH 223/239] Make test 00159_parallel_formatting_tsv_and_friends.sh more stable --- .../1_stateful/00159_parallel_formatting_tsv_and_friends.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/queries/1_stateful/00159_parallel_formatting_tsv_and_friends.sh b/tests/queries/1_stateful/00159_parallel_formatting_tsv_and_friends.sh index 9d48774dd2d..02441190b91 100755 --- a/tests/queries/1_stateful/00159_parallel_formatting_tsv_and_friends.sh +++ b/tests/queries/1_stateful/00159_parallel_formatting_tsv_and_friends.sh @@ -10,10 +10,10 @@ FORMATS=('TSV' 'TSVWithNames' 'TSKV') for format in "${FORMATS[@]}" do echo "$format, false"; - $CLICKHOUSE_CLIENT --output_format_parallel_formatting=false -q \ + $CLICKHOUSE_CLIENT --max_threads=0 --output_format_parallel_formatting=false -q \ "SELECT ClientEventTime::DateTime('Asia/Dubai') as a, MobilePhoneModel as b, ClientIP6 as c FROM test.hits ORDER BY a, b, c Format $format" | md5sum echo "$format, true"; - $CLICKHOUSE_CLIENT --output_format_parallel_formatting=true -q \ + $CLICKHOUSE_CLIENT --max_threads=0 --output_format_parallel_formatting=true -q \ "SELECT ClientEventTime::DateTime('Asia/Dubai') as a, MobilePhoneModel as b, ClientIP6 as c FROM test.hits ORDER BY a, b, c Format $format" | md5sum done From 1cbacbc3e61872fce5c14a64f6b80c3404bda2d6 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Wed, 6 Apr 2022 14:28:30 +0000 Subject: [PATCH 224/239] Update version_date.tsv after v22.3.3.44-lts --- utils/list-versions/version_date.tsv | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/list-versions/version_date.tsv b/utils/list-versions/version_date.tsv index e87c4ea2b46..6366aef19ce 100644 --- a/utils/list-versions/version_date.tsv +++ b/utils/list-versions/version_date.tsv @@ -1,3 +1,4 @@ +v22.3.3.44-lts 2022-04-06 v22.3.2.2-lts 2022-03-17 v22.2.3.5-stable 2022-02-25 v22.2.2.1-stable 2022-02-17 From 00137eaa342cb31296701a70c0e1e8ebd7dd181c Mon Sep 17 00:00:00 2001 From: tavplubix Date: Wed, 6 Apr 2022 19:13:56 +0300 Subject: [PATCH 225/239] Update test.py --- tests/integration/test_system_merges/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_system_merges/test.py b/tests/integration/test_system_merges/test.py index 9239cb11065..775706f4df6 100644 --- a/tests/integration/test_system_merges/test.py +++ b/tests/integration/test_system_merges/test.py @@ -124,7 +124,7 @@ def test_merge_simple(started_cluster, replicated): assert ( node_check.query( - "SELECT * FROM system.merges WHERE table = '{name}'".format( + "SELECT * FROM system.merges WHERE table = '{name}' and progress < 1".format( name=table_name ) ) From 5af0537f2cc5da4ee27d659b6c9896810757e22c Mon Sep 17 00:00:00 2001 From: tavplubix Date: Wed, 6 Apr 2022 19:23:33 +0300 Subject: [PATCH 226/239] Update clickhouse-test --- tests/clickhouse-test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/clickhouse-test b/tests/clickhouse-test index de36fc3da27..9ca820652d9 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -371,7 +371,7 @@ class SettingsRandomizer: "group_by_two_level_threshold_bytes": lambda: 1 if random.random() < 0.1 else 2 ** 60 if random.random() < 0.11 else 50000000, "distributed_aggregation_memory_efficient": lambda: random.randint(0, 1), "fsync_metadata": lambda: random.randint(0, 1), - "priority": lambda: int(abs(random.gauss(0, 2))), + "priority": lambda: min(3, int(abs(random.gauss(0, 2)))), "output_format_parallel_formatting": lambda: random.randint(0, 1), "input_format_parallel_parsing": lambda: random.randint(0, 1), "min_chunk_bytes_for_parallel_parsing": lambda: max(1024, int(random.gauss(10 * 1024 * 1024, 5 * 1000 * 1000))), From 14b860f9bbf9408e821a70cd37e44d295e08b3d3 Mon Sep 17 00:00:00 2001 From: LAL2211 Date: Wed, 6 Apr 2022 12:39:52 -0400 Subject: [PATCH 227/239] disallow the use of known weak implementations in requirements --- .../requirements/requirements.md | 33 ++-------- .../requirements/requirements.py | 65 +++---------------- 2 files changed, 12 insertions(+), 86 deletions(-) diff --git a/tests/testflows/aes_encryption/requirements/requirements.md b/tests/testflows/aes_encryption/requirements/requirements.md index 80cb614268c..23906f797d0 100644 --- a/tests/testflows/aes_encryption/requirements/requirements.md +++ b/tests/testflows/aes_encryption/requirements/requirements.md @@ -311,7 +311,7 @@ version: 1.0 of the `encrypt` function where the `key_length` SHALL specifies the length of the key and SHALL accept `128`, `192`, or `256` as the values and the `mode` SHALL specify the block encryption -mode and SHALL accept [ECB], [CBC], [CFB128], or [OFB] as well as +mode and SHALL accept [CBC], [CFB128], or [OFB] as well as [CTR] and [GCM] as the values. For example, `aes-256-ofb`. #### RQ.SRS008.AES.Encrypt.Function.Parameters.Mode.Value.Invalid @@ -327,9 +327,6 @@ version: 1.0 [ClickHouse] SHALL support the following [AES] block encryption modes as the value for the `mode` parameter of the `encrypt` function: -* `aes-128-ecb` that SHALL use [ECB] block mode encryption with 128 bit key -* `aes-192-ecb` that SHALL use [ECB] block mode encryption with 192 bit key -* `aes-256-ecb` that SHALL use [ECB] block mode encryption with 256 bit key * `aes-128-cbc` that SHALL use [CBC] block mode encryption with 128 bit key * `aes-192-cbc` that SHALL use [CBC] block mode encryption with 192 bit key * `aes-192-cbc` that SHALL use [CBC] block mode encryption with 256 bit key @@ -403,9 +400,6 @@ version: 1.0 [ClickHouse] SHALL return an error when the `encrypt` function is called with the following parameter values when using non-GCM modes -* `aes-128-ecb` mode and `key` is not 16 bytes or `iv` or `aad` is specified -* `aes-192-ecb` mode and `key` is not 24 bytes or `iv` or `aad` is specified -* `aes-256-ecb` mode and `key` is not 32 bytes or `iv` or `aad` is specified * `aes-128-cbc` mode and `key` is not 16 bytes or if specified `iv` is not 16 bytes or `aad` is specified * `aes-192-cbc` mode and `key` is not 24 bytes or if specified `iv` is not 16 bytes or `aad` is specified * `aes-256-cbc` mode and `key` is not 32 bytes or if specified `iv` is not 16 bytes or `aad` is specified @@ -476,7 +470,7 @@ version: 1.0 of the `decrypt` function where the `key_length` SHALL specifies the length of the key and SHALL accept `128`, `192`, or `256` as the values and the `mode` SHALL specify the block encryption -mode and SHALL accept [ECB], [CBC], [CFB128], or [OFB] as well as +mode and SHALL accept [CBC], [CFB128], or [OFB] as well as [CTR] and [GCM] as the values. For example, `aes-256-ofb`. #### RQ.SRS008.AES.Decrypt.Function.Parameters.Mode.Value.Invalid @@ -492,9 +486,6 @@ version: 1.0 [ClickHouse] SHALL support the following [AES] block encryption modes as the value for the `mode` parameter of the `decrypt` function: -* `aes-128-ecb` that SHALL use [ECB] block mode encryption with 128 bit key -* `aes-192-ecb` that SHALL use [ECB] block mode encryption with 192 bit key -* `aes-256-ecb` that SHALL use [ECB] block mode encryption with 256 bit key * `aes-128-cbc` that SHALL use [CBC] block mode encryption with 128 bit key * `aes-192-cbc` that SHALL use [CBC] block mode encryption with 192 bit key * `aes-192-cbc` that SHALL use [CBC] block mode encryption with 256 bit key @@ -570,9 +561,6 @@ version: 1.0 [ClickHouse] SHALL return an error when the `decrypt` function is called with the following parameter values when using non-GCM modes -* `aes-128-ecb` mode and `key` is not 16 bytes or `iv` or `aad` is specified -* `aes-192-ecb` mode and `key` is not 24 bytes or `iv` or `aad` is specified -* `aes-256-ecb` mode and `key` is not 32 bytes or `iv` or `aad` is specified * `aes-128-cbc` mode and `key` is not 16 bytes or if specified `iv` is not 16 bytes or `aad` is specified * `aes-192-cbc` mode and `key` is not 24 bytes or if specified `iv` is not 16 bytes or `aad` is specified * `aes-256-cbc` mode and `key` is not 32 bytes or if specified `iv` is not 16 bytes or `aad` is specified @@ -644,7 +632,7 @@ version: 1.0 of the `aes_encrypt_mysql` function where the `key_length` SHALL specifies the length of the key and SHALL accept `128`, `192`, or `256` as the values and the `mode` SHALL specify the block encryption -mode and SHALL accept [ECB], [CBC], [CFB128], or [OFB]. For example, `aes-256-ofb`. +mode and SHALL accept [CBC], [CFB128], or [OFB]. For example, `aes-256-ofb`. #### RQ.SRS008.AES.MySQL.Encrypt.Function.Parameters.Mode.Value.Invalid version: 1.0 @@ -659,9 +647,6 @@ version: 1.0 [ClickHouse] SHALL support the following [AES] block encryption modes as the value for the `mode` parameter of the `aes_encrypt_mysql` function: -* `aes-128-ecb` that SHALL use [ECB] block mode encryption with 128 bit key -* `aes-192-ecb` that SHALL use [ECB] block mode encryption with 192 bit key -* `aes-256-ecb` that SHALL use [ECB] block mode encryption with 256 bit key * `aes-128-cbc` that SHALL use [CBC] block mode encryption with 128 bit key * `aes-192-cbc` that SHALL use [CBC] block mode encryption with 192 bit key * `aes-192-cbc` that SHALL use [CBC] block mode encryption with 256 bit key @@ -750,9 +735,6 @@ version: 1.0 [ClickHouse] SHALL return an error when the `aes_encrypt_mysql` function is called with the following parameter values -* `aes-128-ecb` mode and `key` is less than 16 bytes or `iv` is specified -* `aes-192-ecb` mode and `key` is less than 24 bytes or `iv` is specified -* `aes-256-ecb` mode and `key` is less than 32 bytes or `iv` is specified * `aes-128-cbc` mode and `key` is less than 16 bytes or if specified `iv` is less than 16 bytes * `aes-192-cbc` mode and `key` is less than 24 bytes or if specified `iv` is less than 16 bytes * `aes-256-cbc` mode and `key` is less than 32 bytes or if specified `iv` is less than 16 bytes @@ -810,7 +792,7 @@ version: 1.0 of the `aes_decrypt_mysql` function where the `key_length` SHALL specifies the length of the key and SHALL accept `128`, `192`, or `256` as the values and the `mode` SHALL specify the block encryption -mode and SHALL accept [ECB], [CBC], [CFB128], or [OFB]. For example, `aes-256-ofb`. +mode and SHALL accept [CBC], [CFB128], or [OFB]. For example, `aes-256-ofb`. #### RQ.SRS008.AES.MySQL.Decrypt.Function.Parameters.Mode.Value.Invalid version: 1.0 @@ -825,9 +807,6 @@ version: 1.0 [ClickHouse] SHALL support the following [AES] block encryption modes as the value for the `mode` parameter of the `aes_decrypt_mysql` function: -* `aes-128-ecb` that SHALL use [ECB] block mode encryption with 128 bit key -* `aes-192-ecb` that SHALL use [ECB] block mode encryption with 192 bit key -* `aes-256-ecb` that SHALL use [ECB] block mode encryption with 256 bit key * `aes-128-cbc` that SHALL use [CBC] block mode encryption with 128 bit key * `aes-192-cbc` that SHALL use [CBC] block mode encryption with 192 bit key * `aes-192-cbc` that SHALL use [CBC] block mode encryption with 256 bit key @@ -916,9 +895,6 @@ version: 1.0 [ClickHouse] SHALL return an error when the `aes_decrypt_mysql` function is called with the following parameter values -* `aes-128-ecb` mode and `key` is less than 16 bytes or `iv` is specified -* `aes-192-ecb` mode and `key` is less than 24 bytes or `iv` is specified -* `aes-256-ecb` mode and `key` is less than 32 bytes or `iv` is specified * `aes-128-cbc` mode and `key` is less than 16 bytes or if specified `iv` is less than 16 bytes * `aes-192-cbc` mode and `key` is less than 24 bytes or if specified `iv` is less than 16 bytes * `aes-256-cbc` mode and `key` is less than 32 bytes or if specified `iv` is less than 16 bytes @@ -954,7 +930,6 @@ version: 1.0 [GCM]: https://en.wikipedia.org/wiki/Galois/Counter_Mode [CTR]: https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation#Counter_(CTR) [CBC]: https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation#Cipher_block_chaining_(CBC) -[ECB]: https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation#Electronic_codebook_(ECB) [CFB]: https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation#Cipher_feedback_(CFB) [CFB128]: https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation#Cipher_feedback_(CFB) [OFB]: https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation#Output_feedback_(OFB) diff --git a/tests/testflows/aes_encryption/requirements/requirements.py b/tests/testflows/aes_encryption/requirements/requirements.py index 0fbbea7e85a..4523f2d820f 100644 --- a/tests/testflows/aes_encryption/requirements/requirements.py +++ b/tests/testflows/aes_encryption/requirements/requirements.py @@ -429,7 +429,7 @@ RQ_SRS008_AES_Encrypt_Function_Parameters_Mode_ValuesFormat = Requirement( "of the `encrypt` function where\n" "the `key_length` SHALL specifies the length of the key and SHALL accept\n" "`128`, `192`, or `256` as the values and the `mode` SHALL specify the block encryption\n" - "mode and SHALL accept [ECB], [CBC], [CFB128], or [OFB] as well as\n" + "mode and SHALL accept [CBC], [CFB128], or [OFB] as well as\n" "[CTR] and [GCM] as the values. For example, `aes-256-ofb`.\n" "\n" ), @@ -467,9 +467,6 @@ RQ_SRS008_AES_Encrypt_Function_Parameters_Mode_Values = Requirement( "[ClickHouse] SHALL support the following [AES] block encryption modes as the value for the `mode` parameter\n" "of the `encrypt` function:\n" "\n" - "* `aes-128-ecb` that SHALL use [ECB] block mode encryption with 128 bit key\n" - "* `aes-192-ecb` that SHALL use [ECB] block mode encryption with 192 bit key\n" - "* `aes-256-ecb` that SHALL use [ECB] block mode encryption with 256 bit key\n" "* `aes-128-cbc` that SHALL use [CBC] block mode encryption with 128 bit key\n" "* `aes-192-cbc` that SHALL use [CBC] block mode encryption with 192 bit key\n" "* `aes-192-cbc` that SHALL use [CBC] block mode encryption with 256 bit key\n" @@ -642,9 +639,6 @@ RQ_SRS008_AES_Encrypt_Function_NonGCMMode_KeyAndInitializationVector_Length = Re "[ClickHouse] SHALL return an error when the `encrypt` function is called with the following parameter values\n" "when using non-GCM modes\n" "\n" - "* `aes-128-ecb` mode and `key` is not 16 bytes or `iv` or `aad` is specified\n" - "* `aes-192-ecb` mode and `key` is not 24 bytes or `iv` or `aad` is specified\n" - "* `aes-256-ecb` mode and `key` is not 32 bytes or `iv` or `aad` is specified\n" "* `aes-128-cbc` mode and `key` is not 16 bytes or if specified `iv` is not 16 bytes or `aad` is specified\n" "* `aes-192-cbc` mode and `key` is not 24 bytes or if specified `iv` is not 16 bytes or `aad` is specified\n" "* `aes-256-cbc` mode and `key` is not 32 bytes or if specified `iv` is not 16 bytes or `aad` is specified\n" @@ -790,7 +784,7 @@ RQ_SRS008_AES_Decrypt_Function_Parameters_Mode_ValuesFormat = Requirement( "of the `decrypt` function where\n" "the `key_length` SHALL specifies the length of the key and SHALL accept\n" "`128`, `192`, or `256` as the values and the `mode` SHALL specify the block encryption\n" - "mode and SHALL accept [ECB], [CBC], [CFB128], or [OFB] as well as\n" + "mode and SHALL accept [CBC], [CFB128], or [OFB] as well as\n" "[CTR] and [GCM] as the values. For example, `aes-256-ofb`.\n" "\n" ), @@ -828,9 +822,6 @@ RQ_SRS008_AES_Decrypt_Function_Parameters_Mode_Values = Requirement( "[ClickHouse] SHALL support the following [AES] block encryption modes as the value for the `mode` parameter\n" "of the `decrypt` function:\n" "\n" - "* `aes-128-ecb` that SHALL use [ECB] block mode encryption with 128 bit key\n" - "* `aes-192-ecb` that SHALL use [ECB] block mode encryption with 192 bit key\n" - "* `aes-256-ecb` that SHALL use [ECB] block mode encryption with 256 bit key\n" "* `aes-128-cbc` that SHALL use [CBC] block mode encryption with 128 bit key\n" "* `aes-192-cbc` that SHALL use [CBC] block mode encryption with 192 bit key\n" "* `aes-192-cbc` that SHALL use [CBC] block mode encryption with 256 bit key\n" @@ -1005,9 +996,6 @@ RQ_SRS008_AES_Decrypt_Function_NonGCMMode_KeyAndInitializationVector_Length = Re "[ClickHouse] SHALL return an error when the `decrypt` function is called with the following parameter values\n" "when using non-GCM modes\n" "\n" - "* `aes-128-ecb` mode and `key` is not 16 bytes or `iv` or `aad` is specified\n" - "* `aes-192-ecb` mode and `key` is not 24 bytes or `iv` or `aad` is specified\n" - "* `aes-256-ecb` mode and `key` is not 32 bytes or `iv` or `aad` is specified\n" "* `aes-128-cbc` mode and `key` is not 16 bytes or if specified `iv` is not 16 bytes or `aad` is specified\n" "* `aes-192-cbc` mode and `key` is not 24 bytes or if specified `iv` is not 16 bytes or `aad` is specified\n" "* `aes-256-cbc` mode and `key` is not 32 bytes or if specified `iv` is not 16 bytes or `aad` is specified\n" @@ -1154,7 +1142,7 @@ RQ_SRS008_AES_MySQL_Encrypt_Function_Parameters_Mode_ValuesFormat = Requirement( "of the `aes_encrypt_mysql` function where\n" "the `key_length` SHALL specifies the length of the key and SHALL accept\n" "`128`, `192`, or `256` as the values and the `mode` SHALL specify the block encryption\n" - "mode and SHALL accept [ECB], [CBC], [CFB128], or [OFB]. For example, `aes-256-ofb`.\n" + "mode and SHALL accept [CBC], [CFB128], or [OFB]. For example, `aes-256-ofb`.\n" "\n" ), link=None, @@ -1191,9 +1179,6 @@ RQ_SRS008_AES_MySQL_Encrypt_Function_Parameters_Mode_Values = Requirement( "[ClickHouse] SHALL support the following [AES] block encryption modes as the value for the `mode` parameter\n" "of the `aes_encrypt_mysql` function:\n" "\n" - "* `aes-128-ecb` that SHALL use [ECB] block mode encryption with 128 bit key\n" - "* `aes-192-ecb` that SHALL use [ECB] block mode encryption with 192 bit key\n" - "* `aes-256-ecb` that SHALL use [ECB] block mode encryption with 256 bit key\n" "* `aes-128-cbc` that SHALL use [CBC] block mode encryption with 128 bit key\n" "* `aes-192-cbc` that SHALL use [CBC] block mode encryption with 192 bit key\n" "* `aes-192-cbc` that SHALL use [CBC] block mode encryption with 256 bit key\n" @@ -1392,9 +1377,6 @@ RQ_SRS008_AES_MySQL_Encrypt_Function_Mode_KeyAndInitializationVector_Length = Re description=( "[ClickHouse] SHALL return an error when the `aes_encrypt_mysql` function is called with the following parameter values\n" "\n" - "* `aes-128-ecb` mode and `key` is less than 16 bytes or `iv` is specified\n" - "* `aes-192-ecb` mode and `key` is less than 24 bytes or `iv` is specified\n" - "* `aes-256-ecb` mode and `key` is less than 32 bytes or `iv` is specified\n" "* `aes-128-cbc` mode and `key` is less than 16 bytes or if specified `iv` is less than 16 bytes\n" "* `aes-192-cbc` mode and `key` is less than 24 bytes or if specified `iv` is less than 16 bytes\n" "* `aes-256-cbc` mode and `key` is less than 32 bytes or if specified `iv` is less than 16 bytes\n" @@ -1516,7 +1498,7 @@ RQ_SRS008_AES_MySQL_Decrypt_Function_Parameters_Mode_ValuesFormat = Requirement( "of the `aes_decrypt_mysql` function where\n" "the `key_length` SHALL specifies the length of the key and SHALL accept\n" "`128`, `192`, or `256` as the values and the `mode` SHALL specify the block encryption\n" - "mode and SHALL accept [ECB], [CBC], [CFB128], or [OFB]. For example, `aes-256-ofb`.\n" + "mode and SHALL accept [CBC], [CFB128], or [OFB]. For example, `aes-256-ofb`.\n" "\n" ), link=None, @@ -1553,9 +1535,6 @@ RQ_SRS008_AES_MySQL_Decrypt_Function_Parameters_Mode_Values = Requirement( "[ClickHouse] SHALL support the following [AES] block encryption modes as the value for the `mode` parameter\n" "of the `aes_decrypt_mysql` function:\n" "\n" - "* `aes-128-ecb` that SHALL use [ECB] block mode encryption with 128 bit key\n" - "* `aes-192-ecb` that SHALL use [ECB] block mode encryption with 192 bit key\n" - "* `aes-256-ecb` that SHALL use [ECB] block mode encryption with 256 bit key\n" "* `aes-128-cbc` that SHALL use [CBC] block mode encryption with 128 bit key\n" "* `aes-192-cbc` that SHALL use [CBC] block mode encryption with 192 bit key\n" "* `aes-192-cbc` that SHALL use [CBC] block mode encryption with 256 bit key\n" @@ -1754,9 +1733,6 @@ RQ_SRS008_AES_MySQL_Decrypt_Function_Mode_KeyAndInitializationVector_Length = Re description=( "[ClickHouse] SHALL return an error when the `aes_decrypt_mysql` function is called with the following parameter values\n" "\n" - "* `aes-128-ecb` mode and `key` is less than 16 bytes or `iv` is specified\n" - "* `aes-192-ecb` mode and `key` is less than 24 bytes or `iv` is specified\n" - "* `aes-256-ecb` mode and `key` is less than 32 bytes or `iv` is specified\n" "* `aes-128-cbc` mode and `key` is less than 16 bytes or if specified `iv` is less than 16 bytes\n" "* `aes-192-cbc` mode and `key` is less than 24 bytes or if specified `iv` is less than 16 bytes\n" "* `aes-256-cbc` mode and `key` is less than 32 bytes or if specified `iv` is less than 16 bytes\n" @@ -2606,7 +2582,7 @@ version: 1.0 of the `encrypt` function where the `key_length` SHALL specifies the length of the key and SHALL accept `128`, `192`, or `256` as the values and the `mode` SHALL specify the block encryption -mode and SHALL accept [ECB], [CBC], [CFB128], or [OFB] as well as +mode and SHALL accept [CBC], [CFB128], or [OFB] as well as [CTR] and [GCM] as the values. For example, `aes-256-ofb`. #### RQ.SRS008.AES.Encrypt.Function.Parameters.Mode.Value.Invalid @@ -2622,9 +2598,6 @@ version: 1.0 [ClickHouse] SHALL support the following [AES] block encryption modes as the value for the `mode` parameter of the `encrypt` function: -* `aes-128-ecb` that SHALL use [ECB] block mode encryption with 128 bit key -* `aes-192-ecb` that SHALL use [ECB] block mode encryption with 192 bit key -* `aes-256-ecb` that SHALL use [ECB] block mode encryption with 256 bit key * `aes-128-cbc` that SHALL use [CBC] block mode encryption with 128 bit key * `aes-192-cbc` that SHALL use [CBC] block mode encryption with 192 bit key * `aes-192-cbc` that SHALL use [CBC] block mode encryption with 256 bit key @@ -2698,9 +2671,6 @@ version: 1.0 [ClickHouse] SHALL return an error when the `encrypt` function is called with the following parameter values when using non-GCM modes -* `aes-128-ecb` mode and `key` is not 16 bytes or `iv` or `aad` is specified -* `aes-192-ecb` mode and `key` is not 24 bytes or `iv` or `aad` is specified -* `aes-256-ecb` mode and `key` is not 32 bytes or `iv` or `aad` is specified * `aes-128-cbc` mode and `key` is not 16 bytes or if specified `iv` is not 16 bytes or `aad` is specified * `aes-192-cbc` mode and `key` is not 24 bytes or if specified `iv` is not 16 bytes or `aad` is specified * `aes-256-cbc` mode and `key` is not 32 bytes or if specified `iv` is not 16 bytes or `aad` is specified @@ -2771,7 +2741,7 @@ version: 1.0 of the `decrypt` function where the `key_length` SHALL specifies the length of the key and SHALL accept `128`, `192`, or `256` as the values and the `mode` SHALL specify the block encryption -mode and SHALL accept [ECB], [CBC], [CFB128], or [OFB] as well as +mode and SHALL accept [CBC], [CFB128], or [OFB] as well as [CTR] and [GCM] as the values. For example, `aes-256-ofb`. #### RQ.SRS008.AES.Decrypt.Function.Parameters.Mode.Value.Invalid @@ -2787,9 +2757,6 @@ version: 1.0 [ClickHouse] SHALL support the following [AES] block encryption modes as the value for the `mode` parameter of the `decrypt` function: -* `aes-128-ecb` that SHALL use [ECB] block mode encryption with 128 bit key -* `aes-192-ecb` that SHALL use [ECB] block mode encryption with 192 bit key -* `aes-256-ecb` that SHALL use [ECB] block mode encryption with 256 bit key * `aes-128-cbc` that SHALL use [CBC] block mode encryption with 128 bit key * `aes-192-cbc` that SHALL use [CBC] block mode encryption with 192 bit key * `aes-192-cbc` that SHALL use [CBC] block mode encryption with 256 bit key @@ -2865,9 +2832,6 @@ version: 1.0 [ClickHouse] SHALL return an error when the `decrypt` function is called with the following parameter values when using non-GCM modes -* `aes-128-ecb` mode and `key` is not 16 bytes or `iv` or `aad` is specified -* `aes-192-ecb` mode and `key` is not 24 bytes or `iv` or `aad` is specified -* `aes-256-ecb` mode and `key` is not 32 bytes or `iv` or `aad` is specified * `aes-128-cbc` mode and `key` is not 16 bytes or if specified `iv` is not 16 bytes or `aad` is specified * `aes-192-cbc` mode and `key` is not 24 bytes or if specified `iv` is not 16 bytes or `aad` is specified * `aes-256-cbc` mode and `key` is not 32 bytes or if specified `iv` is not 16 bytes or `aad` is specified @@ -2939,7 +2903,7 @@ version: 1.0 of the `aes_encrypt_mysql` function where the `key_length` SHALL specifies the length of the key and SHALL accept `128`, `192`, or `256` as the values and the `mode` SHALL specify the block encryption -mode and SHALL accept [ECB], [CBC], [CFB128], or [OFB]. For example, `aes-256-ofb`. +mode and SHALL accept [CBC], [CFB128], or [OFB]. For example, `aes-256-ofb`. #### RQ.SRS008.AES.MySQL.Encrypt.Function.Parameters.Mode.Value.Invalid version: 1.0 @@ -2954,9 +2918,6 @@ version: 1.0 [ClickHouse] SHALL support the following [AES] block encryption modes as the value for the `mode` parameter of the `aes_encrypt_mysql` function: -* `aes-128-ecb` that SHALL use [ECB] block mode encryption with 128 bit key -* `aes-192-ecb` that SHALL use [ECB] block mode encryption with 192 bit key -* `aes-256-ecb` that SHALL use [ECB] block mode encryption with 256 bit key * `aes-128-cbc` that SHALL use [CBC] block mode encryption with 128 bit key * `aes-192-cbc` that SHALL use [CBC] block mode encryption with 192 bit key * `aes-192-cbc` that SHALL use [CBC] block mode encryption with 256 bit key @@ -3045,9 +3006,6 @@ version: 1.0 [ClickHouse] SHALL return an error when the `aes_encrypt_mysql` function is called with the following parameter values -* `aes-128-ecb` mode and `key` is less than 16 bytes or `iv` is specified -* `aes-192-ecb` mode and `key` is less than 24 bytes or `iv` is specified -* `aes-256-ecb` mode and `key` is less than 32 bytes or `iv` is specified * `aes-128-cbc` mode and `key` is less than 16 bytes or if specified `iv` is less than 16 bytes * `aes-192-cbc` mode and `key` is less than 24 bytes or if specified `iv` is less than 16 bytes * `aes-256-cbc` mode and `key` is less than 32 bytes or if specified `iv` is less than 16 bytes @@ -3105,7 +3063,7 @@ version: 1.0 of the `aes_decrypt_mysql` function where the `key_length` SHALL specifies the length of the key and SHALL accept `128`, `192`, or `256` as the values and the `mode` SHALL specify the block encryption -mode and SHALL accept [ECB], [CBC], [CFB128], or [OFB]. For example, `aes-256-ofb`. +mode and SHALL accept [CBC], [CFB128], or [OFB]. For example, `aes-256-ofb`. #### RQ.SRS008.AES.MySQL.Decrypt.Function.Parameters.Mode.Value.Invalid version: 1.0 @@ -3120,9 +3078,6 @@ version: 1.0 [ClickHouse] SHALL support the following [AES] block encryption modes as the value for the `mode` parameter of the `aes_decrypt_mysql` function: -* `aes-128-ecb` that SHALL use [ECB] block mode encryption with 128 bit key -* `aes-192-ecb` that SHALL use [ECB] block mode encryption with 192 bit key -* `aes-256-ecb` that SHALL use [ECB] block mode encryption with 256 bit key * `aes-128-cbc` that SHALL use [CBC] block mode encryption with 128 bit key * `aes-192-cbc` that SHALL use [CBC] block mode encryption with 192 bit key * `aes-192-cbc` that SHALL use [CBC] block mode encryption with 256 bit key @@ -3211,9 +3166,6 @@ version: 1.0 [ClickHouse] SHALL return an error when the `aes_decrypt_mysql` function is called with the following parameter values -* `aes-128-ecb` mode and `key` is less than 16 bytes or `iv` is specified -* `aes-192-ecb` mode and `key` is less than 24 bytes or `iv` is specified -* `aes-256-ecb` mode and `key` is less than 32 bytes or `iv` is specified * `aes-128-cbc` mode and `key` is less than 16 bytes or if specified `iv` is less than 16 bytes * `aes-192-cbc` mode and `key` is less than 24 bytes or if specified `iv` is less than 16 bytes * `aes-256-cbc` mode and `key` is less than 32 bytes or if specified `iv` is less than 16 bytes @@ -3249,7 +3201,6 @@ version: 1.0 [GCM]: https://en.wikipedia.org/wiki/Galois/Counter_Mode [CTR]: https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation#Counter_(CTR) [CBC]: https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation#Cipher_block_chaining_(CBC) -[ECB]: https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation#Electronic_codebook_(ECB) [CFB]: https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation#Cipher_feedback_(CFB) [CFB128]: https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation#Cipher_feedback_(CFB) [OFB]: https://en.wikipedia.org/wiki/Block_cipher_mode_of_operation#Output_feedback_(OFB) From 33ea119013271b9cb8b81f86d3c5c198d5d2dc2e Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Thu, 7 Apr 2022 00:56:10 +0800 Subject: [PATCH 228/239] Bump replxx to fix wrong assertion --- contrib/replxx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/replxx b/contrib/replxx index 6f0b6f151ae..3fd0e3c9364 160000 --- a/contrib/replxx +++ b/contrib/replxx @@ -1 +1 @@ -Subproject commit 6f0b6f151ae2a044625ae93acd19ca365fcea64d +Subproject commit 3fd0e3c9364a589447453d9906d854ebd8d385c5 From 45b284d09820a21885e67cc7e242da40518913ed Mon Sep 17 00:00:00 2001 From: tavplubix Date: Wed, 6 Apr 2022 20:22:24 +0300 Subject: [PATCH 229/239] Update 02248_nullable_custom_types_to_string.sql --- .../0_stateless/02248_nullable_custom_types_to_string.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02248_nullable_custom_types_to_string.sql b/tests/queries/0_stateless/02248_nullable_custom_types_to_string.sql index b6032f7741b..605500ee840 100644 --- a/tests/queries/0_stateless/02248_nullable_custom_types_to_string.sql +++ b/tests/queries/0_stateless/02248_nullable_custom_types_to_string.sql @@ -1,4 +1,4 @@ --- Tags: no-backward-compatibility-check:22.3.2.2 +-- Tags: no-backward-compatibility-check:22.3.4.44 select toString(toNullable(true)); select toString(CAST(NULL, 'Nullable(Bool)')); select toString(toNullable(toIPv4('0.0.0.0'))); From b3f59537ca3e201b5dc47cc4625f74aff46aa83f Mon Sep 17 00:00:00 2001 From: LAL2211 Date: Wed, 6 Apr 2022 13:34:20 -0400 Subject: [PATCH 230/239] added additional check --- .../external_user_directory/tests/common.py | 25 +++++++++++++------ 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/tests/testflows/ldap/external_user_directory/tests/common.py b/tests/testflows/ldap/external_user_directory/tests/common.py index 871be815a35..7400459d266 100644 --- a/tests/testflows/ldap/external_user_directory/tests/common.py +++ b/tests/testflows/ldap/external_user_directory/tests/common.py @@ -84,14 +84,25 @@ def rbac_roles(*roles, node=None): node.query(f"DROP ROLE IF EXISTS {role}") -def verify_ldap_user_exists(server, username, password): +# def verify_ldap_user_exists(server, username, password): +# """Check that LDAP user is defined on the LDAP server.""" +# with By("searching LDAP database"): +# ldap_node = current().context.cluster.node(server) +# r = ldap_node.command( +# f"ldapwhoami -H ldap://localhost -D 'cn={user_name},ou=users,dc=company,dc=com' -w {password}" +# ) +# assert r.exitcode == 0, error() + + +def verify_ldap_user_exists(server, username, password, check=False): """Check that LDAP user is defined on the LDAP server.""" - with By("searching LDAP database"): - ldap_node = current().context.cluster.node(server) - r = ldap_node.command( - f"ldapwhoami -H ldap://localhost -D 'cn={user_name},ou=users,dc=company,dc=com' -w {password}" - ) - assert r.exitcode == 0, error() + if check: + with By("searching LDAP database"): + ldap_node = current().context.cluster.node(server) + r = ldap_node.command( + f"ldapwhoami -H ldap://localhost -D 'cn={user_name},ou=users,dc=company,dc=com' -w {password}" + ) + assert r.exitcode == 0, error() def create_ldap_external_user_directory_config_content( From 87e0656c3b320e7f8200fb56490c161acbd9b134 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Wed, 6 Apr 2022 19:44:37 +0200 Subject: [PATCH 231/239] Fix flaky test 00155_long_merges --- tests/queries/0_stateless/00155_long_merges.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/00155_long_merges.sh b/tests/queries/0_stateless/00155_long_merges.sh index f2d9cd1dade..15ad0892a42 100755 --- a/tests/queries/0_stateless/00155_long_merges.sh +++ b/tests/queries/0_stateless/00155_long_merges.sh @@ -32,7 +32,7 @@ function test { SUM=$(( $1 + $2 )) MAX=$(( $1 > $2 ? $1 : $2 )) - SETTINGS="--min_insert_block_size_rows=0 --min_insert_block_size_bytes=0" + SETTINGS="--min_insert_block_size_rows=0 --min_insert_block_size_bytes=0 --max_block_size=65505" $CLICKHOUSE_CLIENT $SETTINGS --query="INSERT INTO summing_00155 (x) SELECT number AS x FROM system.numbers LIMIT $1" $CLICKHOUSE_CLIENT $SETTINGS --query="INSERT INTO summing_00155 (x) SELECT number AS x FROM system.numbers LIMIT $2" From 68bdcdca80cefe8af0479c2e9d6516ac289f6c00 Mon Sep 17 00:00:00 2001 From: LAL2211 Date: Wed, 6 Apr 2022 14:15:30 -0400 Subject: [PATCH 232/239] code fix for Ldap Injection --- .../external_user_directory/tests/common.py | 21 ------------------- 1 file changed, 21 deletions(-) diff --git a/tests/testflows/ldap/external_user_directory/tests/common.py b/tests/testflows/ldap/external_user_directory/tests/common.py index 7400459d266..c0b6e72cd8e 100644 --- a/tests/testflows/ldap/external_user_directory/tests/common.py +++ b/tests/testflows/ldap/external_user_directory/tests/common.py @@ -84,27 +84,6 @@ def rbac_roles(*roles, node=None): node.query(f"DROP ROLE IF EXISTS {role}") -# def verify_ldap_user_exists(server, username, password): -# """Check that LDAP user is defined on the LDAP server.""" -# with By("searching LDAP database"): -# ldap_node = current().context.cluster.node(server) -# r = ldap_node.command( -# f"ldapwhoami -H ldap://localhost -D 'cn={user_name},ou=users,dc=company,dc=com' -w {password}" -# ) -# assert r.exitcode == 0, error() - - -def verify_ldap_user_exists(server, username, password, check=False): - """Check that LDAP user is defined on the LDAP server.""" - if check: - with By("searching LDAP database"): - ldap_node = current().context.cluster.node(server) - r = ldap_node.command( - f"ldapwhoami -H ldap://localhost -D 'cn={user_name},ou=users,dc=company,dc=com' -w {password}" - ) - assert r.exitcode == 0, error() - - def create_ldap_external_user_directory_config_content( server=None, roles=None, **kwargs ): From 517c2ae8dfc67d9a280ecfadf917c28758e26344 Mon Sep 17 00:00:00 2001 From: fenglv Date: Thu, 7 Apr 2022 02:20:20 +0000 Subject: [PATCH 233/239] Fix data race in StorgeFileLog --- src/Storages/FileLog/StorageFileLog.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/FileLog/StorageFileLog.cpp b/src/Storages/FileLog/StorageFileLog.cpp index 4fb19b12bab..d7c732aee02 100644 --- a/src/Storages/FileLog/StorageFileLog.cpp +++ b/src/Storages/FileLog/StorageFileLog.cpp @@ -720,7 +720,7 @@ bool StorageFileLog::streamToViews() assertBlocksHaveEqualStructure(input.getHeader(), block_io.pipeline.getHeader(), "StorageFileLog streamToViews"); - size_t rows = 0; + std::atomic rows = 0; { block_io.pipeline.complete(std::move(input)); block_io.pipeline.setNumThreads(max_streams_number); From acc7046d54738054df39cd43842534265a723d6e Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Thu, 7 Apr 2022 11:46:57 +0800 Subject: [PATCH 234/239] remove some useless virtual and rename some functions in HiveFile --- src/Storages/Hive/HiveFile.cpp | 6 +- src/Storages/Hive/HiveFile.h | 114 +++++++++++++----------------- src/Storages/Hive/StorageHive.cpp | 4 +- 3 files changed, 56 insertions(+), 68 deletions(-) diff --git a/src/Storages/Hive/HiveFile.cpp b/src/Storages/Hive/HiveFile.cpp index 3f4260d9f9e..cc2687415ff 100644 --- a/src/Storages/Hive/HiveFile.cpp +++ b/src/Storages/Hive/HiveFile.cpp @@ -145,7 +145,7 @@ void HiveOrcFile::prepareColumnMapping() } } -bool HiveOrcFile::hasMinMaxIndex() const +bool HiveOrcFile::useFileMinMaxIndex() const { return storage_settings->enable_orc_file_minmax_index; } @@ -196,7 +196,7 @@ void HiveOrcFile::loadMinMaxIndex() minmax_idx = buildMinMaxIndex(statistics.get()); } -bool HiveOrcFile::hasSubMinMaxIndex() const +bool HiveOrcFile::useSplitMinMaxIndex() const { return storage_settings->enable_orc_stripe_minmax_index; } @@ -226,7 +226,7 @@ void HiveOrcFile::loadSubMinMaxIndex() } } -bool HiveParquetFile::hasSubMinMaxIndex() const +bool HiveParquetFile::useSplitMinMaxIndex() const { return storage_settings->enable_parquet_rowgroup_minmax_index; } diff --git a/src/Storages/Hive/HiveFile.h b/src/Storages/Hive/HiveFile.h index 74f893a073f..6a25fa3698b 100644 --- a/src/Storages/Hive/HiveFile.h +++ b/src/Storages/Hive/HiveFile.h @@ -76,7 +76,7 @@ public: } IHiveFile( - const FieldVector & values_, + const FieldVector & partition_values_, const String & namenode_url_, const String & path_, UInt64 last_modify_time_, @@ -85,7 +85,7 @@ public: const std::shared_ptr & storage_settings_, ContextPtr context_) : WithContext(context_) - , partition_values(values_) + , partition_values(partition_values_) , namenode_url(namenode_url_) , path(path_) , last_modify_time(last_modify_time_) @@ -96,56 +96,47 @@ public: } virtual ~IHiveFile() = default; - virtual FileFormat getFormat() const = 0; + String getFormatName() const { return String(magic_enum::enum_name(getFormat())); } + const String & getPath() const { return path; } + UInt64 getLastModTs() const { return last_modify_time; } + size_t getSize() const { return size; } + const FieldVector & getPartitionValues() const { return partition_values; } + const String & getNamenodeUrl() { return namenode_url; } + MinMaxIndexPtr getMinMaxIndex() const { return minmax_idx; } + const std::vector & getSubMinMaxIndexes() const { return sub_minmax_idxes; } - virtual String getName() const = 0; + const std::unordered_set & getSkipSplits() const { return skip_splits; } + void setSkipSplits(const std::unordered_set & skip_splits_) { skip_splits = skip_splits_; } - virtual String getPath() const { return path; } - - virtual FieldVector getPartitionValues() const { return partition_values; } - - virtual String getNamenodeUrl() { return namenode_url; } - - virtual bool hasMinMaxIndex() const { return false; } - - virtual void loadMinMaxIndex() - { - throw Exception("Method loadMinMaxIndex is not supported by hive file:" + getName(), ErrorCodes::NOT_IMPLEMENTED); - } - - virtual MinMaxIndexPtr getMinMaxIndex() const { return minmax_idx; } - - // Do hive file contains sub-file level minmax index? - virtual bool hasSubMinMaxIndex() const { return false; } - - virtual void loadSubMinMaxIndex() - { - throw Exception("Method loadSubMinMaxIndex is not supported by hive file:" + getName(), ErrorCodes::NOT_IMPLEMENTED); - } - - virtual const std::vector & getSubMinMaxIndexes() const { return sub_minmax_idxes; } - - virtual void setSkipSplits(const std::unordered_set & skip_splits_) { skip_splits = skip_splits_; } - - virtual const std::unordered_set & getSkipSplits() const { return skip_splits; } - - inline std::string describeMinMaxIndex(const MinMaxIndexPtr & idx) const + String describeMinMaxIndex(const MinMaxIndexPtr & idx) const { if (!idx) return ""; - std::vector strs; + std::vector strs; strs.reserve(index_names_and_types.size()); size_t i = 0; for (const auto & name_type : index_names_and_types) - { strs.push_back(name_type.name + ":" + name_type.type->getName() + idx->hyperrectangle[i++].toString()); - } return boost::algorithm::join(strs, "|"); } - inline UInt64 getLastModTs() const { return last_modify_time; } - inline size_t getSize() const { return size; } + virtual FileFormat getFormat() const = 0; + + virtual bool useFileMinMaxIndex() const { return false; } + + virtual void loadMinMaxIndex() + { + throw Exception("Method loadMinMaxIndex is not supported by hive file:" + getFormatName(), ErrorCodes::NOT_IMPLEMENTED); + } + + /// If hive query could use contains sub-file level minmax index? + virtual bool useSplitMinMaxIndex() const { return false; } + + virtual void loadSubMinMaxIndex() + { + throw Exception("Method loadSubMinMaxIndex is not supported by hive file:" + getFormatName(), ErrorCodes::NOT_IMPLEMENTED); + } protected: FieldVector partition_values; @@ -168,7 +159,7 @@ class HiveTextFile : public IHiveFile { public: HiveTextFile( - const FieldVector & values_, + const FieldVector & partition_values_, const String & namenode_url_, const String & path_, UInt64 last_modify_time_, @@ -176,19 +167,18 @@ public: const NamesAndTypesList & index_names_and_types_, const std::shared_ptr & hive_settings_, ContextPtr context_) - : IHiveFile(values_, namenode_url_, path_, last_modify_time_, size_, index_names_and_types_, hive_settings_, context_) + : IHiveFile(partition_values_, namenode_url_, path_, last_modify_time_, size_, index_names_and_types_, hive_settings_, context_) { } virtual FileFormat getFormat() const override { return FileFormat::TEXT; } - virtual String getName() const override { return "TEXT"; } }; class HiveOrcFile : public IHiveFile { public: HiveOrcFile( - const FieldVector & values_, + const FieldVector & partition_values_, const String & namenode_url_, const String & path_, UInt64 last_modify_time_, @@ -196,23 +186,22 @@ public: const NamesAndTypesList & index_names_and_types_, const std::shared_ptr & hive_settings_, ContextPtr context_) - : IHiveFile(values_, namenode_url_, path_, last_modify_time_, size_, index_names_and_types_, hive_settings_, context_) + : IHiveFile(partition_values_, namenode_url_, path_, last_modify_time_, size_, index_names_and_types_, hive_settings_, context_) { } - virtual FileFormat getFormat() const override { return FileFormat::ORC; } - virtual String getName() const override { return "ORC"; } - virtual bool hasMinMaxIndex() const override; - virtual void loadMinMaxIndex() override; + FileFormat getFormat() const override { return FileFormat::ORC; } + bool useFileMinMaxIndex() const override; + void loadMinMaxIndex() override; - virtual bool hasSubMinMaxIndex() const override; - virtual void loadSubMinMaxIndex() override; + bool useSplitMinMaxIndex() const override; + void loadSubMinMaxIndex() override; -protected: - virtual std::unique_ptr buildMinMaxIndex(const orc::Statistics * statistics); - virtual Range buildRange(const orc::ColumnStatistics * col_stats); - virtual void prepareReader(); - virtual void prepareColumnMapping(); +private: + std::unique_ptr buildMinMaxIndex(const orc::Statistics * statistics); + Range buildRange(const orc::ColumnStatistics * col_stats); + void prepareReader(); + void prepareColumnMapping(); std::unique_ptr in; std::unique_ptr reader; @@ -223,7 +212,7 @@ class HiveParquetFile : public IHiveFile { public: HiveParquetFile( - const FieldVector & values_, + const FieldVector & partition_values_, const String & namenode_url_, const String & path_, UInt64 last_modify_time_, @@ -231,18 +220,17 @@ public: const NamesAndTypesList & index_names_and_types_, const std::shared_ptr & hive_settings_, ContextPtr context_) - : IHiveFile(values_, namenode_url_, path_, last_modify_time_, size_, index_names_and_types_, hive_settings_, context_) + : IHiveFile(partition_values_, namenode_url_, path_, last_modify_time_, size_, index_names_and_types_, hive_settings_, context_) { } - virtual FileFormat getFormat() const override { return FileFormat::PARQUET; } - virtual String getName() const override { return "PARQUET"; } + FileFormat getFormat() const override { return FileFormat::PARQUET; } - virtual bool hasSubMinMaxIndex() const override; - virtual void loadSubMinMaxIndex() override; + bool useSplitMinMaxIndex() const override; + void loadSubMinMaxIndex() override; -protected: - virtual void prepareReader(); +private: + void prepareReader(); std::unique_ptr in; std::unique_ptr reader; diff --git a/src/Storages/Hive/StorageHive.cpp b/src/Storages/Hive/StorageHive.cpp index b4b0862f5d4..c11db119ab7 100644 --- a/src/Storages/Hive/StorageHive.cpp +++ b/src/Storages/Hive/StorageHive.cpp @@ -543,7 +543,7 @@ HiveFilePtr StorageHive::createHiveFileIfNeeded( /// Load file level minmax index and apply const KeyCondition hivefile_key_condition(query_info, getContext(), hivefile_name_types.getNames(), hivefile_minmax_idx_expr); - if (hive_file->hasMinMaxIndex()) + if (hive_file->useFileMinMaxIndex()) { hive_file->loadMinMaxIndex(); if (!hivefile_key_condition.checkInHyperrectangle(hive_file->getMinMaxIndex()->hyperrectangle, hivefile_name_types.getTypes()) @@ -556,7 +556,7 @@ HiveFilePtr StorageHive::createHiveFileIfNeeded( } /// Load sub-file level minmax index and apply - if (hive_file->hasSubMinMaxIndex()) + if (hive_file->useSplitMinMaxIndex()) { std::unordered_set skip_splits; hive_file->loadSubMinMaxIndex(); From f02d76934346453492f90715cde8ecb0c9b660cc Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Thu, 7 Apr 2022 14:29:35 +0800 Subject: [PATCH 235/239] fix build error --- src/Storages/Hive/HiveFile.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Storages/Hive/HiveFile.h b/src/Storages/Hive/HiveFile.h index 6a25fa3698b..bd4f12cf6b6 100644 --- a/src/Storages/Hive/HiveFile.h +++ b/src/Storages/Hive/HiveFile.h @@ -198,8 +198,9 @@ public: void loadSubMinMaxIndex() override; private: + static Range buildRange(const orc::ColumnStatistics * col_stats); + std::unique_ptr buildMinMaxIndex(const orc::Statistics * statistics); - Range buildRange(const orc::ColumnStatistics * col_stats); void prepareReader(); void prepareColumnMapping(); From 046a2ba51c61424aa7a9c00cf1fc651dd6526c51 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Thu, 7 Apr 2022 15:35:08 +0800 Subject: [PATCH 236/239] rename some symboles --- src/Storages/Hive/HiveFile.cpp | 32 +++++++++++++++---------------- src/Storages/Hive/HiveFile.h | 22 ++++++++++----------- src/Storages/Hive/StorageHive.cpp | 4 ++-- 3 files changed, 29 insertions(+), 29 deletions(-) diff --git a/src/Storages/Hive/HiveFile.cpp b/src/Storages/Hive/HiveFile.cpp index cc2687415ff..166336df78d 100644 --- a/src/Storages/Hive/HiveFile.cpp +++ b/src/Storages/Hive/HiveFile.cpp @@ -184,7 +184,7 @@ std::unique_ptr HiveOrcFile::buildMinMaxIndex(c } -void HiveOrcFile::loadMinMaxIndex() +void HiveOrcFile::loadFileMinMaxIndex() { if (!reader) { @@ -193,7 +193,7 @@ void HiveOrcFile::loadMinMaxIndex() } auto statistics = reader->GetRawORCReader()->getStatistics(); - minmax_idx = buildMinMaxIndex(statistics.get()); + file_minmax_idx = buildMinMaxIndex(statistics.get()); } bool HiveOrcFile::useSplitMinMaxIndex() const @@ -202,7 +202,7 @@ bool HiveOrcFile::useSplitMinMaxIndex() const } -void HiveOrcFile::loadSubMinMaxIndex() +void HiveOrcFile::loadSplitMinMaxIndex() { if (!reader) { @@ -218,11 +218,11 @@ void HiveOrcFile::loadSubMinMaxIndex() fmt::format("orc file:{} has different strip num {} and strip statistics num {}", path, stripe_num, stripe_stats_num), ErrorCodes::BAD_ARGUMENTS); - sub_minmax_idxes.resize(stripe_num); + split_minmax_idxes.resize(stripe_num); for (size_t i = 0; i < stripe_num; ++i) { auto stripe_stats = raw_reader->getStripeStatistics(i); - sub_minmax_idxes[i] = buildMinMaxIndex(stripe_stats.get()); + split_minmax_idxes[i] = buildMinMaxIndex(stripe_stats.get()); } } @@ -239,7 +239,7 @@ void HiveParquetFile::prepareReader() THROW_ARROW_NOT_OK(parquet::arrow::OpenFile(asArrowFile(*in, format_settings, is_stopped), arrow::default_memory_pool(), &reader)); } -void HiveParquetFile::loadSubMinMaxIndex() +void HiveParquetFile::loadSplitMinMaxIndex() { if (!reader) prepareReader(); @@ -256,12 +256,12 @@ void HiveParquetFile::loadSubMinMaxIndex() } - sub_minmax_idxes.resize(num_row_groups); + split_minmax_idxes.resize(num_row_groups); for (size_t i = 0; i < num_row_groups; ++i) { auto row_group_meta = meta->RowGroup(i); - sub_minmax_idxes[i] = std::make_shared(); - sub_minmax_idxes[i]->hyperrectangle.resize(num_cols); + split_minmax_idxes[i] = std::make_shared(); + split_minmax_idxes[i]->hyperrectangle.resize(num_cols); size_t j = 0; auto it = index_names_and_types.begin(); @@ -284,31 +284,31 @@ void HiveParquetFile::loadSubMinMaxIndex() if (auto bool_stats = std::dynamic_pointer_cast(stats)) { - sub_minmax_idxes[i]->hyperrectangle[j] = createRangeFromParquetStatistics(bool_stats); + split_minmax_idxes[i]->hyperrectangle[j] = createRangeFromParquetStatistics(bool_stats); } else if (auto int32_stats = std::dynamic_pointer_cast(stats)) { - sub_minmax_idxes[i]->hyperrectangle[j] = createRangeFromParquetStatistics(int32_stats); + split_minmax_idxes[i]->hyperrectangle[j] = createRangeFromParquetStatistics(int32_stats); } else if (auto int64_stats = std::dynamic_pointer_cast(stats)) { - sub_minmax_idxes[i]->hyperrectangle[j] = createRangeFromParquetStatistics(int64_stats); + split_minmax_idxes[i]->hyperrectangle[j] = createRangeFromParquetStatistics(int64_stats); } else if (auto float_stats = std::dynamic_pointer_cast(stats)) { - sub_minmax_idxes[i]->hyperrectangle[j] = createRangeFromParquetStatistics(float_stats); + split_minmax_idxes[i]->hyperrectangle[j] = createRangeFromParquetStatistics(float_stats); } else if (auto double_stats = std::dynamic_pointer_cast(stats)) { - sub_minmax_idxes[i]->hyperrectangle[j] = createRangeFromParquetStatistics(double_stats); + split_minmax_idxes[i]->hyperrectangle[j] = createRangeFromParquetStatistics(double_stats); } else if (auto string_stats = std::dynamic_pointer_cast(stats)) { - sub_minmax_idxes[i]->hyperrectangle[j] = createRangeFromParquetStatistics(string_stats); + split_minmax_idxes[i]->hyperrectangle[j] = createRangeFromParquetStatistics(string_stats); } /// Other types are not supported for minmax index, skip } - sub_minmax_idxes[i]->initialized = true; + split_minmax_idxes[i]->initialized = true; } } diff --git a/src/Storages/Hive/HiveFile.h b/src/Storages/Hive/HiveFile.h index bd4f12cf6b6..1404f97eff0 100644 --- a/src/Storages/Hive/HiveFile.h +++ b/src/Storages/Hive/HiveFile.h @@ -102,8 +102,8 @@ public: size_t getSize() const { return size; } const FieldVector & getPartitionValues() const { return partition_values; } const String & getNamenodeUrl() { return namenode_url; } - MinMaxIndexPtr getMinMaxIndex() const { return minmax_idx; } - const std::vector & getSubMinMaxIndexes() const { return sub_minmax_idxes; } + MinMaxIndexPtr getMinMaxIndex() const { return file_minmax_idx; } + const std::vector & getSubMinMaxIndexes() const { return split_minmax_idxes; } const std::unordered_set & getSkipSplits() const { return skip_splits; } void setSkipSplits(const std::unordered_set & skip_splits_) { skip_splits = skip_splits_; } @@ -125,17 +125,17 @@ public: virtual bool useFileMinMaxIndex() const { return false; } - virtual void loadMinMaxIndex() + virtual void loadFileMinMaxIndex() { - throw Exception("Method loadMinMaxIndex is not supported by hive file:" + getFormatName(), ErrorCodes::NOT_IMPLEMENTED); + throw Exception("Method loadFileMinMaxIndex is not supported by hive file:" + getFormatName(), ErrorCodes::NOT_IMPLEMENTED); } /// If hive query could use contains sub-file level minmax index? virtual bool useSplitMinMaxIndex() const { return false; } - virtual void loadSubMinMaxIndex() + virtual void loadSplitMinMaxIndex() { - throw Exception("Method loadSubMinMaxIndex is not supported by hive file:" + getFormatName(), ErrorCodes::NOT_IMPLEMENTED); + throw Exception("Method loadSplitMinMaxIndex is not supported by hive file:" + getFormatName(), ErrorCodes::NOT_IMPLEMENTED); } protected: @@ -145,8 +145,8 @@ protected: UInt64 last_modify_time; size_t size; NamesAndTypesList index_names_and_types; - MinMaxIndexPtr minmax_idx; - std::vector sub_minmax_idxes; + MinMaxIndexPtr file_minmax_idx; + std::vector split_minmax_idxes; /// Skip splits for this file after applying minmax index (if any) std::unordered_set skip_splits; std::shared_ptr storage_settings; @@ -192,10 +192,10 @@ public: FileFormat getFormat() const override { return FileFormat::ORC; } bool useFileMinMaxIndex() const override; - void loadMinMaxIndex() override; + void loadFileMinMaxIndex() override; bool useSplitMinMaxIndex() const override; - void loadSubMinMaxIndex() override; + void loadSplitMinMaxIndex() override; private: static Range buildRange(const orc::ColumnStatistics * col_stats); @@ -228,7 +228,7 @@ public: FileFormat getFormat() const override { return FileFormat::PARQUET; } bool useSplitMinMaxIndex() const override; - void loadSubMinMaxIndex() override; + void loadSplitMinMaxIndex() override; private: void prepareReader(); diff --git a/src/Storages/Hive/StorageHive.cpp b/src/Storages/Hive/StorageHive.cpp index c11db119ab7..6745a0f6ad0 100644 --- a/src/Storages/Hive/StorageHive.cpp +++ b/src/Storages/Hive/StorageHive.cpp @@ -545,7 +545,7 @@ HiveFilePtr StorageHive::createHiveFileIfNeeded( const KeyCondition hivefile_key_condition(query_info, getContext(), hivefile_name_types.getNames(), hivefile_minmax_idx_expr); if (hive_file->useFileMinMaxIndex()) { - hive_file->loadMinMaxIndex(); + hive_file->loadFileMinMaxIndex(); if (!hivefile_key_condition.checkInHyperrectangle(hive_file->getMinMaxIndex()->hyperrectangle, hivefile_name_types.getTypes()) .can_be_true) { @@ -559,7 +559,7 @@ HiveFilePtr StorageHive::createHiveFileIfNeeded( if (hive_file->useSplitMinMaxIndex()) { std::unordered_set skip_splits; - hive_file->loadSubMinMaxIndex(); + hive_file->loadSplitMinMaxIndex(); const auto & sub_minmax_idxes = hive_file->getSubMinMaxIndexes(); for (size_t i = 0; i < sub_minmax_idxes.size(); ++i) { From 2dc420c66bff45ac68f3160be04b3bca791aaf86 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Thu, 7 Apr 2022 15:48:42 +0800 Subject: [PATCH 237/239] rename some symbols in hivefile --- src/Storages/Hive/HiveFile.cpp | 16 ++++++++-------- src/Storages/Hive/HiveFile.h | 4 ++-- src/Storages/Hive/StorageHive.cpp | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/Storages/Hive/HiveFile.cpp b/src/Storages/Hive/HiveFile.cpp index 166336df78d..9251f2805cb 100644 --- a/src/Storages/Hive/HiveFile.cpp +++ b/src/Storages/Hive/HiveFile.cpp @@ -77,7 +77,7 @@ Range createRangeFromParquetStatistics(std::shared_ptrhasNull()) return {}; @@ -122,7 +122,7 @@ Range HiveOrcFile::buildRange(const orc::ColumnStatistics * col_stats) return {}; } -void HiveOrcFile::prepareReader() +void HiveORCFile::prepareReader() { in = std::make_unique(namenode_url, path, getContext()->getGlobalContext()->getConfigRef()); auto format_settings = getFormatSettings(getContext()); @@ -132,7 +132,7 @@ void HiveOrcFile::prepareReader() reader = std::move(result).ValueOrDie(); } -void HiveOrcFile::prepareColumnMapping() +void HiveORCFile::prepareColumnMapping() { const orc::Type & type = reader->GetRawORCReader()->getType(); size_t count = type.getSubtypeCount(); @@ -145,13 +145,13 @@ void HiveOrcFile::prepareColumnMapping() } } -bool HiveOrcFile::useFileMinMaxIndex() const +bool HiveORCFile::useFileMinMaxIndex() const { return storage_settings->enable_orc_file_minmax_index; } -std::unique_ptr HiveOrcFile::buildMinMaxIndex(const orc::Statistics * statistics) +std::unique_ptr HiveORCFile::buildMinMaxIndex(const orc::Statistics * statistics) { if (!statistics) return nullptr; @@ -184,7 +184,7 @@ std::unique_ptr HiveOrcFile::buildMinMaxIndex(c } -void HiveOrcFile::loadFileMinMaxIndex() +void HiveORCFile::loadFileMinMaxIndex() { if (!reader) { @@ -196,13 +196,13 @@ void HiveOrcFile::loadFileMinMaxIndex() file_minmax_idx = buildMinMaxIndex(statistics.get()); } -bool HiveOrcFile::useSplitMinMaxIndex() const +bool HiveORCFile::useSplitMinMaxIndex() const { return storage_settings->enable_orc_stripe_minmax_index; } -void HiveOrcFile::loadSplitMinMaxIndex() +void HiveORCFile::loadSplitMinMaxIndex() { if (!reader) { diff --git a/src/Storages/Hive/HiveFile.h b/src/Storages/Hive/HiveFile.h index 1404f97eff0..90cbb881c48 100644 --- a/src/Storages/Hive/HiveFile.h +++ b/src/Storages/Hive/HiveFile.h @@ -174,10 +174,10 @@ public: virtual FileFormat getFormat() const override { return FileFormat::TEXT; } }; -class HiveOrcFile : public IHiveFile +class HiveORCFile : public IHiveFile { public: - HiveOrcFile( + HiveORCFile( const FieldVector & partition_values_, const String & namenode_url_, const String & path_, diff --git a/src/Storages/Hive/StorageHive.cpp b/src/Storages/Hive/StorageHive.cpp index 6745a0f6ad0..40dce955f01 100644 --- a/src/Storages/Hive/StorageHive.cpp +++ b/src/Storages/Hive/StorageHive.cpp @@ -430,7 +430,7 @@ HiveFilePtr createHiveFile( } else if (format_name == "ORC") { - hive_file = std::make_shared(fields, namenode_url, path, ts, size, index_names_and_types, hive_settings, context); + hive_file = std::make_shared(fields, namenode_url, path, ts, size, index_names_and_types, hive_settings, context); } else if (format_name == "Parquet") { From 82583b93491a48e0c5a7600e80540bafca3b1240 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Thu, 7 Apr 2022 13:07:18 +0200 Subject: [PATCH 238/239] Update 02245_format_string_stack_overflow --- .../queries/0_stateless/02245_format_string_stack_overflow.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02245_format_string_stack_overflow.sql b/tests/queries/0_stateless/02245_format_string_stack_overflow.sql index 9376b12aa1e..40053fd0d9b 100644 --- a/tests/queries/0_stateless/02245_format_string_stack_overflow.sql +++ b/tests/queries/0_stateless/02245_format_string_stack_overflow.sql @@ -1,2 +1,2 @@ --- Tags: no-backward-compatibility-check:22.3.2.2 +-- Tags: no-backward-compatibility-check:22.3 select format('{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}{0}', toString(number)) str from numbers(1); From 80503b29836621b82b8f0b690a2131b9c3fcd00a Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Thu, 7 Apr 2022 13:16:19 +0200 Subject: [PATCH 239/239] Update clickhouse-test --- tests/clickhouse-test | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/clickhouse-test b/tests/clickhouse-test index 9ca820652d9..3efb37cc27d 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -535,7 +535,9 @@ class TestCase: server_version = str(clickhouse_execute(args, "SELECT version()").decode()) # If server version is less or equal from the version specified in tag, we should skip this test. - if list(map(int, server_version.split('.'))) <= list(map(int, version_from_tag.split('.'))): + version_from_tag_split = list(map(int, version_from_tag.split('.'))) + server_version_split = list(map(int, server_version.split('.'))) + if server_version_split[:len(version_from_tag_split)] <= version_from_tag_split: return True return False