diff --git a/docs/en/sql-reference/functions/array-functions.md b/docs/en/sql-reference/functions/array-functions.md index 5957b45a881..1b7e2d56455 100644 --- a/docs/en/sql-reference/functions/array-functions.md +++ b/docs/en/sql-reference/functions/array-functions.md @@ -786,6 +786,24 @@ SELECT indexOf([1, 3, NULL, NULL], NULL) Elements set to `NULL` are handled as normal values. +## indexOfAssumeSorted(arr, x) + +Returns the index of the first ‘x’ element (starting from 1) if it is in the array, or 0 if it is not. +The function should be used for an array sorted not in descending order since binary search is used for the search. +If the internal array type is Nullable, the ‘indexOf‘ function will be used. + +Example: + +``` sql +SELECT indexOfAssumeSorted([1, 3, 3, 3, 4, 4, 5], 4) +``` + +``` text +┌─indexOf([1, 3, 3, 3, 4, 4, 5], NULL)─┐ +│ 5 │ +└──────────────────────────────────--─-┘ +``` + ## arrayCount(\[func,\] arr1, ...) Returns the number of elements for which `func(arr1[i], ..., arrN[i])` returns something other than 0. If `func` is not specified, it returns the number of non-zero elements in the array. diff --git a/docs/ja/sql-reference/functions/array-functions.md b/docs/ja/sql-reference/functions/array-functions.md index bc4c9bef05c..4a900c5a8dc 100644 --- a/docs/ja/sql-reference/functions/array-functions.md +++ b/docs/ja/sql-reference/functions/array-functions.md @@ -785,6 +785,24 @@ SELECT indexOf([1, 3, NULL, NULL], NULL) `NULL` に設定された要素は通常の値として扱われます。 +# indexOfAssumeSorted(arr, x) + +配列内にある場合は最初の'x'要素(1から始まる)のインデックスを返し、そうでない場合は0を返します。 +この関数は、バイナリ検索が検索に使用されるため、降順ではなくソートされた配列に使用する必要があります。 +内部配列型がNull許容の場合は、‘indexOf‘関数が使用されます + +例: + +``` sql +SELECT indexOfAssumeSorted([1, 3, 3, 3, 4, 4, 5], 4) +``` + +``` text +┌─indexOf([1, 3, 3, 3, 4, 4, 5], NULL)─┐ +│ 5 │ +└──────────────────────────────────--─-┘ +``` + ## arrayCount(\[func,\] arr1, ...) `func(arr1[i], ..., arrN[i])`が0以外の値を返す要素の数を返します。`func` が指定されていない場合、配列内の0以外の要素の数を返します。 diff --git a/docs/ru/sql-reference/functions/array-functions.md b/docs/ru/sql-reference/functions/array-functions.md index 825e3f06be2..63d2595dcc8 100644 --- a/docs/ru/sql-reference/functions/array-functions.md +++ b/docs/ru/sql-reference/functions/array-functions.md @@ -306,6 +306,24 @@ SELECT indexOf([1, 3, NULL, NULL], NULL) └───────────────────────────────────┘ ``` +## indexOfAssumeSorted(arr, x) + +Возвращает индекс первого элемента x (начиная с 1), если он есть в массиве, или 0, если его нет. +Функция должна использоваться, если массив отсортирован в неубывающем порядке, так как используется бинарный поиск. +Если внутренний тип Nullable, то будет использована функция ‘indexOf‘. + +Пример: + +``` sql +SELECT indexOfAssumeSorted([1, 3, 3, 3, 4, 4, 5], 4) +``` + +``` text +┌─indexOf([1, 3, 3, 3, 4, 4, 5], NULL)─┐ +│ 5 │ +└──────────────────────────────────--─-┘ +``` + Элементы, равные `NULL`, обрабатываются как обычные значения. ## arrayCount(\[func,\] arr1, ...) {#array-count} diff --git a/docs/zh/sql-reference/functions/array-functions.md b/docs/zh/sql-reference/functions/array-functions.md index 69db34e4a36..83f93f8490a 100644 --- a/docs/zh/sql-reference/functions/array-functions.md +++ b/docs/zh/sql-reference/functions/array-functions.md @@ -337,6 +337,24 @@ SELECT indexOf([1, 3, NULL, NULL], NULL) 设置为«NULL»的元素将作为普通的元素值处理。 +## indexOfAssumeSorted(arr, x) + +返回数组中第一个’x’元素的索引(从1开始),如果’x’元素不存在在数组中,则返回0. +该函数应用于不按降序排序的数组,因为二进制搜索用于搜索。 +如果内部数组类型为空,则将使用’indexOf’函数。 + +示例: + +``` sql +SELECT indexOfAssumeSorted([1, 3, 3, 3, 4, 4, 5], 4) +``` + +``` text +┌─indexOf([1, 3, 3, 3, 4, 4, 5], NULL)─┐ +│ 5 │ +└──────────────────────────────────--─-┘ +``` + ## arrayCount(\[func,\] arr1, ...) {#array-count} `func`将arr数组作为参数,其返回结果为非零值的数量。如果未指定“func”,则返回数组中非零元素的数量。 diff --git a/src/Functions/array/arrayIndex.h b/src/Functions/array/arrayIndex.h index e6ea9f4c4ee..f7addad2add 100644 --- a/src/Functions/array/arrayIndex.h +++ b/src/Functions/array/arrayIndex.h @@ -1,5 +1,6 @@ #pragma once #include +#include #include #include #include @@ -14,6 +15,7 @@ #include #include #include +#include "Common/FieldVisitors.h" #include "Common/Logger.h" #include "Common/logger_useful.h" #include @@ -53,6 +55,10 @@ struct IndexOfAction static constexpr void apply(ResultType& current, size_t j) noexcept { current = j + 1; } }; +struct IndexOfAssumeSorted : public IndexOfAction +{ +}; + struct CountEqualAction { using ResultType = UInt64; @@ -111,13 +117,138 @@ private: return 0 == left.compareAt(i, RightArgIsConstant ? 0 : j, right, 1); } + static bool compare(const Array & arr, const Field& rhs, size_t pos, size_t) + { + return applyVisitor(FieldVisitorAccurateEquals(), arr[pos], rhs); + } + + static constexpr bool lessOrEqual(const PaddedPODArray & left, const Result & right, size_t i, size_t) noexcept + { + return left[i] >= right; + } + + static constexpr bool lessOrEqual(const IColumn & left, const Result & right, size_t i, size_t) noexcept { return left[i] >= right; } + + static constexpr bool lessOrEqual(const Array& arr, const Field& rhs, size_t pos, size_t) noexcept { + return applyVisitor(FieldVisitorAccurateLessOrEqual(), rhs, arr[pos]); + } + #pragma clang diagnostic pop +public: + /** Assuming that the array is sorted, use a binary search */ + template + static constexpr ResultType lowerBound(const Data & data, const Target & target, size_t array_size, ArrOffset current_offset) + { + ResultType current = 0; + size_t low = 0, high = array_size; + while (high - low > 0) + { + auto middle = low + ((high - low) >> 1); + auto compare_result = lessOrEqual(data, target, current_offset + middle, 0); + /// avoid conditional branching + high = compare_result ? middle : high; + low = compare_result ? low : middle + 1; + } + if (low < array_size && compare(data, target, current_offset + low, 0)) + { + ConcreteAction::apply(current, low); + } + return current; + } + + template + static constexpr ResultType linearSearch( + const Data & data, + const Target & target, + size_t array_size, + const NullMap * const null_map_data, + const NullMap * const null_map_item, + size_t row_index, + ArrOffset current_offset) + { + ResultType current = 0; + for (size_t j = 0; j < array_size; ++j) + { + if constexpr (Case == 2) /// Right arg is Nullable + if (hasNull(null_map_item, row_index)) + continue; + + if constexpr (Case == 3) /// Left arg is an array of Nullables + if (hasNull(null_map_data, current_offset + j)) + continue; + + if constexpr (Case == 4) /// Both args are nullable + { + const bool right_is_null = hasNull(null_map_data, current_offset + j); + const bool left_is_null = hasNull(null_map_item, row_index); + + if (right_is_null != left_is_null) + continue; + + if (!right_is_null && !compare(data, target, current_offset + j, row_index)) + continue; + } + else if (!compare(data, target, current_offset + j, row_index)) + continue; + + ConcreteAction::apply(current, j); + + if constexpr (!ConcreteAction::resume_execution) + break; + } + return current; + } + + static ResultType linearSearchConst(const Array & arr, const Field & value) + { + ResultType current = 0; + for (size_t i = 0, size = arr.size(); i < size; ++i) + { + if (!applyVisitor(FieldVisitorAccurateEquals(), arr[i], value)) + continue; + + ConcreteAction::apply(current, i); + + if constexpr (!ConcreteAction::resume_execution) + break; + } + return current; + } + +private: + /** Looking for the target element index in the data (array) */ + template + static constexpr ResultType getIndex( + const Data & data, + const Target & target, + size_t array_size, + const NullMap * const null_map_data, + const NullMap * const null_map_item, + size_t row_index, + ArrOffset current_offset) + { + /** Use binary search if the following conditions are met. + * 1. The array type is not nullable. (Case = 1) + * 2. Target is not a column or an array. + */ + if constexpr ( + std::is_same_v && !std::is_same_v> + && !std::is_same_v && Case == 1) + { + return lowerBound(data, target, array_size, current_offset); + } + return linearSearch(data, target, array_size, null_map_data, null_map_item, row_index, current_offset); + } + static constexpr bool hasNull(const NullMap * const null_map, size_t i) noexcept { return (*null_map)[i]; } template static void process( - const Data & data, const ArrOffsets & offsets, const Target & target, ResultArr & result, + const Data & data, + const ArrOffsets & offsets, + const Target & target, + ResultArr & result, [[maybe_unused]] const NullMap * const null_map_data, [[maybe_unused]] const NullMap * const null_map_item) { @@ -129,7 +260,6 @@ private: } const size_t size = offsets.size(); - result.resize(size); ArrOffset current_offset = 0; @@ -137,39 +267,7 @@ private: for (size_t i = 0; i < size; ++i) { const size_t array_size = offsets[i] - current_offset; - ResultType current = 0; - - for (size_t j = 0; j < array_size; ++j) - { - if constexpr (Case == 2) /// Right arg is Nullable - if (hasNull(null_map_item, i)) - continue; - - if constexpr (Case == 3) /// Left arg is an array of Nullables - if (hasNull(null_map_data, current_offset + j)) - continue; - - if constexpr (Case == 4) /// Both args are nullable - { - const bool right_is_null = hasNull(null_map_data, current_offset + j); - const bool left_is_null = hasNull(null_map_item, i); - - if (right_is_null != left_is_null) - continue; - - if (!right_is_null && !compare(data, target, current_offset + j, i)) - continue; - } - else if (!compare(data, target, current_offset + j, i)) - continue; - - ConcreteAction::apply(current, j); - - if constexpr (!ConcreteAction::resume_execution) - break; - } - - result[i] = current; + result[i] = getIndex(data, target, array_size, null_map_data, null_map_item, i, current_offset); current_offset = offsets[i]; } } @@ -854,16 +952,13 @@ private: { ResultType current = 0; const auto & value = (*item_arg)[0]; - - for (size_t i = 0, size = arr.size(); i < size; ++i) + if constexpr (std::is_same_v) { - if (!applyVisitor(FieldVisitorAccurateEquals(), arr[i], value)) - continue; - - ConcreteAction::apply(current, i); - - if constexpr (!ConcreteAction::resume_execution) - break; + current = Impl::Main::lowerBound(arr, value, arr.size(), 0); + } + else + { + current = Impl::Main::linearSearchConst(arr, value); } return result_type->createColumnConst(item_arg->size(), current); diff --git a/src/Functions/array/indexOfAssumeSorted.cpp b/src/Functions/array/indexOfAssumeSorted.cpp new file mode 100644 index 00000000000..8e36a157721 --- /dev/null +++ b/src/Functions/array/indexOfAssumeSorted.cpp @@ -0,0 +1,28 @@ +#include "arrayIndex.h" +#include +#include +#include "Common/FunctionDocumentation.h" + +namespace DB +{ +struct NameIndexOfAssumeSorted { static constexpr auto name = "indexOfAssumeSorted"; }; + +/// indexOfAssumeSorted(arr, x) - returns the index of the element x (starting with 1), if it exists in the array, or 0 if it +/// should be used when the array is sorted (applies binary search to array) +using FunctionIndexOfAssumeSorted = FunctionArrayIndex; + +REGISTER_FUNCTION(IndexOfAssumeSorted) +{ + factory.registerFunction(FunctionDocumentation{ + .description = R"( +The function finds the position of the first occurrence of element X in the array. +Indexing from one. +The function can be used when the internal array type is not Nullable and the array is sorted in non-decreasing order. +If the array type is Nullable, the 'indexOf' function will be used. +The binary search algorithm is used for the search. +For more details, see [https://en.wikipedia.org/wiki/Binary_search] +For an unsorted array, the behavior is undefined. +)", + .examples = {{.name = "", .query = "SELECT indexOfAssumeSorted([1, 2, 2, 2, 3, 3, 3, 4], 3) FROM test_table;", .result=""}}}); +} +} diff --git a/tests/queries/0_stateless/03276_index_of_assume_sorted.reference b/tests/queries/0_stateless/03276_index_of_assume_sorted.reference new file mode 100644 index 00000000000..400d55b21ea --- /dev/null +++ b/tests/queries/0_stateless/03276_index_of_assume_sorted.reference @@ -0,0 +1,8 @@ +7 +5 +0 +0 +0 +8 +0 +0 diff --git a/tests/queries/0_stateless/03276_index_of_assume_sorted.sql b/tests/queries/0_stateless/03276_index_of_assume_sorted.sql new file mode 100644 index 00000000000..b0784664619 --- /dev/null +++ b/tests/queries/0_stateless/03276_index_of_assume_sorted.sql @@ -0,0 +1,26 @@ +DROP TABLE IF EXISTS test; + +CREATE TABLE test( + id UInt64, + numbers Array(Int64) +) +ENGINE = MergeTree() +ORDER BY id; + +INSERT INTO test VALUES(1, [1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 6, 7]); +INSERT INTO test VALUES (2, [1, 2, 3, 4, 5, 6, 7, 8]); +INSERT INTO test VALUES(3, [1, 3, 7, 10]); +INSERT INTO test VALUES(4, [0, 0, 0]); +INSERT INTO test VALUES(5, [10, 10, 10]); + +SELECT indexOfAssumeSorted(numbers, 4) FROM test WHERE id = 1; +SELECT indexOfAssumeSorted(numbers, 5) FROM test WHERE id = 2; +SELECT indexOfAssumeSorted(numbers, 5) FROM test WHERE id = 3; +SELECT indexOfAssumeSorted(numbers, 1) FROM test WHERE id = 4; +SELECT indexOfAssumeSorted(numbers, 1) FROM test WHERE id = 5; + +SELECT indexOfAssumeSorted([1, 2, 2, 2, 3, 3, 3, 4, 4], 4); +SELECT indexOfAssumeSorted([10, 10, 10], 1); +SELECT indexOfAssumeSorted([1, 1, 1], 10); + +DROP TABLE IF EXISTS test; diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index 0383642385d..c3f9489b922 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -1954,6 +1954,7 @@ ilike incrementing indexHint indexOf +indexOfAssumeSorted inequal infi inflight