Merge pull request #72517 from erickurbanov/bsearchfunction

Add indexOfAssumeSorted Function. request #17795
2024-12-14 10:22:10 +00:00 · 2024-12-06 08:55:40 +00:00 · 2024-12-06 08:55:40 +00:00 · 68e77f6f33
commit 68e77f6f33
parent 95a36b9312 4b406e6dd7
9 changed files with 274 additions and 44 deletions
--- a/docs/en/sql-reference/functions/array-functions.md
+++ b/docs/en/sql-reference/functions/array-functions.md
@ -786,6 +786,24 @@ SELECT indexOf([1, 3, NULL, NULL], NULL)
 Elements set to `NULL` are handled as normal values.
 ## indexOfAssumeSorted(arr, x)
 Returns the index of the first ‘x’ element (starting from 1) if it is in the array, or 0 if it is not.
 The function should be used for an array sorted not in descending order since binary search is used for the search.
 If the internal array type is Nullable, the ‘indexOf‘ function will be used.
 Example:
 ``` sql
 SELECT indexOfAssumeSorted([1, 3, 3, 3, 4, 4, 5], 4)
 ```
 ``` text
 ┌─indexOf([1, 3, 3, 3, 4, 4, 5], NULL)─┐
 │                                 5    │
 └──────────────────────────────────--─-┘
 ```
 ## arrayCount(\[func,\] arr1, ...)
 Returns the number of elements for which `func(arr1[i], ..., arrN[i])` returns something other than 0. If `func` is not specified, it returns the number of non-zero elements in the array.
--- a/docs/ja/sql-reference/functions/array-functions.md
+++ b/docs/ja/sql-reference/functions/array-functions.md
@ -785,6 +785,24 @@ SELECT indexOf([1, 3, NULL, NULL], NULL)
 `NULL` に設定された要素は通常の値として扱われます。
 # indexOfAssumeSorted(arr, x)
 配列内にある場合は最初の'x'要素(1から始まる)のインデックスを返し、そうでない場合は0を返します。
 この関数は、バイナリ検索が検索に使用されるため、降順ではなくソートされた配列に使用する必要があります。
 内部配列型がNull許容の場合は、‘indexOf‘関数が使用されます
 例:
 ``` sql
 SELECT indexOfAssumeSorted([1, 3, 3, 3, 4, 4, 5], 4)
 ```
 ``` text
 ┌─indexOf([1, 3, 3, 3, 4, 4, 5], NULL)─┐
 │                                 5    │
 └──────────────────────────────────--─-┘
 ```
 ## arrayCount(\[func,\] arr1, ...)
 `func(arr1[i], ..., arrN[i])`が0以外の値を返す要素の数を返します。`func` が指定されていない場合、配列内の0以外の要素の数を返します。
--- a/docs/ru/sql-reference/functions/array-functions.md
+++ b/docs/ru/sql-reference/functions/array-functions.md
@ -306,6 +306,24 @@ SELECT indexOf([1, 3, NULL, NULL], NULL)
 └───────────────────────────────────┘
 ```
 ## indexOfAssumeSorted(arr, x)
 Возвращает индекс первого элемента x (начиная с 1), если он есть в массиве, или 0, если его нет.
 Функция должна использоваться, если массив отсортирован в неубывающем порядке, так как используется бинарный поиск.
 Если внутренний тип Nullable, то будет использована функция ‘indexOf‘.
 Пример:
 ``` sql
 SELECT indexOfAssumeSorted([1, 3, 3, 3, 4, 4, 5], 4)
 ```
 ``` text
 ┌─indexOf([1, 3, 3, 3, 4, 4, 5], NULL)─┐
 │                                 5    │
 └──────────────────────────────────--─-┘
 ```
 Элементы, равные `NULL`, обрабатываются как обычные значения.
 ## arrayCount(\[func,\] arr1, ...) {#array-count}
--- a/docs/zh/sql-reference/functions/array-functions.md
+++ b/docs/zh/sql-reference/functions/array-functions.md
@ -337,6 +337,24 @@ SELECT indexOf([1, 3, NULL, NULL], NULL)
 设置为«NULL»的元素将作为普通的元素值处理。
 ## indexOfAssumeSorted(arr, x)
 返回数组中第一个’x’元素的索引（从1开始），如果’x’元素不存在在数组中，则返回0.
 该函数应用于不按降序排序的数组，因为二进制搜索用于搜索。
 如果内部数组类型为空，则将使用’indexOf’函数。
 示例:
 ``` sql
 SELECT indexOfAssumeSorted([1, 3, 3, 3, 4, 4, 5], 4)
 ```
 ``` text
 ┌─indexOf([1, 3, 3, 3, 4, 4, 5], NULL)─┐
 │                                 5    │
 └──────────────────────────────────--─-┘
 ```
 ## arrayCount(\[func,\] arr1, ...) {#array-count}
 `func`将arr数组作为参数，其返回结果为非零值的数量。如果未指定“func”，则返回数组中非零元素的数量。
--- a/src/Functions/array/arrayIndex.h
+++ b/src/Functions/array/arrayIndex.h
@ -1,5 +1,6 @@
 #pragma once
 #include <cstddef>
 #include <type_traits>
 #include <Functions/IFunction.h>
 #include <Functions/FunctionFactory.h>
 #include <Functions/FunctionHelpers.h>
@ -14,6 +15,7 @@
 #include <Columns/ColumnFixedString.h>
 #include <Columns/ColumnsNumber.h>
 #include <Columns/ColumnNullable.h>
 #include "Common/FieldVisitors.h"
 #include "Common/Logger.h"
 #include "Common/logger_useful.h"
 #include <Common/FieldVisitorsAccurateComparison.h>
@ -53,6 +55,10 @@ struct IndexOfAction
    static constexpr void apply(ResultType& current, size_t j) noexcept { current = j + 1; }
 };
 struct IndexOfAssumeSorted : public IndexOfAction
 {
 };
 struct CountEqualAction
 {
    using ResultType = UInt64;
@ -111,13 +117,138 @@ private:
        return 0 == left.compareAt(i, RightArgIsConstant ? 0 : j, right, 1);
    }
    static bool compare(const Array & arr, const Field& rhs, size_t pos, size_t)
    {
        return applyVisitor(FieldVisitorAccurateEquals(), arr[pos], rhs);
    }
    static constexpr bool lessOrEqual(const PaddedPODArray<Initial> & left, const Result & right, size_t i, size_t) noexcept
    {
        return left[i] >= right;
    }
    static constexpr bool lessOrEqual(const IColumn & left, const Result & right, size_t i, size_t) noexcept { return left[i] >= right; }
    static constexpr bool lessOrEqual(const Array& arr, const Field& rhs, size_t pos, size_t) noexcept {
        return applyVisitor(FieldVisitorAccurateLessOrEqual(), rhs, arr[pos]);
    }
 #pragma clang diagnostic pop
 public:
    /** Assuming that the array is sorted, use a binary search */
    template <typename Data, typename Target>
    static constexpr ResultType lowerBound(const Data & data, const Target & target, size_t array_size, ArrOffset current_offset)
    {
        ResultType current = 0;
        size_t low = 0, high = array_size;
        while (high - low > 0)
        {
            auto middle = low + ((high - low) >> 1);
            auto compare_result = lessOrEqual(data, target, current_offset + middle, 0);
            /// avoid conditional branching
            high = compare_result ? middle : high;
            low = compare_result ? low : middle + 1;
        }
        if (low < array_size && compare(data, target, current_offset + low, 0))
        {
            ConcreteAction::apply(current, low);
        }
        return current;
    }
    template <size_t Case, typename Data, typename Target>
    static constexpr ResultType linearSearch(
        const Data & data,
        const Target & target,
        size_t array_size,
        const NullMap * const null_map_data,
        const NullMap * const null_map_item,
        size_t row_index,
        ArrOffset current_offset)
    {
        ResultType current = 0;
        for (size_t j = 0; j < array_size; ++j)
        {
            if constexpr (Case == 2) /// Right arg is Nullable
                if (hasNull(null_map_item, row_index))
                    continue;
            if constexpr (Case == 3) /// Left arg is an array of Nullables
                if (hasNull(null_map_data, current_offset + j))
                    continue;
            if constexpr (Case == 4) /// Both args are nullable
            {
                const bool right_is_null = hasNull(null_map_data, current_offset + j);
                const bool left_is_null = hasNull(null_map_item, row_index);
                if (right_is_null != left_is_null)
                    continue;
                if (!right_is_null && !compare(data, target, current_offset + j, row_index))
                    continue;
            }
            else if (!compare(data, target, current_offset + j, row_index))
                continue;
            ConcreteAction::apply(current, j);
            if constexpr (!ConcreteAction::resume_execution)
                break;
        }
        return current;
    }
    static ResultType linearSearchConst(const Array & arr, const Field & value)
    {
        ResultType current = 0;
        for (size_t i = 0, size = arr.size(); i < size; ++i)
        {
            if (!applyVisitor(FieldVisitorAccurateEquals(), arr[i], value))
                continue;
            ConcreteAction::apply(current, i);
            if constexpr (!ConcreteAction::resume_execution)
                break;
        }
        return current;
    }
 private:
    /** Looking for the target element index in the data (array) */
    template <size_t Case, typename Data, typename Target>
    static constexpr ResultType getIndex(
        const Data & data,
        const Target & target,
        size_t array_size,
        const NullMap * const null_map_data,
        const NullMap * const null_map_item,
        size_t row_index,
        ArrOffset current_offset)
    {
        /** Use binary search if the following conditions are met.
          *   1. The array type is not nullable. (Case = 1)
          *   2. Target is not a column or an array.
          */
        if constexpr (
            std::is_same_v<ConcreteAction, IndexOfAssumeSorted> && !std::is_same_v<Target, PaddedPODArray<Result>>
            && !std::is_same_v<Target, IColumn> && Case == 1)
        {
            return lowerBound(data, target, array_size, current_offset);
        }
        return linearSearch<Case>(data, target, array_size, null_map_data, null_map_item, row_index, current_offset);
    }
    static constexpr bool hasNull(const NullMap * const null_map, size_t i) noexcept { return (*null_map)[i]; }
    template <size_t Case, typename Data, typename Target>
    static void process(
-        const Data & data, const ArrOffsets & offsets, const Target & target, ResultArr & result,
+        const Data & data,
        const ArrOffsets & offsets,
        const Target & target,
        ResultArr & result,
        [[maybe_unused]] const NullMap * const null_map_data,
        [[maybe_unused]] const NullMap * const null_map_item)
    {
@ -129,7 +260,6 @@ private:
        }
        const size_t size = offsets.size();
        result.resize(size);
        ArrOffset current_offset = 0;
@ -137,39 +267,7 @@ private:
        for (size_t i = 0; i < size; ++i)
        {
            const size_t array_size = offsets[i] - current_offset;
-            ResultType current = 0;
+            result[i] = getIndex<Case>(data, target, array_size, null_map_data, null_map_item, i, current_offset);
            for (size_t j = 0; j < array_size; ++j)
            {
                if constexpr (Case == 2) /// Right arg is Nullable
                     if (hasNull(null_map_item, i))
                        continue;
                if constexpr (Case == 3) /// Left arg is an array of Nullables
                    if (hasNull(null_map_data, current_offset + j))
                        continue;
                if constexpr (Case == 4) /// Both args are nullable
                {
                    const bool right_is_null = hasNull(null_map_data, current_offset + j);
                    const bool left_is_null = hasNull(null_map_item, i);
                    if (right_is_null != left_is_null)
                        continue;
                    if (!right_is_null && !compare(data, target, current_offset + j, i))
                        continue;
                }
                else if (!compare(data, target, current_offset + j, i))
                    continue;
                ConcreteAction::apply(current, j);
                if constexpr (!ConcreteAction::resume_execution)
                    break;
            }
            result[i] = current;
            current_offset = offsets[i];
        }
    }
@ -854,16 +952,13 @@ private:
        {
            ResultType current = 0;
            const auto & value = (*item_arg)[0];
-
+            if constexpr (std::is_same_v<ConcreteAction, IndexOfAssumeSorted>)
            for (size_t i = 0, size = arr.size(); i < size; ++i)
            {
-                if (!applyVisitor(FieldVisitorAccurateEquals(), arr[i], value))
+                current = Impl::Main<ConcreteAction, true>::lowerBound(arr, value, arr.size(), 0);
-                    continue;
+            }
-
+            else
-                ConcreteAction::apply(current, i);
+            {
-
+                current = Impl::Main<ConcreteAction, true>::linearSearchConst(arr, value);
                if constexpr (!ConcreteAction::resume_execution)
                    break;
            }
            return result_type->createColumnConst(item_arg->size(), current);
--- a/src/Functions/array/indexOfAssumeSorted.cpp
+++ b/src/Functions/array/indexOfAssumeSorted.cpp
@ -0,0 +1,28 @@
 #include "arrayIndex.h"
 #include <Functions/FunctionFactory.h>
 #include <Functions/IFunctionAdaptors.h>
 #include "Common/FunctionDocumentation.h"
 namespace DB
 {
 struct NameIndexOfAssumeSorted { static constexpr auto name = "indexOfAssumeSorted"; };
 /// indexOfAssumeSorted(arr, x) - returns the index of the element x (starting with 1), if it exists in the array, or 0 if it
 /// should be used when the array is sorted (applies binary search to array)
 using FunctionIndexOfAssumeSorted = FunctionArrayIndex<IndexOfAssumeSorted, NameIndexOfAssumeSorted>;
 REGISTER_FUNCTION(IndexOfAssumeSorted)
 {
    factory.registerFunction<FunctionIndexOfAssumeSorted>(FunctionDocumentation{
        .description = R"(
 The function finds the position of the first occurrence of element X in the array.
 Indexing from one.
 The function can be used when the internal array type is not Nullable and the array is sorted in non-decreasing order.
 If the array type is Nullable, the 'indexOf' function will be used.
 The binary search algorithm is used for the search.
 For more details, see [https://en.wikipedia.org/wiki/Binary_search]
 For an unsorted array, the behavior is undefined.
 )",
        .examples = {{.name = "", .query = "SELECT indexOfAssumeSorted([1, 2, 2, 2, 3, 3, 3, 4], 3) FROM test_table;", .result=""}}});
 }
 }
--- a/tests/queries/0_stateless/03276_index_of_assume_sorted.reference
+++ b/tests/queries/0_stateless/03276_index_of_assume_sorted.reference
@ -0,0 +1,8 @@
 7
 5
 0
 0
 0
 8
 0
 0
--- a/tests/queries/0_stateless/03276_index_of_assume_sorted.sql
+++ b/tests/queries/0_stateless/03276_index_of_assume_sorted.sql
@ -0,0 +1,26 @@
 DROP TABLE IF EXISTS test;
 CREATE TABLE test(
    id UInt64,
    numbers Array(Int64)
 )
 ENGINE = MergeTree()
 ORDER BY id;
 INSERT INTO test VALUES(1, [1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 6, 7]);
 INSERT INTO test VALUES (2, [1, 2, 3, 4, 5, 6, 7, 8]);
 INSERT INTO test VALUES(3, [1, 3, 7, 10]);
 INSERT INTO test VALUES(4, [0, 0, 0]);
 INSERT INTO test VALUES(5, [10, 10, 10]);
 SELECT indexOfAssumeSorted(numbers, 4) FROM test WHERE id = 1;
 SELECT indexOfAssumeSorted(numbers, 5) FROM test WHERE id = 2;
 SELECT indexOfAssumeSorted(numbers, 5) FROM test WHERE id = 3;
 SELECT indexOfAssumeSorted(numbers, 1) FROM test WHERE id = 4;
 SELECT indexOfAssumeSorted(numbers, 1) FROM test WHERE id = 5;
 SELECT indexOfAssumeSorted([1, 2, 2, 2, 3, 3, 3, 4, 4], 4);
 SELECT indexOfAssumeSorted([10, 10, 10], 1);
 SELECT indexOfAssumeSorted([1, 1, 1], 10);
 DROP TABLE IF EXISTS test;
--- a/utils/check-style/aspell-ignore/en/aspell-dict.txt
+++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt
@ -1954,6 +1954,7 @@ ilike
 incrementing
 indexHint
 indexOf
 indexOfAssumeSorted
 inequal
 infi
 inflight