Merge pull request #72517 from erickurbanov/bsearchfunction

Add indexOfAssumeSorted Function. request #17795
2024-12-14 10:22:10 +00:00 · 2024-12-06 08:55:40 +00:00 · 2024-12-06 08:55:40 +00:00 · 68e77f6f33
commit 68e77f6f33
parent 95a36b9312 4b406e6dd7
9 changed files with 274 additions and 44 deletions
--- a/docs/en/sql-reference/functions/array-functions.md
+++ b/docs/en/sql-reference/functions/array-functions.md
@ -786,6 +786,24 @@ SELECT indexOf([1, 3, NULL, NULL], NULL)

 Elements set to `NULL` are handled as normal values.

+## indexOfAssumeSorted(arr, x)
+
+Returns the index of the first ‘x’ element (starting from 1) if it is in the array, or 0 if it is not.
+The function should be used for an array sorted not in descending order since binary search is used for the search.
+If the internal array type is Nullable, the ‘indexOf‘ function will be used.
+
+Example:
+
+``` sql
+SELECT indexOfAssumeSorted([1, 3, 3, 3, 4, 4, 5], 4)
+```
+
+``` text
+┌─indexOf([1, 3, 3, 3, 4, 4, 5], NULL)─┐
+│                                 5    │
+└──────────────────────────────────--─-┘
+```
+
 ## arrayCount(\[func,\] arr1, ...)

 Returns the number of elements for which `func(arr1[i], ..., arrN[i])` returns something other than 0. If `func` is not specified, it returns the number of non-zero elements in the array.
--- a/docs/ja/sql-reference/functions/array-functions.md
+++ b/docs/ja/sql-reference/functions/array-functions.md
@ -785,6 +785,24 @@ SELECT indexOf([1, 3, NULL, NULL], NULL)

 `NULL` に設定された要素は通常の値として扱われます。

+# indexOfAssumeSorted(arr, x)
+
+配列内にある場合は最初の'x'要素(1から始まる)のインデックスを返し、そうでない場合は0を返します。
+この関数は、バイナリ検索が検索に使用されるため、降順ではなくソートされた配列に使用する必要があります。
+内部配列型がNull許容の場合は、‘indexOf‘関数が使用されます
+
+例:
+
+``` sql
+SELECT indexOfAssumeSorted([1, 3, 3, 3, 4, 4, 5], 4)
+```
+
+``` text
+┌─indexOf([1, 3, 3, 3, 4, 4, 5], NULL)─┐
+│                                 5    │
+└──────────────────────────────────--─-┘
+```
+
 ## arrayCount(\[func,\] arr1, ...)

 `func(arr1[i], ..., arrN[i])`が0以外の値を返す要素の数を返します。`func` が指定されていない場合、配列内の0以外の要素の数を返します。
--- a/docs/ru/sql-reference/functions/array-functions.md
+++ b/docs/ru/sql-reference/functions/array-functions.md
@ -306,6 +306,24 @@ SELECT indexOf([1, 3, NULL, NULL], NULL)
 └───────────────────────────────────┘
 ```

+## indexOfAssumeSorted(arr, x)
+
+Возвращает индекс первого элемента x (начиная с 1), если он есть в массиве, или 0, если его нет.
+Функция должна использоваться, если массив отсортирован в неубывающем порядке, так как используется бинарный поиск.
+Если внутренний тип Nullable, то будет использована функция ‘indexOf‘.
+
+Пример:
+
+``` sql
+SELECT indexOfAssumeSorted([1, 3, 3, 3, 4, 4, 5], 4)
+```
+
+``` text
+┌─indexOf([1, 3, 3, 3, 4, 4, 5], NULL)─┐
+│                                 5    │
+└──────────────────────────────────--─-┘
+```
+
 Элементы, равные `NULL`, обрабатываются как обычные значения.

 ## arrayCount(\[func,\] arr1, ...) {#array-count}
--- a/docs/zh/sql-reference/functions/array-functions.md
+++ b/docs/zh/sql-reference/functions/array-functions.md
@ -337,6 +337,24 @@ SELECT indexOf([1, 3, NULL, NULL], NULL)

 设置为«NULL»的元素将作为普通的元素值处理。

+## indexOfAssumeSorted(arr, x)
+
+返回数组中第一个’x’元素的索引（从1开始），如果’x’元素不存在在数组中，则返回0.
+该函数应用于不按降序排序的数组，因为二进制搜索用于搜索。
+如果内部数组类型为空，则将使用’indexOf’函数。
+
+示例:
+
+``` sql
+SELECT indexOfAssumeSorted([1, 3, 3, 3, 4, 4, 5], 4)
+```
+
+``` text
+┌─indexOf([1, 3, 3, 3, 4, 4, 5], NULL)─┐
+│                                 5    │
+└──────────────────────────────────--─-┘
+```
+
 ## arrayCount(\[func,\] arr1, ...) {#array-count}

 `func`将arr数组作为参数，其返回结果为非零值的数量。如果未指定“func”，则返回数组中非零元素的数量。
--- a/src/Functions/array/arrayIndex.h
+++ b/src/Functions/array/arrayIndex.h
@ -1,5 +1,6 @@
 #pragma once
 #include <cstddef>
+#include <type_traits>
 #include <Functions/IFunction.h>
 #include <Functions/FunctionFactory.h>
 #include <Functions/FunctionHelpers.h>
@ -14,6 +15,7 @@
 #include <Columns/ColumnFixedString.h>
 #include <Columns/ColumnsNumber.h>
 #include <Columns/ColumnNullable.h>
+#include "Common/FieldVisitors.h"
 #include "Common/Logger.h"
 #include "Common/logger_useful.h"
 #include <Common/FieldVisitorsAccurateComparison.h>
@ -53,6 +55,10 @@ struct IndexOfAction
    static constexpr void apply(ResultType& current, size_t j) noexcept { current = j + 1; }
 };

+struct IndexOfAssumeSorted : public IndexOfAction
+{
+};
+
 struct CountEqualAction
 {
    using ResultType = UInt64;
@ -111,13 +117,138 @@ private:
        return 0 == left.compareAt(i, RightArgIsConstant ? 0 : j, right, 1);
    }

+    static bool compare(const Array & arr, const Field& rhs, size_t pos, size_t)
+    {
+        return applyVisitor(FieldVisitorAccurateEquals(), arr[pos], rhs);
+    }
+
+    static constexpr bool lessOrEqual(const PaddedPODArray<Initial> & left, const Result & right, size_t i, size_t) noexcept
+    {
+        return left[i] >= right;
+    }
+
+    static constexpr bool lessOrEqual(const IColumn & left, const Result & right, size_t i, size_t) noexcept { return left[i] >= right; }
+
+    static constexpr bool lessOrEqual(const Array& arr, const Field& rhs, size_t pos, size_t) noexcept {
+        return applyVisitor(FieldVisitorAccurateLessOrEqual(), rhs, arr[pos]);
+    }
+
 #pragma clang diagnostic pop

+public:
+    /** Assuming that the array is sorted, use a binary search */
+    template <typename Data, typename Target>
+    static constexpr ResultType lowerBound(const Data & data, const Target & target, size_t array_size, ArrOffset current_offset)
+    {
+        ResultType current = 0;
+        size_t low = 0, high = array_size;
+        while (high - low > 0)
+        {
+            auto middle = low + ((high - low) >> 1);
+            auto compare_result = lessOrEqual(data, target, current_offset + middle, 0);
+            /// avoid conditional branching
+            high = compare_result ? middle : high;
+            low = compare_result ? low : middle + 1;
+        }
+        if (low < array_size && compare(data, target, current_offset + low, 0))
+        {
+            ConcreteAction::apply(current, low);
+        }
+        return current;
+    }
+
+    template <size_t Case, typename Data, typename Target>
+    static constexpr ResultType linearSearch(
+        const Data & data,
+        const Target & target,
+        size_t array_size,
+        const NullMap * const null_map_data,
+        const NullMap * const null_map_item,
+        size_t row_index,
+        ArrOffset current_offset)
+    {
+        ResultType current = 0;
+        for (size_t j = 0; j < array_size; ++j)
+        {
+            if constexpr (Case == 2) /// Right arg is Nullable
+                if (hasNull(null_map_item, row_index))
+                    continue;
+
+            if constexpr (Case == 3) /// Left arg is an array of Nullables
+                if (hasNull(null_map_data, current_offset + j))
+                    continue;
+
+            if constexpr (Case == 4) /// Both args are nullable
+            {
+                const bool right_is_null = hasNull(null_map_data, current_offset + j);
+                const bool left_is_null = hasNull(null_map_item, row_index);
+
+                if (right_is_null != left_is_null)
+                    continue;
+
+                if (!right_is_null && !compare(data, target, current_offset + j, row_index))
+                    continue;
+            }
+            else if (!compare(data, target, current_offset + j, row_index))
+                continue;
+
+            ConcreteAction::apply(current, j);
+
+            if constexpr (!ConcreteAction::resume_execution)
+                break;
+        }
+        return current;
+    }
+
+    static ResultType linearSearchConst(const Array & arr, const Field & value)
+    {
+        ResultType current = 0;
+        for (size_t i = 0, size = arr.size(); i < size; ++i)
+        {
+            if (!applyVisitor(FieldVisitorAccurateEquals(), arr[i], value))
+                continue;
+
+            ConcreteAction::apply(current, i);
+
+            if constexpr (!ConcreteAction::resume_execution)
+                break;
+        }
+        return current;
+    }
+
+private:
+    /** Looking for the target element index in the data (array) */
+    template <size_t Case, typename Data, typename Target>
+    static constexpr ResultType getIndex(
+        const Data & data,
+        const Target & target,
+        size_t array_size,
+        const NullMap * const null_map_data,
+        const NullMap * const null_map_item,
+        size_t row_index,
+        ArrOffset current_offset)
+    {
+        /** Use binary search if the following conditions are met.
+          *   1. The array type is not nullable. (Case = 1)
+          *   2. Target is not a column or an array.
+          */
+        if constexpr (
+            std::is_same_v<ConcreteAction, IndexOfAssumeSorted> && !std::is_same_v<Target, PaddedPODArray<Result>>
+            && !std::is_same_v<Target, IColumn> && Case == 1)
+        {
+            return lowerBound(data, target, array_size, current_offset);
+        }
+        return linearSearch<Case>(data, target, array_size, null_map_data, null_map_item, row_index, current_offset);
+    }
+
    static constexpr bool hasNull(const NullMap * const null_map, size_t i) noexcept { return (*null_map)[i]; }

    template <size_t Case, typename Data, typename Target>
    static void process(
-        const Data & data, const ArrOffsets & offsets, const Target & target, ResultArr & result,
+        const Data & data,
+        const ArrOffsets & offsets,
+        const Target & target,
+        ResultArr & result,
        [[maybe_unused]] const NullMap * const null_map_data,
        [[maybe_unused]] const NullMap * const null_map_item)
    {
@ -129,7 +260,6 @@ private:
        }

        const size_t size = offsets.size();
-
        result.resize(size);

        ArrOffset current_offset = 0;
@ -137,39 +267,7 @@ private:
        for (size_t i = 0; i < size; ++i)
        {
            const size_t array_size = offsets[i] - current_offset;
-            ResultType current = 0;
-
-            for (size_t j = 0; j < array_size; ++j)
-            {
-                if constexpr (Case == 2) /// Right arg is Nullable
-                     if (hasNull(null_map_item, i))
-                        continue;
-
-                if constexpr (Case == 3) /// Left arg is an array of Nullables
-                    if (hasNull(null_map_data, current_offset + j))
-                        continue;
-
-                if constexpr (Case == 4) /// Both args are nullable
-                {
-                    const bool right_is_null = hasNull(null_map_data, current_offset + j);
-                    const bool left_is_null = hasNull(null_map_item, i);
-
-                    if (right_is_null != left_is_null)
-                        continue;
-
-                    if (!right_is_null && !compare(data, target, current_offset + j, i))
-                        continue;
-                }
-                else if (!compare(data, target, current_offset + j, i))
-                    continue;
-
-                ConcreteAction::apply(current, j);
-
-                if constexpr (!ConcreteAction::resume_execution)
-                    break;
-            }
-
-            result[i] = current;
+            result[i] = getIndex<Case>(data, target, array_size, null_map_data, null_map_item, i, current_offset);
            current_offset = offsets[i];
        }
    }
@ -854,16 +952,13 @@ private:
        {
            ResultType current = 0;
            const auto & value = (*item_arg)[0];
-
-            for (size_t i = 0, size = arr.size(); i < size; ++i)
+            if constexpr (std::is_same_v<ConcreteAction, IndexOfAssumeSorted>)
            {
-                if (!applyVisitor(FieldVisitorAccurateEquals(), arr[i], value))
-                    continue;
-
-                ConcreteAction::apply(current, i);
-
-                if constexpr (!ConcreteAction::resume_execution)
-                    break;
+                current = Impl::Main<ConcreteAction, true>::lowerBound(arr, value, arr.size(), 0);
+            }
+            else
+            {
+                current = Impl::Main<ConcreteAction, true>::linearSearchConst(arr, value);
            }

            return result_type->createColumnConst(item_arg->size(), current);
--- a/src/Functions/array/indexOfAssumeSorted.cpp
+++ b/src/Functions/array/indexOfAssumeSorted.cpp
@ -0,0 +1,28 @@
+#include "arrayIndex.h"
+#include <Functions/FunctionFactory.h>
+#include <Functions/IFunctionAdaptors.h>
+#include "Common/FunctionDocumentation.h"
+
+namespace DB
+{
+struct NameIndexOfAssumeSorted { static constexpr auto name = "indexOfAssumeSorted"; };
+
+/// indexOfAssumeSorted(arr, x) - returns the index of the element x (starting with 1), if it exists in the array, or 0 if it
+/// should be used when the array is sorted (applies binary search to array)
+using FunctionIndexOfAssumeSorted = FunctionArrayIndex<IndexOfAssumeSorted, NameIndexOfAssumeSorted>;
+
+REGISTER_FUNCTION(IndexOfAssumeSorted)
+{
+    factory.registerFunction<FunctionIndexOfAssumeSorted>(FunctionDocumentation{
+        .description = R"(
+The function finds the position of the first occurrence of element X in the array.
+Indexing from one.
+The function can be used when the internal array type is not Nullable and the array is sorted in non-decreasing order.
+If the array type is Nullable, the 'indexOf' function will be used.
+The binary search algorithm is used for the search.
+For more details, see [https://en.wikipedia.org/wiki/Binary_search]
+For an unsorted array, the behavior is undefined.
+)",
+        .examples = {{.name = "", .query = "SELECT indexOfAssumeSorted([1, 2, 2, 2, 3, 3, 3, 4], 3) FROM test_table;", .result=""}}});
+}
+}
--- a/tests/queries/0_stateless/03276_index_of_assume_sorted.reference
+++ b/tests/queries/0_stateless/03276_index_of_assume_sorted.reference
@ -0,0 +1,8 @@
+7
+5
+0
+0
+0
+8
+0
+0
--- a/tests/queries/0_stateless/03276_index_of_assume_sorted.sql
+++ b/tests/queries/0_stateless/03276_index_of_assume_sorted.sql
@ -0,0 +1,26 @@
+DROP TABLE IF EXISTS test;
+
+CREATE TABLE test(
+    id UInt64,
+    numbers Array(Int64)
+)
+ENGINE = MergeTree()
+ORDER BY id;
+
+INSERT INTO test VALUES(1, [1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 6, 7]);
+INSERT INTO test VALUES (2, [1, 2, 3, 4, 5, 6, 7, 8]);
+INSERT INTO test VALUES(3, [1, 3, 7, 10]);
+INSERT INTO test VALUES(4, [0, 0, 0]);
+INSERT INTO test VALUES(5, [10, 10, 10]);
+
+SELECT indexOfAssumeSorted(numbers, 4) FROM test WHERE id = 1;
+SELECT indexOfAssumeSorted(numbers, 5) FROM test WHERE id = 2;
+SELECT indexOfAssumeSorted(numbers, 5) FROM test WHERE id = 3;
+SELECT indexOfAssumeSorted(numbers, 1) FROM test WHERE id = 4;
+SELECT indexOfAssumeSorted(numbers, 1) FROM test WHERE id = 5;
+
+SELECT indexOfAssumeSorted([1, 2, 2, 2, 3, 3, 3, 4, 4], 4);
+SELECT indexOfAssumeSorted([10, 10, 10], 1);
+SELECT indexOfAssumeSorted([1, 1, 1], 10);
+
+DROP TABLE IF EXISTS test;
--- a/utils/check-style/aspell-ignore/en/aspell-dict.txt
+++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt
@ -1954,6 +1954,7 @@ ilike
 incrementing
 indexHint
 indexOf
+indexOfAssumeSorted
 inequal
 infi
 inflight