Merge pull request #72517 from erickurbanov/bsearchfunction

Add indexOfAssumeSorted Function. request #17795
This commit is contained in:
Vladimir Cherkasov 2024-12-06 08:55:40 +00:00 committed by GitHub
commit 68e77f6f33
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 274 additions and 44 deletions

View File

@ -786,6 +786,24 @@ SELECT indexOf([1, 3, NULL, NULL], NULL)
Elements set to `NULL` are handled as normal values. Elements set to `NULL` are handled as normal values.
## indexOfAssumeSorted(arr, x)
Returns the index of the first x element (starting from 1) if it is in the array, or 0 if it is not.
The function should be used for an array sorted not in descending order since binary search is used for the search.
If the internal array type is Nullable, the indexOf function will be used.
Example:
``` sql
SELECT indexOfAssumeSorted([1, 3, 3, 3, 4, 4, 5], 4)
```
``` text
┌─indexOf([1, 3, 3, 3, 4, 4, 5], NULL)─┐
│ 5 │
└──────────────────────────────────--─-┘
```
## arrayCount(\[func,\] arr1, ...) ## arrayCount(\[func,\] arr1, ...)
Returns the number of elements for which `func(arr1[i], ..., arrN[i])` returns something other than 0. If `func` is not specified, it returns the number of non-zero elements in the array. Returns the number of elements for which `func(arr1[i], ..., arrN[i])` returns something other than 0. If `func` is not specified, it returns the number of non-zero elements in the array.

View File

@ -785,6 +785,24 @@ SELECT indexOf([1, 3, NULL, NULL], NULL)
`NULL` に設定された要素は通常の値として扱われます。 `NULL` に設定された要素は通常の値として扱われます。
# indexOfAssumeSorted(arr, x)
配列内にある場合は最初の'x'要素(1から始まる)のインデックスを返し、そうでない場合は0を返します。
この関数は、バイナリ検索が検索に使用されるため、降順ではなくソートされた配列に使用する必要があります。
内部配列型がNull許容の場合は、indexOf関数が使用されます
例:
``` sql
SELECT indexOfAssumeSorted([1, 3, 3, 3, 4, 4, 5], 4)
```
``` text
┌─indexOf([1, 3, 3, 3, 4, 4, 5], NULL)─┐
│ 5 │
└──────────────────────────────────--─-┘
```
## arrayCount(\[func,\] arr1, ...) ## arrayCount(\[func,\] arr1, ...)
`func(arr1[i], ..., arrN[i])`が0以外の値を返す要素の数を返します。`func` が指定されていない場合、配列内の0以外の要素の数を返します。 `func(arr1[i], ..., arrN[i])`が0以外の値を返す要素の数を返します。`func` が指定されていない場合、配列内の0以外の要素の数を返します。

View File

@ -306,6 +306,24 @@ SELECT indexOf([1, 3, NULL, NULL], NULL)
└───────────────────────────────────┘ └───────────────────────────────────┘
``` ```
## indexOfAssumeSorted(arr, x)
Возвращает индекс первого элемента x (начиная с 1), если он есть в массиве, или 0, если его нет.
Функция должна использоваться, если массив отсортирован в неубывающем порядке, так как используется бинарный поиск.
Если внутренний тип Nullable, то будет использована функция indexOf.
Пример:
``` sql
SELECT indexOfAssumeSorted([1, 3, 3, 3, 4, 4, 5], 4)
```
``` text
┌─indexOf([1, 3, 3, 3, 4, 4, 5], NULL)─┐
│ 5 │
└──────────────────────────────────--─-┘
```
Элементы, равные `NULL`, обрабатываются как обычные значения. Элементы, равные `NULL`, обрабатываются как обычные значения.
## arrayCount(\[func,\] arr1, ...) {#array-count} ## arrayCount(\[func,\] arr1, ...) {#array-count}

View File

@ -337,6 +337,24 @@ SELECT indexOf([1, 3, NULL, NULL], NULL)
设置为«NULL»的元素将作为普通的元素值处理。 设置为«NULL»的元素将作为普通的元素值处理。
## indexOfAssumeSorted(arr, x)
返回数组中第一个x元素的索引从1开始如果x元素不存在在数组中则返回0.
该函数应用于不按降序排序的数组,因为二进制搜索用于搜索。
如果内部数组类型为空则将使用indexOf函数。
示例:
``` sql
SELECT indexOfAssumeSorted([1, 3, 3, 3, 4, 4, 5], 4)
```
``` text
┌─indexOf([1, 3, 3, 3, 4, 4, 5], NULL)─┐
│ 5 │
└──────────────────────────────────--─-┘
```
## arrayCount(\[func,\] arr1, ...) {#array-count} ## arrayCount(\[func,\] arr1, ...) {#array-count}
`func`将arr数组作为参数其返回结果为非零值的数量。如果未指定“func”则返回数组中非零元素的数量。 `func`将arr数组作为参数其返回结果为非零值的数量。如果未指定“func”则返回数组中非零元素的数量。

View File

@ -1,5 +1,6 @@
#pragma once #pragma once
#include <cstddef> #include <cstddef>
#include <type_traits>
#include <Functions/IFunction.h> #include <Functions/IFunction.h>
#include <Functions/FunctionFactory.h> #include <Functions/FunctionFactory.h>
#include <Functions/FunctionHelpers.h> #include <Functions/FunctionHelpers.h>
@ -14,6 +15,7 @@
#include <Columns/ColumnFixedString.h> #include <Columns/ColumnFixedString.h>
#include <Columns/ColumnsNumber.h> #include <Columns/ColumnsNumber.h>
#include <Columns/ColumnNullable.h> #include <Columns/ColumnNullable.h>
#include "Common/FieldVisitors.h"
#include "Common/Logger.h" #include "Common/Logger.h"
#include "Common/logger_useful.h" #include "Common/logger_useful.h"
#include <Common/FieldVisitorsAccurateComparison.h> #include <Common/FieldVisitorsAccurateComparison.h>
@ -53,6 +55,10 @@ struct IndexOfAction
static constexpr void apply(ResultType& current, size_t j) noexcept { current = j + 1; } static constexpr void apply(ResultType& current, size_t j) noexcept { current = j + 1; }
}; };
struct IndexOfAssumeSorted : public IndexOfAction
{
};
struct CountEqualAction struct CountEqualAction
{ {
using ResultType = UInt64; using ResultType = UInt64;
@ -111,13 +117,138 @@ private:
return 0 == left.compareAt(i, RightArgIsConstant ? 0 : j, right, 1); return 0 == left.compareAt(i, RightArgIsConstant ? 0 : j, right, 1);
} }
static bool compare(const Array & arr, const Field& rhs, size_t pos, size_t)
{
return applyVisitor(FieldVisitorAccurateEquals(), arr[pos], rhs);
}
static constexpr bool lessOrEqual(const PaddedPODArray<Initial> & left, const Result & right, size_t i, size_t) noexcept
{
return left[i] >= right;
}
static constexpr bool lessOrEqual(const IColumn & left, const Result & right, size_t i, size_t) noexcept { return left[i] >= right; }
static constexpr bool lessOrEqual(const Array& arr, const Field& rhs, size_t pos, size_t) noexcept {
return applyVisitor(FieldVisitorAccurateLessOrEqual(), rhs, arr[pos]);
}
#pragma clang diagnostic pop #pragma clang diagnostic pop
public:
/** Assuming that the array is sorted, use a binary search */
template <typename Data, typename Target>
static constexpr ResultType lowerBound(const Data & data, const Target & target, size_t array_size, ArrOffset current_offset)
{
ResultType current = 0;
size_t low = 0, high = array_size;
while (high - low > 0)
{
auto middle = low + ((high - low) >> 1);
auto compare_result = lessOrEqual(data, target, current_offset + middle, 0);
/// avoid conditional branching
high = compare_result ? middle : high;
low = compare_result ? low : middle + 1;
}
if (low < array_size && compare(data, target, current_offset + low, 0))
{
ConcreteAction::apply(current, low);
}
return current;
}
template <size_t Case, typename Data, typename Target>
static constexpr ResultType linearSearch(
const Data & data,
const Target & target,
size_t array_size,
const NullMap * const null_map_data,
const NullMap * const null_map_item,
size_t row_index,
ArrOffset current_offset)
{
ResultType current = 0;
for (size_t j = 0; j < array_size; ++j)
{
if constexpr (Case == 2) /// Right arg is Nullable
if (hasNull(null_map_item, row_index))
continue;
if constexpr (Case == 3) /// Left arg is an array of Nullables
if (hasNull(null_map_data, current_offset + j))
continue;
if constexpr (Case == 4) /// Both args are nullable
{
const bool right_is_null = hasNull(null_map_data, current_offset + j);
const bool left_is_null = hasNull(null_map_item, row_index);
if (right_is_null != left_is_null)
continue;
if (!right_is_null && !compare(data, target, current_offset + j, row_index))
continue;
}
else if (!compare(data, target, current_offset + j, row_index))
continue;
ConcreteAction::apply(current, j);
if constexpr (!ConcreteAction::resume_execution)
break;
}
return current;
}
static ResultType linearSearchConst(const Array & arr, const Field & value)
{
ResultType current = 0;
for (size_t i = 0, size = arr.size(); i < size; ++i)
{
if (!applyVisitor(FieldVisitorAccurateEquals(), arr[i], value))
continue;
ConcreteAction::apply(current, i);
if constexpr (!ConcreteAction::resume_execution)
break;
}
return current;
}
private:
/** Looking for the target element index in the data (array) */
template <size_t Case, typename Data, typename Target>
static constexpr ResultType getIndex(
const Data & data,
const Target & target,
size_t array_size,
const NullMap * const null_map_data,
const NullMap * const null_map_item,
size_t row_index,
ArrOffset current_offset)
{
/** Use binary search if the following conditions are met.
* 1. The array type is not nullable. (Case = 1)
* 2. Target is not a column or an array.
*/
if constexpr (
std::is_same_v<ConcreteAction, IndexOfAssumeSorted> && !std::is_same_v<Target, PaddedPODArray<Result>>
&& !std::is_same_v<Target, IColumn> && Case == 1)
{
return lowerBound(data, target, array_size, current_offset);
}
return linearSearch<Case>(data, target, array_size, null_map_data, null_map_item, row_index, current_offset);
}
static constexpr bool hasNull(const NullMap * const null_map, size_t i) noexcept { return (*null_map)[i]; } static constexpr bool hasNull(const NullMap * const null_map, size_t i) noexcept { return (*null_map)[i]; }
template <size_t Case, typename Data, typename Target> template <size_t Case, typename Data, typename Target>
static void process( static void process(
const Data & data, const ArrOffsets & offsets, const Target & target, ResultArr & result, const Data & data,
const ArrOffsets & offsets,
const Target & target,
ResultArr & result,
[[maybe_unused]] const NullMap * const null_map_data, [[maybe_unused]] const NullMap * const null_map_data,
[[maybe_unused]] const NullMap * const null_map_item) [[maybe_unused]] const NullMap * const null_map_item)
{ {
@ -129,7 +260,6 @@ private:
} }
const size_t size = offsets.size(); const size_t size = offsets.size();
result.resize(size); result.resize(size);
ArrOffset current_offset = 0; ArrOffset current_offset = 0;
@ -137,39 +267,7 @@ private:
for (size_t i = 0; i < size; ++i) for (size_t i = 0; i < size; ++i)
{ {
const size_t array_size = offsets[i] - current_offset; const size_t array_size = offsets[i] - current_offset;
ResultType current = 0; result[i] = getIndex<Case>(data, target, array_size, null_map_data, null_map_item, i, current_offset);
for (size_t j = 0; j < array_size; ++j)
{
if constexpr (Case == 2) /// Right arg is Nullable
if (hasNull(null_map_item, i))
continue;
if constexpr (Case == 3) /// Left arg is an array of Nullables
if (hasNull(null_map_data, current_offset + j))
continue;
if constexpr (Case == 4) /// Both args are nullable
{
const bool right_is_null = hasNull(null_map_data, current_offset + j);
const bool left_is_null = hasNull(null_map_item, i);
if (right_is_null != left_is_null)
continue;
if (!right_is_null && !compare(data, target, current_offset + j, i))
continue;
}
else if (!compare(data, target, current_offset + j, i))
continue;
ConcreteAction::apply(current, j);
if constexpr (!ConcreteAction::resume_execution)
break;
}
result[i] = current;
current_offset = offsets[i]; current_offset = offsets[i];
} }
} }
@ -854,16 +952,13 @@ private:
{ {
ResultType current = 0; ResultType current = 0;
const auto & value = (*item_arg)[0]; const auto & value = (*item_arg)[0];
if constexpr (std::is_same_v<ConcreteAction, IndexOfAssumeSorted>)
for (size_t i = 0, size = arr.size(); i < size; ++i)
{ {
if (!applyVisitor(FieldVisitorAccurateEquals(), arr[i], value)) current = Impl::Main<ConcreteAction, true>::lowerBound(arr, value, arr.size(), 0);
continue; }
else
ConcreteAction::apply(current, i); {
current = Impl::Main<ConcreteAction, true>::linearSearchConst(arr, value);
if constexpr (!ConcreteAction::resume_execution)
break;
} }
return result_type->createColumnConst(item_arg->size(), current); return result_type->createColumnConst(item_arg->size(), current);

View File

@ -0,0 +1,28 @@
#include "arrayIndex.h"
#include <Functions/FunctionFactory.h>
#include <Functions/IFunctionAdaptors.h>
#include "Common/FunctionDocumentation.h"
namespace DB
{
struct NameIndexOfAssumeSorted { static constexpr auto name = "indexOfAssumeSorted"; };
/// indexOfAssumeSorted(arr, x) - returns the index of the element x (starting with 1), if it exists in the array, or 0 if it
/// should be used when the array is sorted (applies binary search to array)
using FunctionIndexOfAssumeSorted = FunctionArrayIndex<IndexOfAssumeSorted, NameIndexOfAssumeSorted>;
REGISTER_FUNCTION(IndexOfAssumeSorted)
{
factory.registerFunction<FunctionIndexOfAssumeSorted>(FunctionDocumentation{
.description = R"(
The function finds the position of the first occurrence of element X in the array.
Indexing from one.
The function can be used when the internal array type is not Nullable and the array is sorted in non-decreasing order.
If the array type is Nullable, the 'indexOf' function will be used.
The binary search algorithm is used for the search.
For more details, see [https://en.wikipedia.org/wiki/Binary_search]
For an unsorted array, the behavior is undefined.
)",
.examples = {{.name = "", .query = "SELECT indexOfAssumeSorted([1, 2, 2, 2, 3, 3, 3, 4], 3) FROM test_table;", .result=""}}});
}
}

View File

@ -0,0 +1,8 @@
7
5
0
0
0
8
0
0

View File

@ -0,0 +1,26 @@
DROP TABLE IF EXISTS test;
CREATE TABLE test(
id UInt64,
numbers Array(Int64)
)
ENGINE = MergeTree()
ORDER BY id;
INSERT INTO test VALUES(1, [1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 6, 7]);
INSERT INTO test VALUES (2, [1, 2, 3, 4, 5, 6, 7, 8]);
INSERT INTO test VALUES(3, [1, 3, 7, 10]);
INSERT INTO test VALUES(4, [0, 0, 0]);
INSERT INTO test VALUES(5, [10, 10, 10]);
SELECT indexOfAssumeSorted(numbers, 4) FROM test WHERE id = 1;
SELECT indexOfAssumeSorted(numbers, 5) FROM test WHERE id = 2;
SELECT indexOfAssumeSorted(numbers, 5) FROM test WHERE id = 3;
SELECT indexOfAssumeSorted(numbers, 1) FROM test WHERE id = 4;
SELECT indexOfAssumeSorted(numbers, 1) FROM test WHERE id = 5;
SELECT indexOfAssumeSorted([1, 2, 2, 2, 3, 3, 3, 4, 4], 4);
SELECT indexOfAssumeSorted([10, 10, 10], 1);
SELECT indexOfAssumeSorted([1, 1, 1], 10);
DROP TABLE IF EXISTS test;

View File

@ -1954,6 +1954,7 @@ ilike
incrementing incrementing
indexHint indexHint
indexOf indexOf
indexOfAssumeSorted
inequal inequal
infi infi
inflight inflight