mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-12-14 10:22:10 +00:00
Merge pull request #72517 from erickurbanov/bsearchfunction
Add indexOfAssumeSorted Function. request #17795
This commit is contained in:
commit
68e77f6f33
@ -786,6 +786,24 @@ SELECT indexOf([1, 3, NULL, NULL], NULL)
|
||||
|
||||
Elements set to `NULL` are handled as normal values.
|
||||
|
||||
## indexOfAssumeSorted(arr, x)
|
||||
|
||||
Returns the index of the first ‘x’ element (starting from 1) if it is in the array, or 0 if it is not.
|
||||
The function should be used for an array sorted not in descending order since binary search is used for the search.
|
||||
If the internal array type is Nullable, the ‘indexOf‘ function will be used.
|
||||
|
||||
Example:
|
||||
|
||||
``` sql
|
||||
SELECT indexOfAssumeSorted([1, 3, 3, 3, 4, 4, 5], 4)
|
||||
```
|
||||
|
||||
``` text
|
||||
┌─indexOf([1, 3, 3, 3, 4, 4, 5], NULL)─┐
|
||||
│ 5 │
|
||||
└──────────────────────────────────--─-┘
|
||||
```
|
||||
|
||||
## arrayCount(\[func,\] arr1, ...)
|
||||
|
||||
Returns the number of elements for which `func(arr1[i], ..., arrN[i])` returns something other than 0. If `func` is not specified, it returns the number of non-zero elements in the array.
|
||||
|
@ -785,6 +785,24 @@ SELECT indexOf([1, 3, NULL, NULL], NULL)
|
||||
|
||||
`NULL` に設定された要素は通常の値として扱われます。
|
||||
|
||||
# indexOfAssumeSorted(arr, x)
|
||||
|
||||
配列内にある場合は最初の'x'要素(1から始まる)のインデックスを返し、そうでない場合は0を返します。
|
||||
この関数は、バイナリ検索が検索に使用されるため、降順ではなくソートされた配列に使用する必要があります。
|
||||
内部配列型がNull許容の場合は、‘indexOf‘関数が使用されます
|
||||
|
||||
例:
|
||||
|
||||
``` sql
|
||||
SELECT indexOfAssumeSorted([1, 3, 3, 3, 4, 4, 5], 4)
|
||||
```
|
||||
|
||||
``` text
|
||||
┌─indexOf([1, 3, 3, 3, 4, 4, 5], NULL)─┐
|
||||
│ 5 │
|
||||
└──────────────────────────────────--─-┘
|
||||
```
|
||||
|
||||
## arrayCount(\[func,\] arr1, ...)
|
||||
|
||||
`func(arr1[i], ..., arrN[i])`が0以外の値を返す要素の数を返します。`func` が指定されていない場合、配列内の0以外の要素の数を返します。
|
||||
|
@ -306,6 +306,24 @@ SELECT indexOf([1, 3, NULL, NULL], NULL)
|
||||
└───────────────────────────────────┘
|
||||
```
|
||||
|
||||
## indexOfAssumeSorted(arr, x)
|
||||
|
||||
Возвращает индекс первого элемента x (начиная с 1), если он есть в массиве, или 0, если его нет.
|
||||
Функция должна использоваться, если массив отсортирован в неубывающем порядке, так как используется бинарный поиск.
|
||||
Если внутренний тип Nullable, то будет использована функция ‘indexOf‘.
|
||||
|
||||
Пример:
|
||||
|
||||
``` sql
|
||||
SELECT indexOfAssumeSorted([1, 3, 3, 3, 4, 4, 5], 4)
|
||||
```
|
||||
|
||||
``` text
|
||||
┌─indexOf([1, 3, 3, 3, 4, 4, 5], NULL)─┐
|
||||
│ 5 │
|
||||
└──────────────────────────────────--─-┘
|
||||
```
|
||||
|
||||
Элементы, равные `NULL`, обрабатываются как обычные значения.
|
||||
|
||||
## arrayCount(\[func,\] arr1, ...) {#array-count}
|
||||
|
@ -337,6 +337,24 @@ SELECT indexOf([1, 3, NULL, NULL], NULL)
|
||||
|
||||
设置为«NULL»的元素将作为普通的元素值处理。
|
||||
|
||||
## indexOfAssumeSorted(arr, x)
|
||||
|
||||
返回数组中第一个’x’元素的索引(从1开始),如果’x’元素不存在在数组中,则返回0.
|
||||
该函数应用于不按降序排序的数组,因为二进制搜索用于搜索。
|
||||
如果内部数组类型为空,则将使用’indexOf’函数。
|
||||
|
||||
示例:
|
||||
|
||||
``` sql
|
||||
SELECT indexOfAssumeSorted([1, 3, 3, 3, 4, 4, 5], 4)
|
||||
```
|
||||
|
||||
``` text
|
||||
┌─indexOf([1, 3, 3, 3, 4, 4, 5], NULL)─┐
|
||||
│ 5 │
|
||||
└──────────────────────────────────--─-┘
|
||||
```
|
||||
|
||||
## arrayCount(\[func,\] arr1, ...) {#array-count}
|
||||
|
||||
`func`将arr数组作为参数,其返回结果为非零值的数量。如果未指定“func”,则返回数组中非零元素的数量。
|
||||
|
@ -1,5 +1,6 @@
|
||||
#pragma once
|
||||
#include <cstddef>
|
||||
#include <type_traits>
|
||||
#include <Functions/IFunction.h>
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include <Functions/FunctionHelpers.h>
|
||||
@ -14,6 +15,7 @@
|
||||
#include <Columns/ColumnFixedString.h>
|
||||
#include <Columns/ColumnsNumber.h>
|
||||
#include <Columns/ColumnNullable.h>
|
||||
#include "Common/FieldVisitors.h"
|
||||
#include "Common/Logger.h"
|
||||
#include "Common/logger_useful.h"
|
||||
#include <Common/FieldVisitorsAccurateComparison.h>
|
||||
@ -53,6 +55,10 @@ struct IndexOfAction
|
||||
static constexpr void apply(ResultType& current, size_t j) noexcept { current = j + 1; }
|
||||
};
|
||||
|
||||
struct IndexOfAssumeSorted : public IndexOfAction
|
||||
{
|
||||
};
|
||||
|
||||
struct CountEqualAction
|
||||
{
|
||||
using ResultType = UInt64;
|
||||
@ -111,13 +117,138 @@ private:
|
||||
return 0 == left.compareAt(i, RightArgIsConstant ? 0 : j, right, 1);
|
||||
}
|
||||
|
||||
static bool compare(const Array & arr, const Field& rhs, size_t pos, size_t)
|
||||
{
|
||||
return applyVisitor(FieldVisitorAccurateEquals(), arr[pos], rhs);
|
||||
}
|
||||
|
||||
static constexpr bool lessOrEqual(const PaddedPODArray<Initial> & left, const Result & right, size_t i, size_t) noexcept
|
||||
{
|
||||
return left[i] >= right;
|
||||
}
|
||||
|
||||
static constexpr bool lessOrEqual(const IColumn & left, const Result & right, size_t i, size_t) noexcept { return left[i] >= right; }
|
||||
|
||||
static constexpr bool lessOrEqual(const Array& arr, const Field& rhs, size_t pos, size_t) noexcept {
|
||||
return applyVisitor(FieldVisitorAccurateLessOrEqual(), rhs, arr[pos]);
|
||||
}
|
||||
|
||||
#pragma clang diagnostic pop
|
||||
|
||||
public:
|
||||
/** Assuming that the array is sorted, use a binary search */
|
||||
template <typename Data, typename Target>
|
||||
static constexpr ResultType lowerBound(const Data & data, const Target & target, size_t array_size, ArrOffset current_offset)
|
||||
{
|
||||
ResultType current = 0;
|
||||
size_t low = 0, high = array_size;
|
||||
while (high - low > 0)
|
||||
{
|
||||
auto middle = low + ((high - low) >> 1);
|
||||
auto compare_result = lessOrEqual(data, target, current_offset + middle, 0);
|
||||
/// avoid conditional branching
|
||||
high = compare_result ? middle : high;
|
||||
low = compare_result ? low : middle + 1;
|
||||
}
|
||||
if (low < array_size && compare(data, target, current_offset + low, 0))
|
||||
{
|
||||
ConcreteAction::apply(current, low);
|
||||
}
|
||||
return current;
|
||||
}
|
||||
|
||||
template <size_t Case, typename Data, typename Target>
|
||||
static constexpr ResultType linearSearch(
|
||||
const Data & data,
|
||||
const Target & target,
|
||||
size_t array_size,
|
||||
const NullMap * const null_map_data,
|
||||
const NullMap * const null_map_item,
|
||||
size_t row_index,
|
||||
ArrOffset current_offset)
|
||||
{
|
||||
ResultType current = 0;
|
||||
for (size_t j = 0; j < array_size; ++j)
|
||||
{
|
||||
if constexpr (Case == 2) /// Right arg is Nullable
|
||||
if (hasNull(null_map_item, row_index))
|
||||
continue;
|
||||
|
||||
if constexpr (Case == 3) /// Left arg is an array of Nullables
|
||||
if (hasNull(null_map_data, current_offset + j))
|
||||
continue;
|
||||
|
||||
if constexpr (Case == 4) /// Both args are nullable
|
||||
{
|
||||
const bool right_is_null = hasNull(null_map_data, current_offset + j);
|
||||
const bool left_is_null = hasNull(null_map_item, row_index);
|
||||
|
||||
if (right_is_null != left_is_null)
|
||||
continue;
|
||||
|
||||
if (!right_is_null && !compare(data, target, current_offset + j, row_index))
|
||||
continue;
|
||||
}
|
||||
else if (!compare(data, target, current_offset + j, row_index))
|
||||
continue;
|
||||
|
||||
ConcreteAction::apply(current, j);
|
||||
|
||||
if constexpr (!ConcreteAction::resume_execution)
|
||||
break;
|
||||
}
|
||||
return current;
|
||||
}
|
||||
|
||||
static ResultType linearSearchConst(const Array & arr, const Field & value)
|
||||
{
|
||||
ResultType current = 0;
|
||||
for (size_t i = 0, size = arr.size(); i < size; ++i)
|
||||
{
|
||||
if (!applyVisitor(FieldVisitorAccurateEquals(), arr[i], value))
|
||||
continue;
|
||||
|
||||
ConcreteAction::apply(current, i);
|
||||
|
||||
if constexpr (!ConcreteAction::resume_execution)
|
||||
break;
|
||||
}
|
||||
return current;
|
||||
}
|
||||
|
||||
private:
|
||||
/** Looking for the target element index in the data (array) */
|
||||
template <size_t Case, typename Data, typename Target>
|
||||
static constexpr ResultType getIndex(
|
||||
const Data & data,
|
||||
const Target & target,
|
||||
size_t array_size,
|
||||
const NullMap * const null_map_data,
|
||||
const NullMap * const null_map_item,
|
||||
size_t row_index,
|
||||
ArrOffset current_offset)
|
||||
{
|
||||
/** Use binary search if the following conditions are met.
|
||||
* 1. The array type is not nullable. (Case = 1)
|
||||
* 2. Target is not a column or an array.
|
||||
*/
|
||||
if constexpr (
|
||||
std::is_same_v<ConcreteAction, IndexOfAssumeSorted> && !std::is_same_v<Target, PaddedPODArray<Result>>
|
||||
&& !std::is_same_v<Target, IColumn> && Case == 1)
|
||||
{
|
||||
return lowerBound(data, target, array_size, current_offset);
|
||||
}
|
||||
return linearSearch<Case>(data, target, array_size, null_map_data, null_map_item, row_index, current_offset);
|
||||
}
|
||||
|
||||
static constexpr bool hasNull(const NullMap * const null_map, size_t i) noexcept { return (*null_map)[i]; }
|
||||
|
||||
template <size_t Case, typename Data, typename Target>
|
||||
static void process(
|
||||
const Data & data, const ArrOffsets & offsets, const Target & target, ResultArr & result,
|
||||
const Data & data,
|
||||
const ArrOffsets & offsets,
|
||||
const Target & target,
|
||||
ResultArr & result,
|
||||
[[maybe_unused]] const NullMap * const null_map_data,
|
||||
[[maybe_unused]] const NullMap * const null_map_item)
|
||||
{
|
||||
@ -129,7 +260,6 @@ private:
|
||||
}
|
||||
|
||||
const size_t size = offsets.size();
|
||||
|
||||
result.resize(size);
|
||||
|
||||
ArrOffset current_offset = 0;
|
||||
@ -137,39 +267,7 @@ private:
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
{
|
||||
const size_t array_size = offsets[i] - current_offset;
|
||||
ResultType current = 0;
|
||||
|
||||
for (size_t j = 0; j < array_size; ++j)
|
||||
{
|
||||
if constexpr (Case == 2) /// Right arg is Nullable
|
||||
if (hasNull(null_map_item, i))
|
||||
continue;
|
||||
|
||||
if constexpr (Case == 3) /// Left arg is an array of Nullables
|
||||
if (hasNull(null_map_data, current_offset + j))
|
||||
continue;
|
||||
|
||||
if constexpr (Case == 4) /// Both args are nullable
|
||||
{
|
||||
const bool right_is_null = hasNull(null_map_data, current_offset + j);
|
||||
const bool left_is_null = hasNull(null_map_item, i);
|
||||
|
||||
if (right_is_null != left_is_null)
|
||||
continue;
|
||||
|
||||
if (!right_is_null && !compare(data, target, current_offset + j, i))
|
||||
continue;
|
||||
}
|
||||
else if (!compare(data, target, current_offset + j, i))
|
||||
continue;
|
||||
|
||||
ConcreteAction::apply(current, j);
|
||||
|
||||
if constexpr (!ConcreteAction::resume_execution)
|
||||
break;
|
||||
}
|
||||
|
||||
result[i] = current;
|
||||
result[i] = getIndex<Case>(data, target, array_size, null_map_data, null_map_item, i, current_offset);
|
||||
current_offset = offsets[i];
|
||||
}
|
||||
}
|
||||
@ -854,16 +952,13 @@ private:
|
||||
{
|
||||
ResultType current = 0;
|
||||
const auto & value = (*item_arg)[0];
|
||||
|
||||
for (size_t i = 0, size = arr.size(); i < size; ++i)
|
||||
if constexpr (std::is_same_v<ConcreteAction, IndexOfAssumeSorted>)
|
||||
{
|
||||
if (!applyVisitor(FieldVisitorAccurateEquals(), arr[i], value))
|
||||
continue;
|
||||
|
||||
ConcreteAction::apply(current, i);
|
||||
|
||||
if constexpr (!ConcreteAction::resume_execution)
|
||||
break;
|
||||
current = Impl::Main<ConcreteAction, true>::lowerBound(arr, value, arr.size(), 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
current = Impl::Main<ConcreteAction, true>::linearSearchConst(arr, value);
|
||||
}
|
||||
|
||||
return result_type->createColumnConst(item_arg->size(), current);
|
||||
|
28
src/Functions/array/indexOfAssumeSorted.cpp
Normal file
28
src/Functions/array/indexOfAssumeSorted.cpp
Normal file
@ -0,0 +1,28 @@
|
||||
#include "arrayIndex.h"
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include <Functions/IFunctionAdaptors.h>
|
||||
#include "Common/FunctionDocumentation.h"
|
||||
|
||||
namespace DB
|
||||
{
|
||||
struct NameIndexOfAssumeSorted { static constexpr auto name = "indexOfAssumeSorted"; };
|
||||
|
||||
/// indexOfAssumeSorted(arr, x) - returns the index of the element x (starting with 1), if it exists in the array, or 0 if it
|
||||
/// should be used when the array is sorted (applies binary search to array)
|
||||
using FunctionIndexOfAssumeSorted = FunctionArrayIndex<IndexOfAssumeSorted, NameIndexOfAssumeSorted>;
|
||||
|
||||
REGISTER_FUNCTION(IndexOfAssumeSorted)
|
||||
{
|
||||
factory.registerFunction<FunctionIndexOfAssumeSorted>(FunctionDocumentation{
|
||||
.description = R"(
|
||||
The function finds the position of the first occurrence of element X in the array.
|
||||
Indexing from one.
|
||||
The function can be used when the internal array type is not Nullable and the array is sorted in non-decreasing order.
|
||||
If the array type is Nullable, the 'indexOf' function will be used.
|
||||
The binary search algorithm is used for the search.
|
||||
For more details, see [https://en.wikipedia.org/wiki/Binary_search]
|
||||
For an unsorted array, the behavior is undefined.
|
||||
)",
|
||||
.examples = {{.name = "", .query = "SELECT indexOfAssumeSorted([1, 2, 2, 2, 3, 3, 3, 4], 3) FROM test_table;", .result=""}}});
|
||||
}
|
||||
}
|
@ -0,0 +1,8 @@
|
||||
7
|
||||
5
|
||||
0
|
||||
0
|
||||
0
|
||||
8
|
||||
0
|
||||
0
|
26
tests/queries/0_stateless/03276_index_of_assume_sorted.sql
Normal file
26
tests/queries/0_stateless/03276_index_of_assume_sorted.sql
Normal file
@ -0,0 +1,26 @@
|
||||
DROP TABLE IF EXISTS test;
|
||||
|
||||
CREATE TABLE test(
|
||||
id UInt64,
|
||||
numbers Array(Int64)
|
||||
)
|
||||
ENGINE = MergeTree()
|
||||
ORDER BY id;
|
||||
|
||||
INSERT INTO test VALUES(1, [1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 6, 7]);
|
||||
INSERT INTO test VALUES (2, [1, 2, 3, 4, 5, 6, 7, 8]);
|
||||
INSERT INTO test VALUES(3, [1, 3, 7, 10]);
|
||||
INSERT INTO test VALUES(4, [0, 0, 0]);
|
||||
INSERT INTO test VALUES(5, [10, 10, 10]);
|
||||
|
||||
SELECT indexOfAssumeSorted(numbers, 4) FROM test WHERE id = 1;
|
||||
SELECT indexOfAssumeSorted(numbers, 5) FROM test WHERE id = 2;
|
||||
SELECT indexOfAssumeSorted(numbers, 5) FROM test WHERE id = 3;
|
||||
SELECT indexOfAssumeSorted(numbers, 1) FROM test WHERE id = 4;
|
||||
SELECT indexOfAssumeSorted(numbers, 1) FROM test WHERE id = 5;
|
||||
|
||||
SELECT indexOfAssumeSorted([1, 2, 2, 2, 3, 3, 3, 4, 4], 4);
|
||||
SELECT indexOfAssumeSorted([10, 10, 10], 1);
|
||||
SELECT indexOfAssumeSorted([1, 1, 1], 10);
|
||||
|
||||
DROP TABLE IF EXISTS test;
|
@ -1954,6 +1954,7 @@ ilike
|
||||
incrementing
|
||||
indexHint
|
||||
indexOf
|
||||
indexOfAssumeSorted
|
||||
inequal
|
||||
infi
|
||||
inflight
|
||||
|
Loading…
Reference in New Issue
Block a user