mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-12-14 10:22:10 +00:00
Merge pull request #72517 from erickurbanov/bsearchfunction
Add indexOfAssumeSorted Function. request #17795
This commit is contained in:
commit
68e77f6f33
@ -786,6 +786,24 @@ SELECT indexOf([1, 3, NULL, NULL], NULL)
|
|||||||
|
|
||||||
Elements set to `NULL` are handled as normal values.
|
Elements set to `NULL` are handled as normal values.
|
||||||
|
|
||||||
|
## indexOfAssumeSorted(arr, x)
|
||||||
|
|
||||||
|
Returns the index of the first ‘x’ element (starting from 1) if it is in the array, or 0 if it is not.
|
||||||
|
The function should be used for an array sorted not in descending order since binary search is used for the search.
|
||||||
|
If the internal array type is Nullable, the ‘indexOf‘ function will be used.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
``` sql
|
||||||
|
SELECT indexOfAssumeSorted([1, 3, 3, 3, 4, 4, 5], 4)
|
||||||
|
```
|
||||||
|
|
||||||
|
``` text
|
||||||
|
┌─indexOf([1, 3, 3, 3, 4, 4, 5], NULL)─┐
|
||||||
|
│ 5 │
|
||||||
|
└──────────────────────────────────--─-┘
|
||||||
|
```
|
||||||
|
|
||||||
## arrayCount(\[func,\] arr1, ...)
|
## arrayCount(\[func,\] arr1, ...)
|
||||||
|
|
||||||
Returns the number of elements for which `func(arr1[i], ..., arrN[i])` returns something other than 0. If `func` is not specified, it returns the number of non-zero elements in the array.
|
Returns the number of elements for which `func(arr1[i], ..., arrN[i])` returns something other than 0. If `func` is not specified, it returns the number of non-zero elements in the array.
|
||||||
|
@ -785,6 +785,24 @@ SELECT indexOf([1, 3, NULL, NULL], NULL)
|
|||||||
|
|
||||||
`NULL` に設定された要素は通常の値として扱われます。
|
`NULL` に設定された要素は通常の値として扱われます。
|
||||||
|
|
||||||
|
# indexOfAssumeSorted(arr, x)
|
||||||
|
|
||||||
|
配列内にある場合は最初の'x'要素(1から始まる)のインデックスを返し、そうでない場合は0を返します。
|
||||||
|
この関数は、バイナリ検索が検索に使用されるため、降順ではなくソートされた配列に使用する必要があります。
|
||||||
|
内部配列型がNull許容の場合は、‘indexOf‘関数が使用されます
|
||||||
|
|
||||||
|
例:
|
||||||
|
|
||||||
|
``` sql
|
||||||
|
SELECT indexOfAssumeSorted([1, 3, 3, 3, 4, 4, 5], 4)
|
||||||
|
```
|
||||||
|
|
||||||
|
``` text
|
||||||
|
┌─indexOf([1, 3, 3, 3, 4, 4, 5], NULL)─┐
|
||||||
|
│ 5 │
|
||||||
|
└──────────────────────────────────--─-┘
|
||||||
|
```
|
||||||
|
|
||||||
## arrayCount(\[func,\] arr1, ...)
|
## arrayCount(\[func,\] arr1, ...)
|
||||||
|
|
||||||
`func(arr1[i], ..., arrN[i])`が0以外の値を返す要素の数を返します。`func` が指定されていない場合、配列内の0以外の要素の数を返します。
|
`func(arr1[i], ..., arrN[i])`が0以外の値を返す要素の数を返します。`func` が指定されていない場合、配列内の0以外の要素の数を返します。
|
||||||
|
@ -306,6 +306,24 @@ SELECT indexOf([1, 3, NULL, NULL], NULL)
|
|||||||
└───────────────────────────────────┘
|
└───────────────────────────────────┘
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## indexOfAssumeSorted(arr, x)
|
||||||
|
|
||||||
|
Возвращает индекс первого элемента x (начиная с 1), если он есть в массиве, или 0, если его нет.
|
||||||
|
Функция должна использоваться, если массив отсортирован в неубывающем порядке, так как используется бинарный поиск.
|
||||||
|
Если внутренний тип Nullable, то будет использована функция ‘indexOf‘.
|
||||||
|
|
||||||
|
Пример:
|
||||||
|
|
||||||
|
``` sql
|
||||||
|
SELECT indexOfAssumeSorted([1, 3, 3, 3, 4, 4, 5], 4)
|
||||||
|
```
|
||||||
|
|
||||||
|
``` text
|
||||||
|
┌─indexOf([1, 3, 3, 3, 4, 4, 5], NULL)─┐
|
||||||
|
│ 5 │
|
||||||
|
└──────────────────────────────────--─-┘
|
||||||
|
```
|
||||||
|
|
||||||
Элементы, равные `NULL`, обрабатываются как обычные значения.
|
Элементы, равные `NULL`, обрабатываются как обычные значения.
|
||||||
|
|
||||||
## arrayCount(\[func,\] arr1, ...) {#array-count}
|
## arrayCount(\[func,\] arr1, ...) {#array-count}
|
||||||
|
@ -337,6 +337,24 @@ SELECT indexOf([1, 3, NULL, NULL], NULL)
|
|||||||
|
|
||||||
设置为«NULL»的元素将作为普通的元素值处理。
|
设置为«NULL»的元素将作为普通的元素值处理。
|
||||||
|
|
||||||
|
## indexOfAssumeSorted(arr, x)
|
||||||
|
|
||||||
|
返回数组中第一个’x’元素的索引(从1开始),如果’x’元素不存在在数组中,则返回0.
|
||||||
|
该函数应用于不按降序排序的数组,因为二进制搜索用于搜索。
|
||||||
|
如果内部数组类型为空,则将使用’indexOf’函数。
|
||||||
|
|
||||||
|
示例:
|
||||||
|
|
||||||
|
``` sql
|
||||||
|
SELECT indexOfAssumeSorted([1, 3, 3, 3, 4, 4, 5], 4)
|
||||||
|
```
|
||||||
|
|
||||||
|
``` text
|
||||||
|
┌─indexOf([1, 3, 3, 3, 4, 4, 5], NULL)─┐
|
||||||
|
│ 5 │
|
||||||
|
└──────────────────────────────────--─-┘
|
||||||
|
```
|
||||||
|
|
||||||
## arrayCount(\[func,\] arr1, ...) {#array-count}
|
## arrayCount(\[func,\] arr1, ...) {#array-count}
|
||||||
|
|
||||||
`func`将arr数组作为参数,其返回结果为非零值的数量。如果未指定“func”,则返回数组中非零元素的数量。
|
`func`将arr数组作为参数,其返回结果为非零值的数量。如果未指定“func”,则返回数组中非零元素的数量。
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
#include <cstddef>
|
#include <cstddef>
|
||||||
|
#include <type_traits>
|
||||||
#include <Functions/IFunction.h>
|
#include <Functions/IFunction.h>
|
||||||
#include <Functions/FunctionFactory.h>
|
#include <Functions/FunctionFactory.h>
|
||||||
#include <Functions/FunctionHelpers.h>
|
#include <Functions/FunctionHelpers.h>
|
||||||
@ -14,6 +15,7 @@
|
|||||||
#include <Columns/ColumnFixedString.h>
|
#include <Columns/ColumnFixedString.h>
|
||||||
#include <Columns/ColumnsNumber.h>
|
#include <Columns/ColumnsNumber.h>
|
||||||
#include <Columns/ColumnNullable.h>
|
#include <Columns/ColumnNullable.h>
|
||||||
|
#include "Common/FieldVisitors.h"
|
||||||
#include "Common/Logger.h"
|
#include "Common/Logger.h"
|
||||||
#include "Common/logger_useful.h"
|
#include "Common/logger_useful.h"
|
||||||
#include <Common/FieldVisitorsAccurateComparison.h>
|
#include <Common/FieldVisitorsAccurateComparison.h>
|
||||||
@ -53,6 +55,10 @@ struct IndexOfAction
|
|||||||
static constexpr void apply(ResultType& current, size_t j) noexcept { current = j + 1; }
|
static constexpr void apply(ResultType& current, size_t j) noexcept { current = j + 1; }
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct IndexOfAssumeSorted : public IndexOfAction
|
||||||
|
{
|
||||||
|
};
|
||||||
|
|
||||||
struct CountEqualAction
|
struct CountEqualAction
|
||||||
{
|
{
|
||||||
using ResultType = UInt64;
|
using ResultType = UInt64;
|
||||||
@ -111,13 +117,138 @@ private:
|
|||||||
return 0 == left.compareAt(i, RightArgIsConstant ? 0 : j, right, 1);
|
return 0 == left.compareAt(i, RightArgIsConstant ? 0 : j, right, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool compare(const Array & arr, const Field& rhs, size_t pos, size_t)
|
||||||
|
{
|
||||||
|
return applyVisitor(FieldVisitorAccurateEquals(), arr[pos], rhs);
|
||||||
|
}
|
||||||
|
|
||||||
|
static constexpr bool lessOrEqual(const PaddedPODArray<Initial> & left, const Result & right, size_t i, size_t) noexcept
|
||||||
|
{
|
||||||
|
return left[i] >= right;
|
||||||
|
}
|
||||||
|
|
||||||
|
static constexpr bool lessOrEqual(const IColumn & left, const Result & right, size_t i, size_t) noexcept { return left[i] >= right; }
|
||||||
|
|
||||||
|
static constexpr bool lessOrEqual(const Array& arr, const Field& rhs, size_t pos, size_t) noexcept {
|
||||||
|
return applyVisitor(FieldVisitorAccurateLessOrEqual(), rhs, arr[pos]);
|
||||||
|
}
|
||||||
|
|
||||||
#pragma clang diagnostic pop
|
#pragma clang diagnostic pop
|
||||||
|
|
||||||
|
public:
|
||||||
|
/** Assuming that the array is sorted, use a binary search */
|
||||||
|
template <typename Data, typename Target>
|
||||||
|
static constexpr ResultType lowerBound(const Data & data, const Target & target, size_t array_size, ArrOffset current_offset)
|
||||||
|
{
|
||||||
|
ResultType current = 0;
|
||||||
|
size_t low = 0, high = array_size;
|
||||||
|
while (high - low > 0)
|
||||||
|
{
|
||||||
|
auto middle = low + ((high - low) >> 1);
|
||||||
|
auto compare_result = lessOrEqual(data, target, current_offset + middle, 0);
|
||||||
|
/// avoid conditional branching
|
||||||
|
high = compare_result ? middle : high;
|
||||||
|
low = compare_result ? low : middle + 1;
|
||||||
|
}
|
||||||
|
if (low < array_size && compare(data, target, current_offset + low, 0))
|
||||||
|
{
|
||||||
|
ConcreteAction::apply(current, low);
|
||||||
|
}
|
||||||
|
return current;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <size_t Case, typename Data, typename Target>
|
||||||
|
static constexpr ResultType linearSearch(
|
||||||
|
const Data & data,
|
||||||
|
const Target & target,
|
||||||
|
size_t array_size,
|
||||||
|
const NullMap * const null_map_data,
|
||||||
|
const NullMap * const null_map_item,
|
||||||
|
size_t row_index,
|
||||||
|
ArrOffset current_offset)
|
||||||
|
{
|
||||||
|
ResultType current = 0;
|
||||||
|
for (size_t j = 0; j < array_size; ++j)
|
||||||
|
{
|
||||||
|
if constexpr (Case == 2) /// Right arg is Nullable
|
||||||
|
if (hasNull(null_map_item, row_index))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if constexpr (Case == 3) /// Left arg is an array of Nullables
|
||||||
|
if (hasNull(null_map_data, current_offset + j))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if constexpr (Case == 4) /// Both args are nullable
|
||||||
|
{
|
||||||
|
const bool right_is_null = hasNull(null_map_data, current_offset + j);
|
||||||
|
const bool left_is_null = hasNull(null_map_item, row_index);
|
||||||
|
|
||||||
|
if (right_is_null != left_is_null)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if (!right_is_null && !compare(data, target, current_offset + j, row_index))
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
else if (!compare(data, target, current_offset + j, row_index))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
ConcreteAction::apply(current, j);
|
||||||
|
|
||||||
|
if constexpr (!ConcreteAction::resume_execution)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return current;
|
||||||
|
}
|
||||||
|
|
||||||
|
static ResultType linearSearchConst(const Array & arr, const Field & value)
|
||||||
|
{
|
||||||
|
ResultType current = 0;
|
||||||
|
for (size_t i = 0, size = arr.size(); i < size; ++i)
|
||||||
|
{
|
||||||
|
if (!applyVisitor(FieldVisitorAccurateEquals(), arr[i], value))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
ConcreteAction::apply(current, i);
|
||||||
|
|
||||||
|
if constexpr (!ConcreteAction::resume_execution)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return current;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
/** Looking for the target element index in the data (array) */
|
||||||
|
template <size_t Case, typename Data, typename Target>
|
||||||
|
static constexpr ResultType getIndex(
|
||||||
|
const Data & data,
|
||||||
|
const Target & target,
|
||||||
|
size_t array_size,
|
||||||
|
const NullMap * const null_map_data,
|
||||||
|
const NullMap * const null_map_item,
|
||||||
|
size_t row_index,
|
||||||
|
ArrOffset current_offset)
|
||||||
|
{
|
||||||
|
/** Use binary search if the following conditions are met.
|
||||||
|
* 1. The array type is not nullable. (Case = 1)
|
||||||
|
* 2. Target is not a column or an array.
|
||||||
|
*/
|
||||||
|
if constexpr (
|
||||||
|
std::is_same_v<ConcreteAction, IndexOfAssumeSorted> && !std::is_same_v<Target, PaddedPODArray<Result>>
|
||||||
|
&& !std::is_same_v<Target, IColumn> && Case == 1)
|
||||||
|
{
|
||||||
|
return lowerBound(data, target, array_size, current_offset);
|
||||||
|
}
|
||||||
|
return linearSearch<Case>(data, target, array_size, null_map_data, null_map_item, row_index, current_offset);
|
||||||
|
}
|
||||||
|
|
||||||
static constexpr bool hasNull(const NullMap * const null_map, size_t i) noexcept { return (*null_map)[i]; }
|
static constexpr bool hasNull(const NullMap * const null_map, size_t i) noexcept { return (*null_map)[i]; }
|
||||||
|
|
||||||
template <size_t Case, typename Data, typename Target>
|
template <size_t Case, typename Data, typename Target>
|
||||||
static void process(
|
static void process(
|
||||||
const Data & data, const ArrOffsets & offsets, const Target & target, ResultArr & result,
|
const Data & data,
|
||||||
|
const ArrOffsets & offsets,
|
||||||
|
const Target & target,
|
||||||
|
ResultArr & result,
|
||||||
[[maybe_unused]] const NullMap * const null_map_data,
|
[[maybe_unused]] const NullMap * const null_map_data,
|
||||||
[[maybe_unused]] const NullMap * const null_map_item)
|
[[maybe_unused]] const NullMap * const null_map_item)
|
||||||
{
|
{
|
||||||
@ -129,7 +260,6 @@ private:
|
|||||||
}
|
}
|
||||||
|
|
||||||
const size_t size = offsets.size();
|
const size_t size = offsets.size();
|
||||||
|
|
||||||
result.resize(size);
|
result.resize(size);
|
||||||
|
|
||||||
ArrOffset current_offset = 0;
|
ArrOffset current_offset = 0;
|
||||||
@ -137,39 +267,7 @@ private:
|
|||||||
for (size_t i = 0; i < size; ++i)
|
for (size_t i = 0; i < size; ++i)
|
||||||
{
|
{
|
||||||
const size_t array_size = offsets[i] - current_offset;
|
const size_t array_size = offsets[i] - current_offset;
|
||||||
ResultType current = 0;
|
result[i] = getIndex<Case>(data, target, array_size, null_map_data, null_map_item, i, current_offset);
|
||||||
|
|
||||||
for (size_t j = 0; j < array_size; ++j)
|
|
||||||
{
|
|
||||||
if constexpr (Case == 2) /// Right arg is Nullable
|
|
||||||
if (hasNull(null_map_item, i))
|
|
||||||
continue;
|
|
||||||
|
|
||||||
if constexpr (Case == 3) /// Left arg is an array of Nullables
|
|
||||||
if (hasNull(null_map_data, current_offset + j))
|
|
||||||
continue;
|
|
||||||
|
|
||||||
if constexpr (Case == 4) /// Both args are nullable
|
|
||||||
{
|
|
||||||
const bool right_is_null = hasNull(null_map_data, current_offset + j);
|
|
||||||
const bool left_is_null = hasNull(null_map_item, i);
|
|
||||||
|
|
||||||
if (right_is_null != left_is_null)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
if (!right_is_null && !compare(data, target, current_offset + j, i))
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
else if (!compare(data, target, current_offset + j, i))
|
|
||||||
continue;
|
|
||||||
|
|
||||||
ConcreteAction::apply(current, j);
|
|
||||||
|
|
||||||
if constexpr (!ConcreteAction::resume_execution)
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
result[i] = current;
|
|
||||||
current_offset = offsets[i];
|
current_offset = offsets[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -854,16 +952,13 @@ private:
|
|||||||
{
|
{
|
||||||
ResultType current = 0;
|
ResultType current = 0;
|
||||||
const auto & value = (*item_arg)[0];
|
const auto & value = (*item_arg)[0];
|
||||||
|
if constexpr (std::is_same_v<ConcreteAction, IndexOfAssumeSorted>)
|
||||||
for (size_t i = 0, size = arr.size(); i < size; ++i)
|
|
||||||
{
|
{
|
||||||
if (!applyVisitor(FieldVisitorAccurateEquals(), arr[i], value))
|
current = Impl::Main<ConcreteAction, true>::lowerBound(arr, value, arr.size(), 0);
|
||||||
continue;
|
}
|
||||||
|
else
|
||||||
ConcreteAction::apply(current, i);
|
{
|
||||||
|
current = Impl::Main<ConcreteAction, true>::linearSearchConst(arr, value);
|
||||||
if constexpr (!ConcreteAction::resume_execution)
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return result_type->createColumnConst(item_arg->size(), current);
|
return result_type->createColumnConst(item_arg->size(), current);
|
||||||
|
28
src/Functions/array/indexOfAssumeSorted.cpp
Normal file
28
src/Functions/array/indexOfAssumeSorted.cpp
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
#include "arrayIndex.h"
|
||||||
|
#include <Functions/FunctionFactory.h>
|
||||||
|
#include <Functions/IFunctionAdaptors.h>
|
||||||
|
#include "Common/FunctionDocumentation.h"
|
||||||
|
|
||||||
|
namespace DB
|
||||||
|
{
|
||||||
|
struct NameIndexOfAssumeSorted { static constexpr auto name = "indexOfAssumeSorted"; };
|
||||||
|
|
||||||
|
/// indexOfAssumeSorted(arr, x) - returns the index of the element x (starting with 1), if it exists in the array, or 0 if it
|
||||||
|
/// should be used when the array is sorted (applies binary search to array)
|
||||||
|
using FunctionIndexOfAssumeSorted = FunctionArrayIndex<IndexOfAssumeSorted, NameIndexOfAssumeSorted>;
|
||||||
|
|
||||||
|
REGISTER_FUNCTION(IndexOfAssumeSorted)
|
||||||
|
{
|
||||||
|
factory.registerFunction<FunctionIndexOfAssumeSorted>(FunctionDocumentation{
|
||||||
|
.description = R"(
|
||||||
|
The function finds the position of the first occurrence of element X in the array.
|
||||||
|
Indexing from one.
|
||||||
|
The function can be used when the internal array type is not Nullable and the array is sorted in non-decreasing order.
|
||||||
|
If the array type is Nullable, the 'indexOf' function will be used.
|
||||||
|
The binary search algorithm is used for the search.
|
||||||
|
For more details, see [https://en.wikipedia.org/wiki/Binary_search]
|
||||||
|
For an unsorted array, the behavior is undefined.
|
||||||
|
)",
|
||||||
|
.examples = {{.name = "", .query = "SELECT indexOfAssumeSorted([1, 2, 2, 2, 3, 3, 3, 4], 3) FROM test_table;", .result=""}}});
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,8 @@
|
|||||||
|
7
|
||||||
|
5
|
||||||
|
0
|
||||||
|
0
|
||||||
|
0
|
||||||
|
8
|
||||||
|
0
|
||||||
|
0
|
26
tests/queries/0_stateless/03276_index_of_assume_sorted.sql
Normal file
26
tests/queries/0_stateless/03276_index_of_assume_sorted.sql
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
DROP TABLE IF EXISTS test;
|
||||||
|
|
||||||
|
CREATE TABLE test(
|
||||||
|
id UInt64,
|
||||||
|
numbers Array(Int64)
|
||||||
|
)
|
||||||
|
ENGINE = MergeTree()
|
||||||
|
ORDER BY id;
|
||||||
|
|
||||||
|
INSERT INTO test VALUES(1, [1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 6, 7]);
|
||||||
|
INSERT INTO test VALUES (2, [1, 2, 3, 4, 5, 6, 7, 8]);
|
||||||
|
INSERT INTO test VALUES(3, [1, 3, 7, 10]);
|
||||||
|
INSERT INTO test VALUES(4, [0, 0, 0]);
|
||||||
|
INSERT INTO test VALUES(5, [10, 10, 10]);
|
||||||
|
|
||||||
|
SELECT indexOfAssumeSorted(numbers, 4) FROM test WHERE id = 1;
|
||||||
|
SELECT indexOfAssumeSorted(numbers, 5) FROM test WHERE id = 2;
|
||||||
|
SELECT indexOfAssumeSorted(numbers, 5) FROM test WHERE id = 3;
|
||||||
|
SELECT indexOfAssumeSorted(numbers, 1) FROM test WHERE id = 4;
|
||||||
|
SELECT indexOfAssumeSorted(numbers, 1) FROM test WHERE id = 5;
|
||||||
|
|
||||||
|
SELECT indexOfAssumeSorted([1, 2, 2, 2, 3, 3, 3, 4, 4], 4);
|
||||||
|
SELECT indexOfAssumeSorted([10, 10, 10], 1);
|
||||||
|
SELECT indexOfAssumeSorted([1, 1, 1], 10);
|
||||||
|
|
||||||
|
DROP TABLE IF EXISTS test;
|
@ -1954,6 +1954,7 @@ ilike
|
|||||||
incrementing
|
incrementing
|
||||||
indexHint
|
indexHint
|
||||||
indexOf
|
indexOf
|
||||||
|
indexOfAssumeSorted
|
||||||
inequal
|
inequal
|
||||||
infi
|
infi
|
||||||
inflight
|
inflight
|
||||||
|
Loading…
Reference in New Issue
Block a user