From aa092eeffb8f09d9d65294bfa2a62cacc258e562 Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Mon, 27 Dec 2021 16:42:06 +0300 Subject: [PATCH 001/104] proper handle of 'max_rows_to_read' in case of reading in order of sorting key and limit --- .../MergeTree/MergeTreeDataSelectExecutor.cpp | 14 ++++++++++-- .../MergeTree/MergeTreeSelectProcessor.cpp | 5 ----- ...5_read_in_order_max_rows_to_read.reference | 6 +++++ .../02155_read_in_order_max_rows_to_read.sql | 22 +++++++++++++++++++ 4 files changed, 40 insertions(+), 7 deletions(-) create mode 100644 tests/queries/0_stateless/02155_read_in_order_max_rows_to_read.reference create mode 100644 tests/queries/0_stateless/02155_read_in_order_max_rows_to_read.sql diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp index cdedd37e14a..07ac6f5764b 100644 --- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp +++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp @@ -875,12 +875,22 @@ RangesInDataParts MergeTreeDataSelectExecutor::filterPartsByPrimaryKeyAndSkipInd { std::atomic total_rows{0}; + /// Do not check number of read rows if we have reading + /// in order of sorting key with limit. + /// In general case, when there exists WHERE clause + /// it's impossible to estimate number of rows precisely, + /// because we can stop reading at any time. + SizeLimits limits; - if (settings.read_overflow_mode == OverflowMode::THROW && settings.max_rows_to_read) + if (settings.read_overflow_mode == OverflowMode::THROW + && settings.max_rows_to_read + && !query_info.input_order_info) limits = SizeLimits(settings.max_rows_to_read, 0, settings.read_overflow_mode); SizeLimits leaf_limits; - if (settings.read_overflow_mode_leaf == OverflowMode::THROW && settings.max_rows_to_read_leaf) + if (settings.read_overflow_mode_leaf == OverflowMode::THROW + && settings.max_rows_to_read_leaf + && !query_info.input_order_info) leaf_limits = SizeLimits(settings.max_rows_to_read_leaf, 0, settings.read_overflow_mode_leaf); auto mark_cache = context->getIndexMarkCache(); diff --git a/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp index 2d4d3617cee..332eb27094a 100644 --- a/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp +++ b/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp @@ -37,11 +37,6 @@ MergeTreeSelectProcessor::MergeTreeSelectProcessor( has_limit_below_one_block(has_limit_below_one_block_), total_rows(data_part->index_granularity.getRowsCountInRanges(all_mark_ranges)) { - /// Actually it means that parallel reading from replicas enabled - /// and we have to collaborate with initiator. - /// In this case we won't set approximate rows, because it will be accounted multiple times - if (!extension_.has_value()) - addTotalRowsApprox(total_rows); ordered_names = header_without_virtual_columns.getNames(); } diff --git a/tests/queries/0_stateless/02155_read_in_order_max_rows_to_read.reference b/tests/queries/0_stateless/02155_read_in_order_max_rows_to_read.reference new file mode 100644 index 00000000000..b73ab43cabb --- /dev/null +++ b/tests/queries/0_stateless/02155_read_in_order_max_rows_to_read.reference @@ -0,0 +1,6 @@ +10 +0 +1 +2 +3 +4 diff --git a/tests/queries/0_stateless/02155_read_in_order_max_rows_to_read.sql b/tests/queries/0_stateless/02155_read_in_order_max_rows_to_read.sql new file mode 100644 index 00000000000..e82c78b5e42 --- /dev/null +++ b/tests/queries/0_stateless/02155_read_in_order_max_rows_to_read.sql @@ -0,0 +1,22 @@ +DROP TABLE IF EXISTS t_max_rows_to_read; + +CREATE TABLE t_max_rows_to_read (a UInt64) +ENGINE = MergeTree ORDER BY a +SETTINGS index_granularity = 4; + +INSERT INTO t_max_rows_to_read SELECT number FROM numbers(100); + +SET max_threads = 1; + +SELECT a FROM t_max_rows_to_read WHERE a = 10 SETTINGS max_rows_to_read = 4; + +SELECT a FROM t_max_rows_to_read ORDER BY a LIMIT 5 SETTINGS max_rows_to_read = 12; + +-- This should work, but actually it doesn't. Need to investigate. +-- SELECT a FROM t_max_rows_to_read WHERE a > 10 ORDER BY a LIMIT 5 SETTINGS max_rows_to_read = 20; + +SELECT a FROM t_max_rows_to_read ORDER BY a LIMIT 20 FORMAT Null SETTINGS max_rows_to_read = 12; -- { serverError 158 } +SELECT a FROM t_max_rows_to_read WHERE a > 10 ORDER BY a LIMIT 5 FORMAT Null SETTINGS max_rows_to_read = 12; -- { serverError 158 } +SELECT a FROM t_max_rows_to_read WHERE a = 10 OR a = 20 FORMAT Null SETTINGS max_rows_to_read = 4; -- { serverError 158 } + +DROP TABLE t_max_rows_to_read; From e813f6413f12887d0a7a4fec1b5455379073bbe4 Mon Sep 17 00:00:00 2001 From: Pablo Alegre Date: Wed, 29 Dec 2021 16:56:58 +0100 Subject: [PATCH 002/104] Add groupSortedArray() function --- .../reference/groupsortedarray.md | 48 ++++ .../aggregate-functions/reference/index.md | 1 + .../AggregateFunctionGroupSortedArray.cpp | 163 +++++++++++++ .../AggregateFunctionGroupSortedArray.h | 229 ++++++++++++++++++ .../AggregateFunctionGroupSortedArrayData.h | 162 +++++++++++++ .../registerAggregateFunctions.cpp | 2 + tests/performance/group_sorted_array.xml | 25 ++ .../02158_groupsortedarray.reference | 14 ++ .../0_stateless/02158_groupsortedarray.sql | 38 +++ 9 files changed, 682 insertions(+) create mode 100644 docs/en/sql-reference/aggregate-functions/reference/groupsortedarray.md create mode 100644 src/AggregateFunctions/AggregateFunctionGroupSortedArray.cpp create mode 100644 src/AggregateFunctions/AggregateFunctionGroupSortedArray.h create mode 100644 src/AggregateFunctions/AggregateFunctionGroupSortedArrayData.h create mode 100644 tests/performance/group_sorted_array.xml create mode 100644 tests/queries/0_stateless/02158_groupsortedarray.reference create mode 100644 tests/queries/0_stateless/02158_groupsortedarray.sql diff --git a/docs/en/sql-reference/aggregate-functions/reference/groupsortedarray.md b/docs/en/sql-reference/aggregate-functions/reference/groupsortedarray.md new file mode 100644 index 00000000000..06c004173b8 --- /dev/null +++ b/docs/en/sql-reference/aggregate-functions/reference/groupsortedarray.md @@ -0,0 +1,48 @@ +--- +toc_priority: 108 +--- + +# groupSortedArray {#groupSortedArray} + +Returns an array with the first N items in ascending order. + +``` sql +groupSortedArray(N)(column) +``` + +**Arguments** + +- `N` – The number of elements to return. + +If the parameter is omitted, default value 10 is used. + +**Arguments** + +- `x` – The value. +- `expr` — Optional. The field or expresion to sort by. If not set values are sorted by themselves. [Integer](../../../sql-reference/data-types/int-uint.md). + +**Example** + +Gets the first 10 numbers: + +``` sql +SELECT groupSortedArray(10)(number) FROM numbers(100) +``` + +``` text +┌─groupSortedArray(10)(number)─┐ +│ [0,1,2,3,4,5,6,7,8,9] │ +└──────────────────────────────┘ +``` + +Or the last 10: + +``` sql +SELECT groupSortedArray(10)(number, -number) FROM numbers(100) +``` + +``` text +┌─groupSortedArray(10)(number, negate(number))─┐ +│ [99,98,97,96,95,94,93,92,91,90] │ +└──────────────────────────────────────────────┘ +``` \ No newline at end of file diff --git a/docs/en/sql-reference/aggregate-functions/reference/index.md b/docs/en/sql-reference/aggregate-functions/reference/index.md index 59befed8785..14a8ecc9dcf 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/index.md +++ b/docs/en/sql-reference/aggregate-functions/reference/index.md @@ -42,6 +42,7 @@ ClickHouse-specific aggregate functions: - [groupBitmapAnd](../../../sql-reference/aggregate-functions/reference/groupbitmapand.md) - [groupBitmapOr](../../../sql-reference/aggregate-functions/reference/groupbitmapor.md) - [groupBitmapXor](../../../sql-reference/aggregate-functions/reference/groupbitmapxor.md) +- [groupSortedArray](../../../sql-reference/aggregate-functions/reference/groupsortedarray.md) - [sumWithOverflow](../../../sql-reference/aggregate-functions/reference/sumwithoverflow.md) - [sumMap](../../../sql-reference/aggregate-functions/reference/summap.md) - [minMap](../../../sql-reference/aggregate-functions/reference/minmap.md) diff --git a/src/AggregateFunctions/AggregateFunctionGroupSortedArray.cpp b/src/AggregateFunctions/AggregateFunctionGroupSortedArray.cpp new file mode 100644 index 00000000000..e52091fc597 --- /dev/null +++ b/src/AggregateFunctions/AggregateFunctionGroupSortedArray.cpp @@ -0,0 +1,163 @@ +#include +#include +#include +#include +#include +#include +#include +#include + + +static inline constexpr UInt64 GROUP_SORTED_ARRAY_MAX_SIZE = 0xFFFFFF; +static inline constexpr UInt64 GROUP_SORTED_ARRAY_DEFAULT_THRESHOLD = 10; + + +namespace DB +{ +struct Settings; + +namespace ErrorCodes +{ + extern const int ARGUMENT_OUT_OF_BOUND; + extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int LOGICAL_ERROR; + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; +} + + +namespace +{ + template + class AggregateFunctionGroupSortedArrayNumeric : public AggregateFunctionGroupSortedArray + { + using AggregateFunctionGroupSortedArray::AggregateFunctionGroupSortedArray; + }; + + template + class AggregateFunctionGroupSortedArrayFieldType + : public AggregateFunctionGroupSortedArray + { + using AggregateFunctionGroupSortedArray::AggregateFunctionGroupSortedArray; + DataTypePtr getReturnType() const override { return std::make_shared(std::make_shared()); } + }; + + template + static IAggregateFunction * createWithExtraTypes(const DataTypes & argument_types, UInt64 threshold, const Array & params) + { + if (argument_types.empty()) + throw DB::Exception(ErrorCodes::LOGICAL_ERROR, "Got empty arguments list"); + + WhichDataType which(argument_types[0]); + if (which.idx == TypeIndex::Date) + return new AggregateFunctionGroupSortedArrayFieldType(threshold, argument_types, params); + if (which.idx == TypeIndex::DateTime) + return new AggregateFunctionGroupSortedArrayFieldType(threshold, argument_types, params); + + if (argument_types[0]->isValueUnambiguouslyRepresentedInContiguousMemoryRegion()) + { + return new AggregateFunctionGroupSortedArray(threshold, argument_types, params); + } + else + { + return new AggregateFunctionGroupSortedArray(threshold, argument_types, params); + } + } + + template