From 4adb6288eb0b5ae1b33705624489dbb5436e8475 Mon Sep 17 00:00:00 2001 From: Joanna Hulboj Date: Wed, 11 Jan 2023 22:06:02 +0000 Subject: [PATCH 01/21] Add arrayShuffle function --- .../functions/array-functions.md | 25 ++ src/Functions/array/arrayShuffle.cpp | 310 ++++++++++++++++++ .../0_stateless/02523_array_shuffle.reference | 18 + .../0_stateless/02523_array_shuffle.sql | 22 ++ 4 files changed, 375 insertions(+) create mode 100644 src/Functions/array/arrayShuffle.cpp create mode 100644 tests/queries/0_stateless/02523_array_shuffle.reference create mode 100644 tests/queries/0_stateless/02523_array_shuffle.sql diff --git a/docs/en/sql-reference/functions/array-functions.md b/docs/en/sql-reference/functions/array-functions.md index 9d2f89c1837..dd4b7abfbeb 100644 --- a/docs/en/sql-reference/functions/array-functions.md +++ b/docs/en/sql-reference/functions/array-functions.md @@ -1102,6 +1102,31 @@ SELECT arrayReverse([1, 2, 3]) Synonym for [“arrayReverse”](#arrayreverse) +## arrayShuffle(arr [, random_seed]) + +Returns an array of the same size as the original array containing the elements in shuffled order. Elements are being reordered in such a way that each possible permutation of those elements has equal probability of appearance. + +**Arguments** + +- `[arr]` — Input array. [Array](../data-types/array.md). +- 'random_seed` — Random seed manual override to produce stable results. Optional. [64 bit integer](../data-types/int-uint.md) + +**Example** + +Query: + +``` sql +SELECT arrayShuffle([1, 2, 3, 4], 41) +``` + +Result: + +``` text +┌─arrayShuffle([1, 2, 3, 4], 41)─┐ +│ [3,2,1,4] │ +└────────────────────────────────┘ +``` + ## arrayFlatten Converts an array of arrays to a flat array. diff --git a/src/Functions/array/arrayShuffle.cpp b/src/Functions/array/arrayShuffle.cpp new file mode 100644 index 00000000000..00f8727db8d --- /dev/null +++ b/src/Functions/array/arrayShuffle.cpp @@ -0,0 +1,310 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ILLEGAL_COLUMN; + extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; +} + +/** Shuffle array elements + * arrayShuffle(arr) + * arrayShuffle(arr, seed) + */ +class FunctionArrayShuffle : public IFunction +{ +public: + static constexpr auto name = "arrayShuffle"; + + static FunctionPtr create(ContextPtr) { return std::make_shared(); } + + String getName() const override { return name; } + bool isVariadic() const override { return true; } + size_t getNumberOfArguments() const override { return 0; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + if (arguments.size() > 2 || arguments.empty()) + { + throw Exception( + ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} needs 1..2 arguments; passed {}.", getName(), arguments.size()); + } + + const DataTypeArray * array_type = checkAndGetDataType(arguments[0].get()); + if (!array_type) + throw Exception("Argument for function " + getName() + " must be array.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + + if (arguments.size() == 2) + { + WhichDataType which(arguments[1]); + if (!which.isUInt() && !which.isInt()) + throw Exception{ + "Illegal type " + arguments[1]->getName() + " of argument of function " + getName() + " (must be UInt or Int)", + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT}; + } + + return arguments[0]; + } + + bool useDefaultImplementationForConstants() const override { return true; } + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t) const override; + +private: + template + static bool executeNumber(const IColumn & src_data, const ColumnArray::Offsets & src_offsets, IColumn & res_data, pcg64_fast &); + static bool executeFixedString(const IColumn & src_data, const ColumnArray::Offsets & src_offsets, IColumn & res_data, pcg64_fast & rng); + static bool executeString(const IColumn & src_data, const ColumnArray::Offsets & src_array_offsets, IColumn & res_data, pcg64_fast & rng); + static bool executeGeneric(const IColumn & src_data, const ColumnArray::Offsets & src_offsets, IColumn & res_data, pcg64_fast &); +}; + +ColumnPtr FunctionArrayShuffle::executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t) const +{ + const ColumnArray * array = checkAndGetColumn(arguments[0].column.get()); + if (!array) + throw Exception( + "Illegal column " + arguments[0].column->getName() + " of first argument of function " + getName(), ErrorCodes::ILLEGAL_COLUMN); + + auto res_ptr = array->cloneEmpty(); + ColumnArray & res = assert_cast(*res_ptr); + res.getOffsetsPtr() = array->getOffsetsPtr(); + + const IColumn & src_data = array->getData(); + const ColumnArray::Offsets & offsets = array->getOffsets(); + + IColumn & res_data = res.getData(); + + const ColumnNullable * src_nullable_col = typeid_cast(&src_data); + ColumnNullable * res_nullable_col = typeid_cast(&res_data); + + const IColumn * src_inner_col = src_nullable_col ? &src_nullable_col->getNestedColumn() : &src_data; + IColumn * res_inner_col = res_nullable_col ? &res_nullable_col->getNestedColumn() : &res_data; + + const auto seed = [&]() -> uint64_t + { + if (arguments.size() == 1) + return randomSeed(); + const auto * val = arguments[1].column.get(); + return val->getUInt(0); + }(); + pcg64_fast rng(seed); + + false // NOLINT + || executeNumber(*src_inner_col, offsets, *res_inner_col, rng) + || executeNumber(*src_inner_col, offsets, *res_inner_col, rng) + || executeNumber(*src_inner_col, offsets, *res_inner_col, rng) + || executeNumber(*src_inner_col, offsets, *res_inner_col, rng) + || executeNumber(*src_inner_col, offsets, *res_inner_col, rng) + || executeNumber(*src_inner_col, offsets, *res_inner_col, rng) + || executeNumber(*src_inner_col, offsets, *res_inner_col, rng) + || executeNumber(*src_inner_col, offsets, *res_inner_col, rng) + || executeNumber(*src_inner_col, offsets, *res_inner_col, rng) + || executeNumber(*src_inner_col, offsets, *res_inner_col, rng) + || executeString(*src_inner_col, offsets, *res_inner_col, rng) + || executeFixedString(*src_inner_col, offsets, *res_inner_col, rng) + || executeGeneric(*src_inner_col, offsets, *res_inner_col, rng); + + if (src_nullable_col) + { + rng.seed(seed); + if (!executeNumber(src_nullable_col->getNullMapColumn(), offsets, res_nullable_col->getNullMapColumn(), rng)) + throw Exception( + "Illegal column " + src_nullable_col->getNullMapColumn().getName() + " of null map of the first argument of function " + + getName(), + ErrorCodes::ILLEGAL_COLUMN); + } + + return res_ptr; +} + +bool FunctionArrayShuffle::executeGeneric(const IColumn & src_data, const ColumnArray::Offsets & src_offsets, IColumn & res_data, pcg64_fast & rng) +{ + size_t size = src_offsets.size(); + res_data.reserve(size); + + IColumn::Permutation permutation; + ColumnArray::Offset prev_off = 0; + for (size_t i = 0; i < src_offsets.size(); ++i) + { + ColumnArray::Offset off = src_offsets[i]; + size_t count = off - prev_off; + + permutation.resize(count); + for (size_t idx = 0; idx < count; ++idx) + permutation[idx] = idx; + + std::shuffle(std::begin(permutation), std::end(permutation), rng); + + for (size_t unshuffled_idx = 0; unshuffled_idx != count; ++unshuffled_idx) + { + auto shuffled_idx = permutation[unshuffled_idx]; + res_data.insertFrom(src_data, shuffled_idx); + } + + prev_off = src_offsets[i]; + } + + return true; +} + +template +bool FunctionArrayShuffle::executeNumber(const IColumn & src_data, const ColumnArray::Offsets & src_offsets, IColumn & res_data, pcg64_fast & rng) +{ + if (const ColumnVector * src_data_concrete = checkAndGetColumn>(&src_data)) + { + const PaddedPODArray & src_vec = src_data_concrete->getData(); + PaddedPODArray & res_vec = typeid_cast &>(res_data).getData(); + res_vec.resize(src_data.size()); + + ColumnArray::Offset prev_off = 0; + for (size_t i = 0; i < src_offsets.size(); ++i) + { + ColumnArray::Offset off = src_offsets[i]; + + // [prev_off, off) + const auto * src = &src_vec[prev_off]; + const auto * src_end = &src_vec[off]; + + if (src == src_end) + continue; + + auto * dst = &res_vec[prev_off]; + + size_t count = off - prev_off; + + memcpy(dst, src, count * sizeof(T)); + std::shuffle(dst, dst + count, rng); + + prev_off = off; + } + + return true; + } + else + return false; +} + +bool FunctionArrayShuffle::executeFixedString(const IColumn & src_data, const ColumnArray::Offsets & src_offsets, IColumn & res_data, pcg64_fast & rng) +{ + if (const ColumnFixedString * src_data_concrete = checkAndGetColumn(&src_data)) + { + const size_t n = src_data_concrete->getN(); + const ColumnFixedString::Chars & src_data_chars = src_data_concrete->getChars(); + ColumnFixedString::Chars & res_chars = typeid_cast(res_data).getChars(); + res_chars.resize(src_data_chars.size()); + + IColumn::Permutation permutation; + + ColumnArray::Offset prev_off = 0; + for (size_t i = 0; i < src_offsets.size(); ++i) + { + ColumnArray::Offset off = src_offsets[i]; + + const UInt8 * src = &src_data_chars[prev_off * n]; + size_t count = off - prev_off; + + if (count == 0) + continue; + + UInt8 * dst = &res_chars[prev_off * n]; + + + permutation.resize(count); + for (size_t idx = 0; idx < count; ++idx) + permutation[idx] = idx; + + std::shuffle(std::begin(permutation), std::end(permutation), rng); + + for (size_t unshuffled_idx = 0; unshuffled_idx != count; ++unshuffled_idx) + { + auto shuffled_idx = permutation[unshuffled_idx]; + memcpy(dst + unshuffled_idx * n, src + shuffled_idx * n, n); + } + + prev_off = off; + } + return true; + } + else + return false; +} + +bool FunctionArrayShuffle::executeString(const IColumn & src_data, const ColumnArray::Offsets & src_array_offsets, IColumn & res_data, pcg64_fast & rng) +{ + if (const ColumnString * src_data_concrete = checkAndGetColumn(&src_data)) + { + const ColumnString::Offsets & src_string_offsets = src_data_concrete->getOffsets(); + ColumnString::Offsets & res_string_offsets = typeid_cast(res_data).getOffsets(); + + const ColumnString::Chars & src_data_chars = src_data_concrete->getChars(); + ColumnString::Chars & res_chars = typeid_cast(res_data).getChars(); + + res_string_offsets.resize(src_string_offsets.size()); + res_chars.resize(src_data_chars.size()); + + IColumn::Permutation permutation; + + ColumnArray::Offset arr_prev_off = 0; + ColumnString::Offset string_prev_off = 0; + + for (size_t i = 0; i < src_array_offsets.size(); ++i) + { + ColumnArray::Offset arr_off = src_array_offsets[i]; + + if (arr_off != arr_prev_off) + { + size_t string_count = arr_off - arr_prev_off; + + permutation.resize(string_count); + for (size_t idx = 0; idx < string_count; ++idx) + permutation[idx] = idx; + + std::shuffle(std::begin(permutation), std::end(permutation), rng); + + for (size_t unshuffled_idx = 0; unshuffled_idx < string_count; ++unshuffled_idx) + { + auto shuffled_idx = permutation[unshuffled_idx]; + + auto src_pos = src_string_offsets[arr_prev_off + shuffled_idx - 1]; + + size_t string_size = src_string_offsets[arr_prev_off + shuffled_idx] - src_pos; + + memcpySmallAllowReadWriteOverflow15(&res_chars[string_prev_off], &src_data_chars[src_pos], string_size); + + string_prev_off += string_size; + res_string_offsets[arr_prev_off + unshuffled_idx] = string_prev_off; + } + } + + arr_prev_off = arr_off; + } + + return true; + } + else + return false; +} + +REGISTER_FUNCTION(ArrayShuffle) +{ + factory.registerFunction(); +} + +} diff --git a/tests/queries/0_stateless/02523_array_shuffle.reference b/tests/queries/0_stateless/02523_array_shuffle.reference new file mode 100644 index 00000000000..a84be39b50a --- /dev/null +++ b/tests/queries/0_stateless/02523_array_shuffle.reference @@ -0,0 +1,18 @@ +[] +[] +[9223372036854775808] +[9223372036854775808] +[10,9,4,2,5,6,7,1,8,3] +[10.1,9,4,2,5,6,7,1,8,3] +[9223372036854775808,9,4,2,5,6,7,1,8,3] +[NULL,9,4,2,5,6,7,1,8,3] +['789','123','ABC','000','456'] +['789','123','ABC',NULL,'456'] +['imposter','storage','sensation','uniform','tiger','terminal'] +[NULL,'storage','sensation','uniform','tiger','terminal'] +[NULL] +[NULL,NULL] +[[10,20,30,40],[1,2,3,4],[100,200,300,400,500,600,700,800,900],[2,4,8,16,32,64],[-1,-2,-3,-4]] +[[10,20,30,40],[1,2,3,4],[100,200,300,400,500,600,700,800,900],[2,4,8,16,32,64],[NULL,-2,-3,-4]] +[10,72,11,18,73,76,46,71,44,35,9,0,97,53,13,32,51,30,3,68,5,48,67,90,20,27,38,19,54,21,83,84,1,22,56,81,91,77,36,63,33,39,24,40,4,99,14,23,94,29,26,96,2,28,31,57,42,88,12,47,58,8,37,82,92,34,6,60,25,43,50,74,70,52,55,62,17,79,65,93,86,7,16,41,59,75,80,45,69,89,85,87,95,64,61,98,49,78,66,15] +[10,72,11,18,73,76,46,71,44,35,9,0,97,53,13,32,51,30,3,68,5,48,67,90,20,27,38,19,54,21,83,84,1,22,56,81,91,77,36,63,33,39,24,40,4,99,14,23,94,29,26,96,2,28,31,57,42,88,12,47,58,8,37,82,92,34,6,60,25,43,50,74,70,52,55,62,17,79,65,93,86,7,16,41,59,75,80,45,69,89,85,87,95,64,61,98,49,78,66,15] diff --git a/tests/queries/0_stateless/02523_array_shuffle.sql b/tests/queries/0_stateless/02523_array_shuffle.sql new file mode 100644 index 00000000000..46bb95fdcec --- /dev/null +++ b/tests/queries/0_stateless/02523_array_shuffle.sql @@ -0,0 +1,22 @@ +SELECT arrayShuffle([]); +SELECT arrayShuffle([], 0xbad_cafe); +SELECT arrayShuffle([9223372036854775808]); +SELECT arrayShuffle([9223372036854775808], 0xbad_cafe); +SELECT arrayShuffle([1,2,3,4,5,6,7,8,9,10], 0xbad_cafe); +SELECT arrayShuffle([1,2,3,4,5,6,7,8,9,10.1], 0xbad_cafe); +SELECT arrayShuffle([1,2,3,4,5,6,7,8,9,9223372036854775808], 0xbad_cafe); +SELECT arrayShuffle([1,2,3,4,5,6,7,8,9,NULL], 0xbad_cafe); +SELECT arrayShuffle([toFixedString('123', 3), toFixedString('456', 3), toFixedString('789', 3), toFixedString('ABC', 3), toFixedString('000', 3)], 0xbad_cafe); +SELECT arrayShuffle([toFixedString('123', 3), toFixedString('456', 3), toFixedString('789', 3), toFixedString('ABC', 3), NULL], 0xbad_cafe); +SELECT arrayShuffle(['storage','tiger','imposter','terminal','uniform','sensation'], 0xbad_cafe); +SELECT arrayShuffle(['storage','tiger',NULL,'terminal','uniform','sensation'], 0xbad_cafe); +SELECT arrayShuffle([NULL]); +SELECT arrayShuffle([NULL,NULL]); +SELECT arrayShuffle([[1,2,3,4],[-1,-2,-3,-4],[10,20,30,40],[100,200,300,400,500,600,700,800,900],[2,4,8,16,32,64]], 0xbad_cafe); +SELECT arrayShuffle([[1,2,3,4],[NULL,-2,-3,-4],[10,20,30,40],[100,200,300,400,500,600,700,800,900],[2,4,8,16,32,64]], 0xbad_cafe); +SELECT arrayShuffle(groupArray(x),0xbad_cafe) FROM (SELECT number as x from system.numbers LIMIT 100); +SELECT arrayShuffle(groupArray(toUInt64(x)),0xbad_cafe) FROM (SELECT number as x from system.numbers LIMIT 100); +SELECT arrayShuffle(1); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } +SELECT arrayShuffle([1], 'a'); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } +SELECT arrayShuffle([1], 1.1); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } +SELECT arrayShuffle([1], 0xcafe, 1); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } \ No newline at end of file From 2355780737e6fa9a753bba2f5235d182c9ddc5cf Mon Sep 17 00:00:00 2001 From: Joanna Hulboj Date: Thu, 12 Jan 2023 20:47:53 +0000 Subject: [PATCH 02/21] Minor formatting --- .../functions/array-functions.md | 6 ++--- src/Functions/array/arrayShuffle.cpp | 23 +------------------ 2 files changed, 4 insertions(+), 25 deletions(-) diff --git a/docs/en/sql-reference/functions/array-functions.md b/docs/en/sql-reference/functions/array-functions.md index dd4b7abfbeb..8888a2f9256 100644 --- a/docs/en/sql-reference/functions/array-functions.md +++ b/docs/en/sql-reference/functions/array-functions.md @@ -1102,14 +1102,14 @@ SELECT arrayReverse([1, 2, 3]) Synonym for [“arrayReverse”](#arrayreverse) -## arrayShuffle(arr [, random_seed]) +## arrayShuffle(arr[, random_seed]) -Returns an array of the same size as the original array containing the elements in shuffled order. Elements are being reordered in such a way that each possible permutation of those elements has equal probability of appearance. +Returns an array of the same size as the original array containing the elements in shuffled order. Elements are being reordered in such a way that each possible permutation of those elements has equal probability of appearance. **Arguments** - `[arr]` — Input array. [Array](../data-types/array.md). -- 'random_seed` — Random seed manual override to produce stable results. Optional. [64 bit integer](../data-types/int-uint.md) +- 'random_seed` — Random seed manual override to produce stable results. Optional. [64 bit integer](../data-types/int-uint.md). **Example** diff --git a/src/Functions/array/arrayShuffle.cpp b/src/Functions/array/arrayShuffle.cpp index 00f8727db8d..9d2a1c416a2 100644 --- a/src/Functions/array/arrayShuffle.cpp +++ b/src/Functions/array/arrayShuffle.cpp @@ -149,15 +149,12 @@ bool FunctionArrayShuffle::executeGeneric(const IColumn & src_data, const Column permutation.resize(count); for (size_t idx = 0; idx < count; ++idx) permutation[idx] = idx; - std::shuffle(std::begin(permutation), std::end(permutation), rng); - for (size_t unshuffled_idx = 0; unshuffled_idx != count; ++unshuffled_idx) { auto shuffled_idx = permutation[unshuffled_idx]; res_data.insertFrom(src_data, shuffled_idx); } - prev_off = src_offsets[i]; } @@ -178,23 +175,18 @@ bool FunctionArrayShuffle::executeNumber(const IColumn & src_data, const ColumnA { ColumnArray::Offset off = src_offsets[i]; - // [prev_off, off) const auto * src = &src_vec[prev_off]; const auto * src_end = &src_vec[off]; - if (src == src_end) continue; - auto * dst = &res_vec[prev_off]; - size_t count = off - prev_off; - memcpy(dst, src, count * sizeof(T)); + std::shuffle(dst, dst + count, rng); prev_off = off; } - return true; } else @@ -211,7 +203,6 @@ bool FunctionArrayShuffle::executeFixedString(const IColumn & src_data, const Co res_chars.resize(src_data_chars.size()); IColumn::Permutation permutation; - ColumnArray::Offset prev_off = 0; for (size_t i = 0; i < src_offsets.size(); ++i) { @@ -225,11 +216,9 @@ bool FunctionArrayShuffle::executeFixedString(const IColumn & src_data, const Co UInt8 * dst = &res_chars[prev_off * n]; - permutation.resize(count); for (size_t idx = 0; idx < count; ++idx) permutation[idx] = idx; - std::shuffle(std::begin(permutation), std::end(permutation), rng); for (size_t unshuffled_idx = 0; unshuffled_idx != count; ++unshuffled_idx) @@ -237,7 +226,6 @@ bool FunctionArrayShuffle::executeFixedString(const IColumn & src_data, const Co auto shuffled_idx = permutation[unshuffled_idx]; memcpy(dst + unshuffled_idx * n, src + shuffled_idx * n, n); } - prev_off = off; } return true; @@ -260,14 +248,11 @@ bool FunctionArrayShuffle::executeString(const IColumn & src_data, const ColumnA res_chars.resize(src_data_chars.size()); IColumn::Permutation permutation; - ColumnArray::Offset arr_prev_off = 0; ColumnString::Offset string_prev_off = 0; - for (size_t i = 0; i < src_array_offsets.size(); ++i) { ColumnArray::Offset arr_off = src_array_offsets[i]; - if (arr_off != arr_prev_off) { size_t string_count = arr_off - arr_prev_off; @@ -275,27 +260,21 @@ bool FunctionArrayShuffle::executeString(const IColumn & src_data, const ColumnA permutation.resize(string_count); for (size_t idx = 0; idx < string_count; ++idx) permutation[idx] = idx; - std::shuffle(std::begin(permutation), std::end(permutation), rng); for (size_t unshuffled_idx = 0; unshuffled_idx < string_count; ++unshuffled_idx) { auto shuffled_idx = permutation[unshuffled_idx]; - auto src_pos = src_string_offsets[arr_prev_off + shuffled_idx - 1]; - size_t string_size = src_string_offsets[arr_prev_off + shuffled_idx] - src_pos; - memcpySmallAllowReadWriteOverflow15(&res_chars[string_prev_off], &src_data_chars[src_pos], string_size); string_prev_off += string_size; res_string_offsets[arr_prev_off + unshuffled_idx] = string_prev_off; } } - arr_prev_off = arr_off; } - return true; } else From bc97dcb763a09e714ec4e8be5c7bb1371c03289c Mon Sep 17 00:00:00 2001 From: Joanna Hulboj Date: Sat, 14 Jan 2023 11:21:12 +0000 Subject: [PATCH 03/21] Fix typo --- docs/en/sql-reference/functions/array-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/functions/array-functions.md b/docs/en/sql-reference/functions/array-functions.md index 8888a2f9256..2c9fc601f06 100644 --- a/docs/en/sql-reference/functions/array-functions.md +++ b/docs/en/sql-reference/functions/array-functions.md @@ -1109,7 +1109,7 @@ Returns an array of the same size as the original array containing the elements **Arguments** - `[arr]` — Input array. [Array](../data-types/array.md). -- 'random_seed` — Random seed manual override to produce stable results. Optional. [64 bit integer](../data-types/int-uint.md). +- `random_seed` — Random seed manual override to produce stable results. Optional. [64 bit integer](../data-types/int-uint.md). **Example** From 3c360fe96395421972700cb935322da14b587d3a Mon Sep 17 00:00:00 2001 From: Joanna Hulboj Date: Sat, 14 Jan 2023 21:01:40 +0000 Subject: [PATCH 04/21] FIXUP - function is documented test --- .../02415_all_new_functions_must_be_documented.reference | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference b/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference index d225cf5f332..3a7f3006d62 100644 --- a/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference +++ b/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference @@ -130,6 +130,7 @@ arrayReverse arrayReverseFill arrayReverseSort arrayReverseSplit +arrayShuffle arraySlice arraySort arraySplit From 01624e2f23a41ab52ea9f4a74e4fea1c01ac956d Mon Sep 17 00:00:00 2001 From: Joanna Hulboj Date: Sat, 14 Jan 2023 21:30:20 +0000 Subject: [PATCH 05/21] FIXUP: style --- src/Functions/array/arrayShuffle.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Functions/array/arrayShuffle.cpp b/src/Functions/array/arrayShuffle.cpp index 9d2a1c416a2..79ffb0f41c5 100644 --- a/src/Functions/array/arrayShuffle.cpp +++ b/src/Functions/array/arrayShuffle.cpp @@ -54,9 +54,9 @@ public: { WhichDataType which(arguments[1]); if (!which.isUInt() && !which.isInt()) - throw Exception{ + throw Exception( "Illegal type " + arguments[1]->getName() + " of argument of function " + getName() + " (must be UInt or Int)", - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT}; + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); } return arguments[0]; From 8d8d1bb8878122538e77e8329fbdf33163f9d034 Mon Sep 17 00:00:00 2001 From: Joanna Hulboj Date: Sun, 15 Jan 2023 10:38:44 +0000 Subject: [PATCH 06/21] FIXUP: make clang-tidy happier --- src/Functions/array/arrayShuffle.cpp | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/src/Functions/array/arrayShuffle.cpp b/src/Functions/array/arrayShuffle.cpp index 79ffb0f41c5..9d0bf8d0706 100644 --- a/src/Functions/array/arrayShuffle.cpp +++ b/src/Functions/array/arrayShuffle.cpp @@ -141,9 +141,8 @@ bool FunctionArrayShuffle::executeGeneric(const IColumn & src_data, const Column IColumn::Permutation permutation; ColumnArray::Offset prev_off = 0; - for (size_t i = 0; i < src_offsets.size(); ++i) + for (auto off: src_offsets) { - ColumnArray::Offset off = src_offsets[i]; size_t count = off - prev_off; permutation.resize(count); @@ -155,7 +154,7 @@ bool FunctionArrayShuffle::executeGeneric(const IColumn & src_data, const Column auto shuffled_idx = permutation[unshuffled_idx]; res_data.insertFrom(src_data, shuffled_idx); } - prev_off = src_offsets[i]; + prev_off = off; } return true; @@ -171,10 +170,8 @@ bool FunctionArrayShuffle::executeNumber(const IColumn & src_data, const ColumnA res_vec.resize(src_data.size()); ColumnArray::Offset prev_off = 0; - for (size_t i = 0; i < src_offsets.size(); ++i) + for (auto off: src_offsets) { - ColumnArray::Offset off = src_offsets[i]; - const auto * src = &src_vec[prev_off]; const auto * src_end = &src_vec[off]; if (src == src_end) @@ -204,10 +201,8 @@ bool FunctionArrayShuffle::executeFixedString(const IColumn & src_data, const Co IColumn::Permutation permutation; ColumnArray::Offset prev_off = 0; - for (size_t i = 0; i < src_offsets.size(); ++i) + for (auto off: src_offsets) { - ColumnArray::Offset off = src_offsets[i]; - const UInt8 * src = &src_data_chars[prev_off * n]; size_t count = off - prev_off; @@ -250,9 +245,8 @@ bool FunctionArrayShuffle::executeString(const IColumn & src_data, const ColumnA IColumn::Permutation permutation; ColumnArray::Offset arr_prev_off = 0; ColumnString::Offset string_prev_off = 0; - for (size_t i = 0; i < src_array_offsets.size(); ++i) + for (auto arr_off: src_array_offsets) { - ColumnArray::Offset arr_off = src_array_offsets[i]; if (arr_off != arr_prev_off) { size_t string_count = arr_off - arr_prev_off; From 000c19f05b97df45636a7623ef5ca4d646d5efd8 Mon Sep 17 00:00:00 2001 From: Joanna Hulboj Date: Mon, 16 Jan 2023 18:46:02 +0000 Subject: [PATCH 07/21] FIXUP: more tests - array of tuples --- tests/queries/0_stateless/02523_array_shuffle.reference | 2 ++ tests/queries/0_stateless/02523_array_shuffle.sql | 2 ++ 2 files changed, 4 insertions(+) diff --git a/tests/queries/0_stateless/02523_array_shuffle.reference b/tests/queries/0_stateless/02523_array_shuffle.reference index a84be39b50a..a92ad2a05c6 100644 --- a/tests/queries/0_stateless/02523_array_shuffle.reference +++ b/tests/queries/0_stateless/02523_array_shuffle.reference @@ -16,3 +16,5 @@ [[10,20,30,40],[1,2,3,4],[100,200,300,400,500,600,700,800,900],[2,4,8,16,32,64],[NULL,-2,-3,-4]] [10,72,11,18,73,76,46,71,44,35,9,0,97,53,13,32,51,30,3,68,5,48,67,90,20,27,38,19,54,21,83,84,1,22,56,81,91,77,36,63,33,39,24,40,4,99,14,23,94,29,26,96,2,28,31,57,42,88,12,47,58,8,37,82,92,34,6,60,25,43,50,74,70,52,55,62,17,79,65,93,86,7,16,41,59,75,80,45,69,89,85,87,95,64,61,98,49,78,66,15] [10,72,11,18,73,76,46,71,44,35,9,0,97,53,13,32,51,30,3,68,5,48,67,90,20,27,38,19,54,21,83,84,1,22,56,81,91,77,36,63,33,39,24,40,4,99,14,23,94,29,26,96,2,28,31,57,42,88,12,47,58,8,37,82,92,34,6,60,25,43,50,74,70,52,55,62,17,79,65,93,86,7,16,41,59,75,80,45,69,89,85,87,95,64,61,98,49,78,66,15] +[(3,-3),(1,-1),(99999999,-99999999)] +[(3,'A'),(1,NULL),(2,'a')] diff --git a/tests/queries/0_stateless/02523_array_shuffle.sql b/tests/queries/0_stateless/02523_array_shuffle.sql index 46bb95fdcec..ecbc9e649d4 100644 --- a/tests/queries/0_stateless/02523_array_shuffle.sql +++ b/tests/queries/0_stateless/02523_array_shuffle.sql @@ -16,6 +16,8 @@ SELECT arrayShuffle([[1,2,3,4],[-1,-2,-3,-4],[10,20,30,40],[100,200,300,400,500, SELECT arrayShuffle([[1,2,3,4],[NULL,-2,-3,-4],[10,20,30,40],[100,200,300,400,500,600,700,800,900],[2,4,8,16,32,64]], 0xbad_cafe); SELECT arrayShuffle(groupArray(x),0xbad_cafe) FROM (SELECT number as x from system.numbers LIMIT 100); SELECT arrayShuffle(groupArray(toUInt64(x)),0xbad_cafe) FROM (SELECT number as x from system.numbers LIMIT 100); +SELECT arrayShuffle([tuple(1, -1), tuple(99999999, -99999999), tuple(3, -3)], 0xbad_cafe); +SELECT arrayShuffle([tuple(1, NULL), tuple(2, 'a'), tuple(3, 'A')], 0xbad_cafe); SELECT arrayShuffle(1); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } SELECT arrayShuffle([1], 'a'); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } SELECT arrayShuffle([1], 1.1); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } From c09a4dd132f1120f85e19ebc692ca02531bfb57b Mon Sep 17 00:00:00 2001 From: Joanna Hulboj Date: Mon, 16 Jan 2023 18:56:47 +0000 Subject: [PATCH 08/21] FIXUP: Docs into code --- .../functions/array-functions.md | 25 ------------------- src/Functions/array/arrayShuffle.cpp | 19 +++++++++++++- 2 files changed, 18 insertions(+), 26 deletions(-) diff --git a/docs/en/sql-reference/functions/array-functions.md b/docs/en/sql-reference/functions/array-functions.md index 2c9fc601f06..9d2f89c1837 100644 --- a/docs/en/sql-reference/functions/array-functions.md +++ b/docs/en/sql-reference/functions/array-functions.md @@ -1102,31 +1102,6 @@ SELECT arrayReverse([1, 2, 3]) Synonym for [“arrayReverse”](#arrayreverse) -## arrayShuffle(arr[, random_seed]) - -Returns an array of the same size as the original array containing the elements in shuffled order. Elements are being reordered in such a way that each possible permutation of those elements has equal probability of appearance. - -**Arguments** - -- `[arr]` — Input array. [Array](../data-types/array.md). -- `random_seed` — Random seed manual override to produce stable results. Optional. [64 bit integer](../data-types/int-uint.md). - -**Example** - -Query: - -``` sql -SELECT arrayShuffle([1, 2, 3, 4], 41) -``` - -Result: - -``` text -┌─arrayShuffle([1, 2, 3, 4], 41)─┐ -│ [3,2,1,4] │ -└────────────────────────────────┘ -``` - ## arrayFlatten Converts an array of arrays to a flat array. diff --git a/src/Functions/array/arrayShuffle.cpp b/src/Functions/array/arrayShuffle.cpp index 9d0bf8d0706..a0daa9c08ee 100644 --- a/src/Functions/array/arrayShuffle.cpp +++ b/src/Functions/array/arrayShuffle.cpp @@ -277,7 +277,24 @@ bool FunctionArrayShuffle::executeString(const IColumn & src_data, const ColumnA REGISTER_FUNCTION(ArrayShuffle) { - factory.registerFunction(); + factory.registerFunction( + { + R"( +Returns an array of the same size as the original array containing the elements in shuffled order. +Elements are being reordered in such a way that each possible permutation of those elements has equal probability of appearance. + +If no seed is provided a random one will be used: +[example:random_seed] + +It is possible to override the seed to produce stable results: +[example:explicit_seed] +)", + Documentation::Examples{ + {"random_seed", "SELECT arrayShuffle([1, 2, 3, 4])"}, + {"explicit_seed", "SELECT arrayShuffle([1, 2, 3, 4], 41)"}}, + Documentation::Categories{"Array"} + }, + FunctionFactory::CaseInsensitive); } } From a65b2cf8615d64de82aa2b3d84522e23f77a8956 Mon Sep 17 00:00:00 2001 From: Joanna Hulboj Date: Mon, 16 Jan 2023 19:16:32 +0000 Subject: [PATCH 09/21] FIXUP: Simplify logic by using permute function --- src/Functions/array/arrayShuffle.cpp | 197 +++------------------------ 1 file changed, 16 insertions(+), 181 deletions(-) diff --git a/src/Functions/array/arrayShuffle.cpp b/src/Functions/array/arrayShuffle.cpp index a0daa9c08ee..8326ec1c196 100644 --- a/src/Functions/array/arrayShuffle.cpp +++ b/src/Functions/array/arrayShuffle.cpp @@ -12,6 +12,7 @@ #include #include +#include namespace DB { @@ -68,11 +69,7 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t) const override; private: - template - static bool executeNumber(const IColumn & src_data, const ColumnArray::Offsets & src_offsets, IColumn & res_data, pcg64_fast &); - static bool executeFixedString(const IColumn & src_data, const ColumnArray::Offsets & src_offsets, IColumn & res_data, pcg64_fast & rng); - static bool executeString(const IColumn & src_data, const ColumnArray::Offsets & src_array_offsets, IColumn & res_data, pcg64_fast & rng); - static bool executeGeneric(const IColumn & src_data, const ColumnArray::Offsets & src_offsets, IColumn & res_data, pcg64_fast &); + static ColumnPtr executeGeneric(const ColumnArray & array, ColumnPtr mapped, pcg64_fast & rng); }; ColumnPtr FunctionArrayShuffle::executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t) const @@ -82,21 +79,6 @@ ColumnPtr FunctionArrayShuffle::executeImpl(const ColumnsWithTypeAndName & argum throw Exception( "Illegal column " + arguments[0].column->getName() + " of first argument of function " + getName(), ErrorCodes::ILLEGAL_COLUMN); - auto res_ptr = array->cloneEmpty(); - ColumnArray & res = assert_cast(*res_ptr); - res.getOffsetsPtr() = array->getOffsetsPtr(); - - const IColumn & src_data = array->getData(); - const ColumnArray::Offsets & offsets = array->getOffsets(); - - IColumn & res_data = res.getData(); - - const ColumnNullable * src_nullable_col = typeid_cast(&src_data); - ColumnNullable * res_nullable_col = typeid_cast(&res_data); - - const IColumn * src_inner_col = src_nullable_col ? &src_nullable_col->getNestedColumn() : &src_data; - IColumn * res_inner_col = res_nullable_col ? &res_nullable_col->getNestedColumn() : &res_data; - const auto seed = [&]() -> uint64_t { if (arguments.size() == 1) @@ -106,173 +88,26 @@ ColumnPtr FunctionArrayShuffle::executeImpl(const ColumnsWithTypeAndName & argum }(); pcg64_fast rng(seed); - false // NOLINT - || executeNumber(*src_inner_col, offsets, *res_inner_col, rng) - || executeNumber(*src_inner_col, offsets, *res_inner_col, rng) - || executeNumber(*src_inner_col, offsets, *res_inner_col, rng) - || executeNumber(*src_inner_col, offsets, *res_inner_col, rng) - || executeNumber(*src_inner_col, offsets, *res_inner_col, rng) - || executeNumber(*src_inner_col, offsets, *res_inner_col, rng) - || executeNumber(*src_inner_col, offsets, *res_inner_col, rng) - || executeNumber(*src_inner_col, offsets, *res_inner_col, rng) - || executeNumber(*src_inner_col, offsets, *res_inner_col, rng) - || executeNumber(*src_inner_col, offsets, *res_inner_col, rng) - || executeString(*src_inner_col, offsets, *res_inner_col, rng) - || executeFixedString(*src_inner_col, offsets, *res_inner_col, rng) - || executeGeneric(*src_inner_col, offsets, *res_inner_col, rng); - - if (src_nullable_col) - { - rng.seed(seed); - if (!executeNumber(src_nullable_col->getNullMapColumn(), offsets, res_nullable_col->getNullMapColumn(), rng)) - throw Exception( - "Illegal column " + src_nullable_col->getNullMapColumn().getName() + " of null map of the first argument of function " - + getName(), - ErrorCodes::ILLEGAL_COLUMN); - } - - return res_ptr; + return executeGeneric(*array, array->getDataPtr(), rng); } -bool FunctionArrayShuffle::executeGeneric(const IColumn & src_data, const ColumnArray::Offsets & src_offsets, IColumn & res_data, pcg64_fast & rng) +ColumnPtr FunctionArrayShuffle::executeGeneric(const ColumnArray & array, ColumnPtr /*mapped*/, pcg64_fast & rng) { - size_t size = src_offsets.size(); - res_data.reserve(size); + const ColumnArray::Offsets & offsets = array.getOffsets(); - IColumn::Permutation permutation; - ColumnArray::Offset prev_off = 0; - for (auto off: src_offsets) + size_t size = offsets.size(); + size_t nested_size = array.getData().size(); + IColumn::Permutation permutation(nested_size); + std::iota(std::begin(permutation), std::end(permutation), 0); + + ColumnArray::Offset current_offset = 0; + for (size_t i = 0; i < size; ++i) { - size_t count = off - prev_off; - - permutation.resize(count); - for (size_t idx = 0; idx < count; ++idx) - permutation[idx] = idx; - std::shuffle(std::begin(permutation), std::end(permutation), rng); - for (size_t unshuffled_idx = 0; unshuffled_idx != count; ++unshuffled_idx) - { - auto shuffled_idx = permutation[unshuffled_idx]; - res_data.insertFrom(src_data, shuffled_idx); - } - prev_off = off; + auto next_offset = offsets[i]; + std::shuffle(&permutation[current_offset], &permutation[next_offset], rng); + current_offset = next_offset; } - - return true; -} - -template -bool FunctionArrayShuffle::executeNumber(const IColumn & src_data, const ColumnArray::Offsets & src_offsets, IColumn & res_data, pcg64_fast & rng) -{ - if (const ColumnVector * src_data_concrete = checkAndGetColumn>(&src_data)) - { - const PaddedPODArray & src_vec = src_data_concrete->getData(); - PaddedPODArray & res_vec = typeid_cast &>(res_data).getData(); - res_vec.resize(src_data.size()); - - ColumnArray::Offset prev_off = 0; - for (auto off: src_offsets) - { - const auto * src = &src_vec[prev_off]; - const auto * src_end = &src_vec[off]; - if (src == src_end) - continue; - auto * dst = &res_vec[prev_off]; - size_t count = off - prev_off; - memcpy(dst, src, count * sizeof(T)); - - std::shuffle(dst, dst + count, rng); - - prev_off = off; - } - return true; - } - else - return false; -} - -bool FunctionArrayShuffle::executeFixedString(const IColumn & src_data, const ColumnArray::Offsets & src_offsets, IColumn & res_data, pcg64_fast & rng) -{ - if (const ColumnFixedString * src_data_concrete = checkAndGetColumn(&src_data)) - { - const size_t n = src_data_concrete->getN(); - const ColumnFixedString::Chars & src_data_chars = src_data_concrete->getChars(); - ColumnFixedString::Chars & res_chars = typeid_cast(res_data).getChars(); - res_chars.resize(src_data_chars.size()); - - IColumn::Permutation permutation; - ColumnArray::Offset prev_off = 0; - for (auto off: src_offsets) - { - const UInt8 * src = &src_data_chars[prev_off * n]; - size_t count = off - prev_off; - - if (count == 0) - continue; - - UInt8 * dst = &res_chars[prev_off * n]; - - permutation.resize(count); - for (size_t idx = 0; idx < count; ++idx) - permutation[idx] = idx; - std::shuffle(std::begin(permutation), std::end(permutation), rng); - - for (size_t unshuffled_idx = 0; unshuffled_idx != count; ++unshuffled_idx) - { - auto shuffled_idx = permutation[unshuffled_idx]; - memcpy(dst + unshuffled_idx * n, src + shuffled_idx * n, n); - } - prev_off = off; - } - return true; - } - else - return false; -} - -bool FunctionArrayShuffle::executeString(const IColumn & src_data, const ColumnArray::Offsets & src_array_offsets, IColumn & res_data, pcg64_fast & rng) -{ - if (const ColumnString * src_data_concrete = checkAndGetColumn(&src_data)) - { - const ColumnString::Offsets & src_string_offsets = src_data_concrete->getOffsets(); - ColumnString::Offsets & res_string_offsets = typeid_cast(res_data).getOffsets(); - - const ColumnString::Chars & src_data_chars = src_data_concrete->getChars(); - ColumnString::Chars & res_chars = typeid_cast(res_data).getChars(); - - res_string_offsets.resize(src_string_offsets.size()); - res_chars.resize(src_data_chars.size()); - - IColumn::Permutation permutation; - ColumnArray::Offset arr_prev_off = 0; - ColumnString::Offset string_prev_off = 0; - for (auto arr_off: src_array_offsets) - { - if (arr_off != arr_prev_off) - { - size_t string_count = arr_off - arr_prev_off; - - permutation.resize(string_count); - for (size_t idx = 0; idx < string_count; ++idx) - permutation[idx] = idx; - std::shuffle(std::begin(permutation), std::end(permutation), rng); - - for (size_t unshuffled_idx = 0; unshuffled_idx < string_count; ++unshuffled_idx) - { - auto shuffled_idx = permutation[unshuffled_idx]; - auto src_pos = src_string_offsets[arr_prev_off + shuffled_idx - 1]; - size_t string_size = src_string_offsets[arr_prev_off + shuffled_idx] - src_pos; - memcpySmallAllowReadWriteOverflow15(&res_chars[string_prev_off], &src_data_chars[src_pos], string_size); - - string_prev_off += string_size; - res_string_offsets[arr_prev_off + unshuffled_idx] = string_prev_off; - } - } - arr_prev_off = arr_off; - } - return true; - } - else - return false; + return ColumnArray::create(array.getData().permute(permutation, 0), array.getOffsetsPtr()); } REGISTER_FUNCTION(ArrayShuffle) From b9bd0ed4f6985c7d5e941f854d4bafc2c6010249 Mon Sep 17 00:00:00 2001 From: Joanna Hulboj Date: Tue, 17 Jan 2023 19:51:29 +0000 Subject: [PATCH 10/21] FIXUP: after in-code documentation update --- .../02415_all_new_functions_must_be_documented.reference | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference b/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference index 3a7f3006d62..d225cf5f332 100644 --- a/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference +++ b/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference @@ -130,7 +130,6 @@ arrayReverse arrayReverseFill arrayReverseSort arrayReverseSplit -arrayShuffle arraySlice arraySort arraySplit From 09789b027f3ccaf09566a34f1780427bf0f5aa99 Mon Sep 17 00:00:00 2001 From: Joanna Hulboj Date: Tue, 17 Jan 2023 19:51:53 +0000 Subject: [PATCH 11/21] FIXUP: PR comments --- src/Functions/array/arrayShuffle.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/Functions/array/arrayShuffle.cpp b/src/Functions/array/arrayShuffle.cpp index 8326ec1c196..3941eb7271d 100644 --- a/src/Functions/array/arrayShuffle.cpp +++ b/src/Functions/array/arrayShuffle.cpp @@ -38,18 +38,19 @@ public: String getName() const override { return name; } bool isVariadic() const override { return true; } size_t getNumberOfArguments() const override { return 0; } + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; } DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { if (arguments.size() > 2 || arguments.empty()) { throw Exception( - ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} needs 1..2 arguments; passed {}.", getName(), arguments.size()); + ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function '{}' needs 1 or 2 arguments, passed {}.", getName(), arguments.size()); } const DataTypeArray * array_type = checkAndGetDataType(arguments[0].get()); if (!array_type) - throw Exception("Argument for function " + getName() + " must be array.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "First argument of function '{}' must be array", getName()); if (arguments.size() == 2) { From 31eb936457902c91cdf1beb2273e667a9a1f7b4c Mon Sep 17 00:00:00 2001 From: Joanna Hulboj Date: Fri, 20 Jan 2023 18:40:00 +0000 Subject: [PATCH 12/21] Added Fisher-Yates shuffle and partial-shuffle --- src/Common/shuffle.h | 46 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 src/Common/shuffle.h diff --git a/src/Common/shuffle.h b/src/Common/shuffle.h new file mode 100644 index 00000000000..f2477db0352 --- /dev/null +++ b/src/Common/shuffle.h @@ -0,0 +1,46 @@ +#pragma once + +#include +#include +#include + +/* Reorders the elements in the given range [first, last) such that each + * possible permutation of those elements has equal probability of appearance. + */ +template +void shuffle(Iter first, Iter last, Rng && rng) +{ + using diff_t = typename std::iterator_traits::difference_type; + using distr_t = std::uniform_int_distribution; + using param_t = typename distr_t::param_type; + distr_t d; + diff_t n = last - first; + for (ssize_t i = 0; i < n - 1; ++i) + { + using std::swap; + auto j = d(rng, param_t(i, n - 1)); + swap(first[i], first[j]); + } +} + + +/* Partially shuffle elements in range [first, last) in such a way that + * [first, first + limit) is a random subset of the original range. + * [first + limit, last) shall contain the elements not in [first, first + limit) + * in undefined order. + */ +template +void partial_shuffle(Iter first, Iter last, size_t limit, Rng && rng) +{ + using diff_t = typename std::iterator_traits::difference_type; + using distr_t = std::uniform_int_distribution; + using param_t = typename distr_t::param_type; + distr_t d; + diff_t n = last - first; + for (size_t i = 0; i < limit; ++i) + { + using std::swap; + auto j = d(rng, param_t(i, n - 1)); + swap(first[i], first[j]); + } +} From a8b78abc543863ee404395c9b24145a28c6aee57 Mon Sep 17 00:00:00 2001 From: Joanna Hulboj Date: Fri, 20 Jan 2023 20:39:33 +0000 Subject: [PATCH 13/21] Added arrayPartialShuffle function --- src/Functions/array/arrayShuffle.cpp | 125 ++++++++++++++---- .../0_stateless/02523_array_shuffle.reference | 42 ++++++ .../0_stateless/02523_array_shuffle.sql | 42 ++++++ 3 files changed, 186 insertions(+), 23 deletions(-) diff --git a/src/Functions/array/arrayShuffle.cpp b/src/Functions/array/arrayShuffle.cpp index 3941eb7271d..47608a8524e 100644 --- a/src/Functions/array/arrayShuffle.cpp +++ b/src/Functions/array/arrayShuffle.cpp @@ -6,11 +6,13 @@ #include #include #include -#include #include #include +#include #include +#include + #include #include @@ -28,52 +30,83 @@ namespace ErrorCodes * arrayShuffle(arr) * arrayShuffle(arr, seed) */ -class FunctionArrayShuffle : public IFunction +struct FunctionArrayShuffleTraits +{ + static constexpr auto name = "arrayShuffle"; + static constexpr auto has_limit = false; // Permute the whole array + static ColumnNumbers getArgumentsThatAreAlwaysConstant() { return {1}; } + static constexpr auto max_num_params = 2; // array[, seed] + static constexpr auto seed_param_idx = 1; +}; + +/** Partial shuffle array elements + * arrayPartialShuffle(arr) + * arrayPartialShuffle(arr, limit) + * arrayPartialShuffle(arr, limit, seed) + */ +struct FunctionArrayPartialShuffleTraits +{ + static constexpr auto name = "arrayPartialShuffle"; + static constexpr auto has_limit = true; + static ColumnNumbers getArgumentsThatAreAlwaysConstant() { return {1, 2}; } + static constexpr auto max_num_params = 3; // array[, limit[, seed]] + static constexpr auto seed_param_idx = 2; +}; + +template +class FunctionArrayShuffleImpl : public IFunction { public: - static constexpr auto name = "arrayShuffle"; - - static FunctionPtr create(ContextPtr) { return std::make_shared(); } + static constexpr auto name = Traits::name; String getName() const override { return name; } bool isVariadic() const override { return true; } size_t getNumberOfArguments() const override { return 0; } - ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; } + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return Traits::getArgumentsThatAreAlwaysConstant(); } + bool useDefaultImplementationForConstants() const override { return true; } + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } + + static FunctionPtr create(ContextPtr) { return std::make_shared>(); } DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { - if (arguments.size() > 2 || arguments.empty()) + if (arguments.size() > Traits::max_num_params || arguments.empty()) { throw Exception( - ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function '{}' needs 1 or 2 arguments, passed {}.", getName(), arguments.size()); + ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Function '{}' needs from 1 to {} arguments, passed {}.", + getName(), + Traits::max_num_params, + arguments.size()); } const DataTypeArray * array_type = checkAndGetDataType(arguments[0].get()); if (!array_type) throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "First argument of function '{}' must be array", getName()); - if (arguments.size() == 2) + auto check_is_integral = [&](auto param_idx) { - WhichDataType which(arguments[1]); + WhichDataType which(arguments[param_idx]); if (!which.isUInt() && !which.isInt()) throw Exception( - "Illegal type " + arguments[1]->getName() + " of argument of function " + getName() + " (must be UInt or Int)", + "Illegal type " + arguments[param_idx]->getName() + " of argument of function " + getName() + " (must be UInt or Int)", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - } + }; + + for (size_t idx = 1; idx < arguments.size(); ++idx) + check_is_integral(idx); return arguments[0]; } - bool useDefaultImplementationForConstants() const override { return true; } - bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } - ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t) const override; private: - static ColumnPtr executeGeneric(const ColumnArray & array, ColumnPtr mapped, pcg64_fast & rng); + static ColumnPtr executeGeneric(const ColumnArray & array, pcg64_fast & rng, size_t limit); }; -ColumnPtr FunctionArrayShuffle::executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t) const +template +ColumnPtr FunctionArrayShuffleImpl::executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t) const { const ColumnArray * array = checkAndGetColumn(arguments[0].column.get()); if (!array) @@ -82,17 +115,32 @@ ColumnPtr FunctionArrayShuffle::executeImpl(const ColumnsWithTypeAndName & argum const auto seed = [&]() -> uint64_t { - if (arguments.size() == 1) + // If present, seed comes as the last argument + if (arguments.size() != Traits::max_num_params) return randomSeed(); - const auto * val = arguments[1].column.get(); + const auto * val = arguments[Traits::seed_param_idx].column.get(); return val->getUInt(0); }(); pcg64_fast rng(seed); - return executeGeneric(*array, array->getDataPtr(), rng); + size_t limit = [&] + { + if constexpr (Traits::has_limit) + { + if (arguments.size() > 1) + { + const auto * val = arguments[1].column.get(); + return val->getUInt(0); + } + } + return static_cast(0); + }(); + + return executeGeneric(*array, rng, limit); } -ColumnPtr FunctionArrayShuffle::executeGeneric(const ColumnArray & array, ColumnPtr /*mapped*/, pcg64_fast & rng) +template +ColumnPtr FunctionArrayShuffleImpl::executeGeneric(const ColumnArray & array, pcg64_fast & rng, size_t limit [[maybe_unused]]) { const ColumnArray::Offsets & offsets = array.getOffsets(); @@ -105,7 +153,15 @@ ColumnPtr FunctionArrayShuffle::executeGeneric(const ColumnArray & array, Column for (size_t i = 0; i < size; ++i) { auto next_offset = offsets[i]; - std::shuffle(&permutation[current_offset], &permutation[next_offset], rng); + if constexpr (Traits::has_limit) + { + if (limit && next_offset > limit) + { + partial_shuffle(&permutation[current_offset], &permutation[next_offset], limit, rng); + break; + } + } + shuffle(&permutation[current_offset], &permutation[next_offset], rng); current_offset = next_offset; } return ColumnArray::create(array.getData().permute(permutation, 0), array.getOffsetsPtr()); @@ -113,7 +169,7 @@ ColumnPtr FunctionArrayShuffle::executeGeneric(const ColumnArray & array, Column REGISTER_FUNCTION(ArrayShuffle) { - factory.registerFunction( + factory.registerFunction>( { R"( Returns an array of the same size as the original array containing the elements in shuffled order. @@ -131,6 +187,29 @@ It is possible to override the seed to produce stable results: Documentation::Categories{"Array"} }, FunctionFactory::CaseInsensitive); + factory.registerFunction>( + { + R"( +Returns an array of the same size as the original array where elements in range [0..limit) are a random +subset of the original array. Remaining [limit..n) shall contain the elements not in [0..limit) range in undefined order. +Value of limit shall be in range [0..n]. Values outside of that range are equivalent to performing full arrayShuffle: +[example:no_limit1] +[example:no_limit2] + +If no seed is provided a random one will be used: +[example:random_seed] + +It is possible to override the seed to produce stable results: +[example:explicit_seed] +)", + Documentation::Examples{ + {"no_limit1", "SELECT arrayPartialShuffle([1, 2, 3, 4], 0)"}, + {"no_limit2", "SELECT arrayPartialShuffle([1, 2, 3, 4])"}, + {"random_seed", "SELECT arrayPartialShuffle([1, 2, 3, 4], 2)"}, + {"explicit_seed", "SELECT arrayShuffle([1, 2, 3, 4], 2, 41)"}}, + Documentation::Categories{"Array"} + }, + FunctionFactory::CaseInsensitive); } } diff --git a/tests/queries/0_stateless/02523_array_shuffle.reference b/tests/queries/0_stateless/02523_array_shuffle.reference index a92ad2a05c6..2263f8dc92a 100644 --- a/tests/queries/0_stateless/02523_array_shuffle.reference +++ b/tests/queries/0_stateless/02523_array_shuffle.reference @@ -18,3 +18,45 @@ [10,72,11,18,73,76,46,71,44,35,9,0,97,53,13,32,51,30,3,68,5,48,67,90,20,27,38,19,54,21,83,84,1,22,56,81,91,77,36,63,33,39,24,40,4,99,14,23,94,29,26,96,2,28,31,57,42,88,12,47,58,8,37,82,92,34,6,60,25,43,50,74,70,52,55,62,17,79,65,93,86,7,16,41,59,75,80,45,69,89,85,87,95,64,61,98,49,78,66,15] [(3,-3),(1,-1),(99999999,-99999999)] [(3,'A'),(1,NULL),(2,'a')] +[] +[] +[] +[9223372036854775808] +[9223372036854775808] +[9223372036854775808] +[10,9,4,2,5,6,7,1,8,3] +[10.1,9,4,2,5,6,7,1,8,3] +[9223372036854775808,9,4,2,5,6,7,1,8,3] +[NULL,9,4,2,5,6,7,1,8,3] +['789','123','ABC','000','456'] +['789','123','ABC',NULL,'456'] +['imposter','storage','sensation','uniform','tiger','terminal'] +[NULL,'storage','sensation','uniform','tiger','terminal'] +[NULL] +[NULL,NULL] +[[10,20,30,40],[1,2,3,4],[100,200,300,400,500,600,700,800,900],[2,4,8,16,32,64],[-1,-2,-3,-4]] +[[10,20,30,40],[1,2,3,4],[100,200,300,400,500,600,700,800,900],[2,4,8,16,32,64],[NULL,-2,-3,-4]] +[10,72,11,18,73,76,46,71,44,35,9,0,97,53,13,32,51,30,3,68,5,48,67,90,20,27,38,19,54,21,83,84,1,22,56,81,91,77,36,63,33,39,24,40,4,99,14,23,94,29,26,96,2,28,31,57,42,88,12,47,58,8,37,82,92,34,6,60,25,43,50,74,70,52,55,62,17,79,65,93,86,7,16,41,59,75,80,45,69,89,85,87,95,64,61,98,49,78,66,15] +[10,72,11,18,73,76,46,71,44,35,9,0,97,53,13,32,51,30,3,68,5,48,67,90,20,27,38,19,54,21,83,84,1,22,56,81,91,77,36,63,33,39,24,40,4,99,14,23,94,29,26,96,2,28,31,57,42,88,12,47,58,8,37,82,92,34,6,60,25,43,50,74,70,52,55,62,17,79,65,93,86,7,16,41,59,75,80,45,69,89,85,87,95,64,61,98,49,78,66,15] +[(3,-3),(1,-1),(99999999,-99999999)] +[(3,'A'),(1,NULL),(2,'a')] +[NULL,NULL,NULL] +[10,2,3,4,5,6,7,8,9,1] +[10,9,3,4,5,6,7,8,2,1] +[10,9,4,2,5,6,7,8,3,1] +[10,9,4,2,5,6,7,1,3,8] +[10,9,4,2,5,6,7,1,8,3] +[10,9,4,2,5,6,7,1,8,3] +[10.1,9,4,2,5,6,7,8,3,1] +[9223372036854775808,9,4,2,5,6,7,8,3,1] +[NULL,9,4,2,5,6,7,8,3,1] +['789','123','ABC','456','000'] +['789','123','ABC','456',NULL] +['imposter','storage','sensation','terminal','uniform','tiger'] +[NULL,'storage','sensation','terminal','uniform','tiger'] +[[10,20,30,40],[1,2,3,4],[-1,-2,-3,-4],[100,200,300,400,500,600,700,800,900],[2,4,8,16,32,64]] +[[10,20,30,40],[1,2,3,4],[NULL,-2,-3,-4],[100,200,300,400,500,600,700,800,900],[2,4,8,16,32,64]] +[10,72,11,18,73,76,46,71,44,35,9,0,97,53,13,32,51,30,3,68,20,21,22,23,24,25,26,27,28,29,17,31,15,33,34,2,36,37,38,39,40,41,42,43,8,45,6,47,48,49,50,16,52,14,54,55,56,57,58,59,60,61,62,63,64,65,66,67,19,69,70,7,1,4,74,75,5,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,12,98,99] +[10,72,11,18,73,76,46,71,44,35,9,0,97,53,13,32,51,30,3,68,20,21,22,23,24,25,26,27,28,29,17,31,15,33,34,2,36,37,38,39,40,41,42,43,8,45,6,47,48,49,50,16,52,14,54,55,56,57,58,59,60,61,62,63,64,65,66,67,19,69,70,7,1,4,74,75,5,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,12,98,99] +[(3,-3),(1,-1),(99999999,-99999999)] +[(3,'A'),(1,NULL),(2,'a')] diff --git a/tests/queries/0_stateless/02523_array_shuffle.sql b/tests/queries/0_stateless/02523_array_shuffle.sql index ecbc9e649d4..dfeb75e01c5 100644 --- a/tests/queries/0_stateless/02523_array_shuffle.sql +++ b/tests/queries/0_stateless/02523_array_shuffle.sql @@ -18,6 +18,48 @@ SELECT arrayShuffle(groupArray(x),0xbad_cafe) FROM (SELECT number as x from syst SELECT arrayShuffle(groupArray(toUInt64(x)),0xbad_cafe) FROM (SELECT number as x from system.numbers LIMIT 100); SELECT arrayShuffle([tuple(1, -1), tuple(99999999, -99999999), tuple(3, -3)], 0xbad_cafe); SELECT arrayShuffle([tuple(1, NULL), tuple(2, 'a'), tuple(3, 'A')], 0xbad_cafe); +SELECT arrayPartialShuffle([]); -- trivial cases (equivalent to arrayShuffle) +SELECT arrayPartialShuffle([], 0); +SELECT arrayPartialShuffle([], 0, 0xbad_cafe); +SELECT arrayPartialShuffle([9223372036854775808]); +SELECT arrayPartialShuffle([9223372036854775808], 0); +SELECT arrayPartialShuffle([9223372036854775808], 0, 0xbad_cafe); +SELECT arrayPartialShuffle([1,2,3,4,5,6,7,8,9,10], 0, 0xbad_cafe); +SELECT arrayPartialShuffle([1,2,3,4,5,6,7,8,9,10.1], 0, 0xbad_cafe); +SELECT arrayPartialShuffle([1,2,3,4,5,6,7,8,9,9223372036854775808], 0, 0xbad_cafe); +SELECT arrayPartialShuffle([1,2,3,4,5,6,7,8,9,NULL], 0, 0xbad_cafe); +SELECT arrayPartialShuffle([toFixedString('123', 3), toFixedString('456', 3), toFixedString('789', 3), toFixedString('ABC', 3), toFixedString('000', 3)], 0, 0xbad_cafe); +SELECT arrayPartialShuffle([toFixedString('123', 3), toFixedString('456', 3), toFixedString('789', 3), toFixedString('ABC', 3), NULL], 0, 0xbad_cafe); +SELECT arrayPartialShuffle(['storage','tiger','imposter','terminal','uniform','sensation'], 0, 0xbad_cafe); +SELECT arrayPartialShuffle(['storage','tiger',NULL,'terminal','uniform','sensation'], 0, 0xbad_cafe); +SELECT arrayPartialShuffle([NULL]); +SELECT arrayPartialShuffle([NULL,NULL]); +SELECT arrayPartialShuffle([[1,2,3,4],[-1,-2,-3,-4],[10,20,30,40],[100,200,300,400,500,600,700,800,900],[2,4,8,16,32,64]], 0, 0xbad_cafe); +SELECT arrayPartialShuffle([[1,2,3,4],[NULL,-2,-3,-4],[10,20,30,40],[100,200,300,400,500,600,700,800,900],[2,4,8,16,32,64]], 0, 0xbad_cafe); +SELECT arrayPartialShuffle(groupArray(x),0,0xbad_cafe) FROM (SELECT number as x from system.numbers LIMIT 100); +SELECT arrayPartialShuffle(groupArray(toUInt64(x)),0,0xbad_cafe) FROM (SELECT number as x from system.numbers LIMIT 100); +SELECT arrayPartialShuffle([tuple(1, -1), tuple(99999999, -99999999), tuple(3, -3)], 0, 0xbad_cafe); +SELECT arrayPartialShuffle([tuple(1, NULL), tuple(2, 'a'), tuple(3, 'A')], 0, 0xbad_cafe); +SELECT arrayPartialShuffle([NULL,NULL,NULL], 2); -- other, mostly non-trivial cases +SELECT arrayPartialShuffle([1,2,3,4,5,6,7,8,9,10], 1, 0xbad_cafe); +SELECT arrayPartialShuffle([1,2,3,4,5,6,7,8,9,10], 2, 0xbad_cafe); +SELECT arrayPartialShuffle([1,2,3,4,5,6,7,8,9,10], 4, 0xbad_cafe); +SELECT arrayPartialShuffle([1,2,3,4,5,6,7,8,9,10], 8, 0xbad_cafe); +SELECT arrayPartialShuffle([1,2,3,4,5,6,7,8,9,10], 9, 0xbad_cafe); +SELECT arrayPartialShuffle([1,2,3,4,5,6,7,8,9,10], 10, 0xbad_cafe); +SELECT arrayPartialShuffle([1,2,3,4,5,6,7,8,9,10.1], 4, 0xbad_cafe); +SELECT arrayPartialShuffle([1,2,3,4,5,6,7,8,9,9223372036854775808], 4, 0xbad_cafe); +SELECT arrayPartialShuffle([1,2,3,4,5,6,7,8,9,NULL], 4, 0xbad_cafe); +SELECT arrayPartialShuffle([toFixedString('123', 3), toFixedString('456', 3), toFixedString('789', 3), toFixedString('ABC', 3), toFixedString('000', 3)], 3, 0xbad_cafe); +SELECT arrayPartialShuffle([toFixedString('123', 3), toFixedString('456', 3), toFixedString('789', 3), toFixedString('ABC', 3), NULL], 3, 0xbad_cafe); +SELECT arrayPartialShuffle(['storage','tiger','imposter','terminal','uniform','sensation'], 3, 0xbad_cafe); +SELECT arrayPartialShuffle(['storage','tiger',NULL,'terminal','uniform','sensation'], 3, 0xbad_cafe); +SELECT arrayPartialShuffle([[1,2,3,4],[-1,-2,-3,-4],[10,20,30,40],[100,200,300,400,500,600,700,800,900],[2,4,8,16,32,64]], 2, 0xbad_cafe); +SELECT arrayPartialShuffle([[1,2,3,4],[NULL,-2,-3,-4],[10,20,30,40],[100,200,300,400,500,600,700,800,900],[2,4,8,16,32,64]], 2, 0xbad_cafe); +SELECT arrayPartialShuffle(groupArray(x),20,0xbad_cafe) FROM (SELECT number as x from system.numbers LIMIT 100); +SELECT arrayPartialShuffle(groupArray(toUInt64(x)),20,0xbad_cafe) FROM (SELECT number as x from system.numbers LIMIT 100); +SELECT arrayPartialShuffle([tuple(1, -1), tuple(99999999, -99999999), tuple(3, -3)], 2, 0xbad_cafe); +SELECT arrayPartialShuffle([tuple(1, NULL), tuple(2, 'a'), tuple(3, 'A')], 2, 0xbad_cafe); SELECT arrayShuffle(1); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } SELECT arrayShuffle([1], 'a'); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } SELECT arrayShuffle([1], 1.1); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } From d378e453c14987a1297b5b3f11038ab785b3c751 Mon Sep 17 00:00:00 2001 From: Joanna Hulboj Date: Sat, 21 Jan 2023 11:20:56 +0000 Subject: [PATCH 14/21] FIXUP: fix in arrayPartialShuffle --- src/Functions/array/arrayShuffle.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Functions/array/arrayShuffle.cpp b/src/Functions/array/arrayShuffle.cpp index 47608a8524e..9f95c7f67f1 100644 --- a/src/Functions/array/arrayShuffle.cpp +++ b/src/Functions/array/arrayShuffle.cpp @@ -155,13 +155,13 @@ ColumnPtr FunctionArrayShuffleImpl::executeGeneric(const ColumnArray & a auto next_offset = offsets[i]; if constexpr (Traits::has_limit) { - if (limit && next_offset > limit) - { + if (limit) partial_shuffle(&permutation[current_offset], &permutation[next_offset], limit, rng); - break; - } + else + shuffle(&permutation[current_offset], &permutation[next_offset], rng); } - shuffle(&permutation[current_offset], &permutation[next_offset], rng); + else + shuffle(&permutation[current_offset], &permutation[next_offset], rng); current_offset = next_offset; } return ColumnArray::create(array.getData().permute(permutation, 0), array.getOffsetsPtr()); From b0ba8c02bef755e22e37c3eca192381607461fe2 Mon Sep 17 00:00:00 2001 From: Joanna Hulboj Date: Sat, 21 Jan 2023 11:35:17 +0000 Subject: [PATCH 15/21] FIXUP: Darwin compilation issue --- src/Functions/array/arrayShuffle.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Functions/array/arrayShuffle.cpp b/src/Functions/array/arrayShuffle.cpp index 9f95c7f67f1..b2432e650c4 100644 --- a/src/Functions/array/arrayShuffle.cpp +++ b/src/Functions/array/arrayShuffle.cpp @@ -123,7 +123,7 @@ ColumnPtr FunctionArrayShuffleImpl::executeImpl(const ColumnsWithTypeAnd }(); pcg64_fast rng(seed); - size_t limit = [&] + size_t limit = [&]() -> size_t { if constexpr (Traits::has_limit) { @@ -133,7 +133,7 @@ ColumnPtr FunctionArrayShuffleImpl::executeImpl(const ColumnsWithTypeAnd return val->getUInt(0); } } - return static_cast(0); + return 0; }(); return executeGeneric(*array, rng, limit); From e1d281206f3696768fd89afb31196c522c2104db Mon Sep 17 00:00:00 2001 From: Joanna Hulboj Date: Sat, 21 Jan 2023 14:08:30 +0000 Subject: [PATCH 16/21] Clamp the limit for arrayPartialShuffle --- src/Functions/array/arrayShuffle.cpp | 5 ++++- tests/queries/0_stateless/02523_array_shuffle.reference | 1 + tests/queries/0_stateless/02523_array_shuffle.sql | 1 + 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/Functions/array/arrayShuffle.cpp b/src/Functions/array/arrayShuffle.cpp index b2432e650c4..b9c16fc9a07 100644 --- a/src/Functions/array/arrayShuffle.cpp +++ b/src/Functions/array/arrayShuffle.cpp @@ -156,7 +156,10 @@ ColumnPtr FunctionArrayShuffleImpl::executeGeneric(const ColumnArray & a if constexpr (Traits::has_limit) { if (limit) - partial_shuffle(&permutation[current_offset], &permutation[next_offset], limit, rng); + { + const auto effective_limit = std::min(limit, next_offset - current_offset); + partial_shuffle(&permutation[current_offset], &permutation[next_offset], effective_limit, rng); + } else shuffle(&permutation[current_offset], &permutation[next_offset], rng); } diff --git a/tests/queries/0_stateless/02523_array_shuffle.reference b/tests/queries/0_stateless/02523_array_shuffle.reference index 2263f8dc92a..db5d1b06342 100644 --- a/tests/queries/0_stateless/02523_array_shuffle.reference +++ b/tests/queries/0_stateless/02523_array_shuffle.reference @@ -47,6 +47,7 @@ [10,9,4,2,5,6,7,1,3,8] [10,9,4,2,5,6,7,1,8,3] [10,9,4,2,5,6,7,1,8,3] +[10,9,4,2,5,6,7,1,8,3] [10.1,9,4,2,5,6,7,8,3,1] [9223372036854775808,9,4,2,5,6,7,8,3,1] [NULL,9,4,2,5,6,7,8,3,1] diff --git a/tests/queries/0_stateless/02523_array_shuffle.sql b/tests/queries/0_stateless/02523_array_shuffle.sql index dfeb75e01c5..cec0b3fbd29 100644 --- a/tests/queries/0_stateless/02523_array_shuffle.sql +++ b/tests/queries/0_stateless/02523_array_shuffle.sql @@ -47,6 +47,7 @@ SELECT arrayPartialShuffle([1,2,3,4,5,6,7,8,9,10], 4, 0xbad_cafe); SELECT arrayPartialShuffle([1,2,3,4,5,6,7,8,9,10], 8, 0xbad_cafe); SELECT arrayPartialShuffle([1,2,3,4,5,6,7,8,9,10], 9, 0xbad_cafe); SELECT arrayPartialShuffle([1,2,3,4,5,6,7,8,9,10], 10, 0xbad_cafe); +SELECT arrayPartialShuffle([1,2,3,4,5,6,7,8,9,10], 100, 0xbad_cafe); SELECT arrayPartialShuffle([1,2,3,4,5,6,7,8,9,10.1], 4, 0xbad_cafe); SELECT arrayPartialShuffle([1,2,3,4,5,6,7,8,9,9223372036854775808], 4, 0xbad_cafe); SELECT arrayPartialShuffle([1,2,3,4,5,6,7,8,9,NULL], 4, 0xbad_cafe); From 3b472eb2dd18ef707fce0dfafaa2790f57158e7b Mon Sep 17 00:00:00 2001 From: Joanna Hulboj Date: Sun, 22 Jan 2023 13:38:48 +0000 Subject: [PATCH 17/21] FIXUP: Darwin compilation issue --- src/Functions/array/arrayShuffle.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Functions/array/arrayShuffle.cpp b/src/Functions/array/arrayShuffle.cpp index b9c16fc9a07..6532fb89ae8 100644 --- a/src/Functions/array/arrayShuffle.cpp +++ b/src/Functions/array/arrayShuffle.cpp @@ -157,7 +157,7 @@ ColumnPtr FunctionArrayShuffleImpl::executeGeneric(const ColumnArray & a { if (limit) { - const auto effective_limit = std::min(limit, next_offset - current_offset); + const auto effective_limit = std::min(limit, next_offset - current_offset); partial_shuffle(&permutation[current_offset], &permutation[next_offset], effective_limit, rng); } else From 8791a44e01d96ac5bbbacb9097eb49064724e926 Mon Sep 17 00:00:00 2001 From: Joanna Hulboj Date: Fri, 27 Jan 2023 18:01:54 +0000 Subject: [PATCH 18/21] FIXUP: Added info and test for materialized array --- src/Functions/array/arrayShuffle.cpp | 18 ++++++++++----- .../0_stateless/02523_array_shuffle.reference | 22 +++++++++++++++++++ .../0_stateless/02523_array_shuffle.sql | 4 ++++ 3 files changed, 39 insertions(+), 5 deletions(-) diff --git a/src/Functions/array/arrayShuffle.cpp b/src/Functions/array/arrayShuffle.cpp index 6532fb89ae8..0c5696d1d37 100644 --- a/src/Functions/array/arrayShuffle.cpp +++ b/src/Functions/array/arrayShuffle.cpp @@ -178,6 +178,9 @@ REGISTER_FUNCTION(ArrayShuffle) Returns an array of the same size as the original array containing the elements in shuffled order. Elements are being reordered in such a way that each possible permutation of those elements has equal probability of appearance. +Note: this function will not materialize constants: +[example:materialize] + If no seed is provided a random one will be used: [example:random_seed] @@ -186,19 +189,23 @@ It is possible to override the seed to produce stable results: )", Documentation::Examples{ {"random_seed", "SELECT arrayShuffle([1, 2, 3, 4])"}, - {"explicit_seed", "SELECT arrayShuffle([1, 2, 3, 4], 41)"}}, + {"explicit_seed", "SELECT arrayShuffle([1, 2, 3, 4], 41)"}, + {"materialize", "SELECT arrayShuffle(materialize([1, 2, 3]), 42), arrayShuffle([1, 2, 3], 42) FROM numbers(10)"}}, Documentation::Categories{"Array"} }, FunctionFactory::CaseInsensitive); factory.registerFunction>( { R"( -Returns an array of the same size as the original array where elements in range [0..limit) are a random -subset of the original array. Remaining [limit..n) shall contain the elements not in [0..limit) range in undefined order. -Value of limit shall be in range [0..n]. Values outside of that range are equivalent to performing full arrayShuffle: +Returns an array of the same size as the original array where elements in range [1..limit] are a random +subset of the original array. Remaining (limit..n] shall contain the elements not in [1..limit] range in undefined order. +Value of limit shall be in range [1..n]. Values outside of that range are equivalent to performing full arrayShuffle: [example:no_limit1] [example:no_limit2] +Note: this function will not materialize constants: +[example:materialize] + If no seed is provided a random one will be used: [example:random_seed] @@ -209,7 +216,8 @@ It is possible to override the seed to produce stable results: {"no_limit1", "SELECT arrayPartialShuffle([1, 2, 3, 4], 0)"}, {"no_limit2", "SELECT arrayPartialShuffle([1, 2, 3, 4])"}, {"random_seed", "SELECT arrayPartialShuffle([1, 2, 3, 4], 2)"}, - {"explicit_seed", "SELECT arrayShuffle([1, 2, 3, 4], 2, 41)"}}, + {"explicit_seed", "SELECT arrayPartialShuffle([1, 2, 3, 4], 2, 41)"}, + {"materialize", "SELECT arrayPartialShuffle(materialize([1, 2, 3, 4]), 2, 42), arrayPartialShuffle([1, 2, 3], 2, 42) FROM numbers(10)"}}, Documentation::Categories{"Array"} }, FunctionFactory::CaseInsensitive); diff --git a/tests/queries/0_stateless/02523_array_shuffle.reference b/tests/queries/0_stateless/02523_array_shuffle.reference index db5d1b06342..0504da61f9d 100644 --- a/tests/queries/0_stateless/02523_array_shuffle.reference +++ b/tests/queries/0_stateless/02523_array_shuffle.reference @@ -3,6 +3,7 @@ [9223372036854775808] [9223372036854775808] [10,9,4,2,5,6,7,1,8,3] +[10,9,4,2,5,6,7,1,8,3] [10.1,9,4,2,5,6,7,1,8,3] [9223372036854775808,9,4,2,5,6,7,1,8,3] [NULL,9,4,2,5,6,7,1,8,3] @@ -13,6 +14,7 @@ [NULL] [NULL,NULL] [[10,20,30,40],[1,2,3,4],[100,200,300,400,500,600,700,800,900],[2,4,8,16,32,64],[-1,-2,-3,-4]] +[[10,20,30,40],[1,2,3,4],[100,200,300,400,500,600,700,800,900],[2,4,8,16,32,64],[-1,-2,-3,-4]] [[10,20,30,40],[1,2,3,4],[100,200,300,400,500,600,700,800,900],[2,4,8,16,32,64],[NULL,-2,-3,-4]] [10,72,11,18,73,76,46,71,44,35,9,0,97,53,13,32,51,30,3,68,5,48,67,90,20,27,38,19,54,21,83,84,1,22,56,81,91,77,36,63,33,39,24,40,4,99,14,23,94,29,26,96,2,28,31,57,42,88,12,47,58,8,37,82,92,34,6,60,25,43,50,74,70,52,55,62,17,79,65,93,86,7,16,41,59,75,80,45,69,89,85,87,95,64,61,98,49,78,66,15] [10,72,11,18,73,76,46,71,44,35,9,0,97,53,13,32,51,30,3,68,5,48,67,90,20,27,38,19,54,21,83,84,1,22,56,81,91,77,36,63,33,39,24,40,4,99,14,23,94,29,26,96,2,28,31,57,42,88,12,47,58,8,37,82,92,34,6,60,25,43,50,74,70,52,55,62,17,79,65,93,86,7,16,41,59,75,80,45,69,89,85,87,95,64,61,98,49,78,66,15] @@ -61,3 +63,23 @@ [10,72,11,18,73,76,46,71,44,35,9,0,97,53,13,32,51,30,3,68,20,21,22,23,24,25,26,27,28,29,17,31,15,33,34,2,36,37,38,39,40,41,42,43,8,45,6,47,48,49,50,16,52,14,54,55,56,57,58,59,60,61,62,63,64,65,66,67,19,69,70,7,1,4,74,75,5,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,12,98,99] [(3,-3),(1,-1),(99999999,-99999999)] [(3,'A'),(1,NULL),(2,'a')] +[3,2,1] +[3,2,1] +[3,2,1] +[3,2,1] +[3,2,1] +[3,2,1] +[3,2,1] +[3,2,1] +[3,2,1] +[3,2,1] +[3,2,1] +[3,1,2] +[1,3,2] +[2,1,3] +[3,2,1] +[3,2,1] +[1,2,3] +[3,2,1] +[3,2,1] +[2,1,3] diff --git a/tests/queries/0_stateless/02523_array_shuffle.sql b/tests/queries/0_stateless/02523_array_shuffle.sql index cec0b3fbd29..9138657c842 100644 --- a/tests/queries/0_stateless/02523_array_shuffle.sql +++ b/tests/queries/0_stateless/02523_array_shuffle.sql @@ -3,6 +3,7 @@ SELECT arrayShuffle([], 0xbad_cafe); SELECT arrayShuffle([9223372036854775808]); SELECT arrayShuffle([9223372036854775808], 0xbad_cafe); SELECT arrayShuffle([1,2,3,4,5,6,7,8,9,10], 0xbad_cafe); +SELECT arrayShuffle(materialize([1,2,3,4,5,6,7,8,9,10]), 0xbad_cafe); SELECT arrayShuffle([1,2,3,4,5,6,7,8,9,10.1], 0xbad_cafe); SELECT arrayShuffle([1,2,3,4,5,6,7,8,9,9223372036854775808], 0xbad_cafe); SELECT arrayShuffle([1,2,3,4,5,6,7,8,9,NULL], 0xbad_cafe); @@ -13,6 +14,7 @@ SELECT arrayShuffle(['storage','tiger',NULL,'terminal','uniform','sensation'], 0 SELECT arrayShuffle([NULL]); SELECT arrayShuffle([NULL,NULL]); SELECT arrayShuffle([[1,2,3,4],[-1,-2,-3,-4],[10,20,30,40],[100,200,300,400,500,600,700,800,900],[2,4,8,16,32,64]], 0xbad_cafe); +SELECT arrayShuffle(materialize([[1,2,3,4],[-1,-2,-3,-4],[10,20,30,40],[100,200,300,400,500,600,700,800,900],[2,4,8,16,32,64]]), 0xbad_cafe); SELECT arrayShuffle([[1,2,3,4],[NULL,-2,-3,-4],[10,20,30,40],[100,200,300,400,500,600,700,800,900],[2,4,8,16,32,64]], 0xbad_cafe); SELECT arrayShuffle(groupArray(x),0xbad_cafe) FROM (SELECT number as x from system.numbers LIMIT 100); SELECT arrayShuffle(groupArray(toUInt64(x)),0xbad_cafe) FROM (SELECT number as x from system.numbers LIMIT 100); @@ -61,6 +63,8 @@ SELECT arrayPartialShuffle(groupArray(x),20,0xbad_cafe) FROM (SELECT number as x SELECT arrayPartialShuffle(groupArray(toUInt64(x)),20,0xbad_cafe) FROM (SELECT number as x from system.numbers LIMIT 100); SELECT arrayPartialShuffle([tuple(1, -1), tuple(99999999, -99999999), tuple(3, -3)], 2, 0xbad_cafe); SELECT arrayPartialShuffle([tuple(1, NULL), tuple(2, 'a'), tuple(3, 'A')], 2, 0xbad_cafe); +SELECT arrayShuffle([1, 2, 3], 42) FROM numbers(10); -- for constant array we don not materialize it and each row gets the same permutation +SELECT arrayShuffle(materialize([1, 2, 3]), 42) FROM numbers(10); SELECT arrayShuffle(1); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } SELECT arrayShuffle([1], 'a'); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } SELECT arrayShuffle([1], 1.1); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } From 9a559b5475f4c7a89b422e0f3c420291b2d048a0 Mon Sep 17 00:00:00 2001 From: Joanna Hulboj Date: Fri, 27 Jan 2023 18:03:13 +0000 Subject: [PATCH 19/21] FIXUP: More comments about shuffle --- src/Common/shuffle.h | 8 ++++++++ src/Functions/array/arrayShuffle.cpp | 4 ++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src/Common/shuffle.h b/src/Common/shuffle.h index f2477db0352..c21a3e4ea33 100644 --- a/src/Common/shuffle.h +++ b/src/Common/shuffle.h @@ -6,6 +6,10 @@ /* Reorders the elements in the given range [first, last) such that each * possible permutation of those elements has equal probability of appearance. + * + * for i ∈ [0, n-2): + * j ← random from ∈ [i, n) + * swap arr[i] ↔ arr[j] */ template void shuffle(Iter first, Iter last, Rng && rng) @@ -28,6 +32,10 @@ void shuffle(Iter first, Iter last, Rng && rng) * [first, first + limit) is a random subset of the original range. * [first + limit, last) shall contain the elements not in [first, first + limit) * in undefined order. + * + * for i ∈ [0, limit): + * j ← random from ∈ [i, n) + * swap arr[i] ↔ arr[j] */ template void partial_shuffle(Iter first, Iter last, size_t limit, Rng && rng) diff --git a/src/Functions/array/arrayShuffle.cpp b/src/Functions/array/arrayShuffle.cpp index 0c5696d1d37..d78024236bf 100644 --- a/src/Functions/array/arrayShuffle.cpp +++ b/src/Functions/array/arrayShuffle.cpp @@ -36,7 +36,7 @@ struct FunctionArrayShuffleTraits static constexpr auto has_limit = false; // Permute the whole array static ColumnNumbers getArgumentsThatAreAlwaysConstant() { return {1}; } static constexpr auto max_num_params = 2; // array[, seed] - static constexpr auto seed_param_idx = 1; + static constexpr auto seed_param_idx = 1; // --------^^^^ }; /** Partial shuffle array elements @@ -50,7 +50,7 @@ struct FunctionArrayPartialShuffleTraits static constexpr auto has_limit = true; static ColumnNumbers getArgumentsThatAreAlwaysConstant() { return {1, 2}; } static constexpr auto max_num_params = 3; // array[, limit[, seed]] - static constexpr auto seed_param_idx = 2; + static constexpr auto seed_param_idx = 2; // ----------------^^^^ }; template From 67377dc81d2e6c04879b2cd54521b2ca64050002 Mon Sep 17 00:00:00 2001 From: Joanna Hulboj Date: Fri, 27 Jan 2023 18:04:31 +0000 Subject: [PATCH 20/21] FIXUP: Add arrayShuffle and arrayShufflePartial to fuzzer corpus --- tests/fuzz/all.dict | 2 ++ tests/fuzz/dictionaries/functions.dict | 2 ++ 2 files changed, 4 insertions(+) diff --git a/tests/fuzz/all.dict b/tests/fuzz/all.dict index 7977cb9ed21..17ef7d2ab1e 100644 --- a/tests/fuzz/all.dict +++ b/tests/fuzz/all.dict @@ -72,6 +72,7 @@ "arrayMap" "arrayMax" "arrayMin" +"arrayPartialShuffle" "arrayPopBack" "arrayPopFront" "arrayProduct" @@ -84,6 +85,7 @@ "arrayReverseFill" "arrayReverseSort" "arrayReverseSplit" +"arrayShuffle" "arraySlice" "arraySort" "arraySplit" diff --git a/tests/fuzz/dictionaries/functions.dict b/tests/fuzz/dictionaries/functions.dict index a07841f733e..e77a2a779fd 100644 --- a/tests/fuzz/dictionaries/functions.dict +++ b/tests/fuzz/dictionaries/functions.dict @@ -872,6 +872,8 @@ "nullIn" "MONTH" "arrayReverse" +"arrayShuffle" +"arrayPartialShuffle" "now64" "DATE" "addressToLine" From df030a56f0b3ad4a8c08371ea04e30cd2e39fdd9 Mon Sep 17 00:00:00 2001 From: Joanna Hulboj Date: Fri, 27 Jan 2023 21:27:09 +0000 Subject: [PATCH 21/21] FIXUP: Remove use of legacy exceptions api --- src/Functions/array/arrayShuffle.cpp | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/src/Functions/array/arrayShuffle.cpp b/src/Functions/array/arrayShuffle.cpp index d78024236bf..9cf3ac8f3fe 100644 --- a/src/Functions/array/arrayShuffle.cpp +++ b/src/Functions/array/arrayShuffle.cpp @@ -89,8 +89,10 @@ public: WhichDataType which(arguments[param_idx]); if (!which.isUInt() && !which.isInt()) throw Exception( - "Illegal type " + arguments[param_idx]->getName() + " of argument of function " + getName() + " (must be UInt or Int)", - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of arguments of function {} (must be UInt or Int)", + arguments[param_idx]->getName(), + getName()); }; for (size_t idx = 1; idx < arguments.size(); ++idx) @@ -111,7 +113,7 @@ ColumnPtr FunctionArrayShuffleImpl::executeImpl(const ColumnsWithTypeAnd const ColumnArray * array = checkAndGetColumn(arguments[0].column.get()); if (!array) throw Exception( - "Illegal column " + arguments[0].column->getName() + " of first argument of function " + getName(), ErrorCodes::ILLEGAL_COLUMN); + ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}", arguments[0].column->getName(), getName()); const auto seed = [&]() -> uint64_t { @@ -187,12 +189,11 @@ If no seed is provided a random one will be used: It is possible to override the seed to produce stable results: [example:explicit_seed] )", - Documentation::Examples{ + Documentation::Examples{ {"random_seed", "SELECT arrayShuffle([1, 2, 3, 4])"}, {"explicit_seed", "SELECT arrayShuffle([1, 2, 3, 4], 41)"}, {"materialize", "SELECT arrayShuffle(materialize([1, 2, 3]), 42), arrayShuffle([1, 2, 3], 42) FROM numbers(10)"}}, - Documentation::Categories{"Array"} - }, + Documentation::Categories{"Array"}}, FunctionFactory::CaseInsensitive); factory.registerFunction>( { @@ -212,14 +213,14 @@ If no seed is provided a random one will be used: It is possible to override the seed to produce stable results: [example:explicit_seed] )", - Documentation::Examples{ + Documentation::Examples{ {"no_limit1", "SELECT arrayPartialShuffle([1, 2, 3, 4], 0)"}, {"no_limit2", "SELECT arrayPartialShuffle([1, 2, 3, 4])"}, {"random_seed", "SELECT arrayPartialShuffle([1, 2, 3, 4], 2)"}, {"explicit_seed", "SELECT arrayPartialShuffle([1, 2, 3, 4], 2, 41)"}, - {"materialize", "SELECT arrayPartialShuffle(materialize([1, 2, 3, 4]), 2, 42), arrayPartialShuffle([1, 2, 3], 2, 42) FROM numbers(10)"}}, - Documentation::Categories{"Array"} - }, + {"materialize", + "SELECT arrayPartialShuffle(materialize([1, 2, 3, 4]), 2, 42), arrayPartialShuffle([1, 2, 3], 2, 42) FROM numbers(10)"}}, + Documentation::Categories{"Array"}}, FunctionFactory::CaseInsensitive); }