diff --git a/docs/en/sql-reference/functions/array-functions.md b/docs/en/sql-reference/functions/array-functions.md index 40bfb65e4e8..02e5d1e5ae2 100644 --- a/docs/en/sql-reference/functions/array-functions.md +++ b/docs/en/sql-reference/functions/array-functions.md @@ -2172,80 +2172,6 @@ Result: └─────────────────────┘ ``` - -## arrayRandomSample - -Function `arrayRandomSample` returns a subset with `samples`-many random elements of an input array. If `samples` exceeds the size of the input array, the sample size is limited to the size of the array. In this case, all elements of the input array are returned, but the order is not guaranteed. The function can handle both flat arrays and nested arrays. - -**Syntax** - -```sql -arrayRandomSample(arr, samples) -``` - -**Arguments** - -- `arr` — The input array from which to sample elements. This may be flat or nested arrays. -- `samples` — An unsigned integer specifying the number of elements to include in the random sample. - -**Returned Value** - -- An array containing a random sample of elements from the input array. - -**Examples** - -Query: - -```sql -SELECT arrayRandomSample(['apple', 'banana', 'cherry', 'date'], 2) as res; -``` - -Result: -``` -┌─res────────────────┐ -│ ['banana','apple'] │ -└────────────────────┘ -``` - -Query: - -```sql -SELECT arrayRandomSample([[1, 2], [3, 4], [5, 6]], 2) as res; -``` - -Result: -``` -┌─res───────────┐ -│ [[3,4],[5,6]] │ -└───────────────┘ -``` - -Query: - -```sql -SELECT arrayRandomSample([1, 2, 3, 4, 5], 0) as res; -``` - -Result: -``` -┌─res─┐ -│ [] │ -└─────┘ -``` - -Query: - -```sql -SELECT arrayRandomSample([1, 2, 3], 5) as res; -``` - -Result: -``` -┌─res─────┐ -│ [3,1,2] │ -└─────────┘ -``` - ## Distance functions All supported functions are described in [distance functions documentation](../../sql-reference/functions/distance-functions.md). diff --git a/src/Functions/array/arrayRandomSample.cpp b/src/Functions/array/arrayRandomSample.cpp deleted file mode 100644 index 908ca9fa30a..00000000000 --- a/src/Functions/array/arrayRandomSample.cpp +++ /dev/null @@ -1,118 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include "Columns/ColumnsNumber.h" - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int ILLEGAL_COLUMN; - extern const int ILLEGAL_TYPE_OF_ARGUMENT; -} - -/// arrayRandomSample(arr, k) - Returns k random elements from the input array -class FunctionArrayRandomSample : public IFunction -{ -public: - static constexpr auto name = "arrayRandomSample"; - - static FunctionPtr create(ContextPtr) { return std::make_shared(); } - - String getName() const override { return name; } - - size_t getNumberOfArguments() const override { return 2; } - ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; } - bool useDefaultImplementationForConstants() const override { return true; } - bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } - - DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override - { - FunctionArgumentDescriptors args{ - {"array", &isArray, nullptr, "Array"}, - {"samples", &isUnsignedInteger, isColumnConst, "const UInt*"}, - }; - validateFunctionArgumentTypes(*this, arguments, args); - - // Return an array with the same nested type as the input array - const DataTypePtr & array_type = arguments[0].type; - const DataTypeArray * array_data_type = checkAndGetDataType(array_type.get()); - - // Get the nested data type of the array - const DataTypePtr & nested_type = array_data_type->getNestedType(); - - return std::make_shared(nested_type); - } - - ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override - { - const ColumnArray * column_array = checkAndGetColumn(arguments[0].column.get()); - if (!column_array) - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "First argument must be an array"); - - const IColumn * col_samples = arguments[1].column.get(); - if (!col_samples) - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "The second argument is empty or null, type = {}", arguments[1].type->getName()); - - UInt64 samples; - try - { - samples = col_samples->getUInt(0); - } - catch (...) - { - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Failed to fetch UInt64 from the second argument column, type = {}", - arguments[1].type->getName()); - } - - std::random_device rd; - std::mt19937 gen(rd()); - - auto nested_column = column_array->getDataPtr()->cloneEmpty(); - auto offsets_column = ColumnUInt64::create(); - - auto res_data = ColumnArray::create(std::move(nested_column), std::move(offsets_column)); - - const auto & input_offsets = column_array->getOffsets(); - auto & res_offsets = res_data->getOffsets(); - res_offsets.resize(input_rows_count); - - UInt64 cur_samples; - size_t current_offset = 0; - - for (size_t row = 0; row < input_rows_count; row++) - { - size_t row_size = input_offsets[row] - current_offset; - - std::vector indices(row_size); - std::iota(indices.begin(), indices.end(), 0); - std::shuffle(indices.begin(), indices.end(), gen); - - cur_samples = std::min(samples, static_cast(row_size)); - - for (UInt64 j = 0; j < cur_samples; j++) - { - size_t source_index = indices[j]; - res_data->getData().insertFrom(column_array->getData(), source_index); - } - - res_offsets[row] = current_offset + cur_samples; - current_offset += cur_samples; - } - - return res_data; - } -}; - -REGISTER_FUNCTION(ArrayRandomSample) -{ - factory.registerFunction(); -} - -} diff --git a/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference b/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference index 379eea4dbbb..589ea366030 100644 --- a/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference +++ b/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference @@ -126,7 +126,6 @@ arrayPopFront arrayProduct arrayPushBack arrayPushFront -arrayRandomSample arrayReduce arrayReduceInRanges arrayResize diff --git a/tests/queries/0_stateless/02874_array_random_sample.reference b/tests/queries/0_stateless/02874_array_random_sample.reference deleted file mode 100644 index 7dab23a37b4..00000000000 --- a/tests/queries/0_stateless/02874_array_random_sample.reference +++ /dev/null @@ -1,37 +0,0 @@ -Running iteration: 1 -Integer Test: Passed -String Test: Passed -Nested Array Test: Passed -Higher Sample Number Test: Passed -Multi-row Test with scalar k: Passed -Running iteration: 2 -Integer Test: Passed -String Test: Passed -Nested Array Test: Passed -Higher Sample Number Test: Passed -Multi-row Test with scalar k: Passed -Running iteration: 3 -Integer Test: Passed -String Test: Passed -Nested Array Test: Passed -Higher Sample Number Test: Passed -Multi-row Test with scalar k: Passed -Running iteration: 4 -Integer Test: Passed -String Test: Passed -Nested Array Test: Passed -Higher Sample Number Test: Passed -Multi-row Test with scalar k: Passed -Running iteration: 5 -Integer Test: Passed -String Test: Passed -Nested Array Test: Passed -Higher Sample Number Test: Passed -Multi-row Test with scalar k: Passed -Integer Test with K=0: Passed -Empty Array with K > 0 Test: Passed -Non-Unsigned-Integer K Test (Negative Integer): Passed -Non-Unsigned-Integer K Test (String): Passed -Non-Unsigned-Integer K Test (Floating-Point): Passed -Total tests: 30 -Passed tests: 30 diff --git a/tests/queries/0_stateless/02874_array_random_sample.sh b/tests/queries/0_stateless/02874_array_random_sample.sh deleted file mode 100755 index fe136d6d5d2..00000000000 --- a/tests/queries/0_stateless/02874_array_random_sample.sh +++ /dev/null @@ -1,258 +0,0 @@ -#!/usr/bin/env bash - -CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) -# shellcheck source=../shell_config.sh -. "$CUR_DIR"/../shell_config.sh - -# Initialize variables -total_tests=0 -passed_tests=0 - - -# Test Function for Integer Arrays -run_integer_test() { - query_result=$(clickhouse-client -q "SELECT arrayRandomSample([1,2,3], 2)") - mapfile -t sorted_result < <(echo "$query_result" | tr -d '[]' | tr ',' '\n' | sort -n) - declare -A expected_outcomes - expected_outcomes["1 2"]=1 - expected_outcomes["1 3"]=1 - expected_outcomes["2 3"]=1 - expected_outcomes["2 1"]=1 - expected_outcomes["3 1"]=1 - expected_outcomes["3 2"]=1 - - sorted_result_str=$(echo "${sorted_result[*]}" | tr ' ' '\n' | sort -n | tr '\n' ' ' | sed 's/ $//') - if [[ -n "${expected_outcomes[$sorted_result_str]}" ]]; then - echo "Integer Test: Passed" - ((passed_tests++)) - else - echo "Integer Test: Failed" - echo "Output: $query_result" - fi - ((total_tests++)) -} - -# Test Function for String Arrays -run_string_test() { - query_result=$(clickhouse-client -q "SELECT arrayRandomSample(['a','b','c'], 2)") - mapfile -t sorted_result < <(echo "$query_result" | tr -d "[]'" | tr ',' '\n' | sort) - declare -A expected_outcomes - expected_outcomes["a b"]=1 - expected_outcomes["a c"]=1 - expected_outcomes["b c"]=1 - expected_outcomes["b a"]=1 - expected_outcomes["c a"]=1 - expected_outcomes["c b"]=1 - - sorted_result_str=$(echo "${sorted_result[*]}" | tr ' ' '\n' | sort | tr '\n' ' ' | sed 's/ $//') - if [[ -n "${expected_outcomes[$sorted_result_str]}" ]]; then - echo "String Test: Passed" - ((passed_tests++)) - else - echo "String Test: Failed" - echo "Output: $query_result" - fi - ((total_tests++)) -} - -# Test Function for Nested Arrays -run_nested_array_test() { - query_result=$(clickhouse-client -q "SELECT arrayRandomSample([[7,2],[3,4],[7,6]], 2)") - # Convert to a space-separated string for easy sorting. - converted_result=$(echo "$query_result" | tr -d '[]' | tr ',' ' ') - - # Sort the string. - sorted_result_str=$(echo "$converted_result" | tr ' ' '\n' | xargs -n2 | sort | tr '\n' ' ' | sed 's/ $//') - - # Define all possible expected outcomes, sorted - declare -A expected_outcomes - expected_outcomes["7 2 3 4"]=1 - expected_outcomes["7 2 7 6"]=1 - expected_outcomes["3 4 7 6"]=1 - expected_outcomes["3 4 7 2"]=1 - expected_outcomes["7 6 7 2"]=1 - expected_outcomes["7 6 3 4"]=1 - - if [[ -n "${expected_outcomes[$sorted_result_str]}" ]]; then - echo "Nested Array Test: Passed" - ((passed_tests++)) - else - echo "Nested Array Test: Failed" - echo "Output: $query_result" - echo "Processed Output: ${sorted_result_str}" - fi - ((total_tests++)) -} - - -# Test Function for K > array.size -run_higher_k_test() { - query_result=$(clickhouse-client -q "SELECT arrayRandomSample([1,2,3], 5)") - mapfile -t sorted_result < <(echo "$query_result" | tr -d '[]' | tr ',' '\n' | sort -n) - sorted_original=("1" "2" "3") - - are_arrays_equal=true - for i in "${!sorted_result[@]}"; do - if [[ "${sorted_result[$i]}" != "${sorted_original[$i]}" ]]; then - are_arrays_equal=false - break - fi - done - - if $are_arrays_equal; then - echo "Higher Sample Number Test: Passed" - ((passed_tests++)) - else - echo "Higher Sample Number Test: Failed" - echo "Output: $query_result" - fi - ((total_tests++)) -} - -# Test Function for Integer Arrays with samples = 0 -run_integer_with_samples_0_test() { - query_result=$(clickhouse-client -q "SELECT arrayRandomSample([1,2,3], 0)") - mapfile -t sorted_result < <(echo "$query_result" | tr -d '[]' | tr ',' '\n' | sort -n) - - # An empty array should produce an empty string after transformations - declare -A expected_outcomes - expected_outcomes["EMPTY_ARRAY"]=1 - - # Prepare the result string for comparison - sorted_result_str=$(echo "${sorted_result[*]}" | tr ' ' '\n' | sort -n | tr '\n' ' ' | sed 's/ $//') - - # Use "EMPTY_ARRAY" as a placeholder for an empty array - [[ -z "$sorted_result_str" ]] && sorted_result_str="EMPTY_ARRAY" - - # Compare - if [[ -n "${expected_outcomes[$sorted_result_str]}" ]]; then - echo "Integer Test with K=0: Passed" - ((passed_tests++)) - else - echo "Integer Test with K=0: Failed" - echo "Output: $query_result" - fi - ((total_tests++)) -} - -# Test Function for Empty Array with K > 0 -run_empty_array_with_k_test() { - query_result=$(clickhouse-client -q "SELECT arrayRandomSample([], 5)") - - if [[ "$query_result" == "[]" ]]; then - echo "Empty Array with K > 0 Test: Passed" - ((passed_tests++)) - else { - echo "Empty Array with K > 0 Test: Failed" - echo "Output: $query_result" - } - fi - ((total_tests++)) -} - -# Test Function for Non-Unsigned-Integer K -run_non_unsigned_integer_k_test() { - # Test with negative integer - query_result=$(clickhouse-client -q "SELECT arrayRandomSample([1, 2, 3], -5)" 2>&1) - if [[ "$query_result" == *"ILLEGAL_TYPE_OF_ARGUMENT"* ]]; then - echo "Non-Unsigned-Integer K Test (Negative Integer): Passed" - ((passed_tests++)) - else { - echo "Non-Unsigned-Integer K Test (Negative Integer): Failed" - echo "Output: $query_result" - } - fi - ((total_tests++)) - - # Test with string - query_result=$(clickhouse-client -q "SELECT arrayRandomSample([1, 2, 3], 'a')" 2>&1) - if [[ "$query_result" == *"ILLEGAL_TYPE_OF_ARGUMENT"* ]]; then - echo "Non-Unsigned-Integer K Test (String): Passed" - ((passed_tests++)) - else { - echo "Non-Unsigned-Integer K Test (String): Failed" - echo "Output: $query_result" - } - fi - ((total_tests++)) - - # Test with floating-point number - query_result=$(clickhouse-client -q "SELECT arrayRandomSample([1, 2, 3], 1.5)" 2>&1) - if [[ "$query_result" == *"ILLEGAL_TYPE_OF_ARGUMENT"* ]]; then - echo "Non-Unsigned-Integer K Test (Floating-Point): Passed" - ((passed_tests++)) - else { - echo "Non-Unsigned-Integer K Test (Floating-Point): Failed" - echo "Output: $query_result" - } - fi - ((total_tests++)) -} - -# Function to run a multi-row test with scalar 'k' -run_multi_row_scalar_k_test() { - # Create a table. Use a random database name as tests potentially run in parallel. - db=`tr -dc A-Za-z0-9