Merge pull request #69030 from bigo-sg/improve_array_zip

Add new function arrayZipUnaligned
This commit is contained in:
Anton Popov 2024-09-05 08:05:56 +00:00 committed by GitHub
commit 10d2631ec9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 189 additions and 39 deletions

View File

@ -2035,6 +2035,7 @@ Query:
SELECT arrayZip(['a', 'b', 'c'], [5, 2, 1]);
```
Result:
``` text
@ -2043,6 +2044,43 @@ Result:
└──────────────────────────────────────┘
```
## arrayZipUnaligned
Combines multiple arrays into a single array, allowing for unaligned arrays. The resulting array contains the corresponding elements of the source arrays grouped into tuples in the listed order of arguments.
**Syntax**
``` sql
arrayZipUnaligned(arr1, arr2, ..., arrN)
```
**Arguments**
- `arrN` — [Array](../data-types/array.md).
The function can take any number of arrays of different types.
**Returned value**
- Array with elements from the source arrays grouped into [tuples](../data-types/tuple.md). Data types in the tuple are the same as types of the input arrays and in the same order as arrays are passed. [Array](../data-types/array.md). If the arrays have different sizes, the shorter arrays will be padded with `null` values.
**Example**
Query:
``` sql
SELECT arrayZipUnaligned(['a'], [1, 2, 3]);
```
Result:
``` text
┌─arrayZipUnaligned(['a'], [1, 2, 3])─┐
│ [('a',1),(NULL,2),(NULL,3)] │
└─────────────────────────────────────┘
```
## arrayAUC
Calculate AUC (Area Under the Curve, which is a concept in machine learning, see more details: <https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve>).

View File

@ -1,7 +1,8 @@
#include <Columns/ColumnTuple.h>
#include <Columns/ColumnArray.h>
#include <DataTypes/DataTypeTuple.h>
#include <Columns/ColumnNullable.h>
#include <Columns/ColumnTuple.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeTuple.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionHelpers.h>
#include <IO/WriteHelpers.h>
@ -12,23 +13,22 @@ namespace DB
namespace ErrorCodes
{
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int SIZES_OF_ARRAYS_DONT_MATCH;
extern const int TOO_FEW_ARGUMENTS_FOR_FUNCTION;
extern const int ILLEGAL_COLUMN;
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int SIZES_OF_ARRAYS_DONT_MATCH;
extern const int TOO_FEW_ARGUMENTS_FOR_FUNCTION;
extern const int ILLEGAL_COLUMN;
}
/// arrayZip(['a', 'b', 'c'], ['d', 'e', 'f']) = [('a', 'd'), ('b', 'e'), ('c', 'f')]
/// arrayZipUnaligned(['a', 'b', 'c'], ['d', 'e']) = [('a', 'd'), ('b', 'e'), ('c', null)]
template <bool allow_unaligned>
class FunctionArrayZip : public IFunction
{
public:
static constexpr auto name = "arrayZip";
static constexpr auto name = allow_unaligned ? "arrayZipUnaligned" : "arrayZip";
static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionArrayZip>(); }
String getName() const override
{
return name;
}
String getName() const override { return name; }
bool isVariadic() const override { return true; }
size_t getNumberOfArguments() const override { return 0; }
@ -39,8 +39,11 @@ public:
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
{
if (arguments.empty())
throw Exception(ErrorCodes::TOO_FEW_ARGUMENTS_FOR_FUNCTION,
"Function {} needs at least one argument; passed {}." , getName(), arguments.size());
throw Exception(
ErrorCodes::TOO_FEW_ARGUMENTS_FOR_FUNCTION,
"Function {} needs at least one argument; passed {}.",
getName(),
arguments.size());
DataTypes arguments_types;
for (size_t index = 0; index < arguments.size(); ++index)
@ -48,56 +51,142 @@ public:
const DataTypeArray * array_type = checkAndGetDataType<DataTypeArray>(arguments[index].type.get());
if (!array_type)
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Argument {} of function {} must be array. Found {} instead.",
toString(index + 1), getName(), arguments[0].type->getName());
throw Exception(
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
"Argument {} of function {} must be array. Found {} instead.",
toString(index + 1),
getName(),
arguments[0].type->getName());
arguments_types.emplace_back(array_type->getNestedType());
auto nested_type = array_type->getNestedType();
if constexpr (allow_unaligned)
nested_type = makeNullable(nested_type);
arguments_types.emplace_back(nested_type);
}
return std::make_shared<DataTypeArray>(std::make_shared<DataTypeTuple>(arguments_types));
}
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/) const override
ColumnPtr
executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & /*result_type*/, size_t input_rows_count) const override
{
size_t num_arguments = arguments.size();
ColumnPtr first_array_column;
Columns holders(num_arguments);
Columns tuple_columns(num_arguments);
bool has_unaligned = false;
size_t unaligned_index = 0;
for (size_t i = 0; i < num_arguments; ++i)
{
/// Constant columns cannot be inside tuple. It's only possible to have constant tuple as a whole.
ColumnPtr holder = arguments[i].column->convertToFullColumnIfConst();
holders[i] = holder;
const ColumnArray * column_array = checkAndGetColumn<ColumnArray>(holder.get());
if (!column_array)
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Argument {} of function {} must be array. Found column {} instead.",
i + 1, getName(), holder->getName());
if (i == 0)
{
first_array_column = holder;
}
else if (!column_array->hasEqualOffsets(static_cast<const ColumnArray &>(*first_array_column)))
{
throw Exception(ErrorCodes::SIZES_OF_ARRAYS_DONT_MATCH,
"The argument 1 and argument {} of function {} have different array sizes",
i + 1, getName());
}
throw Exception(
ErrorCodes::ILLEGAL_COLUMN,
"Argument {} of function {} must be array. Found column {} instead.",
i + 1,
getName(),
holder->getName());
tuple_columns[i] = column_array->getDataPtr();
if (i && !column_array->hasEqualOffsets(static_cast<const ColumnArray &>(*holders[0])))
{
has_unaligned = true;
unaligned_index = i;
}
}
return ColumnArray::create(
ColumnTuple::create(tuple_columns), static_cast<const ColumnArray &>(*first_array_column).getOffsetsPtr());
if constexpr (!allow_unaligned)
{
if (has_unaligned)
throw Exception(
ErrorCodes::SIZES_OF_ARRAYS_DONT_MATCH,
"The argument 1 and argument {} of function {} have different array sizes",
unaligned_index + 1,
getName());
else
return ColumnArray::create(
ColumnTuple::create(std::move(tuple_columns)), static_cast<const ColumnArray &>(*holders[0]).getOffsetsPtr());
}
else
return executeUnaligned(holders, tuple_columns, input_rows_count, has_unaligned);
}
private:
ColumnPtr executeUnaligned(const Columns & holders, Columns & tuple_columns, size_t input_rows_count, bool has_unaligned) const
{
std::vector<const ColumnArray *> array_columns(holders.size());
for (size_t i = 0; i < holders.size(); ++i)
array_columns[i] = checkAndGetColumn<ColumnArray>(holders[i].get());
for (auto & tuple_column : tuple_columns)
tuple_column = makeNullable(tuple_column);
if (!has_unaligned)
return ColumnArray::create(ColumnTuple::create(std::move(tuple_columns)), array_columns[0]->getOffsetsPtr());
MutableColumns res_tuple_columns(tuple_columns.size());
for (size_t i = 0; i < tuple_columns.size(); ++i)
{
res_tuple_columns[i] = tuple_columns[i]->cloneEmpty();
res_tuple_columns[i]->reserve(tuple_columns[i]->size());
}
auto res_offsets_column = ColumnArray::ColumnOffsets::create(input_rows_count);
auto & res_offsets = assert_cast<ColumnArray::ColumnOffsets &>(*res_offsets_column).getData();
size_t curr_offset = 0;
for (size_t row_i = 0; row_i < input_rows_count; ++row_i)
{
size_t max_size = 0;
for (size_t arg_i = 0; arg_i < holders.size(); ++arg_i)
{
const auto * array_column = array_columns[arg_i];
const auto & offsets = array_column->getOffsets();
size_t array_offset = offsets[row_i - 1];
size_t array_size = offsets[row_i] - array_offset;
res_tuple_columns[arg_i]->insertRangeFrom(*tuple_columns[arg_i], array_offset, array_size);
max_size = std::max(max_size, array_size);
}
for (size_t arg_i = 0; arg_i < holders.size(); ++arg_i)
{
const auto * array_column = array_columns[arg_i];
const auto & offsets = array_column->getOffsets();
size_t array_offset = offsets[row_i - 1];
size_t array_size = offsets[row_i] - array_offset;
res_tuple_columns[arg_i]->insertManyDefaults(max_size - array_size);
}
curr_offset += max_size;
res_offsets[row_i] = curr_offset;
}
return ColumnArray::create(ColumnTuple::create(std::move(res_tuple_columns)), std::move(res_offsets_column));
}
};
REGISTER_FUNCTION(ArrayZip)
{
factory.registerFunction<FunctionArrayZip>();
factory.registerFunction<FunctionArrayZip<false>>(
{.description = R"(
Combines multiple arrays into a single array. The resulting array contains the corresponding elements of the source arrays grouped into tuples in the listed order of arguments.
)",
.categories{"String"}});
factory.registerFunction<FunctionArrayZip<true>>(
{.description = R"(
Combines multiple arrays into a single array, allowing for unaligned arrays. The resulting array contains the corresponding elements of the source arrays grouped into tuples in the listed order of arguments.
If the arrays have different sizes, the shorter arrays will be padded with `null` values.
)",
.categories{"String"}}
);
}
}

View File

@ -143,7 +143,6 @@ arrayStringConcat
arraySum
arrayUniq
arrayWithConstant
arrayZip
asinh
assumeNotNull
atan

View File

@ -0,0 +1,8 @@
[('a','d'),('b','e'),('c','f')] Array(Tuple(Nullable(String), Nullable(String)))
[('a','d','g'),('b','e','h'),('c','f','i')]
[('a','d'),('b','e'),('c','f'),(NULL,'g')]
[('a',1),(NULL,2),(NULL,3)]
[('a',1,1.1),('b',2,2.2),('c',NULL,3.3),(NULL,NULL,4.4)]
[('g'),('h'),('i')]
[('g'),('h'),('i')]
[('g'),('h'),('i')]

View File

@ -0,0 +1,15 @@
SELECT arrayZipUnaligned(['a', 'b', 'c'], ['d', 'e', 'f']) as x, toTypeName(x);
SELECT arrayZipUnaligned(['a', 'b', 'c'], ['d', 'e', 'f'], ['g', 'h', 'i']);
SELECT arrayZipUnaligned(); -- { serverError TOO_FEW_ARGUMENTS_FOR_FUNCTION }
SELECT arrayZipUnaligned('a', 'b', 'c'); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
SELECT arrayZipUnaligned(['a', 'b', 'c'], ['d', 'e', 'f', 'g']);
SELECT arrayZipUnaligned(['a'], [1, 2, 3]);
SELECT arrayZipUnaligned(['a', 'b', 'c'], [1, 2], [1.1, 2.2, 3.3, 4.4]);
SELECT arrayZipUnaligned(materialize(['g', 'h', 'i'])) from numbers(3);

View File

@ -1209,6 +1209,7 @@ arraySum
arrayUniq
arrayWithConstant
arrayZip
arrayZipUnaligned
ascii
asin
asinh