mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-24 08:32:02 +00:00
Merge pull request #69030 from bigo-sg/improve_array_zip
Add new function arrayZipUnaligned
This commit is contained in:
commit
10d2631ec9
@ -2035,6 +2035,7 @@ Query:
|
||||
SELECT arrayZip(['a', 'b', 'c'], [5, 2, 1]);
|
||||
```
|
||||
|
||||
|
||||
Result:
|
||||
|
||||
``` text
|
||||
@ -2043,6 +2044,43 @@ Result:
|
||||
└──────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## arrayZipUnaligned
|
||||
|
||||
Combines multiple arrays into a single array, allowing for unaligned arrays. The resulting array contains the corresponding elements of the source arrays grouped into tuples in the listed order of arguments.
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
arrayZipUnaligned(arr1, arr2, ..., arrN)
|
||||
```
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `arrN` — [Array](../data-types/array.md).
|
||||
|
||||
The function can take any number of arrays of different types.
|
||||
|
||||
**Returned value**
|
||||
|
||||
- Array with elements from the source arrays grouped into [tuples](../data-types/tuple.md). Data types in the tuple are the same as types of the input arrays and in the same order as arrays are passed. [Array](../data-types/array.md). If the arrays have different sizes, the shorter arrays will be padded with `null` values.
|
||||
|
||||
**Example**
|
||||
|
||||
Query:
|
||||
|
||||
``` sql
|
||||
SELECT arrayZipUnaligned(['a'], [1, 2, 3]);
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
``` text
|
||||
┌─arrayZipUnaligned(['a'], [1, 2, 3])─┐
|
||||
│ [('a',1),(NULL,2),(NULL,3)] │
|
||||
└─────────────────────────────────────┘
|
||||
```
|
||||
|
||||
|
||||
## arrayAUC
|
||||
|
||||
Calculate AUC (Area Under the Curve, which is a concept in machine learning, see more details: <https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve>).
|
||||
|
@ -1,7 +1,8 @@
|
||||
#include <Columns/ColumnTuple.h>
|
||||
#include <Columns/ColumnArray.h>
|
||||
#include <DataTypes/DataTypeTuple.h>
|
||||
#include <Columns/ColumnNullable.h>
|
||||
#include <Columns/ColumnTuple.h>
|
||||
#include <DataTypes/DataTypeArray.h>
|
||||
#include <DataTypes/DataTypeTuple.h>
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include <Functions/FunctionHelpers.h>
|
||||
#include <IO/WriteHelpers.h>
|
||||
@ -12,23 +13,22 @@ namespace DB
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
|
||||
extern const int SIZES_OF_ARRAYS_DONT_MATCH;
|
||||
extern const int TOO_FEW_ARGUMENTS_FOR_FUNCTION;
|
||||
extern const int ILLEGAL_COLUMN;
|
||||
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
|
||||
extern const int SIZES_OF_ARRAYS_DONT_MATCH;
|
||||
extern const int TOO_FEW_ARGUMENTS_FOR_FUNCTION;
|
||||
extern const int ILLEGAL_COLUMN;
|
||||
}
|
||||
|
||||
/// arrayZip(['a', 'b', 'c'], ['d', 'e', 'f']) = [('a', 'd'), ('b', 'e'), ('c', 'f')]
|
||||
/// arrayZipUnaligned(['a', 'b', 'c'], ['d', 'e']) = [('a', 'd'), ('b', 'e'), ('c', null)]
|
||||
template <bool allow_unaligned>
|
||||
class FunctionArrayZip : public IFunction
|
||||
{
|
||||
public:
|
||||
static constexpr auto name = "arrayZip";
|
||||
static constexpr auto name = allow_unaligned ? "arrayZipUnaligned" : "arrayZip";
|
||||
static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionArrayZip>(); }
|
||||
|
||||
String getName() const override
|
||||
{
|
||||
return name;
|
||||
}
|
||||
String getName() const override { return name; }
|
||||
|
||||
bool isVariadic() const override { return true; }
|
||||
size_t getNumberOfArguments() const override { return 0; }
|
||||
@ -39,8 +39,11 @@ public:
|
||||
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
|
||||
{
|
||||
if (arguments.empty())
|
||||
throw Exception(ErrorCodes::TOO_FEW_ARGUMENTS_FOR_FUNCTION,
|
||||
"Function {} needs at least one argument; passed {}." , getName(), arguments.size());
|
||||
throw Exception(
|
||||
ErrorCodes::TOO_FEW_ARGUMENTS_FOR_FUNCTION,
|
||||
"Function {} needs at least one argument; passed {}.",
|
||||
getName(),
|
||||
arguments.size());
|
||||
|
||||
DataTypes arguments_types;
|
||||
for (size_t index = 0; index < arguments.size(); ++index)
|
||||
@ -48,56 +51,142 @@ public:
|
||||
const DataTypeArray * array_type = checkAndGetDataType<DataTypeArray>(arguments[index].type.get());
|
||||
|
||||
if (!array_type)
|
||||
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Argument {} of function {} must be array. Found {} instead.",
|
||||
toString(index + 1), getName(), arguments[0].type->getName());
|
||||
throw Exception(
|
||||
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
|
||||
"Argument {} of function {} must be array. Found {} instead.",
|
||||
toString(index + 1),
|
||||
getName(),
|
||||
arguments[0].type->getName());
|
||||
|
||||
arguments_types.emplace_back(array_type->getNestedType());
|
||||
auto nested_type = array_type->getNestedType();
|
||||
if constexpr (allow_unaligned)
|
||||
nested_type = makeNullable(nested_type);
|
||||
arguments_types.emplace_back(nested_type);
|
||||
}
|
||||
|
||||
return std::make_shared<DataTypeArray>(std::make_shared<DataTypeTuple>(arguments_types));
|
||||
}
|
||||
|
||||
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/) const override
|
||||
ColumnPtr
|
||||
executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & /*result_type*/, size_t input_rows_count) const override
|
||||
{
|
||||
size_t num_arguments = arguments.size();
|
||||
|
||||
ColumnPtr first_array_column;
|
||||
Columns holders(num_arguments);
|
||||
Columns tuple_columns(num_arguments);
|
||||
|
||||
bool has_unaligned = false;
|
||||
size_t unaligned_index = 0;
|
||||
for (size_t i = 0; i < num_arguments; ++i)
|
||||
{
|
||||
/// Constant columns cannot be inside tuple. It's only possible to have constant tuple as a whole.
|
||||
ColumnPtr holder = arguments[i].column->convertToFullColumnIfConst();
|
||||
holders[i] = holder;
|
||||
|
||||
const ColumnArray * column_array = checkAndGetColumn<ColumnArray>(holder.get());
|
||||
|
||||
if (!column_array)
|
||||
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Argument {} of function {} must be array. Found column {} instead.",
|
||||
i + 1, getName(), holder->getName());
|
||||
|
||||
if (i == 0)
|
||||
{
|
||||
first_array_column = holder;
|
||||
}
|
||||
else if (!column_array->hasEqualOffsets(static_cast<const ColumnArray &>(*first_array_column)))
|
||||
{
|
||||
throw Exception(ErrorCodes::SIZES_OF_ARRAYS_DONT_MATCH,
|
||||
"The argument 1 and argument {} of function {} have different array sizes",
|
||||
i + 1, getName());
|
||||
}
|
||||
|
||||
throw Exception(
|
||||
ErrorCodes::ILLEGAL_COLUMN,
|
||||
"Argument {} of function {} must be array. Found column {} instead.",
|
||||
i + 1,
|
||||
getName(),
|
||||
holder->getName());
|
||||
tuple_columns[i] = column_array->getDataPtr();
|
||||
|
||||
if (i && !column_array->hasEqualOffsets(static_cast<const ColumnArray &>(*holders[0])))
|
||||
{
|
||||
has_unaligned = true;
|
||||
unaligned_index = i;
|
||||
}
|
||||
}
|
||||
|
||||
return ColumnArray::create(
|
||||
ColumnTuple::create(tuple_columns), static_cast<const ColumnArray &>(*first_array_column).getOffsetsPtr());
|
||||
if constexpr (!allow_unaligned)
|
||||
{
|
||||
if (has_unaligned)
|
||||
throw Exception(
|
||||
ErrorCodes::SIZES_OF_ARRAYS_DONT_MATCH,
|
||||
"The argument 1 and argument {} of function {} have different array sizes",
|
||||
unaligned_index + 1,
|
||||
getName());
|
||||
else
|
||||
return ColumnArray::create(
|
||||
ColumnTuple::create(std::move(tuple_columns)), static_cast<const ColumnArray &>(*holders[0]).getOffsetsPtr());
|
||||
}
|
||||
else
|
||||
return executeUnaligned(holders, tuple_columns, input_rows_count, has_unaligned);
|
||||
}
|
||||
|
||||
private:
|
||||
ColumnPtr executeUnaligned(const Columns & holders, Columns & tuple_columns, size_t input_rows_count, bool has_unaligned) const
|
||||
{
|
||||
std::vector<const ColumnArray *> array_columns(holders.size());
|
||||
for (size_t i = 0; i < holders.size(); ++i)
|
||||
array_columns[i] = checkAndGetColumn<ColumnArray>(holders[i].get());
|
||||
|
||||
for (auto & tuple_column : tuple_columns)
|
||||
tuple_column = makeNullable(tuple_column);
|
||||
|
||||
if (!has_unaligned)
|
||||
return ColumnArray::create(ColumnTuple::create(std::move(tuple_columns)), array_columns[0]->getOffsetsPtr());
|
||||
|
||||
MutableColumns res_tuple_columns(tuple_columns.size());
|
||||
for (size_t i = 0; i < tuple_columns.size(); ++i)
|
||||
{
|
||||
res_tuple_columns[i] = tuple_columns[i]->cloneEmpty();
|
||||
res_tuple_columns[i]->reserve(tuple_columns[i]->size());
|
||||
}
|
||||
|
||||
auto res_offsets_column = ColumnArray::ColumnOffsets::create(input_rows_count);
|
||||
auto & res_offsets = assert_cast<ColumnArray::ColumnOffsets &>(*res_offsets_column).getData();
|
||||
size_t curr_offset = 0;
|
||||
for (size_t row_i = 0; row_i < input_rows_count; ++row_i)
|
||||
{
|
||||
size_t max_size = 0;
|
||||
for (size_t arg_i = 0; arg_i < holders.size(); ++arg_i)
|
||||
{
|
||||
const auto * array_column = array_columns[arg_i];
|
||||
const auto & offsets = array_column->getOffsets();
|
||||
size_t array_offset = offsets[row_i - 1];
|
||||
size_t array_size = offsets[row_i] - array_offset;
|
||||
|
||||
res_tuple_columns[arg_i]->insertRangeFrom(*tuple_columns[arg_i], array_offset, array_size);
|
||||
max_size = std::max(max_size, array_size);
|
||||
}
|
||||
|
||||
for (size_t arg_i = 0; arg_i < holders.size(); ++arg_i)
|
||||
{
|
||||
const auto * array_column = array_columns[arg_i];
|
||||
const auto & offsets = array_column->getOffsets();
|
||||
size_t array_offset = offsets[row_i - 1];
|
||||
size_t array_size = offsets[row_i] - array_offset;
|
||||
|
||||
res_tuple_columns[arg_i]->insertManyDefaults(max_size - array_size);
|
||||
}
|
||||
|
||||
curr_offset += max_size;
|
||||
res_offsets[row_i] = curr_offset;
|
||||
}
|
||||
|
||||
return ColumnArray::create(ColumnTuple::create(std::move(res_tuple_columns)), std::move(res_offsets_column));
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_FUNCTION(ArrayZip)
|
||||
{
|
||||
factory.registerFunction<FunctionArrayZip>();
|
||||
factory.registerFunction<FunctionArrayZip<false>>(
|
||||
{.description = R"(
|
||||
Combines multiple arrays into a single array. The resulting array contains the corresponding elements of the source arrays grouped into tuples in the listed order of arguments.
|
||||
)",
|
||||
.categories{"String"}});
|
||||
|
||||
factory.registerFunction<FunctionArrayZip<true>>(
|
||||
{.description = R"(
|
||||
Combines multiple arrays into a single array, allowing for unaligned arrays. The resulting array contains the corresponding elements of the source arrays grouped into tuples in the listed order of arguments.
|
||||
|
||||
If the arrays have different sizes, the shorter arrays will be padded with `null` values.
|
||||
)",
|
||||
.categories{"String"}}
|
||||
|
||||
);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
@ -143,7 +143,6 @@ arrayStringConcat
|
||||
arraySum
|
||||
arrayUniq
|
||||
arrayWithConstant
|
||||
arrayZip
|
||||
asinh
|
||||
assumeNotNull
|
||||
atan
|
||||
|
@ -0,0 +1,8 @@
|
||||
[('a','d'),('b','e'),('c','f')] Array(Tuple(Nullable(String), Nullable(String)))
|
||||
[('a','d','g'),('b','e','h'),('c','f','i')]
|
||||
[('a','d'),('b','e'),('c','f'),(NULL,'g')]
|
||||
[('a',1),(NULL,2),(NULL,3)]
|
||||
[('a',1,1.1),('b',2,2.2),('c',NULL,3.3),(NULL,NULL,4.4)]
|
||||
[('g'),('h'),('i')]
|
||||
[('g'),('h'),('i')]
|
||||
[('g'),('h'),('i')]
|
15
tests/queries/0_stateless/03230_array_zip_unaligned.sql
Normal file
15
tests/queries/0_stateless/03230_array_zip_unaligned.sql
Normal file
@ -0,0 +1,15 @@
|
||||
SELECT arrayZipUnaligned(['a', 'b', 'c'], ['d', 'e', 'f']) as x, toTypeName(x);
|
||||
|
||||
SELECT arrayZipUnaligned(['a', 'b', 'c'], ['d', 'e', 'f'], ['g', 'h', 'i']);
|
||||
|
||||
SELECT arrayZipUnaligned(); -- { serverError TOO_FEW_ARGUMENTS_FOR_FUNCTION }
|
||||
|
||||
SELECT arrayZipUnaligned('a', 'b', 'c'); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
|
||||
|
||||
SELECT arrayZipUnaligned(['a', 'b', 'c'], ['d', 'e', 'f', 'g']);
|
||||
|
||||
SELECT arrayZipUnaligned(['a'], [1, 2, 3]);
|
||||
|
||||
SELECT arrayZipUnaligned(['a', 'b', 'c'], [1, 2], [1.1, 2.2, 3.3, 4.4]);
|
||||
|
||||
SELECT arrayZipUnaligned(materialize(['g', 'h', 'i'])) from numbers(3);
|
@ -1209,6 +1209,7 @@ arraySum
|
||||
arrayUniq
|
||||
arrayWithConstant
|
||||
arrayZip
|
||||
arrayZipUnaligned
|
||||
ascii
|
||||
asin
|
||||
asinh
|
||||
|
Loading…
Reference in New Issue
Block a user