mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-19 16:20:50 +00:00
Merge pull request #65009 from yariks5s/group_concat
Aggregate function groupConcat (#63671)
This commit is contained in:
commit
fb50f9475d
265
src/AggregateFunctions/AggregateFunctionGroupConcat.cpp
Normal file
265
src/AggregateFunctions/AggregateFunctionGroupConcat.cpp
Normal file
@ -0,0 +1,265 @@
|
||||
#include <AggregateFunctions/IAggregateFunction.h>
|
||||
#include <AggregateFunctions/AggregateFunctionFactory.h>
|
||||
#include <AggregateFunctions/FactoryHelpers.h>
|
||||
|
||||
#include <Columns/IColumn.h>
|
||||
#include <Columns/ColumnNullable.h>
|
||||
#include <Columns/ColumnString.h>
|
||||
|
||||
#include <Core/ServerSettings.h>
|
||||
#include <Core/ColumnWithTypeAndName.h>
|
||||
|
||||
#include <Common/ArenaAllocator.h>
|
||||
#include <Common/assert_cast.h>
|
||||
#include <Interpreters/castColumn.h>
|
||||
|
||||
#include <DataTypes/IDataType.h>
|
||||
#include <DataTypes/DataTypeArray.h>
|
||||
#include <DataTypes/DataTypeString.h>
|
||||
#include <DataTypes/DataTypesNumber.h>
|
||||
|
||||
#include <IO/ReadHelpers.h>
|
||||
#include <IO/WriteHelpers.h>
|
||||
|
||||
|
||||
namespace DB
|
||||
{
|
||||
struct Settings;
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int TOO_MANY_ARGUMENTS_FOR_FUNCTION;
|
||||
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
|
||||
extern const int BAD_ARGUMENTS;
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
struct GroupConcatDataBase
|
||||
{
|
||||
UInt64 data_size = 0;
|
||||
UInt64 allocated_size = 0;
|
||||
char * data = nullptr;
|
||||
|
||||
void checkAndUpdateSize(UInt64 add, Arena * arena)
|
||||
{
|
||||
if (data_size + add >= allocated_size)
|
||||
{
|
||||
auto old_size = allocated_size;
|
||||
allocated_size = std::max(2 * allocated_size, data_size + add);
|
||||
data = arena->realloc(data, old_size, allocated_size);
|
||||
}
|
||||
}
|
||||
|
||||
void insertChar(const char * str, UInt64 str_size, Arena * arena)
|
||||
{
|
||||
checkAndUpdateSize(str_size, arena);
|
||||
memcpy(data + data_size, str, str_size);
|
||||
data_size += str_size;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
struct GroupConcatData : public GroupConcatDataBase
|
||||
{
|
||||
using Offset = UInt64;
|
||||
using Allocator = MixedAlignedArenaAllocator<alignof(Offset), 4096>;
|
||||
using Offsets = PODArray<Offset, 32, Allocator>;
|
||||
|
||||
/// offset[i * 2] - beginning of the i-th row, offset[i * 2 + 1] - end of the i-th row
|
||||
Offsets offsets;
|
||||
UInt64 num_rows = 0;
|
||||
|
||||
UInt64 getSize(size_t i) const { return offsets[i * 2 + 1] - offsets[i * 2]; }
|
||||
|
||||
UInt64 getString(size_t i) const { return offsets[i * 2]; }
|
||||
|
||||
void insert(const IColumn * column, const SerializationPtr & serialization, size_t row_num, Arena * arena)
|
||||
{
|
||||
WriteBufferFromOwnString buff;
|
||||
serialization->serializeText(*column, row_num, buff, {});
|
||||
auto string = buff.stringView();
|
||||
|
||||
checkAndUpdateSize(string.size(), arena);
|
||||
memcpy(data + data_size, string.data(), string.size());
|
||||
offsets.push_back(data_size, arena);
|
||||
data_size += string.size();
|
||||
offsets.push_back(data_size, arena);
|
||||
num_rows++;
|
||||
}
|
||||
};
|
||||
|
||||
template <bool has_limit>
|
||||
class GroupConcatImpl final
|
||||
: public IAggregateFunctionDataHelper<GroupConcatData, GroupConcatImpl<has_limit>>
|
||||
{
|
||||
static constexpr auto name = "groupConcat";
|
||||
|
||||
SerializationPtr serialization;
|
||||
UInt64 limit;
|
||||
const String delimiter;
|
||||
|
||||
public:
|
||||
GroupConcatImpl(const DataTypePtr & data_type_, const Array & parameters_, UInt64 limit_, const String & delimiter_)
|
||||
: IAggregateFunctionDataHelper<GroupConcatData, GroupConcatImpl<has_limit>>(
|
||||
{data_type_}, parameters_, std::make_shared<DataTypeString>())
|
||||
, serialization(this->argument_types[0]->getDefaultSerialization())
|
||||
, limit(limit_)
|
||||
, delimiter(delimiter_)
|
||||
{
|
||||
}
|
||||
|
||||
String getName() const override { return name; }
|
||||
|
||||
void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena * arena) const override
|
||||
{
|
||||
auto & cur_data = this->data(place);
|
||||
|
||||
if constexpr (has_limit)
|
||||
if (cur_data.num_rows >= limit)
|
||||
return;
|
||||
|
||||
if (cur_data.data_size != 0)
|
||||
cur_data.insertChar(delimiter.c_str(), delimiter.size(), arena);
|
||||
|
||||
cur_data.insert(columns[0], serialization, row_num, arena);
|
||||
}
|
||||
|
||||
void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena * arena) const override
|
||||
{
|
||||
auto & cur_data = this->data(place);
|
||||
auto & rhs_data = this->data(rhs);
|
||||
|
||||
if (rhs_data.data_size == 0)
|
||||
return;
|
||||
|
||||
if constexpr (has_limit)
|
||||
{
|
||||
UInt64 new_elems_count = std::min(rhs_data.num_rows, limit - cur_data.num_rows);
|
||||
for (UInt64 i = 0; i < new_elems_count; ++i)
|
||||
{
|
||||
if (cur_data.data_size != 0)
|
||||
cur_data.insertChar(delimiter.c_str(), delimiter.size(), arena);
|
||||
|
||||
cur_data.offsets.push_back(cur_data.data_size, arena);
|
||||
cur_data.insertChar(rhs_data.data + rhs_data.getString(i), rhs_data.getSize(i), arena);
|
||||
cur_data.num_rows++;
|
||||
cur_data.offsets.push_back(cur_data.data_size, arena);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (cur_data.data_size != 0)
|
||||
cur_data.insertChar(delimiter.c_str(), delimiter.size(), arena);
|
||||
|
||||
cur_data.insertChar(rhs_data.data, rhs_data.data_size, arena);
|
||||
}
|
||||
}
|
||||
|
||||
void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
|
||||
{
|
||||
auto & cur_data = this->data(place);
|
||||
|
||||
writeVarUInt(cur_data.data_size, buf);
|
||||
writeVarUInt(cur_data.allocated_size, buf);
|
||||
|
||||
buf.write(cur_data.data, cur_data.data_size);
|
||||
|
||||
if constexpr (has_limit)
|
||||
{
|
||||
writeVarUInt(cur_data.num_rows, buf);
|
||||
for (const auto & offset : cur_data.offsets)
|
||||
writeVarUInt(offset, buf);
|
||||
}
|
||||
}
|
||||
|
||||
void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena * arena) const override
|
||||
{
|
||||
auto & cur_data = this->data(place);
|
||||
|
||||
readVarUInt(cur_data.data_size, buf);
|
||||
readVarUInt(cur_data.allocated_size, buf);
|
||||
|
||||
buf.readStrict(cur_data.data, cur_data.data_size);
|
||||
|
||||
if constexpr (has_limit)
|
||||
{
|
||||
readVarUInt(cur_data.num_rows, buf);
|
||||
cur_data.offsets.resize_exact(cur_data.num_rows * 2, arena);
|
||||
for (auto & offset : cur_data.offsets)
|
||||
readVarUInt(offset, buf);
|
||||
}
|
||||
}
|
||||
|
||||
void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
|
||||
{
|
||||
auto & cur_data = this->data(place);
|
||||
|
||||
if (cur_data.data_size == 0)
|
||||
{
|
||||
auto column_nullable = IColumn::mutate(makeNullable(to.getPtr()));
|
||||
column_nullable->insertDefault();
|
||||
return;
|
||||
}
|
||||
|
||||
auto & column_string = assert_cast<ColumnString &>(to);
|
||||
column_string.insertData(cur_data.data, cur_data.data_size);
|
||||
}
|
||||
|
||||
bool allocatesMemoryInArena() const override { return true; }
|
||||
};
|
||||
|
||||
AggregateFunctionPtr createAggregateFunctionGroupConcat(
|
||||
const std::string & name, const DataTypes & argument_types, const Array & parameters, const Settings *)
|
||||
{
|
||||
assertUnary(name, argument_types);
|
||||
|
||||
bool has_limit = false;
|
||||
UInt64 limit = 0;
|
||||
String delimiter;
|
||||
|
||||
if (parameters.size() > 2)
|
||||
throw Exception(ErrorCodes::TOO_MANY_ARGUMENTS_FOR_FUNCTION,
|
||||
"Incorrect number of parameters for aggregate function {}, should be 0, 1 or 2, got: {}", name, parameters.size());
|
||||
|
||||
if (!parameters.empty())
|
||||
{
|
||||
auto type = parameters[0].getType();
|
||||
if (type != Field::Types::String)
|
||||
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "First parameter for aggregate function {} should be string", name);
|
||||
|
||||
delimiter = parameters[0].get<String>();
|
||||
}
|
||||
if (parameters.size() == 2)
|
||||
{
|
||||
auto type = parameters[1].getType();
|
||||
|
||||
if (type != Field::Types::Int64 && type != Field::Types::UInt64)
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Second parameter for aggregate function {} should be a positive number", name);
|
||||
|
||||
if ((type == Field::Types::Int64 && parameters[1].get<Int64>() <= 0) ||
|
||||
(type == Field::Types::UInt64 && parameters[1].get<UInt64>() == 0))
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Second parameter for aggregate function {} should be a positive number, got: {}", name, parameters[1].get<Int64>());
|
||||
|
||||
has_limit = true;
|
||||
limit = parameters[1].get<UInt64>();
|
||||
}
|
||||
|
||||
if (has_limit)
|
||||
return std::make_shared<GroupConcatImpl</* has_limit= */ true>>(argument_types[0], parameters, limit, delimiter);
|
||||
else
|
||||
return std::make_shared<GroupConcatImpl</* has_limit= */ false>>(argument_types[0], parameters, limit, delimiter);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void registerAggregateFunctionGroupConcat(AggregateFunctionFactory & factory)
|
||||
{
|
||||
AggregateFunctionProperties properties = { .returns_default_when_only_null = false, .is_order_dependent = true };
|
||||
|
||||
factory.registerFunction("groupConcat", { createAggregateFunctionGroupConcat, properties });
|
||||
factory.registerAlias("group_concat", "groupConcat", AggregateFunctionFactory::CaseInsensitive);
|
||||
}
|
||||
|
||||
}
|
@ -19,6 +19,7 @@ void registerAggregateFunctionGroupArraySorted(AggregateFunctionFactory & factor
|
||||
void registerAggregateFunctionGroupUniqArray(AggregateFunctionFactory &);
|
||||
void registerAggregateFunctionGroupArrayInsertAt(AggregateFunctionFactory &);
|
||||
void registerAggregateFunctionGroupArrayIntersect(AggregateFunctionFactory &);
|
||||
void registerAggregateFunctionGroupConcat(AggregateFunctionFactory &);
|
||||
void registerAggregateFunctionsQuantile(AggregateFunctionFactory &);
|
||||
void registerAggregateFunctionsQuantileDeterministic(AggregateFunctionFactory &);
|
||||
void registerAggregateFunctionsQuantileExact(AggregateFunctionFactory &);
|
||||
@ -120,6 +121,7 @@ void registerAggregateFunctions()
|
||||
registerAggregateFunctionGroupUniqArray(factory);
|
||||
registerAggregateFunctionGroupArrayInsertAt(factory);
|
||||
registerAggregateFunctionGroupArrayIntersect(factory);
|
||||
registerAggregateFunctionGroupConcat(factory);
|
||||
registerAggregateFunctionsQuantile(factory);
|
||||
registerAggregateFunctionsQuantileDeterministic(factory);
|
||||
registerAggregateFunctionsQuantileExact(factory);
|
||||
|
14
tests/queries/0_stateless/03156_group_concat.reference
Normal file
14
tests/queries/0_stateless/03156_group_concat.reference
Normal file
@ -0,0 +1,14 @@
|
||||
0 95 abc [1,2,3]
|
||||
1 \N a [993,986,979,972]
|
||||
2 123 makson95 []
|
||||
95123
|
||||
abcamakson95
|
||||
[1,2,3][993,986,979,972][]
|
||||
95,123
|
||||
abc,a,makson95
|
||||
[1,2,3],[993,986,979,972]
|
||||
\N
|
||||
951239512395123
|
||||
abc,a,makson95,abc,a,makson95,abc,a,makson95
|
||||
[1,2,3][993,986,979,972][][1,2,3][993,986,979,972][][1,2,3][993,986,979,972][]
|
||||
488890
|
40
tests/queries/0_stateless/03156_group_concat.sql
Normal file
40
tests/queries/0_stateless/03156_group_concat.sql
Normal file
@ -0,0 +1,40 @@
|
||||
DROP TABLE IF EXISTS test_groupConcat;
|
||||
CREATE TABLE test_groupConcat
|
||||
(
|
||||
id UInt64,
|
||||
p_int Int32 NULL,
|
||||
p_string String,
|
||||
p_array Array(Int32)
|
||||
) ENGINE = MergeTree ORDER BY id;
|
||||
|
||||
SET max_insert_threads = 1, max_threads = 1, min_insert_block_size_rows = 0, min_insert_block_size_bytes = 0;
|
||||
INSERT INTO test_groupConcat VALUES (0, 95, 'abc', [1, 2, 3]), (1, NULL, 'a', [993, 986, 979, 972]), (2, 123, 'makson95', []);
|
||||
|
||||
SELECT * FROM test_groupConcat;
|
||||
|
||||
SELECT groupConcat(p_int) FROM test_groupConcat;
|
||||
SELECT groupConcat(p_string) FROM test_groupConcat;
|
||||
SELECT groupConcat(p_array) FROM test_groupConcat;
|
||||
|
||||
SELECT groupConcat(',')(p_int) FROM test_groupConcat;
|
||||
SELECT groupConcat(',')(p_string) FROM test_groupConcat;
|
||||
SELECT groupConcat(',', 2)(p_array) FROM test_groupConcat;
|
||||
|
||||
SELECT groupConcat(p_int) FROM test_groupConcat WHERE id = 1;
|
||||
|
||||
INSERT INTO test_groupConcat VALUES (0, 95, 'abc', [1, 2, 3]), (1, NULL, 'a', [993, 986, 979, 972]), (2, 123, 'makson95', []);
|
||||
INSERT INTO test_groupConcat VALUES (0, 95, 'abc', [1, 2, 3]), (1, NULL, 'a', [993, 986, 979, 972]), (2, 123, 'makson95', []);
|
||||
|
||||
SELECT groupConcat(p_int) FROM test_groupConcat;
|
||||
SELECT groupConcat(',')(p_string) FROM test_groupConcat;
|
||||
SELECT groupConcat(p_array) FROM test_groupConcat;
|
||||
|
||||
SELECT groupConcat(123)(number) FROM numbers(10); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT }
|
||||
SELECT groupConcat(',', '3')(number) FROM numbers(10); -- { serverError BAD_ARGUMENTS }
|
||||
SELECT groupConcat(',', 0)(number) FROM numbers(10); -- { serverError BAD_ARGUMENTS }
|
||||
SELECT groupConcat(',', -1)(number) FROM numbers(10); -- { serverError BAD_ARGUMENTS }
|
||||
SELECT groupConcat(',', 3, 3)(number) FROM numbers(10); -- { serverError TOO_MANY_ARGUMENTS_FOR_FUNCTION }
|
||||
|
||||
SELECT length(groupConcat(number)) FROM numbers(100000);
|
||||
|
||||
DROP TABLE IF EXISTS test_groupConcat;
|
Loading…
Reference in New Issue
Block a user