mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-21 15:12:02 +00:00
[CLICKHOUSE-2830] add FunctionArrayDistinct (#2670)
* [CLICKHOUSE-2830] add FunctionArrayDistinct * [#CLICKHOUSE-2830] Add tests and fix * [CLICKHOUSE-2830] fix bug with array without NULL * [CLICKHOUSE-2830] fix test * [CLICKHOUSE-3844] Add suport Tuple and other types in Uniq, EnumerateUniq and Distinct * [CLICKHOUSE-3844] Add '\n' * [CLICKHOUSE-3844] fix * [CLICKHOUSE-2830] del Nullable return type * Update FunctionsArray.cpp * [CLICKHOUSE-3844] add new tests * [CLICKHOUSE-3844] add new tests
This commit is contained in:
parent
56523948bf
commit
b8ae9f7951
@ -41,6 +41,7 @@ generate_function_register(Array
|
||||
FunctionArrayEnumerate
|
||||
FunctionArrayEnumerateUniq
|
||||
FunctionArrayUniq
|
||||
FunctionArrayDistinct
|
||||
FunctionEmptyArrayUInt8
|
||||
FunctionEmptyArrayUInt16
|
||||
FunctionEmptyArrayUInt32
|
||||
|
@ -1062,9 +1062,7 @@ void FunctionArrayUniq::executeImpl(Block & block, const ColumnNumbers & argumen
|
||||
|| executeNumber<Float32>(first_array, first_null_map, res_values)
|
||||
|| executeNumber<Float64>(first_array, first_null_map, res_values)
|
||||
|| executeString(first_array, first_null_map, res_values)))
|
||||
throw Exception("Illegal column " + block.getByPosition(arguments[0]).column->getName()
|
||||
+ " of first argument of function " + getName(),
|
||||
ErrorCodes::ILLEGAL_COLUMN);
|
||||
executeHashed(*offsets, original_data_columns, res_values);
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -1272,6 +1270,213 @@ void FunctionArrayUniq::executeHashed(
|
||||
}
|
||||
}
|
||||
|
||||
/// Implementation of FunctionArrayDistinct.
|
||||
|
||||
FunctionPtr FunctionArrayDistinct::create(const Context &)
|
||||
{
|
||||
return std::make_shared<FunctionArrayDistinct>();
|
||||
}
|
||||
|
||||
String FunctionArrayDistinct::getName() const
|
||||
{
|
||||
return name;
|
||||
}
|
||||
|
||||
DataTypePtr FunctionArrayDistinct::getReturnTypeImpl(const DataTypes & arguments) const
|
||||
{
|
||||
const DataTypeArray * array_type = checkAndGetDataType<DataTypeArray>(arguments[0].get());
|
||||
if (!array_type)
|
||||
throw Exception("Argument for function " + getName() + " must be array but it "
|
||||
" has type " + arguments[0]->getName() + ".",
|
||||
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
||||
|
||||
auto nested_type = removeNullable(array_type->getNestedType());
|
||||
|
||||
return std::make_shared<DataTypeArray>(nested_type);
|
||||
}
|
||||
|
||||
void FunctionArrayDistinct::executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/)
|
||||
{
|
||||
ColumnPtr array_ptr = block.getByPosition(arguments[0]).column;
|
||||
const ColumnArray * array = checkAndGetColumn<ColumnArray>(array_ptr.get());
|
||||
|
||||
const auto & return_type = block.getByPosition(result).type;
|
||||
|
||||
auto res_ptr = return_type->createColumn();
|
||||
ColumnArray & res = static_cast<ColumnArray &>(*res_ptr);
|
||||
|
||||
const IColumn & src_data = array->getData();
|
||||
const ColumnArray::Offsets & offsets = array->getOffsets();
|
||||
|
||||
ColumnRawPtrs original_data_columns;
|
||||
original_data_columns.push_back(&src_data);
|
||||
|
||||
IColumn & res_data = res.getData();
|
||||
ColumnArray::Offsets & res_offsets = res.getOffsets();
|
||||
|
||||
const ColumnNullable * nullable_col = nullptr;
|
||||
|
||||
const IColumn * inner_col;
|
||||
|
||||
if (src_data.isColumnNullable())
|
||||
{
|
||||
nullable_col = static_cast<const ColumnNullable *>(&src_data);
|
||||
inner_col = &nullable_col->getNestedColumn();
|
||||
}
|
||||
else
|
||||
{
|
||||
inner_col = &src_data;
|
||||
}
|
||||
|
||||
if (!(executeNumber<UInt8>(*inner_col, offsets, res_data, res_offsets, nullable_col)
|
||||
|| executeNumber<UInt16>(*inner_col, offsets, res_data, res_offsets, nullable_col)
|
||||
|| executeNumber<UInt32>(*inner_col, offsets, res_data, res_offsets, nullable_col)
|
||||
|| executeNumber<UInt64>(*inner_col, offsets, res_data, res_offsets, nullable_col)
|
||||
|| executeNumber<Int8>(*inner_col, offsets, res_data, res_offsets, nullable_col)
|
||||
|| executeNumber<Int16>(*inner_col, offsets, res_data, res_offsets, nullable_col)
|
||||
|| executeNumber<Int32>(*inner_col, offsets, res_data, res_offsets, nullable_col)
|
||||
|| executeNumber<Int64>(*inner_col, offsets, res_data, res_offsets, nullable_col)
|
||||
|| executeNumber<Float32>(*inner_col, offsets, res_data, res_offsets, nullable_col)
|
||||
|| executeNumber<Float64>(*inner_col, offsets, res_data, res_offsets, nullable_col)
|
||||
|| executeString(*inner_col, offsets, res_data, res_offsets, nullable_col)))
|
||||
executeHashed(offsets, original_data_columns, res_data, res_offsets);
|
||||
|
||||
block.getByPosition(result).column = std::move(res_ptr);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool FunctionArrayDistinct::executeNumber(const IColumn & src_data,
|
||||
const ColumnArray::Offsets & src_offsets,
|
||||
IColumn & res_data_col,
|
||||
ColumnArray::Offsets & res_offsets,
|
||||
const ColumnNullable * nullable_col)
|
||||
{
|
||||
const ColumnVector<T> * src_data_concrete = checkAndGetColumn<ColumnVector<T>>(&src_data);
|
||||
|
||||
if (!src_data_concrete)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
const PaddedPODArray<T> & values = src_data_concrete->getData();
|
||||
PaddedPODArray<T> & res_data = typeid_cast<ColumnVector<T> &>(res_data_col).getData();
|
||||
|
||||
const PaddedPODArray<UInt8> * src_null_map = nullptr;
|
||||
|
||||
if (nullable_col)
|
||||
{
|
||||
src_null_map = &static_cast<const ColumnUInt8 *>(&nullable_col->getNullMapColumn())->getData();
|
||||
}
|
||||
|
||||
using Set = ClearableHashSet<T,
|
||||
DefaultHash<T>,
|
||||
HashTableGrower<INITIAL_SIZE_DEGREE>,
|
||||
HashTableAllocatorWithStackMemory<(1ULL << INITIAL_SIZE_DEGREE) * sizeof(T)>>;
|
||||
|
||||
Set set;
|
||||
size_t prev_off = 0;
|
||||
for (size_t i = 0; i < src_offsets.size(); ++i)
|
||||
{
|
||||
set.clear();
|
||||
size_t off = src_offsets[i];
|
||||
for (size_t j = prev_off; j < off; ++j)
|
||||
{
|
||||
if ((set.find(values[j]) == set.end()) && (!nullable_col || (*src_null_map)[j] == 0))
|
||||
{
|
||||
res_data.emplace_back(values[j]);
|
||||
set.insert(values[j]);
|
||||
}
|
||||
}
|
||||
|
||||
res_offsets.emplace_back(set.size() + prev_off);
|
||||
prev_off = off;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool FunctionArrayDistinct::executeString(
|
||||
const IColumn & src_data,
|
||||
const ColumnArray::Offsets & src_offsets,
|
||||
IColumn & res_data_col,
|
||||
ColumnArray::Offsets & res_offsets,
|
||||
const ColumnNullable * nullable_col)
|
||||
{
|
||||
const ColumnString * src_data_concrete = checkAndGetColumn<ColumnString>(&src_data);
|
||||
|
||||
if (!src_data_concrete)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
ColumnString & res_data_column_string = typeid_cast<ColumnString &>(res_data_col);
|
||||
|
||||
using Set = ClearableHashSet<StringRef,
|
||||
StringRefHash,
|
||||
HashTableGrower<INITIAL_SIZE_DEGREE>,
|
||||
HashTableAllocatorWithStackMemory<(1ULL << INITIAL_SIZE_DEGREE) * sizeof(StringRef)>>;
|
||||
|
||||
const PaddedPODArray<UInt8> * src_null_map = nullptr;
|
||||
|
||||
if (nullable_col)
|
||||
{
|
||||
src_null_map = &static_cast<const ColumnUInt8 *>(&nullable_col->getNullMapColumn())->getData();
|
||||
}
|
||||
|
||||
Set set;
|
||||
size_t prev_off = 0;
|
||||
for (size_t i = 0; i < src_offsets.size(); ++i)
|
||||
{
|
||||
set.clear();
|
||||
size_t off = src_offsets[i];
|
||||
for (size_t j = prev_off; j < off; ++j)
|
||||
{
|
||||
StringRef str_ref = src_data_concrete->getDataAt(j);
|
||||
|
||||
if (set.find(str_ref) == set.end() && (!nullable_col || (*src_null_map)[j] == 0))
|
||||
{
|
||||
set.insert(str_ref);
|
||||
res_data_column_string.insertData(str_ref.data, str_ref.size);
|
||||
}
|
||||
}
|
||||
|
||||
res_offsets.emplace_back(set.size() + prev_off);
|
||||
prev_off = off;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void FunctionArrayDistinct::executeHashed(
|
||||
const ColumnArray::Offsets & offsets,
|
||||
const ColumnRawPtrs & columns,
|
||||
IColumn & res_data_col,
|
||||
ColumnArray::Offsets & res_offsets)
|
||||
{
|
||||
size_t count = columns.size();
|
||||
|
||||
using Set = ClearableHashSet<UInt128, UInt128TrivialHash, HashTableGrower<INITIAL_SIZE_DEGREE>,
|
||||
HashTableAllocatorWithStackMemory<(1ULL << INITIAL_SIZE_DEGREE) * sizeof(UInt128)>>;
|
||||
|
||||
Set set;
|
||||
size_t prev_off = 0;
|
||||
for (size_t i = 0; i < offsets.size(); ++i)
|
||||
{
|
||||
set.clear();
|
||||
size_t off = offsets[i];
|
||||
for (size_t j = prev_off; j < off; ++j)
|
||||
{
|
||||
auto hash = hash128(j, count, columns);
|
||||
if (set.find(hash) == set.end())
|
||||
{
|
||||
set.insert(hash);
|
||||
res_data_col.insertFrom(*columns[0], j);
|
||||
}
|
||||
}
|
||||
|
||||
res_offsets.emplace_back(set.size() + prev_off);
|
||||
prev_off = off;
|
||||
}
|
||||
}
|
||||
|
||||
/// Implementation of FunctionArrayEnumerateUniq.
|
||||
|
||||
FunctionPtr FunctionArrayEnumerateUniq::create(const Context &)
|
||||
@ -1334,13 +1539,7 @@ void FunctionArrayEnumerateUniq::executeImpl(Block & block, const ColumnNumbers
|
||||
ErrorCodes::SIZES_OF_ARRAYS_DOESNT_MATCH);
|
||||
|
||||
auto * array_data = &array->getData();
|
||||
if (auto * tuple_column = checkAndGetColumn<ColumnTuple>(array_data))
|
||||
{
|
||||
for (const auto & element : tuple_column->getColumns())
|
||||
data_columns.push_back(element.get());
|
||||
}
|
||||
else
|
||||
data_columns.push_back(array_data);
|
||||
data_columns.push_back(array_data);
|
||||
}
|
||||
|
||||
size_t num_columns = data_columns.size();
|
||||
@ -1383,9 +1582,7 @@ void FunctionArrayEnumerateUniq::executeImpl(Block & block, const ColumnNumbers
|
||||
|| executeNumber<Float32>(first_array, first_null_map, res_values)
|
||||
|| executeNumber<Float64>(first_array, first_null_map, res_values)
|
||||
|| executeString (first_array, first_null_map, res_values)))
|
||||
throw Exception("Illegal column " + block.getByPosition(arguments[0]).column->getName()
|
||||
+ " of first argument of function " + getName(),
|
||||
ErrorCodes::ILLEGAL_COLUMN);
|
||||
executeHashed(*offsets, original_data_columns, res_values);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -46,6 +46,8 @@ namespace ErrorCodes
|
||||
* arrayUniq(arr) - counts the number of different elements in the array,
|
||||
* arrayUniq(arr1, arr2, ...) - counts the number of different tuples from the elements in the corresponding positions in several arrays.
|
||||
*
|
||||
* arrayDistinct(arr) - retrun different elements in an array
|
||||
*
|
||||
* arrayEnumerateUniq(arr)
|
||||
* - outputs an array parallel (having same size) to this, where for each element specified
|
||||
* how many times this element was encountered before (including this element) among elements with the same value.
|
||||
@ -1210,6 +1212,52 @@ private:
|
||||
};
|
||||
|
||||
|
||||
/// Find different elements in an array.
|
||||
class FunctionArrayDistinct : public IFunction
|
||||
{
|
||||
public:
|
||||
static constexpr auto name = "arrayDistinct";
|
||||
static FunctionPtr create(const Context & context);
|
||||
|
||||
String getName() const override;
|
||||
|
||||
bool isVariadic() const override { return false; }
|
||||
|
||||
size_t getNumberOfArguments() const override { return 1; }
|
||||
|
||||
bool useDefaultImplementationForConstants() const override { return true; }
|
||||
|
||||
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override;
|
||||
|
||||
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count) override;
|
||||
|
||||
private:
|
||||
/// Initially allocate a piece of memory for 512 elements. NOTE: This is just a guess.
|
||||
static constexpr size_t INITIAL_SIZE_DEGREE = 9;
|
||||
|
||||
template <typename T>
|
||||
bool executeNumber(
|
||||
const IColumn & src_data,
|
||||
const ColumnArray::Offsets & src_offsets,
|
||||
IColumn & res_data_col,
|
||||
ColumnArray::Offsets & res_offsets,
|
||||
const ColumnNullable * nullable_col);
|
||||
|
||||
bool executeString(
|
||||
const IColumn & src_data,
|
||||
const ColumnArray::Offsets & src_offsets,
|
||||
IColumn & res_data_col,
|
||||
ColumnArray::Offsets & res_offsets,
|
||||
const ColumnNullable * nullable_col);
|
||||
|
||||
void executeHashed(
|
||||
const ColumnArray::Offsets & offsets,
|
||||
const ColumnRawPtrs & columns,
|
||||
IColumn & res_data_col,
|
||||
ColumnArray::Offsets & res_offsets);
|
||||
};
|
||||
|
||||
|
||||
class FunctionArrayEnumerateUniq : public IFunction
|
||||
{
|
||||
public:
|
||||
|
@ -1 +1,3 @@
|
||||
6
|
||||
[1,1,1,2]
|
||||
[1,1,2]
|
||||
|
@ -1,2 +1,5 @@
|
||||
SELECT max(arrayJoin(arr)) FROM (SELECT arrayEnumerateUniq(groupArray(intDiv(number, 54321)) AS nums, groupArray(toString(intDiv(number, 98765)))) AS arr FROM (SELECT number FROM system.numbers LIMIT 1000000) GROUP BY intHash32(number) % 100000)
|
||||
SELECT max(arrayJoin(arr)) FROM (SELECT arrayEnumerateUniq(groupArray(intDiv(number, 54321)) AS nums, groupArray(toString(intDiv(number, 98765)))) AS arr FROM (SELECT number FROM system.numbers LIMIT 1000000) GROUP BY intHash32(number) % 100000);
|
||||
|
||||
SELECT arrayEnumerateUniq([[1], [2], [34], [1]]);
|
||||
SELECT arrayEnumerateUniq([(1, 2), (3, 4), (1, 2)]);
|
||||
|
||||
|
12
dbms/tests/queries/0_stateless/00672_arrayDistinct.reference
Normal file
12
dbms/tests/queries/0_stateless/00672_arrayDistinct.reference
Normal file
@ -0,0 +1,12 @@
|
||||
[1,2,3]
|
||||
[1,2,3]
|
||||
[1,2,5]
|
||||
['1212','sef','343r4']
|
||||
['1212','sef','343r4']
|
||||
['1212','sef','343r4','232']
|
||||
[1,2,3]
|
||||
[21]
|
||||
['a','b','c']
|
||||
['123']
|
||||
[['1212'],['sef'],['343r4']]
|
||||
[(1,2),(1,3),(1,5)]
|
21
dbms/tests/queries/0_stateless/00672_arrayDistinct.sql
Normal file
21
dbms/tests/queries/0_stateless/00672_arrayDistinct.sql
Normal file
@ -0,0 +1,21 @@
|
||||
USE test;
|
||||
|
||||
SELECT arrayDistinct([1, 2, 3]);
|
||||
SELECT arrayDistinct([1, 2, 3, 2, 2]);
|
||||
SELECT arrayDistinct([1, 2, NULL, 5, 2, NULL]);
|
||||
|
||||
SELECT arrayDistinct(['1212', 'sef', '343r4']);
|
||||
SELECT arrayDistinct(['1212', 'sef', '343r4', '1212']);
|
||||
SELECT arrayDistinct(['1212', 'sef', '343r4', NULL, NULL, '232']);
|
||||
|
||||
DROP TABLE IF EXISTS arrayDistinct_test;
|
||||
CREATE TABLE arrayDistinct_test(arr_int Array(UInt8), arr_string Array(String)) ENGINE=Memory;
|
||||
INSERT INTO arrayDistinct_test values ([1, 2, 3], ['a', 'b', 'c']), ([21, 21, 21, 21], ['123', '123', '123']);
|
||||
|
||||
SELECT arrayDistinct(arr_int) FROM arrayDistinct_test;
|
||||
SELECT arrayDistinct(arr_string) FROM arrayDistinct_test;
|
||||
|
||||
DROP TABLE arrayDistinct_test;
|
||||
|
||||
SELECT arrayDistinct([['1212'], ['sef'], ['343r4'], ['1212']]);
|
||||
SELECT arrayDistinct([(1, 2), (1, 3), (1, 2), (1, 2), (1, 2), (1, 5)]);
|
Loading…
Reference in New Issue
Block a user