[CLICKHOUSE-2830] add FunctionArrayDistinct (#2670)

* [CLICKHOUSE-2830] add FunctionArrayDistinct

* [#CLICKHOUSE-2830] Add tests and fix

* [CLICKHOUSE-2830] fix bug with array without NULL

* [CLICKHOUSE-2830] fix test

* [CLICKHOUSE-3844] Add suport Tuple and other types in Uniq, EnumerateUniq and Distinct

* [CLICKHOUSE-3844] Add '\n'

* [CLICKHOUSE-3844] fix

* [CLICKHOUSE-2830] del Nullable return type

* Update FunctionsArray.cpp

* [CLICKHOUSE-3844] add new tests

* [CLICKHOUSE-3844] add new tests
This commit is contained in:
Vadim 2018-07-19 16:53:32 +03:00 committed by alexey-milovidov
parent 56523948bf
commit b8ae9f7951
7 changed files with 298 additions and 14 deletions

View File

@ -41,6 +41,7 @@ generate_function_register(Array
FunctionArrayEnumerate
FunctionArrayEnumerateUniq
FunctionArrayUniq
FunctionArrayDistinct
FunctionEmptyArrayUInt8
FunctionEmptyArrayUInt16
FunctionEmptyArrayUInt32

View File

@ -1062,9 +1062,7 @@ void FunctionArrayUniq::executeImpl(Block & block, const ColumnNumbers & argumen
|| executeNumber<Float32>(first_array, first_null_map, res_values)
|| executeNumber<Float64>(first_array, first_null_map, res_values)
|| executeString(first_array, first_null_map, res_values)))
throw Exception("Illegal column " + block.getByPosition(arguments[0]).column->getName()
+ " of first argument of function " + getName(),
ErrorCodes::ILLEGAL_COLUMN);
executeHashed(*offsets, original_data_columns, res_values);
}
else
{
@ -1272,6 +1270,213 @@ void FunctionArrayUniq::executeHashed(
}
}
/// Implementation of FunctionArrayDistinct.
FunctionPtr FunctionArrayDistinct::create(const Context &)
{
return std::make_shared<FunctionArrayDistinct>();
}
String FunctionArrayDistinct::getName() const
{
return name;
}
DataTypePtr FunctionArrayDistinct::getReturnTypeImpl(const DataTypes & arguments) const
{
const DataTypeArray * array_type = checkAndGetDataType<DataTypeArray>(arguments[0].get());
if (!array_type)
throw Exception("Argument for function " + getName() + " must be array but it "
" has type " + arguments[0]->getName() + ".",
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
auto nested_type = removeNullable(array_type->getNestedType());
return std::make_shared<DataTypeArray>(nested_type);
}
void FunctionArrayDistinct::executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/)
{
ColumnPtr array_ptr = block.getByPosition(arguments[0]).column;
const ColumnArray * array = checkAndGetColumn<ColumnArray>(array_ptr.get());
const auto & return_type = block.getByPosition(result).type;
auto res_ptr = return_type->createColumn();
ColumnArray & res = static_cast<ColumnArray &>(*res_ptr);
const IColumn & src_data = array->getData();
const ColumnArray::Offsets & offsets = array->getOffsets();
ColumnRawPtrs original_data_columns;
original_data_columns.push_back(&src_data);
IColumn & res_data = res.getData();
ColumnArray::Offsets & res_offsets = res.getOffsets();
const ColumnNullable * nullable_col = nullptr;
const IColumn * inner_col;
if (src_data.isColumnNullable())
{
nullable_col = static_cast<const ColumnNullable *>(&src_data);
inner_col = &nullable_col->getNestedColumn();
}
else
{
inner_col = &src_data;
}
if (!(executeNumber<UInt8>(*inner_col, offsets, res_data, res_offsets, nullable_col)
|| executeNumber<UInt16>(*inner_col, offsets, res_data, res_offsets, nullable_col)
|| executeNumber<UInt32>(*inner_col, offsets, res_data, res_offsets, nullable_col)
|| executeNumber<UInt64>(*inner_col, offsets, res_data, res_offsets, nullable_col)
|| executeNumber<Int8>(*inner_col, offsets, res_data, res_offsets, nullable_col)
|| executeNumber<Int16>(*inner_col, offsets, res_data, res_offsets, nullable_col)
|| executeNumber<Int32>(*inner_col, offsets, res_data, res_offsets, nullable_col)
|| executeNumber<Int64>(*inner_col, offsets, res_data, res_offsets, nullable_col)
|| executeNumber<Float32>(*inner_col, offsets, res_data, res_offsets, nullable_col)
|| executeNumber<Float64>(*inner_col, offsets, res_data, res_offsets, nullable_col)
|| executeString(*inner_col, offsets, res_data, res_offsets, nullable_col)))
executeHashed(offsets, original_data_columns, res_data, res_offsets);
block.getByPosition(result).column = std::move(res_ptr);
}
template <typename T>
bool FunctionArrayDistinct::executeNumber(const IColumn & src_data,
const ColumnArray::Offsets & src_offsets,
IColumn & res_data_col,
ColumnArray::Offsets & res_offsets,
const ColumnNullable * nullable_col)
{
const ColumnVector<T> * src_data_concrete = checkAndGetColumn<ColumnVector<T>>(&src_data);
if (!src_data_concrete)
{
return false;
}
const PaddedPODArray<T> & values = src_data_concrete->getData();
PaddedPODArray<T> & res_data = typeid_cast<ColumnVector<T> &>(res_data_col).getData();
const PaddedPODArray<UInt8> * src_null_map = nullptr;
if (nullable_col)
{
src_null_map = &static_cast<const ColumnUInt8 *>(&nullable_col->getNullMapColumn())->getData();
}
using Set = ClearableHashSet<T,
DefaultHash<T>,
HashTableGrower<INITIAL_SIZE_DEGREE>,
HashTableAllocatorWithStackMemory<(1ULL << INITIAL_SIZE_DEGREE) * sizeof(T)>>;
Set set;
size_t prev_off = 0;
for (size_t i = 0; i < src_offsets.size(); ++i)
{
set.clear();
size_t off = src_offsets[i];
for (size_t j = prev_off; j < off; ++j)
{
if ((set.find(values[j]) == set.end()) && (!nullable_col || (*src_null_map)[j] == 0))
{
res_data.emplace_back(values[j]);
set.insert(values[j]);
}
}
res_offsets.emplace_back(set.size() + prev_off);
prev_off = off;
}
return true;
}
bool FunctionArrayDistinct::executeString(
const IColumn & src_data,
const ColumnArray::Offsets & src_offsets,
IColumn & res_data_col,
ColumnArray::Offsets & res_offsets,
const ColumnNullable * nullable_col)
{
const ColumnString * src_data_concrete = checkAndGetColumn<ColumnString>(&src_data);
if (!src_data_concrete)
{
return false;
}
ColumnString & res_data_column_string = typeid_cast<ColumnString &>(res_data_col);
using Set = ClearableHashSet<StringRef,
StringRefHash,
HashTableGrower<INITIAL_SIZE_DEGREE>,
HashTableAllocatorWithStackMemory<(1ULL << INITIAL_SIZE_DEGREE) * sizeof(StringRef)>>;
const PaddedPODArray<UInt8> * src_null_map = nullptr;
if (nullable_col)
{
src_null_map = &static_cast<const ColumnUInt8 *>(&nullable_col->getNullMapColumn())->getData();
}
Set set;
size_t prev_off = 0;
for (size_t i = 0; i < src_offsets.size(); ++i)
{
set.clear();
size_t off = src_offsets[i];
for (size_t j = prev_off; j < off; ++j)
{
StringRef str_ref = src_data_concrete->getDataAt(j);
if (set.find(str_ref) == set.end() && (!nullable_col || (*src_null_map)[j] == 0))
{
set.insert(str_ref);
res_data_column_string.insertData(str_ref.data, str_ref.size);
}
}
res_offsets.emplace_back(set.size() + prev_off);
prev_off = off;
}
return true;
}
void FunctionArrayDistinct::executeHashed(
const ColumnArray::Offsets & offsets,
const ColumnRawPtrs & columns,
IColumn & res_data_col,
ColumnArray::Offsets & res_offsets)
{
size_t count = columns.size();
using Set = ClearableHashSet<UInt128, UInt128TrivialHash, HashTableGrower<INITIAL_SIZE_DEGREE>,
HashTableAllocatorWithStackMemory<(1ULL << INITIAL_SIZE_DEGREE) * sizeof(UInt128)>>;
Set set;
size_t prev_off = 0;
for (size_t i = 0; i < offsets.size(); ++i)
{
set.clear();
size_t off = offsets[i];
for (size_t j = prev_off; j < off; ++j)
{
auto hash = hash128(j, count, columns);
if (set.find(hash) == set.end())
{
set.insert(hash);
res_data_col.insertFrom(*columns[0], j);
}
}
res_offsets.emplace_back(set.size() + prev_off);
prev_off = off;
}
}
/// Implementation of FunctionArrayEnumerateUniq.
FunctionPtr FunctionArrayEnumerateUniq::create(const Context &)
@ -1334,13 +1539,7 @@ void FunctionArrayEnumerateUniq::executeImpl(Block & block, const ColumnNumbers
ErrorCodes::SIZES_OF_ARRAYS_DOESNT_MATCH);
auto * array_data = &array->getData();
if (auto * tuple_column = checkAndGetColumn<ColumnTuple>(array_data))
{
for (const auto & element : tuple_column->getColumns())
data_columns.push_back(element.get());
}
else
data_columns.push_back(array_data);
data_columns.push_back(array_data);
}
size_t num_columns = data_columns.size();
@ -1383,9 +1582,7 @@ void FunctionArrayEnumerateUniq::executeImpl(Block & block, const ColumnNumbers
|| executeNumber<Float32>(first_array, first_null_map, res_values)
|| executeNumber<Float64>(first_array, first_null_map, res_values)
|| executeString (first_array, first_null_map, res_values)))
throw Exception("Illegal column " + block.getByPosition(arguments[0]).column->getName()
+ " of first argument of function " + getName(),
ErrorCodes::ILLEGAL_COLUMN);
executeHashed(*offsets, original_data_columns, res_values);
}
else
{

View File

@ -46,6 +46,8 @@ namespace ErrorCodes
* arrayUniq(arr) - counts the number of different elements in the array,
* arrayUniq(arr1, arr2, ...) - counts the number of different tuples from the elements in the corresponding positions in several arrays.
*
* arrayDistinct(arr) - retrun different elements in an array
*
* arrayEnumerateUniq(arr)
* - outputs an array parallel (having same size) to this, where for each element specified
* how many times this element was encountered before (including this element) among elements with the same value.
@ -1210,6 +1212,52 @@ private:
};
/// Find different elements in an array.
class FunctionArrayDistinct : public IFunction
{
public:
static constexpr auto name = "arrayDistinct";
static FunctionPtr create(const Context & context);
String getName() const override;
bool isVariadic() const override { return false; }
size_t getNumberOfArguments() const override { return 1; }
bool useDefaultImplementationForConstants() const override { return true; }
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override;
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count) override;
private:
/// Initially allocate a piece of memory for 512 elements. NOTE: This is just a guess.
static constexpr size_t INITIAL_SIZE_DEGREE = 9;
template <typename T>
bool executeNumber(
const IColumn & src_data,
const ColumnArray::Offsets & src_offsets,
IColumn & res_data_col,
ColumnArray::Offsets & res_offsets,
const ColumnNullable * nullable_col);
bool executeString(
const IColumn & src_data,
const ColumnArray::Offsets & src_offsets,
IColumn & res_data_col,
ColumnArray::Offsets & res_offsets,
const ColumnNullable * nullable_col);
void executeHashed(
const ColumnArray::Offsets & offsets,
const ColumnRawPtrs & columns,
IColumn & res_data_col,
ColumnArray::Offsets & res_offsets);
};
class FunctionArrayEnumerateUniq : public IFunction
{
public:

View File

@ -1 +1,3 @@
6
[1,1,1,2]
[1,1,2]

View File

@ -1,2 +1,5 @@
SELECT max(arrayJoin(arr)) FROM (SELECT arrayEnumerateUniq(groupArray(intDiv(number, 54321)) AS nums, groupArray(toString(intDiv(number, 98765)))) AS arr FROM (SELECT number FROM system.numbers LIMIT 1000000) GROUP BY intHash32(number) % 100000)
SELECT max(arrayJoin(arr)) FROM (SELECT arrayEnumerateUniq(groupArray(intDiv(number, 54321)) AS nums, groupArray(toString(intDiv(number, 98765)))) AS arr FROM (SELECT number FROM system.numbers LIMIT 1000000) GROUP BY intHash32(number) % 100000);
SELECT arrayEnumerateUniq([[1], [2], [34], [1]]);
SELECT arrayEnumerateUniq([(1, 2), (3, 4), (1, 2)]);

View File

@ -0,0 +1,12 @@
[1,2,3]
[1,2,3]
[1,2,5]
['1212','sef','343r4']
['1212','sef','343r4']
['1212','sef','343r4','232']
[1,2,3]
[21]
['a','b','c']
['123']
[['1212'],['sef'],['343r4']]
[(1,2),(1,3),(1,5)]

View File

@ -0,0 +1,21 @@
USE test;
SELECT arrayDistinct([1, 2, 3]);
SELECT arrayDistinct([1, 2, 3, 2, 2]);
SELECT arrayDistinct([1, 2, NULL, 5, 2, NULL]);
SELECT arrayDistinct(['1212', 'sef', '343r4']);
SELECT arrayDistinct(['1212', 'sef', '343r4', '1212']);
SELECT arrayDistinct(['1212', 'sef', '343r4', NULL, NULL, '232']);
DROP TABLE IF EXISTS arrayDistinct_test;
CREATE TABLE arrayDistinct_test(arr_int Array(UInt8), arr_string Array(String)) ENGINE=Memory;
INSERT INTO arrayDistinct_test values ([1, 2, 3], ['a', 'b', 'c']), ([21, 21, 21, 21], ['123', '123', '123']);
SELECT arrayDistinct(arr_int) FROM arrayDistinct_test;
SELECT arrayDistinct(arr_string) FROM arrayDistinct_test;
DROP TABLE arrayDistinct_test;
SELECT arrayDistinct([['1212'], ['sef'], ['343r4'], ['1212']]);
SELECT arrayDistinct([(1, 2), (1, 3), (1, 2), (1, 2), (1, 2), (1, 5)]);