ClickHouse/src/Functions/array/arrayEnumerateExtended.h

374 lines
14 KiB
C++
Raw Normal View History

2020-10-10 18:37:02 +00:00
#pragma once
2021-05-17 07:30:42 +00:00
#include <Functions/IFunction.h>
2018-09-09 20:57:54 +00:00
#include <Functions/FunctionHelpers.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypesNumber.h>
#include <Columns/ColumnArray.h>
#include <Columns/ColumnNullable.h>
#include <Columns/ColumnsNumber.h>
2018-09-09 21:15:40 +00:00
#include <Columns/ColumnString.h>
2018-09-09 20:57:54 +00:00
#include <Interpreters/AggregationCommon.h>
2018-09-09 21:15:40 +00:00
#include <Common/HashTable/ClearableHashMap.h>
#include <Common/ColumnsHashing.h>
2018-09-09 20:57:54 +00:00
namespace DB
{
namespace ErrorCodes
{
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
extern const int ILLEGAL_COLUMN;
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
2018-09-09 21:15:40 +00:00
extern const int SIZES_OF_ARRAYS_DOESNT_MATCH;
2018-09-09 20:57:54 +00:00
}
2018-09-09 21:15:40 +00:00
class FunctionArrayEnumerateUniq;
class FunctionArrayEnumerateDense;
2018-09-09 20:57:54 +00:00
template <typename Derived>
class FunctionArrayEnumerateExtended : public IFunction
{
public:
2021-06-01 12:20:52 +00:00
static FunctionPtr create(ContextPtr) { return std::make_shared<Derived>(); }
2018-09-09 20:57:54 +00:00
String getName() const override { return Derived::name; }
bool isVariadic() const override { return true; }
size_t getNumberOfArguments() const override { return 0; }
bool useDefaultImplementationForConstants() const override { return true; }
bool isSuitableForShortCircuitArgumentsExecution(ColumnsWithTypeAndName & /*arguments*/) const override { return true; }
2018-09-09 20:57:54 +00:00
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
2020-03-08 21:40:00 +00:00
if (arguments.empty())
2018-09-09 20:57:54 +00:00
throw Exception("Number of arguments for function " + getName() + " doesn't match: passed "
+ toString(arguments.size()) + ", should be at least 1.",
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
for (size_t i = 0; i < arguments.size(); ++i)
{
const DataTypeArray * array_type = checkAndGetDataType<DataTypeArray>(arguments[i].get());
if (!array_type)
throw Exception("All arguments for function " + getName() + " must be arrays but argument " +
toString(i + 1) + " has type " + arguments[i]->getName() + ".", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
}
return std::make_shared<DataTypeArray>(std::make_shared<DataTypeUInt32>());
}
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override;
2018-09-09 20:57:54 +00:00
private:
2020-06-08 19:39:49 +00:00
/// Initially allocate a piece of memory for 64 elements. NOTE: This is just a guess.
static constexpr size_t INITIAL_SIZE_DEGREE = 6;
2018-09-09 20:57:54 +00:00
template <typename T>
struct MethodOneNumber
{
2020-05-26 05:54:04 +00:00
using Set = ClearableHashMapWithStackMemory<T, UInt32, DefaultHash<T>,
INITIAL_SIZE_DEGREE>;
using Method = ColumnsHashing::HashMethodOneNumber<typename Set::value_type, UInt32, T, false>;
};
struct MethodString
{
2020-05-26 05:54:04 +00:00
using Set = ClearableHashMapWithStackMemory<StringRef, UInt32, StringRefHash,
INITIAL_SIZE_DEGREE>;
using Method = ColumnsHashing::HashMethodString<typename Set::value_type, UInt32, false, false>;
};
struct MethodFixedString
{
2020-05-26 05:54:04 +00:00
using Set = ClearableHashMapWithStackMemory<StringRef, UInt32, StringRefHash,
INITIAL_SIZE_DEGREE>;
using Method = ColumnsHashing::HashMethodFixedString<typename Set::value_type, UInt32, false, false>;
};
struct MethodFixed
{
2020-05-26 05:54:04 +00:00
using Set = ClearableHashMapWithStackMemory<UInt128, UInt32, UInt128HashCRC32,
INITIAL_SIZE_DEGREE>;
using Method = ColumnsHashing::HashMethodKeysFixed<typename Set::value_type, UInt128, UInt32, false, false, false>;
};
struct MethodHashed
{
2020-05-26 05:54:04 +00:00
using Set = ClearableHashMapWithStackMemory<UInt128, UInt32, UInt128TrivialHash,
INITIAL_SIZE_DEGREE>;
using Method = ColumnsHashing::HashMethodHashed<typename Set::value_type, UInt32, false>;
};
template <typename Method>
void executeMethod(const ColumnArray::Offsets & offsets, const ColumnRawPtrs & columns, const Sizes & key_sizes,
2020-07-21 13:58:07 +00:00
const NullMap * null_map, ColumnUInt32::Container & res_values) const;
template <typename Method, bool has_null_map>
void executeMethodImpl(const ColumnArray::Offsets & offsets, const ColumnRawPtrs & columns, const Sizes & key_sizes,
2020-07-21 13:58:07 +00:00
const NullMap * null_map, ColumnUInt32::Container & res_values) const;
2018-09-09 20:57:54 +00:00
template <typename T>
2020-07-21 13:58:07 +00:00
bool executeNumber(const ColumnArray::Offsets & offsets, const IColumn & data, const NullMap * null_map, ColumnUInt32::Container & res_values) const;
bool executeString(const ColumnArray::Offsets & offsets, const IColumn & data, const NullMap * null_map, ColumnUInt32::Container & res_values) const;
bool executeFixedString(const ColumnArray::Offsets & offsets, const IColumn & data, const NullMap * null_map, ColumnUInt32::Container & res_values) const;
bool execute128bit(const ColumnArray::Offsets & offsets, const ColumnRawPtrs & columns, ColumnUInt32::Container & res_values) const;
void executeHashed(const ColumnArray::Offsets & offsets, const ColumnRawPtrs & columns, ColumnUInt32::Container & res_values) const;
2018-09-09 20:57:54 +00:00
};
template <typename Derived>
ColumnPtr FunctionArrayEnumerateExtended<Derived>::executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/) const
2018-09-09 20:57:54 +00:00
{
const ColumnArray::Offsets * offsets = nullptr;
2018-12-23 01:41:03 +00:00
size_t num_arguments = arguments.size();
ColumnRawPtrs data_columns(num_arguments);
2018-09-09 20:57:54 +00:00
Columns array_holders;
2018-12-23 01:41:03 +00:00
ColumnPtr offsets_column;
for (size_t i = 0; i < num_arguments; ++i)
2018-09-09 20:57:54 +00:00
{
2020-10-19 21:21:10 +00:00
const ColumnPtr & array_ptr = arguments[i].column;
const ColumnArray * array = checkAndGetColumn<ColumnArray>(array_ptr.get());
2018-09-09 20:57:54 +00:00
if (!array)
{
const ColumnConst * const_array = checkAndGetColumnConst<ColumnArray>(
2020-10-19 21:21:10 +00:00
arguments[i].column.get());
2018-09-09 20:57:54 +00:00
if (!const_array)
2020-10-19 21:21:10 +00:00
throw Exception("Illegal column " + arguments[i].column->getName()
2018-09-09 20:57:54 +00:00
+ " of " + toString(i + 1) + "-th argument of function " + getName(),
ErrorCodes::ILLEGAL_COLUMN);
array_holders.emplace_back(const_array->convertToFullColumn());
array = checkAndGetColumn<ColumnArray>(array_holders.back().get());
2018-09-09 20:57:54 +00:00
}
const ColumnArray::Offsets & offsets_i = array->getOffsets();
if (i == 0)
2018-12-23 01:41:03 +00:00
{
2018-09-09 20:57:54 +00:00
offsets = &offsets_i;
2018-12-23 01:41:03 +00:00
offsets_column = array->getOffsetsPtr();
}
2018-09-09 20:57:54 +00:00
else if (offsets_i != *offsets)
throw Exception("Lengths of all arrays passed to " + getName() + " must be equal.",
ErrorCodes::SIZES_OF_ARRAYS_DOESNT_MATCH);
2020-10-19 21:21:10 +00:00
const auto * array_data = &array->getData();
2018-12-23 01:41:03 +00:00
data_columns[i] = array_data;
2018-09-09 20:57:54 +00:00
}
2018-12-22 15:40:51 +00:00
const NullMap * null_map = nullptr;
2018-09-09 20:57:54 +00:00
2018-12-23 01:41:03 +00:00
for (size_t i = 0; i < num_arguments; ++i)
2018-09-09 20:57:54 +00:00
{
2020-10-19 21:21:10 +00:00
if (const auto * nullable_col = checkAndGetColumn<ColumnNullable>(*data_columns[i]))
2018-09-09 20:57:54 +00:00
{
2018-12-23 01:41:03 +00:00
if (num_arguments == 1)
2019-06-26 17:20:33 +00:00
data_columns[i] = &nullable_col->getNestedColumn();
2018-12-22 15:40:51 +00:00
2019-06-26 17:20:33 +00:00
null_map = &nullable_col->getNullMapData();
2018-12-22 15:40:51 +00:00
break;
2018-09-09 20:57:54 +00:00
}
}
auto res_nested = ColumnUInt32::create();
ColumnUInt32::Container & res_values = res_nested->getData();
if (!offsets->empty())
res_values.resize(offsets->back());
2018-12-23 01:41:03 +00:00
if (num_arguments == 1)
2018-09-09 20:57:54 +00:00
{
if (!(executeNumber<UInt8>(*offsets, *data_columns[0], null_map, res_values)
2018-12-22 15:40:51 +00:00
|| executeNumber<UInt16>(*offsets, *data_columns[0], null_map, res_values)
|| executeNumber<UInt32>(*offsets, *data_columns[0], null_map, res_values)
|| executeNumber<UInt64>(*offsets, *data_columns[0], null_map, res_values)
|| executeNumber<Int8>(*offsets, *data_columns[0], null_map, res_values)
|| executeNumber<Int16>(*offsets, *data_columns[0], null_map, res_values)
|| executeNumber<Int32>(*offsets, *data_columns[0], null_map, res_values)
|| executeNumber<Int64>(*offsets, *data_columns[0], null_map, res_values)
|| executeNumber<Float32>(*offsets, *data_columns[0], null_map, res_values)
|| executeNumber<Float64>(*offsets, *data_columns[0], null_map, res_values)
|| executeString(*offsets, *data_columns[0], null_map, res_values)
|| executeFixedString(*offsets, *data_columns[0], null_map, res_values)))
executeHashed(*offsets, data_columns, res_values);
2018-09-09 20:57:54 +00:00
}
else
{
2019-02-05 09:43:14 +00:00
if (!execute128bit(*offsets, data_columns, res_values))
executeHashed(*offsets, data_columns, res_values);
2018-09-09 20:57:54 +00:00
}
2020-10-19 21:21:10 +00:00
return ColumnArray::create(std::move(res_nested), offsets_column);
2018-09-09 20:57:54 +00:00
}
template <typename Derived>
template <typename Method, bool has_null_map>
void FunctionArrayEnumerateExtended<Derived>::executeMethodImpl(
const ColumnArray::Offsets & offsets,
const ColumnRawPtrs & columns,
const Sizes & key_sizes,
[[maybe_unused]] const NullMap * null_map,
2020-07-21 13:58:07 +00:00
ColumnUInt32::Container & res_values) const
2018-09-09 20:57:54 +00:00
{
typename Method::Set indices;
typename Method::Method method(columns, key_sizes, nullptr);
Arena pool; /// Won't use it;
2018-09-09 20:57:54 +00:00
ColumnArray::Offset prev_off = 0;
2018-09-09 20:57:54 +00:00
if constexpr (std::is_same_v<Derived, FunctionArrayEnumerateUniq>)
{
// Unique
for (size_t off : offsets)
2018-09-09 20:57:54 +00:00
{
indices.clear();
UInt32 null_count = 0;
for (size_t j = prev_off; j < off; ++j)
{
if constexpr (has_null_map)
{
if ((*null_map)[j])
{
res_values[j] = ++null_count;
continue;
}
}
auto emplace_result = method.emplaceKey(indices, j, pool);
auto idx = emplace_result.getMapped() + 1;
emplace_result.setMapped(idx);
res_values[j] = idx;
2018-09-09 20:57:54 +00:00
}
prev_off = off;
}
}
else
{
// Dense
for (size_t off : offsets)
2018-09-09 20:57:54 +00:00
{
indices.clear();
UInt32 rank = 0;
[[maybe_unused]] UInt32 null_index = 0;
2018-09-09 20:57:54 +00:00
for (size_t j = prev_off; j < off; ++j)
{
if constexpr (has_null_map)
2018-09-09 20:57:54 +00:00
{
if ((*null_map)[j])
{
if (!null_index)
null_index = ++rank;
res_values[j] = null_index;
continue;
}
2018-09-09 20:57:54 +00:00
}
auto emplace_result = method.emplaceKey(indices, j, pool);
auto idx = emplace_result.getMapped();
if (!idx)
2018-09-09 20:57:54 +00:00
{
idx = ++rank;
emplace_result.setMapped(idx);
2018-09-09 20:57:54 +00:00
}
res_values[j] = idx;
2018-09-09 20:57:54 +00:00
}
prev_off = off;
}
}
}
template <typename Derived>
template <typename Method>
void FunctionArrayEnumerateExtended<Derived>::executeMethod(
const ColumnArray::Offsets & offsets,
const ColumnRawPtrs & columns,
const Sizes & key_sizes,
const NullMap * null_map,
2020-07-21 13:58:07 +00:00
ColumnUInt32::Container & res_values) const
{
if (null_map)
executeMethodImpl<Method, true>(offsets, columns, key_sizes, null_map, res_values);
else
executeMethodImpl<Method, false>(offsets, columns, key_sizes, null_map, res_values);
}
template <typename Derived>
template <typename T>
bool FunctionArrayEnumerateExtended<Derived>::executeNumber(
2020-07-21 13:58:07 +00:00
const ColumnArray::Offsets & offsets, const IColumn & data, const NullMap * null_map, ColumnUInt32::Container & res_values) const
{
const auto * nested = checkAndGetColumn<ColumnVector<T>>(&data);
if (!nested)
return false;
executeMethod<MethodOneNumber<T>>(offsets, {nested}, {}, null_map, res_values);
2018-09-09 20:57:54 +00:00
return true;
}
template <typename Derived>
2018-12-22 15:40:51 +00:00
bool FunctionArrayEnumerateExtended<Derived>::executeString(
2020-07-21 13:58:07 +00:00
const ColumnArray::Offsets & offsets, const IColumn & data, const NullMap * null_map, ColumnUInt32::Container & res_values) const
2018-09-09 20:57:54 +00:00
{
const auto * nested = checkAndGetColumn<ColumnString>(&data);
if (nested)
executeMethod<MethodString>(offsets, {nested}, {}, null_map, res_values);
2018-09-09 20:57:54 +00:00
return nested;
}
2018-09-09 20:57:54 +00:00
template <typename Derived>
bool FunctionArrayEnumerateExtended<Derived>::executeFixedString(
2020-07-21 13:58:07 +00:00
const ColumnArray::Offsets & offsets, const IColumn & data, const NullMap * null_map, ColumnUInt32::Container & res_values) const
{
const auto * nested = checkAndGetColumn<ColumnString>(&data);
if (nested)
executeMethod<MethodFixedString>(offsets, {nested}, {}, null_map, res_values);
return nested;
2018-09-09 20:57:54 +00:00
}
template <typename Derived>
bool FunctionArrayEnumerateExtended<Derived>::execute128bit(
const ColumnArray::Offsets & offsets,
const ColumnRawPtrs & columns,
2020-07-21 13:58:07 +00:00
ColumnUInt32::Container & res_values) const
2018-09-09 20:57:54 +00:00
{
size_t count = columns.size();
size_t keys_bytes = 0;
Sizes key_sizes(count);
for (size_t j = 0; j < count; ++j)
{
if (!columns[j]->isFixedAndContiguous())
return false;
key_sizes[j] = columns[j]->sizeOfValueIfFixed();
keys_bytes += key_sizes[j];
}
2021-01-29 01:41:55 +00:00
if (keys_bytes > 16)
return false;
executeMethod<MethodFixed>(offsets, columns, key_sizes, nullptr, res_values);
2018-09-09 20:57:54 +00:00
return true;
}
template <typename Derived>
void FunctionArrayEnumerateExtended<Derived>::executeHashed(
2018-09-09 20:57:54 +00:00
const ColumnArray::Offsets & offsets,
const ColumnRawPtrs & columns,
2020-07-21 13:58:07 +00:00
ColumnUInt32::Container & res_values) const
2018-09-09 20:57:54 +00:00
{
executeMethod<MethodHashed>(offsets, columns, {}, nullptr, res_values);
2018-09-09 20:57:54 +00:00
}
}