Implemented limited support for NULLs in IN operator [#CLICKHOUSE-4].

This commit is contained in:
Alexey Milovidov 2017-03-28 06:00:33 +03:00
parent 130fce2c4c
commit a8f6a3127d
12 changed files with 346 additions and 23 deletions

View File

@ -779,6 +779,27 @@ public:
}
bool ALWAYS_INLINE has(Key x) const
{
if (Cell::isZero(x, *this))
return this->hasZero();
size_t hash_value = hash(x);
size_t place_value = findCell(x, hash_value, grower.place(hash_value));
return !buf[place_value].isZero(*this);
}
bool ALWAYS_INLINE has(Key x, size_t hash_value) const
{
if (Cell::isZero(x, *this))
return this->hasZero();
size_t place_value = findCell(x, hash_value, grower.place(hash_value));
return !buf[place_value].isZero(*this);
}
void write(DB::WriteBuffer & wb) const
{
Cell::State::write(wb);

View File

@ -1006,7 +1006,6 @@ public:
size_t getNumberOfArguments() const override { return 2; }
/// Получить типы результата по типам аргументов. Если функция неприменима для данных аргументов - кинуть исключение.
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
const DataTypeArray * array_type = typeid_cast<const DataTypeArray *>(arguments[0].get());
@ -1020,7 +1019,7 @@ public:
const IDataType * observed_type1 = DataTypeTraits::removeNullable(arguments[1]).get();
if (!(observed_type0->behavesAsNumber() && observed_type1->behavesAsNumber())
&& observed_type0->getName() != observed_type1->getName())
&& !observed_type0->equals(*observed_type1))
throw Exception("Types of array and 2nd argument of function "
+ getName() + " must be identical up to nullability. Passed: "
+ arguments[0]->getName() + " and " + arguments[1]->getName() + ".",

View File

@ -1658,7 +1658,7 @@ public:
return std::make_shared<DataTypeTuple>(std::move(result_tuple));
}
else if (arguments[1]->getName() != arguments[2]->getName())
else if (!arguments[1]->equals(*arguments[2]))
{
const DataTypeString * type_string1 = typeid_cast<const DataTypeString *>(arguments[1].get());
const DataTypeString * type_string2 = typeid_cast<const DataTypeString *>(arguments[2].get());

View File

@ -56,11 +56,28 @@ public:
size_t getTotalRowCount() const { return data.getTotalRowCount(); }
size_t getTotalByteCount() const { return data.getTotalByteCount(); }
using ConstNullMapPtr = const PaddedPODArray<UInt8> *;
private:
Sizes key_sizes;
SetVariants data;
/** How IN works with Nullable types.
*
* For simplicity reasons, all NULL values and any tuples with at least one NULL element are ignored in the Set.
* And for left hand side values, that are NULLs or contain any NULLs, we return 0 (means that element is not in Set).
*
* If we want more standard compliant behaviour, we must return NULL
* if lhs is NULL and set is not empty or if lhs is not in set, but set contains at least one NULL.
* It is more complicated with tuples.
* For example,
* (1, NULL, 2) IN ((1, NULL, 3)) must return 0,
* but (1, NULL, 2) IN ((1, 1111, 2)) must return NULL.
*
* We have not implemented such sophisticated behaviour.
*/
/** Типы данных, из которых было создано множество.
* При проверке на принадлежность множеству, типы проверяемых столбцов должны с ними совпадать.
*/
@ -77,7 +94,11 @@ private:
void executeArray(const ColumnArray * key_column, ColumnUInt8::Container_t & vec_res, bool negative) const;
/// Если в левой части набор столбцов тех же типов, что элементы множества.
void executeOrdinary(const ConstColumnPlainPtrs & key_columns, ColumnUInt8::Container_t & vec_res, bool negative) const;
void executeOrdinary(
const ConstColumnPlainPtrs & key_columns,
ColumnUInt8::Container_t & vec_res,
bool negative,
const PaddedPODArray<UInt8> * null_map) const;
/// Проверить не превышены ли допустимые размеры множества ключей
bool checkSetSizeLimits() const;
@ -101,7 +122,16 @@ private:
Method & method,
const ConstColumnPlainPtrs & key_columns,
size_t rows,
SetVariants & variants);
SetVariants & variants,
ConstNullMapPtr null_map);
template <typename Method, bool has_null_map>
void insertFromBlockImplCase(
Method & method,
const ConstColumnPlainPtrs & key_columns,
size_t rows,
SetVariants & variants,
ConstNullMapPtr null_map);
template <typename Method>
void executeImpl(
@ -109,7 +139,17 @@ private:
const ConstColumnPlainPtrs & key_columns,
ColumnUInt8::Container_t & vec_res,
bool negative,
size_t rows) const;
size_t rows,
ConstNullMapPtr null_map) const;
template <typename Method, bool has_null_map>
void executeImplCase(
Method & method,
const ConstColumnPlainPtrs & key_columns,
ColumnUInt8::Container_t & vec_res,
bool negative,
size_t rows,
ConstNullMapPtr null_map) const;
template <typename Method>
void executeArrayImpl(

View File

@ -319,7 +319,7 @@ struct SetVariants
std::unique_ptr<SetMethodKeysFixed<HashSet<UInt256, UInt256HashCRC32>>> keys256;
std::unique_ptr<SetMethodHashed<HashSet<UInt128, UInt128TrivialHash>>> hashed;
/// Support for nullable keys.
/// Support for nullable keys (for DISTINCT implementation).
std::unique_ptr<SetMethodKeysFixed<HashSet<UInt128, UInt128HashCRC32>, true>> nullable_keys128;
std::unique_ptr<SetMethodKeysFixed<HashSet<UInt256, UInt256HashCRC32>, true>> nullable_keys256;

View File

@ -2,6 +2,9 @@
#include <DB/Common/SipHash.h>
#include <DB/Common/NaNUtils.h>
#include <DB/Columns/ColumnNullable.h>
#include <DB/Columns/ColumnArray.h>
#include <DB/Columns/ColumnTuple.h>
#include <DB/Columns/ColumnAggregateFunction.h>
namespace DB
@ -10,6 +13,7 @@ namespace DB
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
extern const int ILLEGAL_COLUMN;
}
@ -17,14 +21,22 @@ ColumnNullable::ColumnNullable(ColumnPtr nested_column_, ColumnPtr null_map_)
: nested_column{nested_column_}, null_map{null_map_}
{
if (nested_column->isNullable())
throw Exception{"A nullable column cannot contain another nullable column", ErrorCodes::LOGICAL_ERROR};
throw Exception{"A nullable column cannot contain another nullable column", ErrorCodes::ILLEGAL_COLUMN};
/// TODO Also check for Nullable(Array(...)). But they are occasionally used somewhere in tests.
if (typeid_cast<const ColumnTuple *>(nested_column.get()))
throw Exception{"Nullable(Tuple(...)) is illegal", ErrorCodes::ILLEGAL_COLUMN};
if (typeid_cast<const ColumnAggregateFunction *>(nested_column.get()))
throw Exception{"Nullable(AggregateFunction(...)) is illegal", ErrorCodes::ILLEGAL_COLUMN};
/// ColumnNullable cannot have constant nested column. But constant argument could be passed. Materialize it.
if (auto nested_column_materialized = nested_column->convertToFullColumnIfConst())
nested_column = nested_column_materialized;
if (null_map->isConst())
throw Exception{"ColumnNullable cannot have constant null map", ErrorCodes::LOGICAL_ERROR};
throw Exception{"ColumnNullable cannot have constant null map", ErrorCodes::ILLEGAL_COLUMN};
}

View File

@ -231,7 +231,7 @@ static ColumnsAndDefaults parseColumns(
const auto & deduced_type = tmp_column.type;
/// type mismatch between explicitly specified and deduced type, add conversion for non-array types
if (explicit_type->getName() != deduced_type->getName())
if (!explicit_type->equals(*deduced_type))
{
col_decl_ptr->default_expression = makeASTFunction("CAST", col_decl_ptr->default_expression,
std::make_shared<ASTLiteral>(StringRange(), explicit_type->getName()));

View File

@ -828,7 +828,7 @@ void Join::checkTypesOfKeys(const Block & block_left, const Block & block_right)
size_t keys_size = key_names_left.size();
for (size_t i = 0; i < keys_size; ++i)
if (block_left.getByName(key_names_left[i]).type->getName() != block_right.getByName(key_names_right[i]).type->getName())
if (!block_left.getByName(key_names_left[i]).type->equals(*block_right.getByName(key_names_right[i]).type))
throw Exception("Type mismatch of columns to JOIN by: "
+ key_names_left[i] + " " + block_left.getByName(key_names_left[i]).type->getName() + " at left, "
+ key_names_right[i] + " " + block_right.getByName(key_names_right[i]).type->getName() + " at right",

View File

@ -43,12 +43,29 @@ bool Set::checkSetSizeLimits() const
return true;
}
template <typename Method>
void NO_INLINE Set::insertFromBlockImpl(
Method & method,
const ConstColumnPlainPtrs & key_columns,
size_t rows,
SetVariants & variants)
SetVariants & variants,
ConstNullMapPtr null_map)
{
if (null_map)
insertFromBlockImplCase<Method, true>(method, key_columns, rows, variants, null_map);
else
insertFromBlockImplCase<Method, false>(method, key_columns, rows, variants, null_map);
}
template <typename Method, bool has_null_map>
void NO_INLINE Set::insertFromBlockImplCase(
Method & method,
const ConstColumnPlainPtrs & key_columns,
size_t rows,
SetVariants & variants,
ConstNullMapPtr null_map)
{
typename Method::State state;
state.init(key_columns);
@ -57,6 +74,9 @@ void NO_INLINE Set::insertFromBlockImpl(
/// For all rows
for (size_t i = 0; i < rows; ++i)
{
if (has_null_map && (*null_map)[i])
continue;
/// Obtain a key to insert to the set
typename Method::Key key = state.getKey(key_columns, keys_size, i, key_sizes);
@ -70,6 +90,52 @@ void NO_INLINE Set::insertFromBlockImpl(
}
/** Replace Nullable key_columns to corresponding nested columns.
* In 'null_map' return a map of positions where at least one column was NULL.
* null_map_holder could take ownership of null_map, if required.
*/
static void extractNestedColumnsAndNullMap(ConstColumnPlainPtrs & key_columns, ColumnPtr & null_map_holder, Set::ConstNullMapPtr & null_map)
{
if (key_columns.size() == 1)
{
auto & column = key_columns[0];
if (!column->isNullable())
return;
const ColumnNullable & column_nullable = static_cast<const ColumnNullable &>(*column);
null_map = &column_nullable.getNullMap();
column = column_nullable.getNestedColumn().get();
}
else
{
PaddedPODArray<UInt8> * mutable_null_map = nullptr;
for (auto & column : key_columns)
{
if (column->isNullable())
{
const ColumnNullable & column_nullable = static_cast<const ColumnNullable &>(*column);
column = column_nullable.getNestedColumn().get();
if (!null_map_holder)
{
null_map_holder = column_nullable.getNullMapColumn()->clone();
mutable_null_map = &static_cast<ColumnUInt8 &>(*null_map_holder).getData();
}
else
{
const PaddedPODArray<UInt8> & other_null_map = column_nullable.getNullMap();
for (size_t i = 0, size = mutable_null_map->size(); i < size; ++i)
(*mutable_null_map)[i] |= other_null_map[i];
}
}
}
null_map = mutable_null_map;
}
}
bool Set::insertFromBlock(const Block & block, bool create_ordered_set)
{
Poco::ScopedWriteRWLock lock(rwlock);
@ -131,6 +197,11 @@ bool Set::insertFromBlock(const Block & block, bool create_ordered_set)
size_t rows = block.rows();
/// We will insert to the Set only keys, where all components are not NULL.
ColumnPtr null_map_holder;
ConstNullMapPtr null_map{};
extractNestedColumnsAndNullMap(key_columns, null_map_holder, null_map);
/// Choose data structure to use for the set.
if (empty())
data.init(data.chooseMethod(key_columns, key_sizes));
@ -141,7 +212,7 @@ bool Set::insertFromBlock(const Block & block, bool create_ordered_set)
break;
#define M(NAME) \
case SetVariants::Type::NAME: \
insertFromBlockImpl(*data.NAME, key_columns, rows, data); \
insertFromBlockImpl(*data.NAME, key_columns, rows, data, null_map); \
break;
APPLY_FOR_SET_VARIANTS(M)
#undef M
@ -286,10 +357,17 @@ ColumnPtr Set::execute(const Block & block, bool negative) const
if (array_type)
{
/// Special treatment of Arrays in left hand side of IN:
/// check that at least one array element is in Set.
/// This is deprecated functionality and will be removed.
if (data_types.size() != 1 || num_key_columns != 1)
throw Exception("Number of columns in section IN doesn't match.", ErrorCodes::NUMBER_OF_COLUMNS_DOESNT_MATCH);
if (DataTypeTraits::removeNullable(array_type->getNestedType())->getName() !=
DataTypeTraits::removeNullable(data_types[0])->getName())
if (array_type->getNestedType()->isNullable())
throw Exception("Array(Nullable(...)) for left hand side of IN is not supported.", ErrorCodes::NOT_IMPLEMENTED);
if (!array_type->getNestedType()->equals(*data_types[0]))
throw Exception(std::string() + "Types in section IN don't match: " + data_types[0]->getName() +
" on the right, " + array_type->getNestedType()->getName() + " on the left.",
ErrorCodes::TYPE_MISMATCH);
@ -340,7 +418,12 @@ ColumnPtr Set::execute(const Block & block, bool negative) const
}
}
executeOrdinary(key_columns, vec_res, negative);
/// We will check existence in Set only for keys, where all components are not NULL.
ColumnPtr null_map_holder;
ConstNullMapPtr null_map{};
extractNestedColumnsAndNullMap(key_columns, null_map_holder, null_map);
executeOrdinary(key_columns, vec_res, negative, null_map);
}
return res;
@ -353,7 +436,24 @@ void NO_INLINE Set::executeImpl(
const ConstColumnPlainPtrs & key_columns,
ColumnUInt8::Container_t & vec_res,
bool negative,
size_t rows) const
size_t rows,
ConstNullMapPtr null_map) const
{
if (null_map)
executeImplCase<Method, true>(method, key_columns, vec_res, negative, rows, null_map);
else
executeImplCase<Method, false>(method, key_columns, vec_res, negative, rows, null_map);
}
template <typename Method, bool has_null_map>
void NO_INLINE Set::executeImplCase(
Method & method,
const ConstColumnPlainPtrs & key_columns,
ColumnUInt8::Container_t & vec_res,
bool negative,
size_t rows,
ConstNullMapPtr null_map) const
{
typename Method::State state;
state.init(key_columns);
@ -364,9 +464,14 @@ void NO_INLINE Set::executeImpl(
/// Для всех строчек
for (size_t i = 0; i < rows; ++i)
{
/// Строим ключ
typename Method::Key key = state.getKey(key_columns, keys_size, i, key_sizes);
vec_res[i] = negative ^ (method.data.end() != method.data.find(key));
if (has_null_map && (*null_map)[i])
vec_res[i] = negative;
else
{
/// Строим ключ
typename Method::Key key = state.getKey(key_columns, keys_size, i, key_sizes);
vec_res[i] = negative ^ method.data.has(key);
}
}
}
@ -393,7 +498,7 @@ void NO_INLINE Set::executeArrayImpl(
{
/// Строим ключ
typename Method::Key key = state.getKey(key_columns, keys_size, j, key_sizes);
res |= negative ^ (method.data.end() != method.data.find(key));
res |= negative ^ method.data.has(key);
if (res)
break;
}
@ -403,7 +508,11 @@ void NO_INLINE Set::executeArrayImpl(
}
void Set::executeOrdinary(const ConstColumnPlainPtrs & key_columns, ColumnUInt8::Container_t & vec_res, bool negative) const
void Set::executeOrdinary(
const ConstColumnPlainPtrs & key_columns,
ColumnUInt8::Container_t & vec_res,
bool negative,
ConstNullMapPtr null_map) const
{
size_t rows = key_columns[0]->size();
@ -413,7 +522,7 @@ void Set::executeOrdinary(const ConstColumnPlainPtrs & key_columns, ColumnUInt8:
break;
#define M(NAME) \
case SetVariants::Type::NAME: \
executeImpl(*data.NAME, key_columns, vec_res, negative, rows); \
executeImpl(*data.NAME, key_columns, vec_res, negative, rows, null_map); \
break;
APPLY_FOR_SET_VARIANTS(M)
#undef M

View File

@ -151,6 +151,9 @@ Field convertFieldToTypeImpl(const Field & src, const IDataType & type)
Field convertFieldToType(const Field & from_value, const IDataType & to_type, const IDataType * from_type_hint)
{
if (from_value.isNull())
return from_value;
if (from_type_hint && from_type_hint->equals(to_type))
return from_value;

View File

@ -0,0 +1,110 @@
0
1
0
1
0
0
1
\N
1
0
0
1
\N
1
0
0
1
1
1
0
0
1
0
1
0
0
1
0
1
\N
0
1
0
1
0
0
1
\N
1
0
0
1
\N
1
0
0
1
1
1
0
0
1
0
1
0
0
1
0
1
\N
0
1
0
1
0
0
1
0
1
0
0
1
0
1
0
0
1
0
1
0
0
1
0
1
0
0
1
1
1
0
0
1
0
1
0
0
1
0
1
0
0
1
0
0
0
0
1
0
0
0

View File

@ -0,0 +1,29 @@
SELECT number IN (1, NULL, 3) FROM system.numbers LIMIT 5;
SELECT nullIf(number, 2) IN (1, NULL, 3) FROM system.numbers LIMIT 5;
SELECT nullIf(number, 2) IN (1, 2, 3) FROM system.numbers LIMIT 5;
SELECT number IN (SELECT number FROM system.numbers LIMIT 1, 3) AS res FROM system.numbers LIMIT 5;
SELECT number IN (SELECT nullIf(number, 2) FROM system.numbers LIMIT 1, 3) AS res FROM system.numbers LIMIT 5;
SELECT nullIf(number, 4) IN (SELECT nullIf(number, 2) FROM system.numbers LIMIT 1, 3) AS res FROM system.numbers LIMIT 5;
SELECT toString(number) IN ('1', NULL, '3') FROM system.numbers LIMIT 5;
SELECT nullIf(toString(number), '2') IN ('1', NULL, '3') FROM system.numbers LIMIT 5;
SELECT nullIf(toString(number), '2') IN ('1', '2', '3') FROM system.numbers LIMIT 5;
SELECT toString(number) IN (SELECT toString(number) FROM system.numbers LIMIT 1, 3) AS res FROM system.numbers LIMIT 5;
SELECT toString(number) IN (SELECT nullIf(toString(number), '2') FROM system.numbers LIMIT 1, 3) AS res FROM system.numbers LIMIT 5;
SELECT nullIf(toString(number), '4') IN (SELECT nullIf(toString(number), '2') FROM system.numbers LIMIT 1, 3) AS res FROM system.numbers LIMIT 5;
SELECT (number, -number) IN ((1, -1), (NULL, NULL), (3, -3)) FROM system.numbers LIMIT 5;
SELECT (nullIf(number, 2), -number) IN ((1, -1), (NULL, NULL), (3, -3)) FROM system.numbers LIMIT 5;
SELECT (nullIf(number, 2), -number) IN ((1, -1), (2, -2), (3, -3)) FROM system.numbers LIMIT 5;
SELECT (nullIf(number, 2), -nullIf(number, 2)) IN ((1, -1), (NULL, NULL), (3, -3)) FROM system.numbers LIMIT 5;
SELECT (nullIf(number, 2), -nullIf(number, 2)) IN ((1, -1), (2, -2), (3, -3)) FROM system.numbers LIMIT 5;
SELECT (number, -number) IN (SELECT number, -number FROM system.numbers LIMIT 1, 3) AS res FROM system.numbers LIMIT 5;
SELECT (number, -number) IN (SELECT nullIf(number, 2), -number FROM system.numbers LIMIT 1, 3) AS res FROM system.numbers LIMIT 5;
SELECT (nullIf(number, 4), -number) IN (SELECT nullIf(number, 2), -number FROM system.numbers LIMIT 1, 3) AS res FROM system.numbers LIMIT 5;
SELECT (number, -nullIf(number, 3)) IN (SELECT nullIf(number, 2), -number FROM system.numbers LIMIT 1, 3) AS res FROM system.numbers LIMIT 5;
SELECT (nullIf(number, 4), -nullIf(number, 3)) IN (SELECT nullIf(number, 2), -number FROM system.numbers LIMIT 1, 3) AS res FROM system.numbers LIMIT 5;