ClickHouse/dbms/src/Interpreters/Set.cpp

566 lines
16 KiB
C++
Raw Normal View History

2012-08-23 20:22:44 +00:00
#include <DB/Core/Field.h>
#include <DB/Core/FieldVisitors.h>
2012-08-23 20:22:44 +00:00
#include <DB/Columns/ColumnString.h>
#include <DB/Columns/ColumnFixedString.h>
#include <DB/Columns/ColumnsNumber.h>
2012-08-23 23:49:28 +00:00
#include <DB/DataStreams/IProfilingBlockInputStream.h>
2012-08-24 19:42:03 +00:00
#include <DB/DataStreams/OneBlockInputStream.h>
#include <DB/DataTypes/DataTypeArray.h>
2012-08-24 19:42:03 +00:00
#include <DB/Parsers/ASTExpressionList.h>
#include <DB/Parsers/ASTFunction.h>
#include <DB/Parsers/ASTLiteral.h>
2012-08-23 23:49:28 +00:00
2012-08-23 20:22:44 +00:00
#include <DB/Interpreters/Set.h>
#include <DB/Interpreters/convertFieldToType.h>
#include <DB/Interpreters/evaluateConstantExpression.h>
2012-08-23 20:22:44 +00:00
namespace DB
{
2016-01-12 02:21:15 +00:00
namespace ErrorCodes
{
extern const int UNKNOWN_SET_DATA_VARIANT;
extern const int LOGICAL_ERROR;
extern const int SET_SIZE_LIMIT_EXCEEDED;
extern const int TYPE_MISMATCH;
extern const int INCORRECT_ELEMENT_OF_SET;
extern const int NUMBER_OF_COLUMNS_DOESNT_MATCH;
}
void SetVariants::init(Type type_)
{
type = type_;
switch (type)
{
case Type::EMPTY: break;
#define M(NAME) \
case Type::NAME: NAME.reset(new decltype(NAME)::element_type); break;
APPLY_FOR_SET_VARIANTS(M)
#undef M
default:
2016-01-12 02:21:15 +00:00
throw Exception("Unknown Set variant.", ErrorCodes::UNKNOWN_SET_DATA_VARIANT);
}
}
size_t SetVariants::getTotalRowCount() const
{
switch (type)
{
case Type::EMPTY: return 0;
#define M(NAME) \
case Type::NAME: return NAME->data.size();
APPLY_FOR_SET_VARIANTS(M)
#undef M
default:
2016-01-12 02:21:15 +00:00
throw Exception("Unknown Set variant.", ErrorCodes::UNKNOWN_SET_DATA_VARIANT);
}
}
size_t SetVariants::getTotalByteCount() const
{
switch (type)
{
case Type::EMPTY: return 0;
#define M(NAME) \
case Type::NAME: return NAME->data.getBufferSizeInBytes();
APPLY_FOR_SET_VARIANTS(M)
#undef M
default:
2016-01-12 02:21:15 +00:00
throw Exception("Unknown Set variant.", ErrorCodes::UNKNOWN_SET_DATA_VARIANT);
}
}
bool Set::checkSetSizeLimits() const
{
if (max_rows && data.getTotalRowCount() > max_rows)
return false;
if (max_bytes && data.getTotalByteCount() > max_bytes)
return false;
return true;
}
2012-08-23 20:22:44 +00:00
SetVariants::Type SetVariants::chooseMethod(const ConstColumnPlainPtrs & key_columns, Sizes & key_sizes)
2012-08-23 20:22:44 +00:00
{
2013-07-19 19:53:51 +00:00
size_t keys_size = key_columns.size();
2012-08-23 20:22:44 +00:00
bool all_fixed = true;
2012-08-23 20:22:44 +00:00
size_t keys_bytes = 0;
key_sizes.resize(keys_size);
for (size_t j = 0; j < keys_size; ++j)
{
if (!key_columns[j]->isFixed())
2012-08-23 20:22:44 +00:00
{
all_fixed = false;
2012-08-23 20:22:44 +00:00
break;
}
2013-07-19 19:53:51 +00:00
key_sizes[j] = key_columns[j]->sizeOfField();
2012-08-23 20:22:44 +00:00
keys_bytes += key_sizes[j];
}
/// Если есть один числовой ключ, который помещается в 64 бита
2013-07-19 19:53:51 +00:00
if (keys_size == 1 && key_columns[0]->isNumeric())
{
size_t size_of_field = key_columns[0]->sizeOfField();
if (size_of_field == 1)
return SetVariants::Type::key8;
if (size_of_field == 2)
return SetVariants::Type::key16;
if (size_of_field == 4)
return SetVariants::Type::key32;
if (size_of_field == 8)
return SetVariants::Type::key64;
throw Exception("Logical error: numeric column has sizeOfField not in 1, 2, 4, 8.", ErrorCodes::LOGICAL_ERROR);
}
/// Если ключи помещаются в N бит, будем использовать хэш-таблицу по упакованным в N-бит ключам
if (all_fixed && keys_bytes <= 16)
return SetVariants::Type::keys128;
if (all_fixed && keys_bytes <= 32)
return SetVariants::Type::keys256;
2012-08-23 20:22:44 +00:00
/// Если есть один строковый ключ, то используем хэш-таблицу с ним
if (keys_size == 1 && (typeid_cast<const ColumnString *>(key_columns[0]) || typeid_cast<const ColumnConstString *>(key_columns[0])))
return SetVariants::Type::key_string;
if (keys_size == 1 && typeid_cast<const ColumnFixedString *>(key_columns[0]))
return SetVariants::Type::key_fixed_string;
/// Иначе будем агрегировать по конкатенации ключей.
return SetVariants::Type::hashed;
}
template <typename Method>
void NO_INLINE Set::insertFromBlockImpl(
Method & method,
const ConstColumnPlainPtrs & key_columns,
size_t rows,
SetVariants & variants)
{
typename Method::State state;
state.init(key_columns);
size_t keys_size = key_columns.size();
/// Для всех строчек
for (size_t i = 0; i < rows; ++i)
{
/// Строим ключ
typename Method::Key key = state.getKey(key_columns, keys_size, i, key_sizes);
typename Method::Data::iterator it = method.data.find(key);
bool inserted;
method.data.emplace(key, it, inserted);
if (inserted)
method.onNewKey(*it, keys_size, i, variants.string_pool);
}
2012-08-23 20:22:44 +00:00
}
2015-01-27 00:52:03 +00:00
bool Set::insertFromBlock(const Block & block, bool create_ordered_set)
2012-08-23 20:22:44 +00:00
{
2015-01-27 00:52:03 +00:00
Poco::ScopedWriteRWLock lock(rwlock);
size_t keys_size = block.columns();
ConstColumnPlainPtrs key_columns(keys_size);
data_types.resize(keys_size);
/// Константные столбцы справа от IN поддерживается не напрямую. Для этого, они сначала материализуется.
Columns materialized_columns;
/// Запоминаем столбцы, с которыми будем работать
for (size_t i = 0; i < keys_size; ++i)
{
key_columns[i] = block.getByPosition(i).column;
data_types[i] = block.getByPosition(i).type;
if (auto converted = key_columns[i]->convertToFullColumnIfConst())
{
materialized_columns.emplace_back(converted);
key_columns[i] = materialized_columns.back().get();
}
}
2012-08-23 20:22:44 +00:00
size_t rows = block.rows();
/// Какую структуру данных для множества использовать?
2014-05-01 15:02:36 +00:00
if (empty())
data.init(data.chooseMethod(key_columns, key_sizes));
if (false) {}
#define M(NAME) \
else if (data.type == SetVariants::Type::NAME) \
insertFromBlockImpl(*data.NAME, key_columns, rows, data);
APPLY_FOR_SET_VARIANTS(M)
#undef M
else
throw Exception("Unknown set variant.", ErrorCodes::UNKNOWN_SET_DATA_VARIANT);
2012-08-23 23:49:28 +00:00
if (create_ordered_set)
for (size_t i = 0; i < rows; ++i)
ordered_set_elements->push_back((*key_columns[0])[i]); /// ordered_set для индекса работает только если IN одному ключу.
if (!checkSetSizeLimits())
2012-08-23 23:49:28 +00:00
{
if (overflow_mode == OverflowMode::THROW)
throw Exception("IN-set size exceeded."
" Rows: " + toString(data.getTotalRowCount()) +
", limit: " + toString(max_rows) +
". Bytes: " + toString(data.getTotalByteCount()) +
", limit: " + toString(max_bytes) + ".",
ErrorCodes::SET_SIZE_LIMIT_EXCEEDED);
if (overflow_mode == OverflowMode::BREAK)
return false;
throw Exception("Logical error: unknown overflow mode", ErrorCodes::LOGICAL_ERROR);
2012-08-23 23:49:28 +00:00
}
return true;
2012-08-23 20:22:44 +00:00
}
static Field extractValueFromNode(ASTPtr & node, const IDataType & type, const Context & context)
2012-08-24 19:42:03 +00:00
{
if (ASTLiteral * lit = typeid_cast<ASTLiteral *>(node.get()))
return convertFieldToType(lit->value, type);
else if (typeid_cast<ASTFunction *>(node.get()))
return convertFieldToType(evaluateConstantExpression(node, context), type);
else
throw Exception("Incorrect element of set. Must be literal or constant expression.", ErrorCodes::INCORRECT_ELEMENT_OF_SET);
}
void Set::createFromAST(DataTypes & types, ASTPtr node, const Context & context, bool create_ordered_set)
{
2012-08-24 19:42:03 +00:00
data_types = types;
/// Засунем множество в блок.
Block block;
for (size_t i = 0, size = data_types.size(); i < size; ++i)
{
ColumnWithTypeAndName col;
2012-08-24 19:42:03 +00:00
col.type = data_types[i];
col.column = data_types[i]->createColumn();
2013-06-21 20:34:19 +00:00
col.name = "_" + toString(i);
2012-08-24 19:42:03 +00:00
block.insert(col);
}
ASTExpressionList & list = typeid_cast<ASTExpressionList &>(*node);
2012-08-24 19:42:03 +00:00
for (ASTs::iterator it = list.children.begin(); it != list.children.end(); ++it)
{
if (data_types.size() == 1)
{
Field value = extractValueFromNode(*it, *data_types[0], context);
if (!value.isNull())
block.getByPosition(0).column->insert(value);
2012-08-24 19:42:03 +00:00
}
else if (ASTFunction * func = typeid_cast<ASTFunction *>(&**it))
2012-08-24 19:42:03 +00:00
{
if (func->name != "tuple")
throw Exception("Incorrect element of set. Must be tuple.", ErrorCodes::INCORRECT_ELEMENT_OF_SET);
size_t tuple_size = func->arguments->children.size();
if (tuple_size != data_types.size())
throw Exception("Incorrect size of tuple in set.", ErrorCodes::INCORRECT_ELEMENT_OF_SET);
2012-08-24 19:42:03 +00:00
for (size_t j = 0; j < tuple_size; ++j)
{
Field value = extractValueFromNode(func->arguments->children[j], *data_types[j], context);
if (!value.isNull())
block.getByPosition(j).column->insert(value);
2012-08-24 19:42:03 +00:00
}
}
else
throw Exception("Incorrect element of set", ErrorCodes::INCORRECT_ELEMENT_OF_SET);
}
2014-04-01 10:09:22 +00:00
if (create_ordered_set)
2014-04-08 12:54:32 +00:00
ordered_set_elements = OrderedSetElementsPtr(new OrderedSetElements());
2014-04-01 10:09:22 +00:00
insertFromBlock(block, create_ordered_set);
if (create_ordered_set)
2014-04-08 12:54:32 +00:00
std::sort(ordered_set_elements->begin(), ordered_set_elements->end());
2012-08-24 19:42:03 +00:00
}
ColumnPtr Set::execute(const Block & block, bool negative) const
2012-08-23 20:22:44 +00:00
{
size_t num_key_columns = block.columns();
if (0 == num_key_columns)
throw Exception("Logical error: no columns passed to Set::execute method.", ErrorCodes::LOGICAL_ERROR);
ColumnUInt8 * p_res = new ColumnUInt8;
ColumnPtr res = p_res;
ColumnUInt8::Container_t & vec_res = p_res->getData();
vec_res.resize(block.getByPosition(0).column->size());
2012-08-23 20:35:05 +00:00
2015-01-27 00:52:03 +00:00
Poco::ScopedReadRWLock lock(rwlock);
2012-08-24 20:40:34 +00:00
/// Если множество пусто
if (data_types.empty())
{
if (negative)
memset(&vec_res[0], 1, vec_res.size());
else
memset(&vec_res[0], 0, vec_res.size());
return res;
}
const DataTypeArray * array_type = typeid_cast<const DataTypeArray *>(&*block.getByPosition(0).type);
if (array_type)
2012-08-23 22:40:51 +00:00
{
if (data_types.size() != 1 || num_key_columns != 1)
throw Exception("Number of columns in section IN doesn't match.", ErrorCodes::NUMBER_OF_COLUMNS_DOESNT_MATCH);
if (array_type->getNestedType()->getName() != data_types[0]->getName())
throw Exception(std::string() + "Types in section IN don't match: " + data_types[0]->getName() + " on the right, " + array_type->getNestedType()->getName() + " on the left.", ErrorCodes::TYPE_MISMATCH);
const IColumn * in_column = &*block.getByPosition(0).column;
/// Константный столбец слева от IN поддерживается не напрямую. Для этого, он сначала материализуется.
ColumnPtr materialized_column;
if (materialized_column = in_column->convertToFullColumnIfConst())
in_column = materialized_column.get();
if (const ColumnArray * col = typeid_cast<const ColumnArray *>(in_column))
executeArray(col, vec_res, negative);
else
throw Exception("Unexpected array column type: " + in_column->getName(), ErrorCodes::ILLEGAL_COLUMN);
}
else
{
if (data_types.size() != num_key_columns)
throw Exception("Number of columns in section IN doesn't match.", ErrorCodes::NUMBER_OF_COLUMNS_DOESNT_MATCH);
/// Запоминаем столбцы, с которыми будем работать. Также проверим, что типы данных правильные.
ConstColumnPlainPtrs key_columns(num_key_columns);
for (size_t i = 0; i < num_key_columns; ++i)
{
key_columns[i] = block.getByPosition(i).column;
if (data_types[i]->getName() != block.getByPosition(i).type->getName())
throw Exception("Types of column " + toString(i + 1) + " in section IN don't match: "
+ data_types[i]->getName() + " on the right, " + block.getByPosition(i).type->getName() + " on the left.",
ErrorCodes::TYPE_MISMATCH);
}
/// Константные столбцы слева от IN поддерживается не напрямую. Для этого, они сначала материализуется.
Columns materialized_columns;
for (auto & column_ptr : key_columns)
{
if (auto converted = column_ptr->convertToFullColumnIfConst())
{
materialized_columns.emplace_back(converted);
column_ptr = materialized_columns.back().get();
}
}
executeOrdinary(key_columns, vec_res, negative);
2012-08-23 22:40:51 +00:00
}
return res;
}
2012-08-23 22:40:51 +00:00
template <typename Method>
void NO_INLINE Set::executeImpl(
Method & method,
const ConstColumnPlainPtrs & key_columns,
ColumnUInt8::Container_t & vec_res,
bool negative,
size_t rows) const
{
typename Method::State state;
state.init(key_columns);
size_t keys_size = key_columns.size();
2012-08-23 20:35:05 +00:00
/// NOTE Не используется оптимизация для подряд идущих одинаковых значений.
/// Для всех строчек
for (size_t i = 0; i < rows; ++i)
2012-08-23 20:35:05 +00:00
{
/// Строим ключ
typename Method::Key key = state.getKey(key_columns, keys_size, i, key_sizes);
vec_res[i] = negative ^ (method.data.end() != method.data.find(key));
}
}
template <typename Method>
void NO_INLINE Set::executeArrayImpl(
Method & method,
const ConstColumnPlainPtrs & key_columns,
const ColumnArray::Offsets_t & offsets,
ColumnUInt8::Container_t & vec_res,
bool negative,
size_t rows) const
{
typename Method::State state;
state.init(key_columns);
size_t keys_size = key_columns.size();
size_t prev_offset = 0;
/// Для всех строчек
for (size_t i = 0; i < rows; ++i)
{
UInt8 res = 0;
/// Для всех элементов
for (size_t j = prev_offset; j < offsets[i]; ++j)
2012-08-24 19:42:03 +00:00
{
/// Строим ключ
2015-03-03 21:11:54 +00:00
typename Method::Key key = state.getKey(key_columns, keys_size, j, key_sizes);
res |= negative ^ (method.data.end() != method.data.find(key));
if (res)
break;
2012-08-24 19:42:03 +00:00
}
vec_res[i] = res;
prev_offset = offsets[i];
2012-08-23 20:35:05 +00:00
}
}
void Set::executeOrdinary(const ConstColumnPlainPtrs & key_columns, ColumnUInt8::Container_t & vec_res, bool negative) const
{
size_t rows = key_columns[0]->size();
if (false) {}
#define M(NAME) \
else if (data.type == SetVariants::Type::NAME) \
executeImpl(*data.NAME, key_columns, vec_res, negative, rows);
APPLY_FOR_SET_VARIANTS(M)
#undef M
2012-08-23 20:35:05 +00:00
else
throw Exception("Unknown set variant.", ErrorCodes::UNKNOWN_SET_DATA_VARIANT);
2012-08-23 20:22:44 +00:00
}
void Set::executeArray(const ColumnArray * key_column, ColumnUInt8::Container_t & vec_res, bool negative) const
{
size_t rows = key_column->size();
const ColumnArray::Offsets_t & offsets = key_column->getOffsets();
const IColumn & nested_column = key_column->getData();
if (false) {}
#define M(NAME) \
else if (data.type == SetVariants::Type::NAME) \
2015-03-03 21:11:54 +00:00
executeArrayImpl(*data.NAME, ConstColumnPlainPtrs{&nested_column}, offsets, vec_res, negative, rows);
APPLY_FOR_SET_VARIANTS(M)
#undef M
else
throw Exception("Unknown set variant.", ErrorCodes::UNKNOWN_SET_DATA_VARIANT);
}
2015-03-27 03:37:46 +00:00
BoolMask Set::mayBeTrueInRange(const Range & range) const
2014-04-01 10:09:22 +00:00
{
2014-04-08 12:54:32 +00:00
if (!ordered_set_elements)
2014-04-01 10:09:22 +00:00
throw DB::Exception("Ordered set in not created.");
2014-04-08 12:54:32 +00:00
if (ordered_set_elements->empty())
2014-04-01 10:09:22 +00:00
return BoolMask(false, true);
const Field & left = range.left;
const Field & right = range.right;
bool can_be_true;
bool can_be_false = true;
/// Если во всем диапазоне одинаковый ключ и он есть в Set, то выбираем блок для in и не выбираем для notIn
if (range.left_bounded && range.right_bounded && range.right_included && range.left_included && left == right)
{
2014-04-18 14:24:06 +00:00
if (std::binary_search(ordered_set_elements->begin(), ordered_set_elements->end(), left))
2014-04-01 10:09:22 +00:00
{
can_be_false = false;
can_be_true = true;
}
else
{
can_be_true = false;
can_be_false = true;
}
}
else
{
2015-03-27 03:37:46 +00:00
auto left_it = range.left_bounded
? std::lower_bound(ordered_set_elements->begin(), ordered_set_elements->end(), left)
: ordered_set_elements->begin();
2014-04-08 12:54:32 +00:00
if (range.left_bounded && !range.left_included && left_it != ordered_set_elements->end() && *left_it == left)
2014-04-01 10:09:22 +00:00
++left_it;
/// если весь диапазон, правее in
2014-04-08 12:54:32 +00:00
if (left_it == ordered_set_elements->end())
2014-04-01 10:09:22 +00:00
{
can_be_true = false;
}
else
{
2015-03-27 03:37:46 +00:00
auto right_it = range.right_bounded
? std::upper_bound(ordered_set_elements->begin(), ordered_set_elements->end(), right)
: ordered_set_elements->end();
2014-04-08 12:54:32 +00:00
if (range.right_bounded && !range.right_included && right_it != ordered_set_elements->begin() && *(right_it--) == right)
2014-04-01 10:09:22 +00:00
--right_it;
/// весь диапазон, левее in
2014-04-08 12:54:32 +00:00
if (right_it == ordered_set_elements->begin())
2014-04-01 10:09:22 +00:00
{
can_be_true = false;
}
else
{
--right_it;
/// в диапазон не попадает ни одного ключа из in
if (*right_it < *left_it)
can_be_true = false;
else
can_be_true = true;
}
}
}
return BoolMask(can_be_true, can_be_false);
}
std::string Set::describe() const
{
if (!ordered_set_elements)
return "{}";
bool first = true;
std::stringstream ss;
ss << "{";
for (const Field & f : *ordered_set_elements)
{
ss << (first ? "" : ", ") << apply_visitor(FieldVisitorToString(), f);
first = false;
}
ss << "}";
return ss.str();
}
2012-08-23 20:22:44 +00:00
}