2015-01-21 04:17:02 +00:00
|
|
|
#include <DB/DataStreams/DistinctBlockInputStream.h>
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
2016-01-11 21:46:36 +00:00
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
|
|
|
extern const int SET_SIZE_LIMIT_EXCEEDED;
|
|
|
|
}
|
|
|
|
|
2016-12-22 20:50:12 +00:00
|
|
|
DistinctBlockInputStream::DistinctBlockInputStream(BlockInputStreamPtr input_, const Limits & limits, size_t limit_hint_, Names columns_)
|
|
|
|
: columns_names(columns_)
|
|
|
|
, limit_hint(limit_hint_)
|
|
|
|
, max_rows(limits.max_rows_in_distinct)
|
|
|
|
, max_bytes(limits.max_bytes_in_distinct)
|
|
|
|
, overflow_mode(limits.distinct_overflow_mode)
|
2015-01-21 04:17:02 +00:00
|
|
|
{
|
|
|
|
children.push_back(input_);
|
|
|
|
}
|
|
|
|
|
2016-11-10 21:24:40 +00:00
|
|
|
String DistinctBlockInputStream::getID() const
|
|
|
|
{
|
|
|
|
std::stringstream res;
|
|
|
|
res << "Distinct(" << children.back()->getID() << ")";
|
|
|
|
return res.str();
|
|
|
|
}
|
2015-01-21 04:17:02 +00:00
|
|
|
|
|
|
|
Block DistinctBlockInputStream::readImpl()
|
2016-12-23 08:01:35 +00:00
|
|
|
{
|
2016-12-22 22:19:57 +00:00
|
|
|
/// Execute until end of stream or until
|
|
|
|
/// a block with some new records will be gotten.
|
2015-01-21 04:17:02 +00:00
|
|
|
while (1)
|
|
|
|
{
|
2016-12-22 22:19:57 +00:00
|
|
|
/// Stop reading if we already reach the limit.
|
2016-12-22 20:50:12 +00:00
|
|
|
if (limit_hint && data.getTotalRowCount() >= limit_hint)
|
2015-01-21 04:17:02 +00:00
|
|
|
return Block();
|
|
|
|
|
|
|
|
Block block = children[0]->read();
|
|
|
|
if (!block)
|
|
|
|
return Block();
|
|
|
|
|
2016-12-22 17:00:23 +00:00
|
|
|
const ConstColumnPlainPtrs column_ptrs(getKeyColumns(block));
|
|
|
|
if (column_ptrs.empty())
|
|
|
|
return block;
|
2015-05-15 00:20:25 +00:00
|
|
|
|
2016-12-22 08:51:34 +00:00
|
|
|
if (data.empty())
|
|
|
|
data.init(SetVariants::chooseMethod(column_ptrs, key_sizes));
|
|
|
|
|
2016-12-22 17:00:23 +00:00
|
|
|
const size_t old_set_size = data.getTotalRowCount();
|
|
|
|
const size_t rows = block.rows();
|
2015-01-21 04:17:02 +00:00
|
|
|
IColumn::Filter filter(rows);
|
|
|
|
|
2016-12-22 08:51:34 +00:00
|
|
|
switch (data.type)
|
2015-01-21 04:17:02 +00:00
|
|
|
{
|
2016-12-22 08:51:34 +00:00
|
|
|
case SetVariants::Type::EMPTY:
|
2015-01-21 04:17:02 +00:00
|
|
|
break;
|
2016-12-22 08:51:34 +00:00
|
|
|
#define M(NAME) \
|
|
|
|
case SetVariants::Type::NAME: \
|
2017-01-12 15:34:41 +00:00
|
|
|
buildFilter(*data.NAME, column_ptrs, filter, rows, data); \
|
2016-12-22 08:51:34 +00:00
|
|
|
break;
|
|
|
|
APPLY_FOR_SET_VARIANTS(M)
|
|
|
|
#undef M
|
2015-01-21 04:17:02 +00:00
|
|
|
}
|
|
|
|
|
2016-12-22 22:19:57 +00:00
|
|
|
/// Just go to the next block if there isn't any new record in the current one.
|
2016-12-22 08:51:34 +00:00
|
|
|
if (data.getTotalRowCount() == old_set_size)
|
2015-01-21 04:17:02 +00:00
|
|
|
continue;
|
|
|
|
|
|
|
|
if (!checkLimits())
|
|
|
|
{
|
|
|
|
if (overflow_mode == OverflowMode::THROW)
|
|
|
|
throw Exception("DISTINCT-Set size limit exceeded."
|
2016-12-22 08:51:34 +00:00
|
|
|
" Rows: " + toString(data.getTotalRowCount()) +
|
2015-01-21 04:17:02 +00:00
|
|
|
", limit: " + toString(max_rows) +
|
2016-12-22 08:51:34 +00:00
|
|
|
". Bytes: " + toString(data.getTotalByteCount()) +
|
2015-01-21 04:17:02 +00:00
|
|
|
", limit: " + toString(max_bytes) + ".",
|
|
|
|
ErrorCodes::SET_SIZE_LIMIT_EXCEEDED);
|
|
|
|
|
|
|
|
if (overflow_mode == OverflowMode::BREAK)
|
|
|
|
return Block();
|
|
|
|
|
|
|
|
throw Exception("Logical error: unknown overflow mode", ErrorCodes::LOGICAL_ERROR);
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t all_columns = block.columns();
|
|
|
|
for (size_t i = 0; i < all_columns; ++i)
|
2017-01-02 20:12:12 +00:00
|
|
|
block.safeGetByPosition(i).column = block.safeGetByPosition(i).column->filter(filter, -1);
|
2015-01-21 04:17:02 +00:00
|
|
|
|
|
|
|
return block;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-12-22 17:00:23 +00:00
|
|
|
bool DistinctBlockInputStream::checkLimits() const
|
|
|
|
{
|
|
|
|
if (max_rows && data.getTotalRowCount() > max_rows)
|
|
|
|
return false;
|
|
|
|
if (max_bytes && data.getTotalByteCount() > max_bytes)
|
|
|
|
return false;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2016-12-22 08:51:34 +00:00
|
|
|
template <typename Method>
|
2016-12-22 17:00:23 +00:00
|
|
|
void DistinctBlockInputStream::buildFilter(
|
2016-12-22 08:51:34 +00:00
|
|
|
Method & method,
|
|
|
|
const ConstColumnPlainPtrs & columns,
|
|
|
|
IColumn::Filter & filter,
|
2017-01-12 15:34:41 +00:00
|
|
|
size_t rows,
|
|
|
|
SetVariants & variants) const
|
2016-12-22 08:51:34 +00:00
|
|
|
{
|
|
|
|
typename Method::State state;
|
|
|
|
state.init(columns);
|
|
|
|
|
|
|
|
for (size_t i = 0; i < rows; ++i)
|
|
|
|
{
|
2016-12-22 22:19:57 +00:00
|
|
|
/// Make a key.
|
2016-12-22 17:00:23 +00:00
|
|
|
typename Method::Key key = state.getKey(columns, columns.size(), i, key_sizes);
|
2016-12-22 08:51:34 +00:00
|
|
|
|
2017-01-12 15:34:41 +00:00
|
|
|
typename Method::Data::iterator it = method.data.find(key);
|
|
|
|
bool inserted;
|
|
|
|
method.data.emplace(key, it, inserted);
|
|
|
|
|
|
|
|
if (inserted)
|
|
|
|
method.onNewKey(*it, columns.size(), i, variants.string_pool);
|
|
|
|
|
2016-12-22 22:19:57 +00:00
|
|
|
/// Emit the record if there is no such key in the current set yet.
|
|
|
|
/// Skip it otherwise.
|
2017-01-12 15:34:41 +00:00
|
|
|
filter[i] = inserted;
|
2016-12-22 08:51:34 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-12-22 17:00:23 +00:00
|
|
|
ConstColumnPlainPtrs DistinctBlockInputStream::getKeyColumns(const Block & block) const
|
2016-11-10 21:24:40 +00:00
|
|
|
{
|
2016-12-22 17:00:23 +00:00
|
|
|
size_t columns = columns_names.empty() ? block.columns() : columns_names.size();
|
|
|
|
|
|
|
|
ConstColumnPlainPtrs column_ptrs;
|
|
|
|
column_ptrs.reserve(columns);
|
|
|
|
|
|
|
|
for (size_t i = 0; i < columns; ++i)
|
|
|
|
{
|
|
|
|
auto & column = columns_names.empty()
|
2017-01-02 20:12:12 +00:00
|
|
|
? block.safeGetByPosition(i).column
|
2016-12-22 17:00:23 +00:00
|
|
|
: block.getByName(columns_names[i]).column;
|
|
|
|
|
2016-12-22 22:19:57 +00:00
|
|
|
/// Ignore all constant columns.
|
2016-12-22 17:00:23 +00:00
|
|
|
if (!column->isConst())
|
|
|
|
column_ptrs.emplace_back(column.get());
|
|
|
|
}
|
|
|
|
|
|
|
|
return column_ptrs;
|
2016-11-10 21:24:40 +00:00
|
|
|
}
|
|
|
|
|
2015-01-21 04:17:02 +00:00
|
|
|
}
|