ClickHouse/dbms/src/Dictionaries/RangeDictionaryBlockInputStream.h

288 lines
11 KiB
C++
Raw Normal View History

2017-04-28 18:33:31 +00:00
#pragma once
#include <Columns/ColumnString.h>
#include <Columns/ColumnVector.h>
2017-04-28 18:33:31 +00:00
#include <Columns/IColumn.h>
#include <DataStreams/IBlockInputStream.h>
#include <DataTypes/DataTypeDate.h>
#include <DataTypes/DataTypesNumber.h>
#include <ext/range.h>
#include "DictionaryBlockInputStreamBase.h"
#include "DictionaryStructure.h"
#include "IDictionary.h"
#include "RangeHashedDictionary.h"
2017-04-28 18:33:31 +00:00
namespace DB
2017-04-28 18:33:31 +00:00
{
/*
* BlockInputStream implementation for external dictionaries
2017-04-28 18:33:31 +00:00
* read() returns single block consisting of the in-memory contents of the dictionaries
*/
template <typename DictionaryType, typename RangeType, typename Key>
2017-04-28 18:33:31 +00:00
class RangeDictionaryBlockInputStream : public DictionaryBlockInputStreamBase
{
public:
2018-08-24 05:21:53 +00:00
using DictionaryPtr = std::shared_ptr<DictionaryType const>;
RangeDictionaryBlockInputStream(
DictionaryPtr dictionary,
2019-02-10 16:55:12 +00:00
UInt64 max_block_size,
const Names & column_names,
PaddedPODArray<Key> && ids_to_fill,
PaddedPODArray<RangeType> && start_dates,
PaddedPODArray<RangeType> && end_dates);
2017-04-28 18:33:31 +00:00
String getName() const override { return "RangeDictionary"; }
2017-04-28 18:33:31 +00:00
protected:
Block getBlock(size_t start, size_t length) const override;
2017-04-28 18:33:31 +00:00
private:
2017-09-15 12:16:12 +00:00
template <typename Type>
using DictionaryGetter = void (DictionaryType::*)(
const std::string &, const PaddedPODArray<Key> &, const PaddedPODArray<Int64> &, PaddedPODArray<Type> &) const;
2017-04-28 18:33:31 +00:00
template <typename Type>
using DictionaryDecimalGetter = void (DictionaryType::*)(
const std::string &, const PaddedPODArray<Key> &, const PaddedPODArray<Int64> &, DecimalPaddedPODArray<Type> &) const;
template <typename AttributeType, typename Getter>
ColumnPtr getColumnFromAttribute(
Getter getter,
const PaddedPODArray<Key> & ids_to_fill,
const PaddedPODArray<Int64> & dates,
const DictionaryAttribute & attribute,
const DictionaryType & concrete_dictionary) const;
ColumnPtr getColumnFromAttributeString(
const PaddedPODArray<Key> & ids_to_fill,
const PaddedPODArray<Int64> & dates,
const DictionaryAttribute & attribute,
const DictionaryType & concrete_dictionary) const;
2017-09-15 12:16:12 +00:00
template <typename T>
2018-01-10 00:04:08 +00:00
ColumnPtr getColumnFromPODArray(const PaddedPODArray<T> & array) const;
template <typename DictionarySpecialAttributeType, typename T>
void addSpecialColumn(
const std::optional<DictionarySpecialAttributeType> & attribute,
DataTypePtr type,
const std::string & default_name,
const std::unordered_set<std::string> & column_names_set,
const PaddedPODArray<T> & values,
ColumnsWithTypeAndName & columns) const;
Block fillBlock(
const PaddedPODArray<Key> & ids_to_fill,
const PaddedPODArray<RangeType> & block_start_dates,
const PaddedPODArray<RangeType> & block_end_dates) const;
PaddedPODArray<Int64>
makeDateKey(const PaddedPODArray<RangeType> & block_start_dates, const PaddedPODArray<RangeType> & block_end_dates) const;
2018-08-24 05:21:53 +00:00
DictionaryPtr dictionary;
Names column_names;
PaddedPODArray<Key> ids;
PaddedPODArray<RangeType> start_dates;
PaddedPODArray<RangeType> end_dates;
2017-04-28 18:33:31 +00:00
};
template <typename DictionaryType, typename RangeType, typename Key>
RangeDictionaryBlockInputStream<DictionaryType, RangeType, Key>::RangeDictionaryBlockInputStream(
DictionaryPtr dictionary,
size_t max_column_size,
const Names & column_names,
PaddedPODArray<Key> && ids,
PaddedPODArray<RangeType> && block_start_dates,
PaddedPODArray<RangeType> && block_end_dates)
: DictionaryBlockInputStreamBase(ids.size(), max_column_size)
, dictionary(dictionary)
, column_names(column_names)
, ids(std::move(ids))
, start_dates(std::move(block_start_dates))
, end_dates(std::move(block_end_dates))
{
}
template <typename DictionaryType, typename RangeType, typename Key>
Block RangeDictionaryBlockInputStream<DictionaryType, RangeType, Key>::getBlock(size_t start, size_t length) const
{
PaddedPODArray<Key> block_ids;
PaddedPODArray<RangeType> block_start_dates;
PaddedPODArray<RangeType> block_end_dates;
block_ids.reserve(length);
block_start_dates.reserve(length);
block_end_dates.reserve(length);
for (auto idx : ext::range(start, start + length))
{
block_ids.push_back(ids[idx]);
block_start_dates.push_back(start_dates[idx]);
block_end_dates.push_back(end_dates[idx]);
}
return fillBlock(block_ids, block_start_dates, block_end_dates);
}
template <typename DictionaryType, typename RangeType, typename Key>
template <typename AttributeType, typename Getter>
ColumnPtr RangeDictionaryBlockInputStream<DictionaryType, RangeType, Key>::getColumnFromAttribute(
Getter getter,
const PaddedPODArray<Key> & ids_to_fill,
const PaddedPODArray<Int64> & dates,
const DictionaryAttribute & attribute,
const DictionaryType & concrete_dictionary) const
{
if constexpr (IsDecimalNumber<AttributeType>)
{
auto column = ColumnDecimal<AttributeType>::create(ids_to_fill.size(), 0); /// NOTE: There's wrong scale here, but it's unused.
(concrete_dictionary.*getter)(attribute.name, ids_to_fill, dates, column->getData());
return column;
}
else
{
auto column_vector = ColumnVector<AttributeType>::create(ids_to_fill.size());
(concrete_dictionary.*getter)(attribute.name, ids_to_fill, dates, column_vector->getData());
return column_vector;
}
}
template <typename DictionaryType, typename RangeType, typename Key>
ColumnPtr RangeDictionaryBlockInputStream<DictionaryType, RangeType, Key>::getColumnFromAttributeString(
const PaddedPODArray<Key> & ids_to_fill,
const PaddedPODArray<Int64> & dates,
const DictionaryAttribute & attribute,
const DictionaryType & concrete_dictionary) const
{
auto column_string = ColumnString::create();
2018-08-27 17:18:14 +00:00
concrete_dictionary.getString(attribute.name, ids_to_fill, dates, column_string.get());
return column_string;
}
template <typename DictionaryType, typename RangeType, typename Key>
2017-09-15 12:16:12 +00:00
template <typename T>
ColumnPtr RangeDictionaryBlockInputStream<DictionaryType, RangeType, Key>::getColumnFromPODArray(const PaddedPODArray<T> & array) const
{
auto column_vector = ColumnVector<T>::create();
column_vector->getData().reserve(array.size());
for (T value : array)
column_vector->insertValue(value);
return column_vector;
}
template <typename DictionaryType, typename RangeType, typename Key>
template <typename DictionarySpecialAttributeType, typename T>
void RangeDictionaryBlockInputStream<DictionaryType, RangeType, Key>::addSpecialColumn(
const std::optional<DictionarySpecialAttributeType> & attribute,
DataTypePtr type,
const std::string & default_name,
const std::unordered_set<std::string> & column_names_set,
const PaddedPODArray<T> & values,
ColumnsWithTypeAndName & columns) const
{
std::string name = default_name;
2018-01-10 00:04:08 +00:00
if (attribute)
name = attribute->name;
2018-01-10 00:04:08 +00:00
2018-08-27 17:42:13 +00:00
if (column_names_set.find(name) != column_names_set.end())
columns.emplace_back(getColumnFromPODArray(values), type, name);
}
template <typename DictionaryType, typename RangeType, typename Key>
PaddedPODArray<Int64> RangeDictionaryBlockInputStream<DictionaryType, RangeType, Key>::makeDateKey(
const PaddedPODArray<RangeType> & block_start_dates, const PaddedPODArray<RangeType> & block_end_dates) const
{
PaddedPODArray<Int64> key(block_start_dates.size());
for (size_t i = 0; i < key.size(); ++i)
{
2018-08-27 17:42:13 +00:00
if (RangeHashedDictionary::Range::isCorrectDate(block_start_dates[i]))
key[i] = block_start_dates[i];
else
2018-08-27 17:42:13 +00:00
key[i] = block_end_dates[i];
}
return key;
}
template <typename DictionaryType, typename RangeType, typename Key>
Block RangeDictionaryBlockInputStream<DictionaryType, RangeType, Key>::fillBlock(
2018-08-24 05:45:03 +00:00
const PaddedPODArray<Key> & ids_to_fill,
const PaddedPODArray<RangeType> & block_start_dates,
const PaddedPODArray<RangeType> & block_end_dates) const
2017-04-28 18:33:31 +00:00
{
ColumnsWithTypeAndName columns;
2018-08-24 05:25:00 +00:00
const DictionaryStructure & structure = dictionary->getStructure();
2017-04-28 18:33:31 +00:00
std::unordered_set<std::string> names(column_names.begin(), column_names.end());
2017-04-28 18:33:31 +00:00
2018-08-24 05:45:03 +00:00
addSpecialColumn(structure.id, std::make_shared<DataTypeUInt64>(), "ID", names, ids_to_fill, columns);
addSpecialColumn(structure.range_min, structure.range_max->type, "Range Start", names, block_start_dates, columns);
addSpecialColumn(structure.range_max, structure.range_max->type, "Range End", names, block_end_dates, columns);
2017-04-28 18:33:31 +00:00
2018-08-27 17:42:13 +00:00
auto date_key = makeDateKey(block_start_dates, block_end_dates);
2017-04-28 18:33:31 +00:00
for (const auto idx : ext::range(0, structure.attributes.size()))
{
2018-08-24 05:20:18 +00:00
const DictionaryAttribute & attribute = structure.attributes[idx];
if (names.find(attribute.name) != names.end())
2017-04-28 18:33:31 +00:00
{
ColumnPtr column;
#define GET_COLUMN_FORM_ATTRIBUTE(TYPE) \
column = getColumnFromAttribute<TYPE>(&DictionaryType::get##TYPE, ids_to_fill, date_key, attribute, *dictionary)
switch (attribute.underlying_type)
{
case AttributeUnderlyingType::UInt8:
GET_COLUMN_FORM_ATTRIBUTE(UInt8);
break;
case AttributeUnderlyingType::UInt16:
GET_COLUMN_FORM_ATTRIBUTE(UInt16);
break;
case AttributeUnderlyingType::UInt32:
GET_COLUMN_FORM_ATTRIBUTE(UInt32);
break;
case AttributeUnderlyingType::UInt64:
GET_COLUMN_FORM_ATTRIBUTE(UInt64);
break;
case AttributeUnderlyingType::UInt128:
GET_COLUMN_FORM_ATTRIBUTE(UInt128);
break;
case AttributeUnderlyingType::Int8:
GET_COLUMN_FORM_ATTRIBUTE(Int8);
break;
case AttributeUnderlyingType::Int16:
GET_COLUMN_FORM_ATTRIBUTE(Int16);
break;
case AttributeUnderlyingType::Int32:
GET_COLUMN_FORM_ATTRIBUTE(Int32);
break;
case AttributeUnderlyingType::Int64:
GET_COLUMN_FORM_ATTRIBUTE(Int64);
break;
case AttributeUnderlyingType::Float32:
GET_COLUMN_FORM_ATTRIBUTE(Float32);
break;
case AttributeUnderlyingType::Float64:
GET_COLUMN_FORM_ATTRIBUTE(Float64);
break;
case AttributeUnderlyingType::Decimal32:
GET_COLUMN_FORM_ATTRIBUTE(Decimal32);
break;
case AttributeUnderlyingType::Decimal64:
GET_COLUMN_FORM_ATTRIBUTE(Decimal64);
break;
case AttributeUnderlyingType::Decimal128:
GET_COLUMN_FORM_ATTRIBUTE(Decimal128);
break;
case AttributeUnderlyingType::String:
column = getColumnFromAttributeString(ids_to_fill, date_key, attribute, *dictionary);
break;
}
#undef GET_COLUMN_FORM_ATTRIBUTE
columns.emplace_back(column, attribute.type, attribute.name);
}
2017-04-28 18:33:31 +00:00
}
return Block(columns);
2017-04-28 18:33:31 +00:00
}
}