refactoring of SerializationInfo

This commit is contained in:
Anton Popov 2021-10-29 20:21:02 +03:00
parent f1a5f79849
commit 0099dfd523
74 changed files with 798 additions and 615 deletions

View File

@ -30,7 +30,6 @@ private:
public:
static constexpr auto DEFAULT_ROWS_SEARCH_SAMPLE_RATIO = 0.1;
static constexpr auto DEFAULT_RATIO_FOR_SPARSE_SERIALIZATION = 0.95;
// static constexpr auto MIN_ROWS_TO_SEARCH_DEFAULTS = DEFAULT_ROWS_SEARCH_STEP * 16;
using Base = COWHelper<IColumn, ColumnSparse>;
static Ptr create(const ColumnPtr & values_, const ColumnPtr & offsets_, size_t size_)

View File

@ -12,6 +12,7 @@
#include <base/sort.h>
#include <base/map.h>
#include <base/range.h>
#include <DataTypes/Serializations/SerializationInfoTuple.h>
namespace DB
@ -555,4 +556,15 @@ void ColumnTuple::getIndicesOfNonDefaultRows(Offsets & indices, size_t from, siz
return getIndicesOfNonDefaultRowsImpl<ColumnTuple>(indices, from, limit);
}
SerializationInfoPtr ColumnTuple::getSerializationInfo() const
{
MutableSerializationInfos infos;
infos.reserve(columns.size());
for (const auto & column : columns)
infos.push_back(const_pointer_cast<SerializationInfo>(column->getSerializationInfo()));
return std::make_shared<SerializationInfoTuple>(std::move(infos), SerializationInfo::Settings{});
}
}

View File

@ -96,6 +96,7 @@ public:
ColumnPtr compress() const override;
double getRatioOfDefaultRows(double sample_ratio) const override;
void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override;
SerializationInfoPtr getSerializationInfo() const override;
size_t tupleSize() const { return columns.size(); }

View File

@ -4,6 +4,7 @@
#include <Columns/ColumnNullable.h>
#include <Columns/ColumnConst.h>
#include <Core/Field.h>
#include <DataTypes/Serializations/SerializationInfo.h>
namespace DB
@ -63,6 +64,11 @@ ColumnPtr IColumn::createWithOffsets(const Offsets & offsets, const Field & defa
return res;
}
SerializationInfoPtr IColumn::getSerializationInfo() const
{
return std::make_shared<SerializationInfo>(ISerialization::getKind(*this), SerializationInfo::Settings{});
}
bool isColumnNullable(const IColumn & column)
{
return checkColumn<ColumnNullable>(column);

View File

@ -26,8 +26,8 @@ class ColumnGathererStream;
class Field;
class WeakHash32;
class ISerialization;
using SerializationPtr = std::shared_ptr<const ISerialization>;
class SerializationInfo;
using SerializationInfoPtr = std::shared_ptr<const SerializationInfo>;
/*
* Represents a set of equal ranges in previous column to perform sorting in current column.
@ -404,6 +404,8 @@ public:
/// Used to create full column from sparse.
virtual Ptr createWithOffsets(const Offsets & offsets, const Field & default_field, size_t total_rows, size_t shift) const;
virtual SerializationInfoPtr getSerializationInfo() const;
/// Compress column in memory to some representation that allows to decompress it back.
/// Return itself if compression is not applicable for this column type.
virtual Ptr compress() const

View File

@ -117,7 +117,7 @@ ASTPtr CompressionCodecFactory::validateCodecAndGetPreprocessedAST(
};
ISerialization::SubstreamPath path;
column_type->getDefaultSerialization()->enumerateStreams(path, callback, column_type, nullptr);
column_type->getDefaultSerialization()->enumerateStreams(path, callback, column_type);
if (!result_codec)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot find any substream with data type for type {}. It's a bug", column_type->getName());

View File

@ -9,6 +9,7 @@
#include <DataTypes/Serializations/SerializationInfo.h>
#include <DataTypes/Serializations/SerializationTuple.h>
#include <DataTypes/Serializations/SerializationNamed.h>
#include <DataTypes/Serializations/SerializationInfoTuple.h>
#include <DataTypes/NestedUtils.h>
#include <Parsers/IAST.h>
#include <Parsers/ASTNameTypePair.h>
@ -254,21 +255,32 @@ SerializationPtr DataTypeTuple::doGetDefaultSerialization() const
return std::make_shared<SerializationTuple>(std::move(serializations), use_explicit_names);
}
SerializationPtr DataTypeTuple::getSerialization(const String & column_name, const SerializationCallback & callback) const
SerializationPtr DataTypeTuple::getSerialization(const SerializationInfo & info) const
{
SerializationTuple::ElementSerializations serializations(elems.size());
const auto & info_tuple = assert_cast<const SerializationInfoTuple &>(info);
bool use_explicit_names = have_explicit_names && serialize_names;
for (size_t i = 0; i < elems.size(); ++i)
{
String elem_name = use_explicit_names ? names[i] : toString(i + 1);
auto subcolumn_name = Nested::concatenateName(column_name, elem_name);
auto serializaion = elems[i]->getSerialization(subcolumn_name, callback);
serializations[i] = std::make_shared<SerializationNamed>(serializaion, elem_name);
auto serialization = elems[i]->getSerialization(*info_tuple.getElementInfo(i));
serializations[i] = std::make_shared<SerializationNamed>(serialization, elem_name);
}
return std::make_shared<SerializationTuple>(std::move(serializations), use_explicit_names);
}
MutableSerializationInfoPtr DataTypeTuple::createSerializationInfo(const SerializationInfo::Settings & settings) const
{
MutableSerializationInfos infos;
infos.reserve(elems.size());
for (const auto & elem : elems)
infos.push_back(elem->createSerializationInfo(settings));
return std::make_shared<SerializationInfoTuple>(std::move(infos), settings);
}
static DataTypePtr create(const ASTPtr & arguments)
{

View File

@ -54,8 +54,9 @@ public:
size_t getMaximumSizeOfValueInMemory() const override;
size_t getSizeOfValueInMemory() const override;
SerializationPtr getSerialization(const String & column_name, const SerializationCallback & callback) const override;
SerializationPtr doGetDefaultSerialization() const override;
SerializationPtr getSerialization(const SerializationInfo & info) const override;
MutableSerializationInfoPtr createSerializationInfo(const SerializationInfo::Settings & settings) const override;
const DataTypePtr & getElement(size_t i) const { return elems[i]; }
const DataTypes & getElements() const { return elems; }

View File

@ -77,9 +77,7 @@ size_t IDataType::getSizeOfValueInMemory() const
void IDataType::forEachSubcolumn(
const SubcolumnCallback & callback,
const SerializationPtr & serialization,
const DataTypePtr & type,
const ColumnPtr & column)
const SubstreamData & data)
{
ISerialization::StreamCallback callback_with_data = [&](const auto & subpath)
{
@ -88,66 +86,59 @@ void IDataType::forEachSubcolumn(
if (!subpath[i].visited && ISerialization::hasSubcolumnForPath(subpath, i + 1))
{
auto name = ISerialization::getSubcolumnNameForStream(subpath, i + 1);
auto data = ISerialization::createFromPath(subpath, i);
callback(subpath, name, data);
auto subdata = ISerialization::createFromPath(subpath, i);
callback(subpath, name, subdata);
}
subpath[i].visited = true;
}
};
ISerialization::SubstreamPath path;
serialization->enumerateStreams(path, callback_with_data, type, column);
SubstreamPath path;
data.serialization->enumerateStreams(path, callback_with_data, data);
}
DataTypePtr IDataType::tryGetSubcolumnType(const String & subcolumn_name) const
template <typename Ptr>
Ptr IDataType::getForSubcolumn(
const String & subcolumn_name,
const SubstreamData & data,
Ptr SubstreamData::*member,
bool throw_if_null) const
{
DataTypePtr res;
forEachSubcolumn([&](const auto &, const auto & name, const auto & data)
Ptr res;
forEachSubcolumn([&](const auto &, const auto & name, const auto & subdata)
{
if (name == subcolumn_name)
res = data.type;
}, getDefaultSerialization(), getPtr(), nullptr);
res = subdata.*member;
}, data);
if (!res && throw_if_null)
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in type {}", subcolumn_name, getName());
return res;
}
DataTypePtr IDataType::tryGetSubcolumnType(const String & subcolumn_name) const
{
SubstreamData data = { getDefaultSerialization(), getPtr(), nullptr, nullptr };
return getForSubcolumn<DataTypePtr>(subcolumn_name, data, &SubstreamData::type, false);
}
DataTypePtr IDataType::getSubcolumnType(const String & subcolumn_name) const
{
auto subcolumn_type = tryGetSubcolumnType(subcolumn_name);
if (subcolumn_type)
return subcolumn_type;
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in type {}", subcolumn_name, getName());
SubstreamData data = { getDefaultSerialization(), getPtr(), nullptr, nullptr };
return getForSubcolumn<DataTypePtr>(subcolumn_name, data, &SubstreamData::type);
}
SerializationPtr IDataType::getSubcolumnSerialization(const String & subcolumn_name, const SerializationPtr & serialization) const
{
SerializationPtr res;
forEachSubcolumn([&](const auto &, const auto & name, const auto & data)
{
if (name == subcolumn_name)
res = data.serialization;
}, serialization, nullptr, nullptr);
if (res)
return res;
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in type {}", subcolumn_name, getName());
SubstreamData data = { serialization, nullptr, nullptr, nullptr };
return getForSubcolumn<SerializationPtr>(subcolumn_name, data, &SubstreamData::serialization);
}
ColumnPtr IDataType::getSubcolumn(const String & subcolumn_name, const ColumnPtr & column) const
{
ColumnPtr res;
forEachSubcolumn([&](const auto &, const auto & name, const auto & data)
{
if (name == subcolumn_name)
res = data.column;
}, getDefaultSerialization(), nullptr, column);
if (res)
return res;
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in type {}", subcolumn_name, getName());
SubstreamData data = { getDefaultSerialization(), nullptr, column, nullptr };
return getForSubcolumn<ColumnPtr>(subcolumn_name, data, &SubstreamData::column);
}
Names IDataType::getSubcolumnNames() const
@ -156,7 +147,7 @@ Names IDataType::getSubcolumnNames() const
forEachSubcolumn([&](const auto &, const auto & name, const auto &)
{
res.push_back(name);
}, getDefaultSerialization(), nullptr, nullptr);
}, { getDefaultSerialization(), nullptr, nullptr, nullptr });
return res;
}
@ -175,6 +166,12 @@ void IDataType::setCustomization(DataTypeCustomDescPtr custom_desc_) const
custom_serialization = std::move(custom_desc_->serialization);
}
MutableSerializationInfoPtr IDataType::createSerializationInfo(
const SerializationInfo::Settings & settings) const
{
return std::make_shared<SerializationInfo>(ISerialization::Kind::DEFAULT, settings);
}
SerializationPtr IDataType::getDefaultSerialization() const
{
if (custom_serialization)
@ -196,31 +193,9 @@ SerializationPtr IDataType::getSerialization(ISerialization::Kind kind) const
return getDefaultSerialization();
}
SerializationPtr IDataType::getSerialization(const IColumn & column) const
SerializationPtr IDataType::getSerialization(const SerializationInfo & info) const
{
return getSerialization(ISerialization::getKind(column));
}
SerializationPtr IDataType::getSerialization(const String & column_name, const SerializationInfo & info) const
{
return getSerialization(column_name, [&info](const auto & name) { return info.getKind(name); });
}
SerializationPtr IDataType::getSerialization(const String & column_name, const SerializationCallback & callback) const
{
return getSerialization(callback(column_name));
}
SerializationPtr IDataType::getSerialization(const ISerialization::Settings & settings) const
{
if (supportsSparseSerialization())
{
double ratio = settings.num_rows ? std::min(static_cast<double>(settings.num_default_rows) / settings.num_rows, 1.0) : 0.0;
if (ratio > settings.ratio_for_sparse_serialization)
return getSparseSerialization();
}
return getDefaultSerialization();
return getSerialization(info.getKind());
}
// static
@ -229,11 +204,11 @@ SerializationPtr IDataType::getSerialization(const NameAndTypePair & column, con
if (column.isSubcolumn())
{
const auto & type_in_storage = column.getTypeInStorage();
auto default_serialization = type_in_storage->getDefaultSerialization();
return type_in_storage->getSubcolumnSerialization(column.getSubcolumnName(), default_serialization);
auto serialization = type_in_storage->getSerialization(info);
return type_in_storage->getSubcolumnSerialization(column.getSubcolumnName(), serialization);
}
return column.type->getSerialization(column.name, info);
return column.type->getSerialization(info);
}
}

View File

@ -7,6 +7,7 @@
#include <Core/TypeId.h>
#include <DataTypes/DataTypeCustom.h>
#include <DataTypes/Serializations/ISerialization.h>
#include <DataTypes/Serializations/SerializationInfo.h>
namespace DB
{
@ -27,7 +28,6 @@ using DataTypePtr = std::shared_ptr<const IDataType>;
using DataTypes = std::vector<DataTypePtr>;
struct NameAndTypePair;
class SerializationInfo;
struct DataTypeWithConstInfo
{
@ -84,19 +84,23 @@ public:
SerializationPtr getSubcolumnSerialization(const String & subcolumn_name, const SerializationPtr & serialization) const;
ColumnPtr getSubcolumn(const String & subcolumn_name, const ColumnPtr & column) const;
using SubstreamData = ISerialization::SubstreamData;
using SubstreamPath = ISerialization::SubstreamPath;
using SubcolumnCallback = std::function<void(
const ISerialization::SubstreamPath &,
const SubstreamPath &,
const String &,
const ISerialization::SubstreamData &)>;
const SubstreamData &)>;
static void forEachSubcolumn(
const SubcolumnCallback & callback,
const SerializationPtr & serialization,
const DataTypePtr & type,
const ColumnPtr & column);
const SubstreamData & data);
Names getSubcolumnNames() const;
virtual MutableSerializationInfoPtr createSerializationInfo(
const SerializationInfo::Settings & settings) const;
/// TODO: support more types.
virtual bool supportsSparseSerialization() const { return !haveSubtypes(); }
@ -106,18 +110,8 @@ public:
/// Chooses serialziation according to serialization kind.
SerializationPtr getSerialization(ISerialization::Kind kind) const;
/// Chooses serialziation according to column content.
SerializationPtr getSerialization(const IColumn & column) const;
/// Chooses serialization according to collected information about content of columns.
SerializationPtr getSerialization(const String & column_name, const SerializationInfo & info) const;
/// Chooses serialization according to settings.
SerializationPtr getSerialization(const ISerialization::Settings & settings) const;
using SerializationCallback = std::function<ISerialization::Kind(const String &)>;
virtual SerializationPtr getSerialization(const String & column_name, const SerializationCallback & callback) const;
/// Chooses serialization according to collected information about content of column.
virtual SerializationPtr getSerialization(const SerializationInfo & info) const;
/// Chooses between subcolumn serialization and regular serialization according to @column.
/// This method typically should be used to get serialization for reading column or subcolumn.
@ -302,6 +296,14 @@ protected:
public:
const IDataTypeCustomName * getCustomName() const { return custom_name.get(); }
const ISerialization * getCustomSerialization() const { return custom_serialization.get(); }
private:
template <typename Ptr>
Ptr getForSubcolumn(
const String & subcolumn_name,
const SubstreamData & data,
Ptr SubstreamData::*member,
bool throw_if_null = true) const;
};

View File

@ -72,18 +72,22 @@ String ISerialization::SubstreamPath::toString() const
void ISerialization::enumerateStreams(
SubstreamPath & path,
const StreamCallback & callback,
DataTypePtr type,
ColumnPtr column) const
const SubstreamData & data) const
{
path.push_back(Substream::Regular);
path.back().data = {type, column, getPtr(), nullptr};
path.back().data = data;
callback(path);
path.pop_back();
}
void ISerialization::enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const
{
enumerateStreams(path, callback, nullptr, nullptr);
enumerateStreams(path, callback, {getPtr(), nullptr, nullptr, nullptr});
}
void ISerialization::enumerateStreams(SubstreamPath & path, const StreamCallback & callback, const DataTypePtr & type) const
{
enumerateStreams(path, callback, {getPtr(), type, nullptr, nullptr});
}
void ISerialization::serializeBinaryBulk(const IColumn & column, WriteBuffer &, size_t, size_t) const
@ -268,10 +272,9 @@ ISerialization::SubstreamData ISerialization::createFromPath(const SubstreamPath
assert(prefix_len < path.size());
SubstreamData res = path[prefix_len].data;
res.creator.reset();
for (ssize_t i = static_cast<ssize_t>(prefix_len) - 1; i >= 0; --i)
{
const auto & creator = path[i].data.creator;
const auto & creator = path[i].creator;
if (creator)
{
res.type = res.type ? creator->create(res.type) : res.type;

View File

@ -32,6 +32,9 @@ using DataTypePtr = std::shared_ptr<const IDataType>;
class ISerialization;
using SerializationPtr = std::shared_ptr<const ISerialization>;
class SerializationInfo;
using SerializationInfoPtr = std::shared_ptr<const SerializationInfo>;
class Field;
struct FormatSettings;
@ -98,10 +101,10 @@ public:
struct SubstreamData
{
SerializationPtr serialization;
DataTypePtr type;
ColumnPtr column;
SerializationPtr serialization;
SubcolumnCreatorPtr creator;
SerializationInfoPtr serialization_info;
};
struct Substream
@ -136,6 +139,9 @@ public:
/// Data for current substream.
SubstreamData data;
/// Creator of subcolumn for current substream.
SubcolumnCreatorPtr creator = nullptr;
/// Flag, that may help to traverse substream paths.
mutable bool visited = false;
@ -158,13 +164,14 @@ public:
virtual void enumerateStreams(
SubstreamPath & path,
const StreamCallback & callback,
DataTypePtr type,
ColumnPtr column) const;
const SubstreamData & data) const;
void enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const;
void enumerateStreams(const StreamCallback & callback, SubstreamPath && path) const { enumerateStreams(callback, path); }
void enumerateStreams(const StreamCallback & callback) const { enumerateStreams(callback, {}); }
void enumerateStreams(SubstreamPath & path, const StreamCallback & callback, const DataTypePtr & type) const;
using OutputStreamGetter = std::function<WriteBuffer*(const SubstreamPath &)>;
using InputStreamGetter = std::function<ReadBuffer*(const SubstreamPath &)>;
@ -208,13 +215,6 @@ public:
double avg_value_size_hint = 0;
};
struct Settings
{
size_t num_rows;
size_t num_default_rows;
double ratio_for_sparse_serialization;
};
/// Call before serializeBinaryBulkWithMultipleStreams chain to write something before first mark.
virtual void serializeBinaryBulkStatePrefix(
SerializeBinaryBulkSettings & /*settings*/,
@ -339,6 +339,7 @@ protected:
using SerializationPtr = std::shared_ptr<const ISerialization>;
using Serializations = std::vector<SerializationPtr>;
using SerializationByName = std::unordered_map<String, SerializationPtr>;
template <typename State, typename StatePtr>
State * ISerialization::checkAndGetState(const StatePtr & state) const

View File

@ -198,33 +198,38 @@ ColumnPtr SerializationArray::SubcolumnCreator::create(const ColumnPtr & prev) c
void SerializationArray::enumerateStreams(
SubstreamPath & path,
const StreamCallback & callback,
DataTypePtr type,
ColumnPtr column) const
const SubstreamData & data) const
{
const auto * type_array = type ? &assert_cast<const DataTypeArray &>(*type) : nullptr;
const auto * column_array = column ? &assert_cast<const ColumnArray &>(*column) : nullptr;
const auto * type_array = data.type ? &assert_cast<const DataTypeArray &>(*data.type) : nullptr;
const auto * column_array = data.column ? &assert_cast<const ColumnArray &>(*data.column) : nullptr;
auto offsets_column = column_array ? column_array->getOffsetsPtr() : nullptr;
path.push_back(Substream::ArraySizes);
path.back().data =
{
type ? std::make_shared<DataTypeUInt64>() : nullptr,
offsets_column ? arrayOffsetsToSizes(*offsets_column) : nullptr,
std::make_shared<SerializationNamed>(
std::make_shared<SerializationNumber<UInt64>>(),
"size" + std::to_string(getArrayLevel(path)), false),
nullptr,
data.type ? std::make_shared<DataTypeUInt64>() : nullptr,
offsets_column ? arrayOffsetsToSizes(*offsets_column) : nullptr,
data.serialization_info,
};
callback(path);
path.back() = Substream::ArrayElements;
path.back().data = {type, column, getPtr(), std::make_shared<SubcolumnCreator>(offsets_column)};
path.back().data = data;
path.back().creator = std::make_shared<SubcolumnCreator>(offsets_column);
auto next_type = type_array ? type_array->getNestedType() : nullptr;
auto next_column = column_array ? column_array->getDataPtr() : nullptr;
SubstreamData next_data =
{
nested,
type_array ? type_array->getNestedType() : nullptr,
column_array ? column_array->getDataPtr() : nullptr,
data.serialization_info,
};
nested->enumerateStreams(path, callback, next_type, next_column);
nested->enumerateStreams(path, callback, next_data);
path.pop_back();
}

View File

@ -38,8 +38,7 @@ public:
void enumerateStreams(
SubstreamPath & path,
const StreamCallback & callback,
DataTypePtr type,
ColumnPtr column) const override;
const SubstreamData & data) const override;
void serializeBinaryBulkStatePrefix(
SerializeBinaryBulkSettings & settings,

View File

@ -1,8 +1,10 @@
#include <DataTypes/Serializations/SerializationInfo.h>
#include <DataTypes/NestedUtils.h>
#include <Columns/ColumnSparse.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
#include <IO/VarInt.h>
#include <base/EnumReflection.h>
#include <Poco/JSON/JSON.h>
#include <Poco/JSON/Object.h>
@ -17,113 +19,14 @@ namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
extern const int INCORRECT_DATA;
}
SerializationInfoBuilder::SerializationInfoBuilder(
double ratio_for_sparse_serialization_,
double default_rows_search_sample_ratio_)
: ratio_for_sparse_serialization(ratio_for_sparse_serialization_)
, default_rows_search_sample_ratio(default_rows_search_sample_ratio_)
, info(std::make_shared<SerializationInfo>())
{
}
void SerializationInfoBuilder::add(const Block & block)
{
size_t num_rows = block.rows();
info->number_of_rows += num_rows;
if (!canHaveSparseSerialization())
return;
for (const auto & elem : block)
{
/// Just skip column and always return default serialization.
if (!elem.type->supportsSparseSerialization())
continue;
/// Multiply by step to restore approximate number of default values.
info->columns[elem.name].num_defaults += static_cast<size_t>(
num_rows * elem.column->getRatioOfDefaultRows(default_rows_search_sample_ratio));
IDataType::forEachSubcolumn([&](const auto &, const auto & name, const auto & data)
{
if (!data.type->supportsSparseSerialization())
return;
auto parent_subcolumn_name = Nested::splitName(name, /*reverse=*/ true).first;
if (!parent_subcolumn_name.empty())
{
auto parent_subcolumn_type = elem.type->tryGetSubcolumnType(parent_subcolumn_name);
if (parent_subcolumn_type && !parent_subcolumn_type->supportsSparseSerialization())
return;
}
auto full_name = Nested::concatenateName(elem.name, name);
info->columns[full_name].num_defaults += static_cast<size_t>(
num_rows * data.column->getRatioOfDefaultRows(default_rows_search_sample_ratio));
}, elem.type->getDefaultSerialization(), elem.type, elem.column);
}
}
void SerializationInfoBuilder::add(const SerializationInfo & other)
{
info->number_of_rows += other.number_of_rows;
for (const auto & [name, column_info] : other.columns)
info->columns[name].num_defaults += column_info.num_defaults;
}
SerializationInfoPtr SerializationInfoBuilder::build() &&
{
size_t total_rows = info->number_of_rows;
for (auto & [_, column_info] : info->columns)
{
double ratio = total_rows ? std::min(static_cast<double>(column_info.num_defaults) / total_rows, 1.0) : 0.0;
if (ratio > ratio_for_sparse_serialization)
column_info.kind = ISerialization::Kind::SPARSE;
}
return std::move(info);
}
SerializationInfoPtr SerializationInfoBuilder::buildFrom(const SerializationInfo & other) &&
{
for (const auto & [name, column_info] : other.columns)
{
auto it = info->columns.find(name);
if (it == info->columns.end())
info->columns[name] = column_info;
else
it->second.kind = column_info.kind;
}
return std::move(info);
}
ISerialization::Kind SerializationInfo::getKind(const String & column_name) const
{
auto it = columns.find(column_name);
if (it == columns.end())
return ISerialization::Kind::DEFAULT;
return it->second.kind;
}
size_t SerializationInfo::getNumberOfDefaultRows(const String & column_name) const
{
auto it = columns.find(column_name);
if (it == columns.end())
return 0;
return it->second.num_defaults;
extern const int CORRUPTED_DATA;
}
namespace
{
constexpr auto KEY_VERSION = "version";
constexpr auto KEY_NUMBER_OF_ROWS = "number_of_rows";
constexpr auto KEY_NUM_ROWS = "num_rows";
constexpr auto KEY_COLUMNS = "columns";
constexpr auto KEY_NUM_DEFAULTS = "num_defaults";
constexpr auto KEY_KIND = "kind";
@ -131,68 +34,181 @@ constexpr auto KEY_NAME = "name";
}
void SerializationInfo::fromJSON(const String & json_str)
void SerializationInfo::Data::add(const IColumn & column)
{
size_t rows = column.size();
double ratio = column.getRatioOfDefaultRows(ColumnSparse::DEFAULT_ROWS_SEARCH_SAMPLE_RATIO);
num_rows += rows;
num_defaults += static_cast<size_t>(ratio * rows);
}
void SerializationInfo::Data::add(const Data & other)
{
num_rows += other.num_rows;
num_defaults += other.num_defaults;
}
SerializationInfo::SerializationInfo(ISerialization::Kind kind_, const Settings & settings_)
: settings(settings_)
, kind(kind_)
{
}
void SerializationInfo::add(const IColumn & column)
{
data.add(column);
if (settings.choose_kind)
kind = chooseKind(data, settings);
}
void SerializationInfo::add(const SerializationInfo & other)
{
data.add(other.data);
if (settings.choose_kind)
kind = chooseKind(data, settings);
}
void SerializationInfo::replaceData(const SerializationInfo & other)
{
data = other.data;
}
MutableSerializationInfoPtr SerializationInfo::clone() const
{
auto res = std::make_shared<SerializationInfo>(kind, settings);
res->data = data;
return res;
}
void SerializationInfo::serialializeKindBinary(WriteBuffer & out) const
{
writeBinary(static_cast<UInt8>(kind), out);
}
void SerializationInfo::deserializeFromKindsBinary(ReadBuffer & in)
{
UInt8 kind_num;
readBinary(kind_num, in);
auto maybe_kind = magic_enum::enum_cast<ISerialization::Kind>(kind_num);
if (!maybe_kind)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown serialization kind " + std::to_string(kind_num));
kind = *maybe_kind;
}
Poco::JSON::Object SerializationInfo::toJSON() const
{
Poco::JSON::Object object;
object.set(KEY_KIND, ISerialization::kindToString(kind));
object.set(KEY_NUM_DEFAULTS, data.num_defaults);
object.set(KEY_NUM_ROWS, data.num_rows);
return object;
}
void SerializationInfo::fromJSON(const Poco::JSON::Object & object)
{
if (!object.has(KEY_KIND) || !object.has(KEY_NUM_DEFAULTS) || !object.has(KEY_NUM_ROWS))
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Missed field '{}' or '{}' or '{}' in SerializationInfo of columns",
KEY_KIND, KEY_NUM_DEFAULTS, KEY_NUM_ROWS);
data.num_rows = object.getValue<size_t>(KEY_NUM_ROWS);
data.num_defaults = object.getValue<size_t>(KEY_NUM_DEFAULTS);
kind = ISerialization::stringToKind(object.getValue<String>(KEY_KIND));
}
ISerialization::Kind SerializationInfo::chooseKind(const Data & data, const Settings & settings)
{
double ratio = data.num_rows ? std::min(static_cast<double>(data.num_defaults) / data.num_rows, 1.0) : 0.0;
return ratio > settings.ratio_for_sparse ? ISerialization::Kind::SPARSE : ISerialization::Kind::DEFAULT;
}
SerializationInfoByName::SerializationInfoByName(
const NamesAndTypesList & columns,
const SerializationInfo::Settings & settings)
{
for (const auto & column : columns)
emplace(column.name, column.type->createSerializationInfo(settings));
}
void SerializationInfoByName::add(const Block & block)
{
for (const auto & column : block)
{
auto it = find(column.name);
if (it == end())
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Not found column {} in serialization infos", column.name);
it->second->add(*column.column);
}
}
void SerializationInfoByName::add(const SerializationInfoByName & other)
{
for (const auto & [name, info] : other)
{
auto it = find(name);
if (it == end())
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Not found column {} in serialization infos", name);
it->second->add(*info);
}
}
void SerializationInfoByName::writeText(WriteBuffer & out) const
{
Poco::JSON::Object object;
object.set(KEY_VERSION, SERIALIZATION_INFO_VERSION);
Poco::JSON::Array column_infos;
for (const auto & [name, info] : *this)
{
auto info_json = info->toJSON();
info_json.set(KEY_NAME, name);
column_infos.add(std::move(info_json));
}
object.set(KEY_COLUMNS, std::move(column_infos));
std::ostringstream oss; // STYLE_CHECK_ALLOW_STD_STRING_STREAM
oss.exceptions(std::ios::failbit);
Poco::JSON::Stringifier::stringify(object, oss);
return writeString(oss.str(), out);
}
void SerializationInfoByName::readText(ReadBuffer & in)
{
String json_str;
readString(json_str, in);
Poco::JSON::Parser parser;
auto object = parser.parse(json_str).extract<Poco::JSON::Object::Ptr>();
if (object->has(KEY_NUMBER_OF_ROWS))
number_of_rows = object->getValue<size_t>(KEY_NUMBER_OF_ROWS);
if (object->has(KEY_COLUMNS))
{
auto array = object->getArray(KEY_COLUMNS);
for (const auto & elem : *array)
{
auto elem_object = elem.extract<Poco::JSON::Object::Ptr>();
if (!elem_object->has(KEY_NAME) || !elem_object->has(KEY_NUM_DEFAULTS) || !elem_object->has(KEY_KIND))
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Missed field '{}' or '{}' or '{}' in SerializationInfo of columns",
KEY_NAME, KEY_NUM_DEFAULTS, KEY_KIND);
if (!elem_object->has(KEY_NAME))
throw Exception(ErrorCodes::CORRUPTED_DATA,
"Missed field '{}' in SerializationInfo of columns", KEY_NAME);
auto name = elem_object->getValue<String>(KEY_NAME);
auto kind = elem_object->getValue<String>(KEY_KIND);
auto num_defaults = elem_object->getValue<size_t>(KEY_NUM_DEFAULTS);
columns[name] = {ISerialization::stringToKind(kind), num_defaults};
auto it = find(name);
if (it == end())
throw Exception(ErrorCodes::CORRUPTED_DATA,
"There is not column {} in serialization infos", name);
it->second->fromJSON(*elem_object);
}
}
}
String SerializationInfo::toJSON() const
{
Poco::JSON::Object info;
info.set(KEY_VERSION, version);
info.set(KEY_NUMBER_OF_ROWS, number_of_rows);
Poco::JSON::Array column_infos;
for (const auto & [name, column_info] : columns)
{
Poco::JSON::Object column_info_json;
column_info_json.set(KEY_NAME, name);
column_info_json.set(KEY_KIND, ISerialization::kindToString(column_info.kind));
column_info_json.set(KEY_NUM_DEFAULTS, column_info.num_defaults);
column_infos.add(std::move(column_info_json));
}
info.set(KEY_COLUMNS, std::move(column_infos));
std::ostringstream oss; // STYLE_CHECK_ALLOW_STD_STRING_STREAM
oss.exceptions(std::ios::failbit);
Poco::JSON::Stringifier::stringify(info, oss);
return oss.str();
}
void SerializationInfo::readText(ReadBuffer & in)
{
String json_str;
readString(json_str, in);
fromJSON(json_str);
}
void SerializationInfo::writeText(WriteBuffer & out) const
{
writeString(toJSON(), out);
}
}

View File

@ -1,13 +1,21 @@
#pragma once
#include <Core/Block.h>
#include <Core/Types.h>
#include <DataTypes/Serializations/ISerialization.h>
#include <Columns/ColumnSparse.h>
#include <Poco/JSON/Object.h>
namespace DB
{
/** Contains information about kinds of serialization of columns.
class ReadBuffer;
class WriteBuffer;
class NamesAndTypesList;
class Block;
constexpr auto SERIALIZATION_INFO_VERSION = 1;
/** Contains information about kind of serialization of column and its subcolumns.
* Also contains information about content of columns,
* that helps to choose kind of serialization of column.
*
@ -19,72 +27,70 @@ namespace DB
class SerializationInfo
{
public:
SerializationInfo() = default;
static constexpr auto version = 1;
size_t getNumberOfDefaultRows(const String & column_name) const;
ISerialization::Kind getKind(const String & column_name) const;
bool empty() const { return !number_of_rows && columns.empty(); }
size_t getNumberOfRows() const { return number_of_rows; }
void readText(ReadBuffer & in);
void writeText(WriteBuffer & out) const;
private:
void fromJSON(const String & json_str);
String toJSON() const;
/// Information about one column.
/// Can be extended, when new kinds of serialization will be implemented.
struct Column
struct Data
{
ISerialization::Kind kind = ISerialization::Kind::DEFAULT;
size_t num_rows = 0;
size_t num_defaults = 0;
void add(const IColumn & column);
void add(const Data & other);
};
using NameToColumn = std::unordered_map<String, Column>;
struct Settings
{
const double ratio_for_sparse = 1.0;
const bool choose_kind = false;
};
size_t number_of_rows = 0;
NameToColumn columns;
SerializationInfo(ISerialization::Kind kind_, const Settings & settings_);
friend class SerializationInfoBuilder;
virtual ~SerializationInfo() = default;
virtual bool hasCustomSerialization() const { return kind != ISerialization::Kind::DEFAULT; }
virtual void add(const IColumn & column);
virtual void add(const SerializationInfo & other);
virtual void replaceData(const SerializationInfo & other);
virtual std::shared_ptr<SerializationInfo> clone() const;
virtual void serialializeKindBinary(WriteBuffer & out) const;
virtual void deserializeFromKindsBinary(ReadBuffer & in);
virtual Poco::JSON::Object toJSON() const;
virtual void fromJSON(const Poco::JSON::Object & object);
const Settings & getSettings() const { return settings; }
const Data & getData() const { return data; }
ISerialization::Kind getKind() const { return kind; }
static ISerialization::Kind chooseKind(const Data & data, const Settings & settings);
protected:
const Settings settings;
ISerialization::Kind kind;
Data data;
};
using SerializationInfoPtr = std::shared_ptr<SerializationInfo>;
using SerializationInfoPtr = std::shared_ptr<const SerializationInfo>;
using MutableSerializationInfoPtr = std::shared_ptr<SerializationInfo>;
/// Builder, that helps to create SerializationInfo.
class SerializationInfoBuilder
using SerializationInfos = std::vector<SerializationInfoPtr>;
using MutableSerializationInfos = std::vector<MutableSerializationInfoPtr>;
class SerializationInfoByName : public std::unordered_map<String, MutableSerializationInfoPtr>
{
public:
SerializationInfoBuilder();
SerializationInfoBuilder(
double ratio_for_sparse_serialization_,
double default_rows_search_sample_ratio_ = ColumnSparse::DEFAULT_ROWS_SEARCH_SAMPLE_RATIO);
SerializationInfoByName() = default;
SerializationInfoByName(
const NamesAndTypesList & columns,
const SerializationInfo::Settings & settings);
/// Add information about column from block.
void add(const Block & block);
void add(const SerializationInfoByName & other);
/// Add information about column from other SerializationInfo.
void add(const SerializationInfo & other);
/// Choose kind of serialization for every column
/// according its content and return finalized SerializationInfo.
SerializationInfoPtr build() &&;
/// Create SerializationInfo from other.
/// Respects kinds of serialization for columns, that exist in other SerializationInfo,
/// but keeps information about content of column from current SerializationInfo.
SerializationInfoPtr buildFrom(const SerializationInfo & other) &&;
double getRatioForSparseSerialization() const { return ratio_for_sparse_serialization; }
bool canHaveSparseSerialization() const { return ratio_for_sparse_serialization < 1.0; }
private:
double ratio_for_sparse_serialization;
double default_rows_search_sample_ratio;
SerializationInfoPtr info;
void writeText(WriteBuffer & out) const;
void readText(ReadBuffer & in);
};
}

View File

@ -0,0 +1,116 @@
#include <DataTypes/Serializations/SerializationInfoTuple.h>
#include <DataTypes/DataTypeTuple.h>
#include <Columns/ColumnTuple.h>
#include <Common/assert_cast.h>
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
extern const int CORRUPTED_DATA;
extern const int THERE_IS_NO_COLUMN;
}
SerializationInfoTuple::SerializationInfoTuple(
MutableSerializationInfos elems_, const Settings & settings_)
: SerializationInfo(ISerialization::Kind::DEFAULT, settings_)
, elems(std::move(elems_))
{
}
bool SerializationInfoTuple::hasCustomSerialization() const
{
return std::any_of(elems.begin(), elems.end(), [](const auto & elem) { return elem->hasCustomSerialization(); });
}
void SerializationInfoTuple::add(const IColumn & column)
{
SerializationInfo::add(column);
const auto & column_tuple = assert_cast<const ColumnTuple &>(column);
const auto & right_elems = column_tuple.getColumns();
assert(elems.size() == right_elems.size());
for (size_t i = 0; i < elems.size(); ++i)
elems[i]->add(*right_elems[i]);
}
void SerializationInfoTuple::add(const SerializationInfo & other)
{
SerializationInfo::add(other);
const auto & info_tuple = assert_cast<const SerializationInfoTuple &>(other);
assert(elems.size() == info_tuple.elems.size());
for (size_t i = 0; i < elems.size(); ++i)
elems[i]->add(*info_tuple.elems[i]);
}
void SerializationInfoTuple::replaceData(const SerializationInfo & other)
{
SerializationInfo::add(other);
const auto & info_tuple = assert_cast<const SerializationInfoTuple &>(other);
assert(elems.size() == info_tuple.elems.size());
for (size_t i = 0; i < elems.size(); ++i)
elems[i]->replaceData(*info_tuple.elems[i]);
}
MutableSerializationInfoPtr SerializationInfoTuple::clone() const
{
MutableSerializationInfos elems_cloned;
elems_cloned.reserve(elems.size());
for (const auto & elem : elems)
elems_cloned.push_back(elem->clone());
return std::make_shared<SerializationInfoTuple>(std::move(elems_cloned), settings);
}
void SerializationInfoTuple::serialializeKindBinary(WriteBuffer & out) const
{
SerializationInfo::serialializeKindBinary(out);
for (const auto & elem : elems)
elem->serialializeKindBinary(out);
}
void SerializationInfoTuple::deserializeFromKindsBinary(ReadBuffer & in)
{
SerializationInfo::deserializeFromKindsBinary(in);
for (const auto & elem : elems)
elem->deserializeFromKindsBinary(in);
}
Poco::JSON::Object SerializationInfoTuple::toJSON() const
{
auto object = SerializationInfo::toJSON();
Poco::JSON::Array subcolumns;
for (const auto & elem : elems)
subcolumns.add(elem->toJSON());
object.set("subcolumns", std::move(subcolumns));
return object;
}
void SerializationInfoTuple::fromJSON(const Poco::JSON::Object & object)
{
SerializationInfo::fromJSON(object);
if (!object.has("subcolumns"))
throw Exception(ErrorCodes::CORRUPTED_DATA,
"Missed field '{}' in SerializationInfo of columns SerializationInfoTuple");
auto subcolumns = object.getArray("subcolumns");
if (elems.size() != subcolumns->size())
throw Exception(ErrorCodes::THERE_IS_NO_COLUMN,
"Mismatched number of subcolumns beetween JSON and SerializationInfoTuple."
"Expected: {}, got: {}", elems.size(), subcolumns->size());
for (size_t i = 0; i < elems.size(); ++i)
elems[i]->fromJSON(*subcolumns->getObject(i));
}
}

View File

@ -0,0 +1,31 @@
#pragma once
#include <DataTypes/Serializations/SerializationInfo.h>
namespace DB
{
class SerializationInfoTuple : public SerializationInfo
{
public:
SerializationInfoTuple(MutableSerializationInfos elems_, const Settings & settings_);
bool hasCustomSerialization() const override;
void add(const IColumn & column) override;
void add(const SerializationInfo & other) override;
void replaceData(const SerializationInfo & other) override;
MutableSerializationInfoPtr clone() const override;
void serialializeKindBinary(WriteBuffer & out) const override;
void deserializeFromKindsBinary(ReadBuffer & in) override;
Poco::JSON::Object toJSON() const override;
void fromJSON(const Poco::JSON::Object & object) override;
MutableSerializationInfoPtr getElementInfo(size_t i) const { return elems[i]; }
ISerialization::Kind getElementKind(size_t i) const { return elems[i]->getKind(); }
private:
MutableSerializationInfos elems;
};
}

View File

@ -43,23 +43,23 @@ SerializationLowCardinality::SerializationLowCardinality(const DataTypePtr & dic
void SerializationLowCardinality::enumerateStreams(
SubstreamPath & path,
const StreamCallback & callback,
DataTypePtr type,
ColumnPtr column) const
const SubstreamData & data) const
{
const auto * column_lc = column ? &getColumnLowCardinality(*column) : nullptr;
SubstreamData data;
data.type = type ? dictionary_type : nullptr;
data.column = column_lc ? column_lc->getDictionary().getNestedColumn() : nullptr;
data.serialization = dict_inner_serialization;
const auto * column_lc = data.column ? &getColumnLowCardinality(*data.column) : nullptr;
path.push_back(Substream::DictionaryKeys);
path.back().data = data;
path.back().data =
{
dict_inner_serialization,
data.type ? dictionary_type : nullptr,
column_lc ? column_lc->getDictionary().getNestedColumn() : nullptr,
data.serialization_info,
};
dict_inner_serialization->enumerateStreams(path, callback, data.type, data.column);
dict_inner_serialization->enumerateStreams(path, callback, path.back().data);
path.back() = Substream::DictionaryIndexes;
path.back().data = {type, column, getPtr(), nullptr};
path.back().data = data;
callback(path);
path.pop_back();

View File

@ -20,8 +20,7 @@ public:
void enumerateStreams(
SubstreamPath & path,
const StreamCallback & callback,
DataTypePtr type,
ColumnPtr column) const override;
const SubstreamData & data) const override;
void serializeBinaryBulkStatePrefix(
SerializeBinaryBulkSettings & settings,

View File

@ -254,13 +254,17 @@ void SerializationMap::deserializeTextCSV(IColumn & column, ReadBuffer & istr, c
void SerializationMap::enumerateStreams(
SubstreamPath & path,
const StreamCallback & callback,
DataTypePtr type,
ColumnPtr column) const
const SubstreamData & data) const
{
auto next_type = type ? assert_cast<const DataTypeMap &>(*type).getNestedType() : nullptr;
auto next_column = column ? assert_cast<const ColumnMap &>(*column).getNestedColumnPtr() : nullptr;
SubstreamData next_data =
{
nested,
data.type ? assert_cast<const DataTypeMap &>(*data.type).getNestedType() : nullptr,
data.column ? assert_cast<const ColumnMap &>(*data.column).getNestedColumnPtr() : nullptr,
data.serialization_info,
};
nested->enumerateStreams(path, callback, next_type, next_column);
nested->enumerateStreams(path, callback, next_data);
}
void SerializationMap::serializeBinaryBulkStatePrefix(

View File

@ -34,8 +34,7 @@ public:
void enumerateStreams(
SubstreamPath & path,
const StreamCallback & callback,
DataTypePtr type,
ColumnPtr column) const override;
const SubstreamData & data) const override;
void serializeBinaryBulkStatePrefix(
SerializeBinaryBulkSettings & settings,

View File

@ -6,12 +6,13 @@ namespace DB
void SerializationNamed::enumerateStreams(
SubstreamPath & path,
const StreamCallback & callback,
DataTypePtr type,
ColumnPtr column) const
const SubstreamData & data) const
{
addToPath(path);
path.back().data = {type, column, getPtr(), std::make_shared<SubcolumnCreator>(name, escape_delimiter)};
nested_serialization->enumerateStreams(path, callback, type, column);
path.back().data = data;
path.back().creator = std::make_shared<SubcolumnCreator>(name, escape_delimiter);
nested_serialization->enumerateStreams(path, callback, data);
path.pop_back();
}

View File

@ -23,8 +23,7 @@ public:
void enumerateStreams(
SubstreamPath & path,
const StreamCallback & callback,
DataTypePtr type,
ColumnPtr column) const override;
const SubstreamData & data) const override;
void serializeBinaryBulkStatePrefix(
SerializeBinaryBulkSettings & settings,

View File

@ -41,30 +41,35 @@ ColumnPtr SerializationNullable::SubcolumnCreator::create(const ColumnPtr & prev
void SerializationNullable::enumerateStreams(
SubstreamPath & path,
const StreamCallback & callback,
DataTypePtr type,
ColumnPtr column) const
const SubstreamData & data) const
{
const auto * type_nullable = type ? &assert_cast<const DataTypeNullable &>(*type) : nullptr;
const auto * column_nullable = column ? &assert_cast<const ColumnNullable &>(*column) : nullptr;
const auto * type_nullable = data.type ? &assert_cast<const DataTypeNullable &>(*data.type) : nullptr;
const auto * column_nullable = data.column ? &assert_cast<const ColumnNullable &>(*data.column) : nullptr;
path.push_back(Substream::NullMap);
path.back().data =
{
std::make_shared<SerializationNamed>(std::make_shared<SerializationNumber<UInt8>>(), "null", false),
type_nullable ? std::make_shared<DataTypeUInt8>() : nullptr,
column_nullable ? column_nullable->getNullMapColumnPtr() : nullptr,
std::make_shared<SerializationNamed>(std::make_shared<SerializationNumber<UInt8>>(), "null", false),
nullptr,
data.serialization_info,
};
callback(path);
path.back() = Substream::NullableElements;
path.back().data = {type, column, getPtr(), std::make_shared<SubcolumnCreator>(path.back().data.column)};
path.back().creator = std::make_shared<SubcolumnCreator>(path.back().data.column);
path.back().data = data;
auto next_type = type_nullable ? type_nullable->getNestedType() : nullptr;
auto next_column = column_nullable ? column_nullable->getNestedColumnPtr() : nullptr;
SubstreamData next_data =
{
nested,
type_nullable ? type_nullable->getNestedType() : nullptr,
column_nullable ? column_nullable->getNestedColumnPtr() : nullptr,
data.serialization_info,
};
nested->enumerateStreams(path, callback, next_type, next_column);
nested->enumerateStreams(path, callback, next_data);
path.pop_back();
}

View File

@ -16,8 +16,7 @@ public:
void enumerateStreams(
SubstreamPath & path,
const StreamCallback & callback,
DataTypePtr type,
ColumnPtr column) const override;
const SubstreamData & data) const override;
void serializeBinaryBulkStatePrefix(
SerializeBinaryBulkSettings & settings,

View File

@ -150,27 +150,36 @@ ColumnPtr SerializationSparse::SubcolumnCreator::create(const ColumnPtr & prev)
void SerializationSparse::enumerateStreams(
SubstreamPath & path,
const StreamCallback & callback,
DataTypePtr type,
ColumnPtr column) const
const SubstreamData & data) const
{
const auto * column_sparse = column ? &assert_cast<const ColumnSparse &>(*column) : nullptr;
const auto * column_sparse = data.column ? &assert_cast<const ColumnSparse &>(*data.column) : nullptr;
SubstreamData data;
data.type = type ? std::make_shared<DataTypeUInt64>() : nullptr;
data.serialization = std::make_shared<SerializationNumber<UInt64>>();
data.column = column_sparse ? column_sparse->getOffsetsPtr() : nullptr;
size_t column_size = column_sparse ? column_sparse->size() : 0;
path.push_back(Substream::SparseOffsets);
path.back().data = data;
path.back().data =
{
std::make_shared<SerializationNumber<UInt64>>(),
data.type ? std::make_shared<DataTypeUInt64>() : nullptr,
column_sparse ? column_sparse->getOffsetsPtr() : nullptr,
data.serialization_info,
};
callback(path);
path.back() = Substream::SparseElements;
path.back().data = {type, column, getPtr(), std::make_shared<SubcolumnCreator>(data.column, column_size)};
path.back().creator = std::make_shared<SubcolumnCreator>(path.back().data.column, column_size);
path.back().data = data;
auto next_column = column_sparse ? column_sparse->getValuesPtr() : nullptr;
nested->enumerateStreams(path, callback, type, next_column);
SubstreamData next_data =
{
nested,
data.type,
column_sparse ? column_sparse->getValuesPtr() : nullptr,
data.serialization_info,
};
nested->enumerateStreams(path, callback, next_data);
path.pop_back();
}

View File

@ -30,8 +30,7 @@ public:
virtual void enumerateStreams(
SubstreamPath & path,
const StreamCallback & callback,
DataTypePtr type,
ColumnPtr column) const override;
const SubstreamData & data) const override;
void serializeBinaryBulkStatePrefix(
SerializeBinaryBulkSettings & settings,

View File

@ -1,6 +1,7 @@
#include <base/map.h>
#include <base/range.h>
#include <DataTypes/Serializations/SerializationTuple.h>
#include <DataTypes/Serializations/SerializationInfoTuple.h>
#include <DataTypes/DataTypeTuple.h>
#include <Core/Field.h>
#include <Columns/ColumnTuple.h>
@ -284,18 +285,23 @@ void SerializationTuple::deserializeTextCSV(IColumn & column, ReadBuffer & istr,
void SerializationTuple::enumerateStreams(
SubstreamPath & path,
const StreamCallback & callback,
DataTypePtr type,
ColumnPtr column) const
const SubstreamData & data) const
{
const auto * type_tuple = type ? &assert_cast<const DataTypeTuple &>(*type) : nullptr;
const auto * column_tuple = column ? &assert_cast<const ColumnTuple &>(*column) : nullptr;
const auto * type_tuple = data.type ? &assert_cast<const DataTypeTuple &>(*data.type) : nullptr;
const auto * column_tuple = data.column ? &assert_cast<const ColumnTuple &>(*data.column) : nullptr;
const auto * info_tuple = data.serialization_info ? &assert_cast<const SerializationInfoTuple &>(*data.serialization_info) : nullptr;
for (size_t i = 0; i < elems.size(); ++i)
{
auto next_type = type_tuple ? type_tuple->getElement(i) : nullptr;
auto next_column = column_tuple ? column_tuple->getColumnPtr(i) : nullptr;
SubstreamData next_data =
{
elems[i],
type_tuple ? type_tuple->getElement(i) : nullptr,
column_tuple ? column_tuple->getColumnPtr(i) : nullptr,
info_tuple ? info_tuple->getElementInfo(i) : nullptr,
};
elems[i]->enumerateStreams(path, callback, next_type, next_column);
elems[i]->enumerateStreams(path, callback, next_data);
}
}

View File

@ -36,8 +36,7 @@ public:
void enumerateStreams(
SubstreamPath & path,
const StreamCallback & callback,
DataTypePtr type,
ColumnPtr column) const override;
const SubstreamData & data) const override;
void serializeBinaryBulkStatePrefix(
SerializeBinaryBulkSettings & settings,

View File

@ -7,10 +7,9 @@ namespace DB
void SerializationWrapper::enumerateStreams(
SubstreamPath & path,
const StreamCallback & callback,
DataTypePtr type,
ColumnPtr column) const
const SubstreamData & data) const
{
nested_serialization->enumerateStreams(path, callback, type, column);
nested_serialization->enumerateStreams(path, callback, data);
}
void SerializationWrapper::serializeBinaryBulkStatePrefix(

View File

@ -23,8 +23,7 @@ public:
void enumerateStreams(
SubstreamPath & path,
const StreamCallback & callback,
DataTypePtr type,
ColumnPtr column) const override;
const SubstreamData & data) const override;
void serializeBinaryBulkStatePrefix(
SerializeBinaryBulkSettings & settings,

View File

@ -149,16 +149,14 @@ Block NativeReader::read()
SerializationPtr serialization;
if (server_revision >= DBMS_MIN_REVISION_WITH_CUSTOM_SERIALIZATION)
{
serialization = column.type->getSerialization(column.name, [&](const String & /*name*/)
{
UInt8 kind_num;
readBinary(kind_num, istr);
auto kind = magic_enum::enum_cast<ISerialization::Kind>(kind_num);
if (!kind)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown serialization kind " + std::to_string(kind_num));
auto info = column.type->createSerializationInfo({});
return *kind;
});
UInt8 has_custom;
readBinary(has_custom, istr);
if (has_custom)
info->deserializeFromKindsBinary(istr);
serialization = column.type->getSerialization(*info);
}
else
{

View File

@ -4,6 +4,7 @@
#include <IO/WriteHelpers.h>
#include <IO/VarInt.h>
#include <Compression/CompressedWriteBuffer.h>
#include <DataTypes/Serializations/SerializationInfo.h>
#include <Formats/MarkInCompressedFile.h>
#include <Formats/NativeWriter.h>
@ -125,18 +126,13 @@ void NativeWriter::write(const Block & block)
SerializationPtr serialization;
if (client_revision >= DBMS_MIN_REVISION_WITH_CUSTOM_SERIALIZATION)
{
serialization = column.type->getSerialization(column.name, [&](const String & name)
{
auto split = Nested::splitName(name);
ISerialization::Kind kind;
if (!split.second.empty() && column.type->tryGetSubcolumnType(split.second))
kind = ISerialization::getKind(*column.type->getSubcolumn(split.second, column.column));
else
kind = ISerialization::getKind(*column.column);
auto info = column.column->getSerializationInfo();
serialization = column.type->getSerialization(*info);
writeBinary(static_cast<UInt8>(kind), ostr);
return kind;
});
bool has_custom = info->hasCustomSerialization();
writeBinary(static_cast<UInt8>(has_custom), ostr);
if (has_custom)
info->serialializeKindBinary(ostr);
}
else
{

View File

@ -150,7 +150,7 @@ BlockIO InterpreterDescribeQuery::execute()
res_columns[6]->insertDefault();
res_columns[7]->insert(1u);
}, column.type->getDefaultSerialization(), column.type, nullptr);
}, {column.type->getDefaultSerialization(), column.type, nullptr, nullptr});
}
}

View File

@ -160,7 +160,7 @@ void PrettyBlockOutputFormat::write(const Chunk & chunk, PortKind port_kind)
Serializations serializations(num_columns);
for (size_t i = 0; i < num_columns; ++i)
serializations[i] = header.getByPosition(i).type->getSerialization(*columns[i]);
serializations[i] = header.getByPosition(i).type->getSerialization(*columns[i]->getSerializationInfo());
WidthsPerColumn widths;
Widths max_widths;

View File

@ -26,7 +26,7 @@ void PrettySpaceBlockOutputFormat::write(const Chunk & chunk, PortKind port_kind
Serializations serializations(num_columns);
for (size_t i = 0; i < num_columns; ++i)
serializations[i] = header.getByPosition(i).type->getSerialization(*columns[i]);
serializations[i] = header.getByPosition(i).type->getSerialization(*columns[i]->getSerializationInfo());
WidthsPerColumn widths;
Widths max_widths;

View File

@ -588,8 +588,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToMemory(
metadata_snapshot->projections.get(projection_name).metadata,
block.getNamesAndTypesList(),
{},
CompressionCodecFactory::instance().get("NONE", {}),
new_data_part->serialization_info);
CompressionCodecFactory::instance().get("NONE", {}));
part_out.write(block);
part_out.writeSuffixAndFinalizePart(new_projection_part);
@ -613,8 +612,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToMemory(
MergedBlockOutputStream part_out(
new_data_part, metadata_snapshot, block.getNamesAndTypesList(), {},
CompressionCodecFactory::instance().get("NONE", {}),
new_data_part->serialization_info);
CompressionCodecFactory::instance().get("NONE", {}));
part_out.write(block);
part_out.writeSuffixAndFinalizePart(new_data_part);

View File

@ -281,7 +281,6 @@ IMergeTreeDataPart::IMergeTreeDataPart(
, volume(parent_part_ ? parent_part_->volume : volume_)
, relative_path(relative_path_.value_or(name_))
, index_granularity_info(storage_, part_type_)
, serialization_info(std::make_shared<SerializationInfo>())
, part_type(part_type_)
, parent_part(parent_part_)
{
@ -307,7 +306,6 @@ IMergeTreeDataPart::IMergeTreeDataPart(
, volume(parent_part_ ? parent_part_->volume : volume_)
, relative_path(relative_path_.value_or(name_))
, index_granularity_info(storage_, part_type_)
, serialization_info(std::make_shared<SerializationInfo>())
, part_type(part_type_)
, parent_part(parent_part_)
{
@ -408,17 +406,47 @@ std::pair<time_t, time_t> IMergeTreeDataPart::getMinMaxTime() const
}
void IMergeTreeDataPart::setColumns(const NamesAndTypesList & new_columns)
void IMergeTreeDataPart::setColumns(const NamesAndTypesList & new_columns, const SerializationInfoByName & new_infos)
{
columns = new_columns;
column_name_to_position.clear();
column_name_to_position.reserve(new_columns.size());
size_t pos = 0;
for (const auto & column : columns)
{
auto & serialization = serializations[column.name];
column_name_to_position.emplace(column.name, pos);
for (const auto & subcolumn : column.type->getSubcolumnNames())
column_name_to_position.emplace(Nested::concatenateName(column.name, subcolumn), pos);
auto it = new_infos.find(column.name);
if (it != new_infos.end())
{
auto & old_info = serialization_infos[column.name];
const auto & new_info = it->second;
if (old_info)
{
old_info->replaceData(*new_info);
}
else
{
old_info = new_info->clone();
serialization = column.type->getSerialization(*old_info);
}
}
else
{
serialization = column.type->getDefaultSerialization();
}
IDataType::forEachSubcolumn([&](const auto &, const auto & subname, const auto & subdata)
{
auto subcolumn_name = Nested::concatenateName(column.name, subname);
column_name_to_position.emplace(subcolumn_name, pos);
serializations.emplace(subcolumn_name, subdata.serialization);
}, {serialization, column.type, nullptr, nullptr});
++pos;
}
}
@ -589,7 +617,6 @@ void IMergeTreeDataPart::loadColumnsChecksumsIndexes(bool require_columns_checks
loadUUID();
loadColumns(require_columns_checksums);
loadChecksums(require_columns_checksums);
loadSerializationInfo();
loadIndexGranularity();
calculateColumnsAndSecondaryIndicesSizesOnDisk();
loadIndex(); /// Must be called after loadIndexGranularity as it uses the value of `index_granularity`
@ -655,13 +682,13 @@ void IMergeTreeDataPart::loadIndex()
size_t marks_count = index_granularity.getMarksCount();
Serializations serializations(key_size);
Serializations key_serializations(key_size);
for (size_t j = 0; j < key_size; ++j)
serializations[j] = primary_key.data_types[j]->getDefaultSerialization();
key_serializations[j] = primary_key.data_types[j]->getDefaultSerialization();
for (size_t i = 0; i < marks_count; ++i) //-V756
for (size_t j = 0; j < key_size; ++j)
serializations[j]->deserializeBinary(*loaded_index[j], *index_file);
key_serializations[j]->deserializeBinary(*loaded_index[j], *index_file);
for (size_t i = 0; i < key_size; ++i)
{
@ -752,9 +779,8 @@ CompressionCodecPtr IMergeTreeDataPart::detectDefaultCompressionCodec() const
auto column_size = getColumnSize(part_column.name, *part_column.type);
if (column_size.data_compressed != 0 && !storage_columns.hasCompressionCodec(part_column.name))
{
auto serialization = IDataType::getSerialization(part_column, *serialization_info);
String path_to_data_file;
serialization->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path)
serializations.at(part_column.name)->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path)
{
if (path_to_data_file.empty())
{
@ -888,7 +914,7 @@ void IMergeTreeDataPart::loadRowsCount()
/// Most trivial types
if (column.type->isValueRepresentedByNumber()
&& !column.type->haveSubtypes()
&& getSerializationForColumn(column)->getKind() == ISerialization::Kind::DEFAULT)
&& getSerialization(column.name)->getKind() == ISerialization::Kind::DEFAULT)
{
auto size = getColumnSize(column.name, *column.type);
@ -940,7 +966,7 @@ void IMergeTreeDataPart::loadRowsCount()
for (const NameAndTypePair & column : columns)
{
ColumnPtr column_col = column.type->createColumn(*getSerializationForColumn(column));
ColumnPtr column_col = column.type->createColumn(*serializations.at(column.name));
if (!column_col->isFixedAndContiguous() || column_col->lowCardinality())
continue;
@ -1014,16 +1040,6 @@ void IMergeTreeDataPart::loadUUID()
}
}
void IMergeTreeDataPart::loadSerializationInfo() const
{
String path = getFullRelativePath() + SERIALIZATION_FILE_NAME;
if (volume->getDisk()->exists(path))
{
auto in = openForReading(volume->getDisk(), path);
serialization_info->readText(*in);
}
}
void IMergeTreeDataPart::loadColumns(bool require)
{
String path = fs::path(getFullRelativePath()) / "columns.txt";
@ -1058,7 +1074,18 @@ void IMergeTreeDataPart::loadColumns(bool require)
loaded_columns.readText(*volume->getDisk()->readFile(path));
}
setColumns(loaded_columns);
SerializationInfo::Settings settings =
{
.ratio_for_sparse = storage.getSettings()->ratio_of_defaults_for_sparse_serialization,
.choose_kind = false,
};
SerializationInfoByName infos(loaded_columns, settings);
path = getFullRelativePath() + SERIALIZATION_FILE_NAME;
if (volume->getDisk()->exists(path))
infos.readText(*volume->getDisk()->readFile(path));
setColumns(loaded_columns, infos);
}
bool IMergeTreeDataPart::shallParticipateInMerges(const StoragePolicyPtr & storage_policy) const
@ -1563,11 +1590,6 @@ bool IMergeTreeDataPart::checkAllTTLCalculated(const StorageMetadataPtr & metada
return true;
}
SerializationPtr IMergeTreeDataPart::getSerializationForColumn(const NameAndTypePair & column) const
{
return IDataType::getSerialization(column, *serialization_info);
}
String IMergeTreeDataPart::getUniqueId() const
{
auto disk = volume->getDisk();

View File

@ -93,7 +93,6 @@ public:
const StorageMetadataPtr & metadata_snapshot,
const std::vector<MergeTreeIndexPtr> & indices_to_recalc,
const CompressionCodecPtr & default_codec_,
const SerializationInfoPtr & serialization_info_,
const MergeTreeWriterSettings & writer_settings,
const MergeTreeIndexGranularity & computed_index_granularity = {}) const = 0;
@ -127,9 +126,12 @@ public:
String getTypeName() const { return getType().toString(); }
void setColumns(const NamesAndTypesList & new_columns);
void setColumns(const NamesAndTypesList & new_columns, const SerializationInfoByName & new_infos = {});
const NamesAndTypesList & getColumns() const { return columns; }
const SerializationInfoByName & getSerializationInfos() const { return serialization_infos; }
const SerializationByName & getSerializations() const { return serializations; }
const SerializationPtr & getSerialization(const String & column_name) const { return serializations.at(column_name); }
/// Throws an exception if part is not stored in on-disk format.
void assertOnDisk() const;
@ -190,9 +192,6 @@ public:
mutable String relative_path;
MergeTreeIndexGranularityInfo index_granularity_info;
/// TODO: add comment
SerializationInfoPtr serialization_info;
size_t rows_count = 0;
time_t modification_time = 0;
@ -399,8 +398,8 @@ public:
/// part creation (using alter query with materialize_ttl setting).
bool checkAllTTLCalculated(const StorageMetadataPtr & metadata_snapshot) const;
/// Returns serialization for column according to serialization_info.
SerializationPtr getSerializationForColumn(const NameAndTypePair & column) const;
// /// Returns serialization for column according to serialization_info.
// SerializationPtr getSerializationForColumn(const NameAndTypePair & column) const;
/// Return some uniq string for file
/// Required for distinguish different copies of the same part on S3
@ -424,6 +423,11 @@ protected:
/// Columns description. Cannot be changed, after part initialization.
NamesAndTypesList columns;
SerializationInfoByName serialization_infos;
SerializationByName serializations;
const Type part_type;
/// Not null when it's a projection part.
@ -470,8 +474,6 @@ private:
/// Loads ttl infos in json format from file ttl.txt. If file doesn't exists assigns ttl infos with all zeros
void loadTTLInfos();
void loadSerializationInfo() const;
void loadPartitionAndMinMaxIndex();
void calculateColumnsSizesOnDisk();

View File

@ -40,6 +40,7 @@ IMergeTreeReader::IMergeTreeReader(
, storage(data_part_->storage)
, metadata_snapshot(metadata_snapshot_)
, all_mark_ranges(all_mark_ranges_)
, serializations(data_part_->getSerializations())
, alter_conversions(storage.getAlterConversionsForPart(data_part))
{
if (isWidePart(data_part))

View File

@ -87,8 +87,7 @@ protected:
using ColumnPosition = std::optional<size_t>;
ColumnPosition findColumnForOffsets(const String & column_name) const;
using Serializations = std::map<std::string, SerializationPtr>;
Serializations serializations;
const SerializationByName & serializations;
friend class MergeTreeRangeReader::DelayedStream;

View File

@ -8,14 +8,16 @@ namespace DB
IMergedBlockOutputStream::IMergedBlockOutputStream(
const MergeTreeDataPartPtr & data_part,
const StorageMetadataPtr & metadata_snapshot_,
const SerializationInfoPtr & input_serialization_info_)
const NamesAndTypesList & columns_list,
bool reset_columns_)
: storage(data_part->storage)
, metadata_snapshot(metadata_snapshot_)
, volume(data_part->volume)
, part_path(data_part->isStoredOnDisk() ? data_part->getFullRelativePath() : "")
, input_serialization_info(input_serialization_info_)
, new_serialization_info(data_part->storage.getSettings()->ratio_of_defaults_for_sparse_serialization)
, reset_columns(reset_columns_)
{
if (reset_columns)
new_serialization_infos = SerializationInfoByName(columns_list, {});
}
NameSet IMergedBlockOutputStream::removeEmptyColumnsFromPart(
@ -32,18 +34,14 @@ NameSet IMergedBlockOutputStream::removeEmptyColumnsFromPart(
/// Collect counts for shared streams of different columns. As an example, Nested columns have shared stream with array sizes.
std::map<String, size_t> stream_counts;
std::unordered_map<String, SerializationPtr> serialziations;
for (const NameAndTypePair & column : columns)
const auto & serializations = data_part->getSerializations();
for (const auto & column : columns)
{
auto serialization = IDataType::getSerialization(column, *data_part->serialization_info);
serialization->enumerateStreams(
serializations.at(column.name)->enumerateStreams(
[&](const ISerialization::SubstreamPath & substream_path)
{
++stream_counts[ISerialization::getFileNameForStream(column, substream_path)];
});
serialziations[column.name] = std::move(serialization);
}
NameSet remove_files;
@ -65,7 +63,7 @@ NameSet IMergedBlockOutputStream::removeEmptyColumnsFromPart(
}
};
serialziations[column_name]->enumerateStreams(callback);
serializations.at(column_name)->enumerateStreams(callback);
}
/// Remove files on disk and checksums

View File

@ -14,7 +14,8 @@ public:
IMergedBlockOutputStream(
const MergeTreeDataPartPtr & data_part,
const StorageMetadataPtr & metadata_snapshot_,
const SerializationInfoPtr & input_serialization_info_);
const NamesAndTypesList & columns_list,
bool reset_columns_);
virtual ~IMergedBlockOutputStream() = default;
@ -46,8 +47,9 @@ protected:
String part_path;
IMergeTreeDataPart::MergeTreeWriterPtr writer;
SerializationInfoPtr input_serialization_info;
SerializationInfoBuilder new_serialization_info;
bool reset_columns = false;
SerializationInfoByName new_serialization_infos;
};
using IMergedBlockOutputStreamPtr = std::shared_ptr<IMergedBlockOutputStream>;

View File

@ -158,15 +158,19 @@ bool MergeTask::ExecuteAndFinalizeHorizontalPart::prepare()
global_ctx->parent_part);
global_ctx->new_data_part->uuid = global_ctx->future_part->uuid;
global_ctx->new_data_part->setColumns(global_ctx->storage_columns);
global_ctx->new_data_part->partition.assign(global_ctx->future_part->getPartition());
global_ctx->new_data_part->is_temp = global_ctx->parent_part == nullptr;
ctx->need_remove_expired_values = false;
ctx->force_ttl = false;
SerializationInfoBuilder serialization_info_builder(
global_ctx->data->getSettings()->ratio_of_defaults_for_sparse_serialization);
SerializationInfo::Settings info_settings =
{
.ratio_for_sparse = global_ctx->data->getSettings()->ratio_of_defaults_for_sparse_serialization,
.choose_kind = true,
};
SerializationInfoByName infos(global_ctx->storage_columns, info_settings);
for (const auto & part : global_ctx->future_part->parts)
{
@ -178,10 +182,10 @@ bool MergeTask::ExecuteAndFinalizeHorizontalPart::prepare()
ctx->force_ttl = true;
}
serialization_info_builder.add(*part->serialization_info);
infos.add(part->getSerializationInfos());
}
global_ctx->input_serialization_info = std::move(serialization_info_builder).build();
global_ctx->new_data_part->setColumns(global_ctx->storage_columns, infos);
const auto & local_part_min_ttl = global_ctx->new_data_part->ttl_infos.part_min_ttl;
if (local_part_min_ttl && local_part_min_ttl <= global_ctx->time_of_merge)
@ -256,7 +260,7 @@ bool MergeTask::ExecuteAndFinalizeHorizontalPart::prepare()
global_ctx->merging_columns,
MergeTreeIndexFactory::instance().getMany(global_ctx->metadata_snapshot->getSecondaryIndices()),
ctx->compression_codec,
global_ctx->input_serialization_info,
/*reset_columns=*/ true,
ctx->blocks_are_granules_size);
global_ctx->rows_written = 0;
@ -435,7 +439,6 @@ void MergeTask::VerticalMergeStage::prepareVerticalMergeForOneColumn() const
global_ctx->metadata_snapshot,
ctx->executor->getHeader(),
ctx->compression_codec,
global_ctx->input_serialization_info,
/// we don't need to recalc indices here
/// because all of them were already recalculated and written
/// as key part of vertical merge

View File

@ -157,7 +157,6 @@ private:
SyncGuardPtr sync_guard{nullptr};
MergeTreeData::MutableDataPartPtr new_data_part{nullptr};
SerializationInfoPtr input_serialization_info{nullptr};
size_t rows_written{0};
UInt64 watch_prev_elapsed{0};

View File

@ -59,7 +59,6 @@ IMergeTreeDataPart::MergeTreeWriterPtr MergeTreeDataPartCompact::getWriter(
const StorageMetadataPtr & metadata_snapshot,
const std::vector<MergeTreeIndexPtr> & indices_to_recalc,
const CompressionCodecPtr & default_codec_,
const SerializationInfoPtr & serialization_info_,
const MergeTreeWriterSettings & writer_settings,
const MergeTreeIndexGranularity & computed_index_granularity) const
{
@ -74,7 +73,7 @@ IMergeTreeDataPart::MergeTreeWriterPtr MergeTreeDataPartCompact::getWriter(
return std::make_unique<MergeTreeDataPartWriterCompact>(
shared_from_this(), ordered_columns_list, metadata_snapshot,
indices_to_recalc, index_granularity_info.marks_file_extension,
default_codec_, serialization_info_, writer_settings, computed_index_granularity);
default_codec_, writer_settings, computed_index_granularity);
}

View File

@ -51,7 +51,6 @@ public:
const StorageMetadataPtr & metadata_snapshot,
const std::vector<MergeTreeIndexPtr> & indices_to_recalc,
const CompressionCodecPtr & default_codec_,
const SerializationInfoPtr & serialization_info_,
const MergeTreeWriterSettings & writer_settings,
const MergeTreeIndexGranularity & computed_index_granularity) const override;

View File

@ -60,7 +60,6 @@ IMergeTreeDataPart::MergeTreeWriterPtr MergeTreeDataPartInMemory::getWriter(
const StorageMetadataPtr & metadata_snapshot,
const std::vector<MergeTreeIndexPtr> & /* indices_to_recalc */,
const CompressionCodecPtr & /* default_codec */,
const SerializationInfoPtr & /* serialization_info */,
const MergeTreeWriterSettings & writer_settings,
const MergeTreeIndexGranularity & /* computed_index_granularity */) const
{
@ -92,7 +91,7 @@ void MergeTreeDataPartInMemory::flushToDisk(const String & base_path, const Stri
auto compression_codec = storage.getContext()->chooseCompressionCodec(0, 0);
auto indices = MergeTreeIndexFactory::instance().getMany(metadata_snapshot->getSecondaryIndices());
MergedBlockOutputStream out(new_data_part, metadata_snapshot, columns, indices, compression_codec, new_data_part->serialization_info);
MergedBlockOutputStream out(new_data_part, metadata_snapshot, columns, indices, compression_codec);
out.write(block);
const auto & projections = metadata_snapshot->getProjections();
for (const auto & [projection_name, projection] : projection_parts)
@ -123,7 +122,7 @@ void MergeTreeDataPartInMemory::flushToDisk(const String & base_path, const Stri
auto projection_indices = MergeTreeIndexFactory::instance().getMany(desc.metadata->getSecondaryIndices());
MergedBlockOutputStream projection_out(
projection_data_part, desc.metadata, projection_part->columns, projection_indices,
projection_compression_codec, new_data_part->serialization_info);
projection_compression_codec);
projection_out.write(projection_part->block);
projection_out.writeSuffixAndFinalizePart(projection_data_part);

View File

@ -40,7 +40,6 @@ public:
const StorageMetadataPtr & metadata_snapshot,
const std::vector<MergeTreeIndexPtr> & indices_to_recalc,
const CompressionCodecPtr & default_codec_,
const SerializationInfoPtr & serialization_info,
const MergeTreeWriterSettings & writer_settings,
const MergeTreeIndexGranularity & computed_index_granularity) const override;

View File

@ -61,14 +61,13 @@ IMergeTreeDataPart::MergeTreeWriterPtr MergeTreeDataPartWide::getWriter(
const StorageMetadataPtr & metadata_snapshot,
const std::vector<MergeTreeIndexPtr> & indices_to_recalc,
const CompressionCodecPtr & default_codec_,
const SerializationInfoPtr & serialization_info_,
const MergeTreeWriterSettings & writer_settings,
const MergeTreeIndexGranularity & computed_index_granularity) const
{
return std::make_unique<MergeTreeDataPartWriterWide>(
shared_from_this(), columns_list, metadata_snapshot, indices_to_recalc,
index_granularity_info.marks_file_extension,
default_codec_, serialization_info_, writer_settings, computed_index_granularity);
default_codec_, writer_settings, computed_index_granularity);
}
@ -81,8 +80,7 @@ ColumnSize MergeTreeDataPartWide::getColumnSizeImpl(
if (checksums.empty())
return size;
auto serialization = getSerializationForColumn(column);
serialization->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path)
serializations.at(column.name)->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path)
{
String file_name = ISerialization::getFileNameForStream(column, substream_path);
@ -159,8 +157,7 @@ void MergeTreeDataPartWide::checkConsistency(bool require_part_metadata) const
{
for (const NameAndTypePair & name_type : columns)
{
auto serialization = getSerializationForColumn(name_type);
serialization->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path)
serializations.at(name_type.name)->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path)
{
String file_name = ISerialization::getFileNameForStream(name_type, substream_path);
String mrk_file_name = file_name + index_granularity_info.marks_file_extension;
@ -174,7 +171,6 @@ void MergeTreeDataPartWide::checkConsistency(bool require_part_metadata) const
});
}
}
}
else
{
@ -182,8 +178,7 @@ void MergeTreeDataPartWide::checkConsistency(bool require_part_metadata) const
std::optional<UInt64> marks_size;
for (const NameAndTypePair & name_type : columns)
{
auto serialization = IDataType::getSerialization(name_type, *serialization_info);
serialization->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path)
serializations.at(name_type.name)->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path)
{
auto file_path = path + ISerialization::getFileNameForStream(name_type, substream_path) + index_granularity_info.marks_file_extension;
@ -218,7 +213,7 @@ bool MergeTreeDataPartWide::hasColumnFiles(const NameAndTypePair & column) const
};
bool res = true;
auto serialization = IDataType::getSerialization(column, *serialization_info);
auto serialization = getSerialization(column.name);
serialization->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path)
{
String file_name = ISerialization::getFileNameForStream(column, substream_path);
@ -232,8 +227,7 @@ bool MergeTreeDataPartWide::hasColumnFiles(const NameAndTypePair & column) const
String MergeTreeDataPartWide::getFileNameForColumn(const NameAndTypePair & column) const
{
String filename;
auto serialization = getSerializationForColumn(column);
serialization->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path)
serializations.at(column.name)->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path)
{
if (filename.empty())
filename = ISerialization::getFileNameForStream(column, substream_path);
@ -255,7 +249,7 @@ void MergeTreeDataPartWide::calculateEachColumnSizes(ColumnSizeByName & each_col
if (rows_count != 0
&& column.type->isValueRepresentedByNumber()
&& !column.type->haveSubtypes()
&& getSerializationForColumn(column)->getKind() == ISerialization::Kind::DEFAULT)
&& serializations.at(column.name)->getKind() == ISerialization::Kind::DEFAULT)
{
size_t rows_in_column = size.data_uncompressed / column.type->getSizeOfValueInMemory();
if (rows_in_column != rows_count)

View File

@ -45,7 +45,6 @@ public:
const StorageMetadataPtr & metadata_snapshot,
const std::vector<MergeTreeIndexPtr> & indices_to_recalc,
const CompressionCodecPtr & default_codec_,
const SerializationInfoPtr & serialization_info_,
const MergeTreeWriterSettings & writer_settings,
const MergeTreeIndexGranularity & computed_index_granularity) const override;

View File

@ -16,12 +16,11 @@ MergeTreeDataPartWriterCompact::MergeTreeDataPartWriterCompact(
const std::vector<MergeTreeIndexPtr> & indices_to_recalc_,
const String & marks_file_extension_,
const CompressionCodecPtr & default_codec_,
const SerializationInfoPtr & serialization_info_,
const MergeTreeWriterSettings & settings_,
const MergeTreeIndexGranularity & index_granularity_)
: MergeTreeDataPartWriterOnDisk(data_part_, columns_list_, metadata_snapshot_,
indices_to_recalc_, marks_file_extension_,
default_codec_, serialization_info_, settings_, index_granularity_)
default_codec_, settings_, index_granularity_)
, plain_file(data_part->volume->getDisk()->writeFile(
part_path + MergeTreeDataPartCompact::DATA_FILE_NAME_WITH_EXTENSION,
settings.max_compress_block_size,
@ -34,12 +33,8 @@ MergeTreeDataPartWriterCompact::MergeTreeDataPartWriterCompact(
, marks(*marks_file)
{
const auto & storage_columns = metadata_snapshot->getColumns();
serializations.reserve(columns_list.size());
for (const auto & column : columns_list)
{
serializations.emplace(column.name, column.type->getSerialization(column.name, *serialization_info));
addStreams(column, storage_columns.getCodecDescOrDefault(column.name, default_codec));
}
}
void MergeTreeDataPartWriterCompact::addStreams(const NameAndTypePair & column, const ASTPtr & effective_codec_desc)
@ -71,7 +66,7 @@ void MergeTreeDataPartWriterCompact::addStreams(const NameAndTypePair & column,
};
ISerialization::SubstreamPath path;
serializations[column.name]->enumerateStreams(path, callback, column.type, nullptr);
serializations.at(column.name)->enumerateStreams(path, callback, column.type);
}
namespace
@ -212,7 +207,7 @@ void MergeTreeDataPartWriterCompact::writeDataBlock(const Block & block, const G
writeIntBinary(UInt64(0), marks);
writeColumnSingleGranule(
block.getByName(name_and_type->name), serializations[name_and_type->name],
block.getByName(name_and_type->name), serializations.at(name_and_type->name),
stream_getter, granule.start_row, granule.rows_to_write);
/// Each type always have at least one substream

View File

@ -15,7 +15,6 @@ public:
const std::vector<MergeTreeIndexPtr> & indices_to_recalc,
const String & marks_file_extension,
const CompressionCodecPtr & default_codec,
const SerializationInfoPtr & serialization_info_,
const MergeTreeWriterSettings & settings,
const MergeTreeIndexGranularity & index_granularity);

View File

@ -68,7 +68,6 @@ MergeTreeDataPartWriterOnDisk::MergeTreeDataPartWriterOnDisk(
const MergeTreeIndices & indices_to_recalc_,
const String & marks_file_extension_,
const CompressionCodecPtr & default_codec_,
const SerializationInfoPtr & serialization_info_,
const MergeTreeWriterSettings & settings_,
const MergeTreeIndexGranularity & index_granularity_)
: IMergeTreeDataPartWriter(data_part_,
@ -76,8 +75,8 @@ MergeTreeDataPartWriterOnDisk::MergeTreeDataPartWriterOnDisk(
, skip_indices(indices_to_recalc_)
, part_path(data_part_->getFullRelativePath())
, marks_file_extension(marks_file_extension_)
, serializations(data_part_->getSerializations())
, default_codec(default_codec_)
, serialization_info(serialization_info_)
, compute_granularity(index_granularity.empty())
{
if (settings.blocks_are_granules_size && !index_granularity.empty())

View File

@ -87,7 +87,6 @@ public:
const std::vector<MergeTreeIndexPtr> & indices_to_recalc,
const String & marks_file_extension,
const CompressionCodecPtr & default_codec,
const SerializationInfoPtr & serialization_info_,
const MergeTreeWriterSettings & settings,
const MergeTreeIndexGranularity & index_granularity);
@ -124,8 +123,8 @@ protected:
const String part_path;
const String marks_file_extension;
const SerializationByName & serializations;
const CompressionCodecPtr default_codec;
const SerializationInfoPtr serialization_info;
const bool compute_granularity;
@ -133,9 +132,6 @@ protected:
MergeTreeIndexAggregators skip_indices_aggregators;
std::vector<size_t> skip_index_accumulated_marks;
using SerializationsMap = std::unordered_map<String, SerializationPtr>;
SerializationsMap serializations;
std::unique_ptr<WriteBufferFromFileBase> index_file_stream;
std::unique_ptr<HashingWriteBuffer> index_stream;
DataTypes index_types;

View File

@ -75,22 +75,17 @@ MergeTreeDataPartWriterWide::MergeTreeDataPartWriterWide(
const std::vector<MergeTreeIndexPtr> & indices_to_recalc_,
const String & marks_file_extension_,
const CompressionCodecPtr & default_codec_,
const SerializationInfoPtr & serialization_info_,
const MergeTreeWriterSettings & settings_,
const MergeTreeIndexGranularity & index_granularity_)
: MergeTreeDataPartWriterOnDisk(data_part_, columns_list_, metadata_snapshot_,
indices_to_recalc_, marks_file_extension_,
default_codec_, serialization_info_, settings_, index_granularity_)
default_codec_, settings_, index_granularity_)
{
const auto & columns = metadata_snapshot->getColumns();
for (const auto & it : columns_list)
{
serializations.emplace(it.name, it.type->getSerialization(it.name, *serialization_info));
addStreams(it, columns.getCodecDescOrDefault(it.name, default_codec));
}
}
void MergeTreeDataPartWriterWide::addStreams(
const NameAndTypePair & column,
const ASTPtr & effective_codec_desc)
@ -123,7 +118,7 @@ void MergeTreeDataPartWriterWide::addStreams(
};
ISerialization::SubstreamPath path;
serializations[column.name]->enumerateStreams(path, callback, column.type, nullptr);
serializations.at(column.name)->enumerateStreams(path, callback, column.type);
}
@ -218,7 +213,7 @@ void MergeTreeDataPartWriterWide::write(const Block & block, const IColumn::Perm
{
auto & column = block_to_write.getByName(it->name);
if (serializations[column.name]->getKind() != ISerialization::Kind::SPARSE)
if (serializations.at(column.name)->getKind() != ISerialization::Kind::SPARSE)
column.column = recursiveRemoveSparse(column.column);
if (permutation)
@ -280,7 +275,7 @@ StreamsWithMarks MergeTreeDataPartWriterWide::getCurrentMarksForColumn(
ISerialization::SubstreamPath & path)
{
StreamsWithMarks result;
serializations[column.name]->enumerateStreams([&] (const ISerialization::SubstreamPath & substream_path)
serializations.at(column.name)->enumerateStreams([&] (const ISerialization::SubstreamPath & substream_path)
{
bool is_offsets = !substream_path.empty() && substream_path.back().type == ISerialization::Substream::ArraySizes;
@ -315,7 +310,7 @@ void MergeTreeDataPartWriterWide::writeSingleGranule(
ISerialization::SerializeBinaryBulkSettings & serialize_settings,
const Granule & granule)
{
const auto & serialization = serializations[name_and_type.name];
const auto & serialization = serializations.at(name_and_type.name);
serialization->serializeBinaryBulkWithMultipleStreams(column, granule.start_row, granule.rows_to_write, serialize_settings, serialization_state);
/// So that instead of the marks pointing to the end of the compressed block, there were marks pointing to the beginning of the next one.
@ -350,7 +345,7 @@ void MergeTreeDataPartWriterWide::writeColumn(
{
ISerialization::SerializeBinaryBulkSettings serialize_settings;
serialize_settings.getter = createStreamGetter(name_and_type, offset_columns);
serializations[name]->serializeBinaryBulkStatePrefix(serialize_settings, it->second);
serializations.at(name)->serializeBinaryBulkStatePrefix(serialize_settings, it->second);
}
const auto & global_settings = storage.getContext()->getSettingsRef();
@ -391,7 +386,7 @@ void MergeTreeDataPartWriterWide::writeColumn(
}
}
serializations[name]->enumerateStreams([&] (const ISerialization::SubstreamPath & substream_path)
serializations.at(name)->enumerateStreams([&] (const ISerialization::SubstreamPath & substream_path)
{
bool is_offsets = !substream_path.empty() && substream_path.back().type == ISerialization::Substream::ArraySizes;
if (is_offsets)
@ -405,7 +400,7 @@ void MergeTreeDataPartWriterWide::writeColumn(
void MergeTreeDataPartWriterWide::validateColumnOfFixedSize(const String & name, const IDataType & type)
{
const auto & serialization = serializations[name];
const auto & serialization = serializations.at(name);
if (!type.isValueRepresentedByNumber() || type.haveSubtypes() || serialization->getKind() != ISerialization::Kind::DEFAULT)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot validate column of non fixed type {}", type.getName());
@ -543,7 +538,7 @@ void MergeTreeDataPartWriterWide::finishDataSerialization(IMergeTreeDataPart::Ch
if (!serialization_states.empty())
{
serialize_settings.getter = createStreamGetter(*it, written_offset_columns ? *written_offset_columns : offset_columns);
serializations[it->name]->serializeBinaryBulkStateSuffix(serialize_settings, serialization_states[it->name]);
serializations.at(it->name)->serializeBinaryBulkStateSuffix(serialize_settings, serialization_states[it->name]);
}
if (write_final_mark)
@ -568,7 +563,7 @@ void MergeTreeDataPartWriterWide::finishDataSerialization(IMergeTreeDataPart::Ch
{
if (column.type->isValueRepresentedByNumber()
&& !column.type->haveSubtypes()
&& serializations[column.name]->getKind() == ISerialization::Kind::DEFAULT)
&& serializations.at(column.name)->getKind() == ISerialization::Kind::DEFAULT)
{
validateColumnOfFixedSize(column.name, *column.type);
}
@ -596,7 +591,7 @@ void MergeTreeDataPartWriterWide::writeFinalMark(
{
writeSingleMark(column, offset_columns, 0, path);
/// Memoize information about offsets
serializations[column.name]->enumerateStreams([&] (const ISerialization::SubstreamPath & substream_path)
serializations.at(column.name)->enumerateStreams([&] (const ISerialization::SubstreamPath & substream_path)
{
bool is_offsets = !substream_path.empty() && substream_path.back().type == ISerialization::Substream::ArraySizes;
if (is_offsets)

View File

@ -24,7 +24,6 @@ public:
const std::vector<MergeTreeIndexPtr> & indices_to_recalc,
const String & marks_file_extension,
const CompressionCodecPtr & default_codec,
const SerializationInfoPtr & serialization_info_,
const MergeTreeWriterSettings & settings,
const MergeTreeIndexGranularity & index_granularity);

View File

@ -359,7 +359,13 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataWriter::writeTempPart(
if (data.storage_settings.get()->assign_part_uuids)
new_data_part->uuid = UUIDHelpers::generateV4();
new_data_part->setColumns(columns);
const auto & data_settings = data.getSettings();
SerializationInfo::Settings settings{data_settings->ratio_of_defaults_for_sparse_serialization, true};
SerializationInfoByName infos(columns, settings);
infos.add(block);
new_data_part->setColumns(columns, infos);
new_data_part->rows_count = block.rows();
new_data_part->partition = std::move(partition);
new_data_part->minmax_idx = std::move(minmax_idx);
@ -406,15 +412,9 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataWriter::writeTempPart(
/// either default lz4 or compression method with zero thresholds on absolute and relative part size.
auto compression_codec = data.getContext()->chooseCompressionCodec(0, 0);
const auto & data_settings = data.getSettings();
SerializationInfoBuilder serialization_info(data_settings->ratio_of_defaults_for_sparse_serialization);
serialization_info.add(block);
const auto & index_factory = MergeTreeIndexFactory::instance();
MergedBlockOutputStream out(new_data_part, metadata_snapshot,columns,
index_factory.getMany(metadata_snapshot->getSecondaryIndices()),
compression_codec, std::move(serialization_info).build());
index_factory.getMany(metadata_snapshot->getSecondaryIndices()), compression_codec);
bool sync_on_insert = data_settings->fsync_after_insert;
@ -458,7 +458,11 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataWriter::writeProjectionPartImpl(
new_data_part->is_temp = is_temp;
NamesAndTypesList columns = metadata_snapshot->getColumns().getAllPhysical().filter(block.getNames());
new_data_part->setColumns(columns);
SerializationInfo::Settings settings{data.getSettings()->ratio_of_defaults_for_sparse_serialization, true};
SerializationInfoByName infos(columns, settings);
infos.add(block);
new_data_part->setColumns(columns, infos);
if (new_data_part->isStoredOnDisk())
{
@ -506,16 +510,12 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataWriter::writeProjectionPartImpl(
/// either default lz4 or compression method with zero thresholds on absolute and relative part size.
auto compression_codec = data.getContext()->chooseCompressionCodec(0, 0);
SerializationInfoBuilder serialization_info(data.getSettings()->ratio_of_defaults_for_sparse_serialization);
serialization_info.add(block);
MergedBlockOutputStream out(
new_data_part,
metadata_snapshot,
columns,
{},
compression_codec,
std::move(serialization_info).build());
compression_codec);
out.writeWithPermutation(block, perm_ptr);
out.writeSuffixAndFinalizePart(new_data_part);

View File

@ -64,16 +64,6 @@ MergeTreeReaderCompact::MergeTreeReaderCompact(
}
column_positions[i] = std::move(position);
if (column_from_part.isSubcolumn())
{
auto name_in_storage = column_from_part.getNameInStorage();
/// We have to read whole column and extract subcolumn.
serializations.emplace(name_in_storage, data_part->getSerializationForColumn(
{name_in_storage, column_from_part.getTypeInStorage()}));
}
serializations.emplace(column_from_part.name, data_part->getSerializationForColumn(column_from_part));
}
/// Do not use max_read_buffer_size, but try to lower buffer size with maximal size of granule to avoid reading much data.

View File

@ -109,7 +109,7 @@ size_t MergeTreeReaderWide::readRows(size_t from_mark, bool continue_reading, si
/// The column is already present in the block so we will append the values to the end.
bool append = res_columns[pos] != nullptr;
if (!append)
res_columns[pos] = type->createColumn(*serializations[name]);
res_columns[pos] = type->createColumn(*serializations.at(name));
auto & column = res_columns[pos];
try
@ -188,9 +188,7 @@ void MergeTreeReaderWide::addStreams(const NameAndTypePair & name_and_type,
profile_callback, clock_type));
};
auto serialization = data_part->getSerializationForColumn(name_and_type);
serialization->enumerateStreams(callback);
serializations.emplace(name_and_type.name, std::move(serialization));
serializations.at(name_and_type.name)->enumerateStreams(callback);
}
@ -231,7 +229,7 @@ void MergeTreeReaderWide::prefetch(
std::unordered_set<std::string> & prefetched_streams)
{
const auto & name = name_and_type.name;
auto & serialization = serializations[name];
auto & serialization = serializations.at(name);
serialization->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path)
{
@ -259,7 +257,7 @@ void MergeTreeReaderWide::readData(
deserialize_settings.avg_value_size_hint = avg_value_size_hint;
const auto & name = name_and_type.name;
auto & serialization = serializations[name];
auto & serialization = serializations.at(name);
if (deserialize_binary_bulk_state_map.count(name) == 0)
{

View File

@ -195,8 +195,7 @@ MergeTreeData::MutableDataPartsVector MergeTreeWriteAheadLog::restore(const Stor
metadata_snapshot,
block.getNamesAndTypesList(),
{},
CompressionCodecFactory::instance().get("NONE", {}),
part->serialization_info);
CompressionCodecFactory::instance().get("NONE", {}));
part->minmax_idx->update(block, storage.getMinMaxColumnsNames(metadata_snapshot->getPartitionKey()));
part->partition.create(metadata_snapshot, block, 0, context);

View File

@ -18,9 +18,9 @@ MergedBlockOutputStream::MergedBlockOutputStream(
const NamesAndTypesList & columns_list_,
const MergeTreeIndices & skip_indices,
CompressionCodecPtr default_codec_,
const SerializationInfoPtr & input_serialization_info_,
bool reset_columns_,
bool blocks_are_granules_size)
: IMergedBlockOutputStream(data_part, metadata_snapshot_, input_serialization_info_)
: IMergedBlockOutputStream(data_part, metadata_snapshot_, columns_list_, reset_columns_)
, columns_list(columns_list_)
, default_codec(default_codec_)
{
@ -34,7 +34,7 @@ MergedBlockOutputStream::MergedBlockOutputStream(
if (!part_path.empty())
volume->getDisk()->createDirectories(part_path);
writer = data_part->getWriter(columns_list, metadata_snapshot, skip_indices, default_codec, input_serialization_info, writer_settings);
writer = data_part->getWriter(columns_list, metadata_snapshot, skip_indices, default_codec, writer_settings);
}
/// If data is pre-sorted.
@ -78,12 +78,12 @@ void MergedBlockOutputStream::writeSuffixAndFinalizePart(
else
part_columns = *total_columns_list;
new_part->serialization_info = std::move(new_serialization_info).buildFrom(*input_serialization_info);
if (reset_columns)
new_part->setColumns(part_columns, new_serialization_infos);
if (new_part->isStoredOnDisk())
finalizePartOnDisk(new_part, part_columns, checksums, sync);
new_part->setColumns(part_columns);
new_part->rows_count = rows_count;
new_part->modification_time = time(nullptr);
new_part->index = writer->releaseIndexColumns();
@ -168,11 +168,12 @@ void MergedBlockOutputStream::finalizePartOnDisk(
removeEmptyColumnsFromPart(new_part, part_columns, checksums);
if (new_part->serialization_info->getNumberOfRows() > 0)
const auto & serialization_infos = new_part->getSerializationInfos();
if (!serialization_infos.empty())
{
auto out = volume->getDisk()->writeFile(part_path + IMergeTreeDataPart::SERIALIZATION_FILE_NAME, 4096);
HashingWriteBuffer out_hashing(*out);
new_part->serialization_info->writeText(out_hashing);
serialization_infos.writeText(out_hashing);
checksums.files[IMergeTreeDataPart::SERIALIZATION_FILE_NAME].file_size = out_hashing.count();
checksums.files[IMergeTreeDataPart::SERIALIZATION_FILE_NAME].file_hash = out_hashing.getHash();
out->finalize();
@ -219,7 +220,8 @@ void MergedBlockOutputStream::writeImpl(const Block & block, const IColumn::Perm
return;
writer->write(block, permutation);
new_serialization_info.add(block);
if (reset_columns)
new_serialization_infos.add(block);
rows_count += rows;
}

View File

@ -19,7 +19,7 @@ public:
const NamesAndTypesList & columns_list_,
const MergeTreeIndices & skip_indices,
CompressionCodecPtr default_codec_,
const SerializationInfoPtr & input_serialization_info_,
bool reset_columns_ = false,
bool blocks_are_granules_size = false);
Block getHeader() const { return metadata_snapshot->getSampleBlock(); }

View File

@ -14,12 +14,11 @@ MergedColumnOnlyOutputStream::MergedColumnOnlyOutputStream(
const StorageMetadataPtr & metadata_snapshot_,
const Block & header_,
CompressionCodecPtr default_codec,
const SerializationInfoPtr & input_serialization_info_,
const MergeTreeIndices & indices_to_recalc,
WrittenOffsetColumns * offset_columns_,
const MergeTreeIndexGranularity & index_granularity,
const MergeTreeIndexGranularityInfo * index_granularity_info)
: IMergedBlockOutputStream(data_part, metadata_snapshot_, input_serialization_info_)
: IMergedBlockOutputStream(data_part, metadata_snapshot_, header_.getNamesAndTypesList(), /*reset_columns=*/ true)
, header(header_)
{
const auto & global_settings = data_part->storage.getContext()->getSettings();
@ -36,7 +35,6 @@ MergedColumnOnlyOutputStream::MergedColumnOnlyOutputStream(
metadata_snapshot_,
indices_to_recalc,
default_codec,
input_serialization_info,
std::move(writer_settings),
index_granularity);
@ -53,7 +51,7 @@ void MergedColumnOnlyOutputStream::write(const Block & block)
return;
writer->write(block, nullptr);
new_serialization_info.add(block);
new_serialization_infos.add(block);
}
MergeTreeData::DataPart::Checksums
@ -79,8 +77,7 @@ MergedColumnOnlyOutputStream::writeSuffixAndGetChecksums(
if (all_checksums.files.count(removed_file))
all_checksums.files.erase(removed_file);
new_part->setColumns(columns);
new_part->serialization_info = std::move(new_serialization_info).buildFrom(*input_serialization_info);
new_part->setColumns(columns, new_serialization_infos);
return checksums;
}

View File

@ -18,7 +18,6 @@ public:
const StorageMetadataPtr & metadata_snapshot_,
const Block & header_,
CompressionCodecPtr default_codec_,
const SerializationInfoPtr & input_serialization_info_,
const MergeTreeIndices & indices_to_recalc_,
WrittenOffsetColumns * offset_columns_ = nullptr,
const MergeTreeIndexGranularity & index_granularity = {},

View File

@ -313,8 +313,7 @@ NameSet collectFilesToSkip(
files_to_skip.insert(stream_name + mrk_extension);
};
auto serialization = source_part->getSerializationForColumn({entry.name, entry.type});
serialization->enumerateStreams(callback);
source_part->getSerialization(entry.name)->enumerateStreams(callback);
}
for (const auto & index : indices_to_recalc)
{
@ -339,8 +338,7 @@ static NameToNameVector collectFilesForRenames(
std::map<String, size_t> stream_counts;
for (const auto & column : source_part->getColumns())
{
auto serialization = source_part->getSerializationForColumn(column);
serialization->enumerateStreams(
source_part->getSerialization(column.name)->enumerateStreams(
[&](const ISerialization::SubstreamPath & substream_path)
{
++stream_counts[ISerialization::getFileNameForStream(column, substream_path)];
@ -384,10 +382,7 @@ static NameToNameVector collectFilesForRenames(
auto column = source_part->getColumns().tryGetByName(command.column_name);
if (column)
{
auto serialization = source_part->getSerializationForColumn(*column);
serialization->enumerateStreams(callback);
}
source_part->getSerialization(column->name)->enumerateStreams(callback);
}
else if (command.type == MutationCommand::Type::RENAME_COLUMN)
{
@ -409,10 +404,7 @@ static NameToNameVector collectFilesForRenames(
auto column = source_part->getColumns().tryGetByName(command.column_name);
if (column)
{
auto serialization = source_part->getSerializationForColumn(*column);
serialization->enumerateStreams(callback);
}
source_part->getSerialization(column->name)->enumerateStreams(callback);
}
}
@ -968,8 +960,7 @@ private:
ctx->metadata_snapshot,
ctx->new_data_part->getColumns(),
skip_part_indices,
ctx->compression_codec,
ctx->new_data_part->serialization_info);
ctx->compression_codec);
ctx->mutating_pipeline = QueryPipelineBuilder::getPipeline(std::move(builder));
ctx->mutating_executor = std::make_unique<PullingPipelineExecutor>(ctx->mutating_pipeline);
@ -1118,7 +1109,6 @@ private:
ctx->metadata_snapshot,
ctx->updated_header,
ctx->compression_codec,
ctx->source_part->serialization_info,
std::vector<MergeTreeIndexPtr>(ctx->indices_to_recalc.begin(), ctx->indices_to_recalc.end()),
nullptr,
ctx->source_part->index_granularity,
@ -1298,11 +1288,13 @@ bool MutateTask::prepare()
ctx->new_data_part->uuid = ctx->future_part->uuid;
ctx->new_data_part->is_temp = true;
ctx->new_data_part->ttl_infos = ctx->source_part->ttl_infos;
ctx->new_data_part->serialization_info = ctx->source_part->serialization_info;
/// It shouldn't be changed by mutation.
ctx->new_data_part->index_granularity_info = ctx->source_part->index_granularity_info;
ctx->new_data_part->setColumns(MergeTreeDataMergerMutator::getColumnsForNewDataPart(ctx->source_part, ctx->updated_header, ctx->storage_columns, ctx->for_file_renames));
ctx->new_data_part->setColumns(
MergeTreeDataMergerMutator::getColumnsForNewDataPart(ctx->source_part, ctx->updated_header, ctx->storage_columns, ctx->for_file_renames),
ctx->source_part->getSerializationInfos());
ctx->new_data_part->partition.assign(ctx->source_part->partition);
ctx->disk = ctx->new_data_part->volume->getDisk();

View File

@ -98,13 +98,13 @@ IMergeTreeDataPart::Checksums checkDataPart(
};
};
auto serialization_info = std::make_shared<SerializationInfo>();
SerializationInfoByName serialization_infos(columns_txt, {});
auto serialization_path = path + IMergeTreeDataPart::SERIALIZATION_FILE_NAME;
if (disk->exists(serialization_path))
{
auto serialization_file = disk->readFile(serialization_path);
serialization_info->readText(*serialization_file);
serialization_infos.readText(*serialization_file);
}
/// This function calculates only checksum of file content (compressed or uncompressed).
@ -141,7 +141,7 @@ IMergeTreeDataPart::Checksums checkDataPart(
const NamesAndTypesList & projection_columns_list = projection->getColumns();
for (const auto & projection_column : projection_columns_list)
{
auto serialization = IDataType::getSerialization(projection_column, *serialization_info);
auto serialization = projection_column.type->getSerialization(*serialization_infos.at(projection_column.name));
serialization->enumerateStreams(
[&](const ISerialization::SubstreamPath & substream_path)
{
@ -214,7 +214,7 @@ IMergeTreeDataPart::Checksums checkDataPart(
{
for (const auto & column : columns_list)
{
auto serialization = IDataType::getSerialization(column, *serialization_info);
auto serialization = column.type->getSerialization(*serialization_infos.at(column .name));
serialization->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path)
{
String file_name = ISerialization::getFileNameForStream(column, substream_path) + ".bin";

View File

@ -166,7 +166,7 @@ void LogSource::readData(const NameAndTypePair & name_and_type, ColumnPtr & colu
{
ISerialization::DeserializeBinaryBulkSettings settings; /// TODO Use avg_value_size_hint.
const auto & [name, type] = name_and_type;
auto serialization = IDataType::getSerialization(name_and_type, {});
auto serialization = type->getDefaultSerialization();
auto create_stream_getter = [&](bool stream_for_prefix)
{

View File

@ -7109,10 +7109,6 @@ bool StorageReplicatedMergeTree::createEmptyPartInsteadOfLost(zkutil::ZooKeeperP
new_data_part->minmax_idx = std::move(minmax_idx);
new_data_part->is_temp = true;
/// Create empty serialization_info.
auto ratio = getSettings()->ratio_of_defaults_for_sparse_serialization;
new_data_part->serialization_info = SerializationInfoBuilder(ratio).build();
SyncGuardPtr sync_guard;
if (new_data_part->isStoredOnDisk())
{
@ -7138,8 +7134,7 @@ bool StorageReplicatedMergeTree::createEmptyPartInsteadOfLost(zkutil::ZooKeeperP
const auto & index_factory = MergeTreeIndexFactory::instance();
MergedBlockOutputStream out(new_data_part, metadata_snapshot, columns,
index_factory.getMany(metadata_snapshot->getSecondaryIndices()),
compression_codec, new_data_part->serialization_info);
index_factory.getMany(metadata_snapshot->getSecondaryIndices()), compression_codec);
bool sync_on_insert = settings->fsync_after_insert;

View File

@ -168,7 +168,7 @@ void TinyLogSource::readData(const NameAndTypePair & name_and_type,
{
ISerialization::DeserializeBinaryBulkSettings settings; /// TODO Use avg_value_size_hint.
const auto & [name, type] = name_and_type;
auto serialization = IDataType::getSerialization(name_and_type, {});
auto serialization = name_and_type.type->getDefaultSerialization();
settings.getter = [&] (const ISerialization::SubstreamPath & path) -> ReadBuffer *
{

View File

@ -222,7 +222,7 @@ void StorageSystemPartsColumns::processNextStorage(
if (columns_mask[src_index++])
columns[res_index++]->insert(column_size.marks);
auto serialization = part->getSerializationForColumn(column);
auto serialization = part->getSerialization(column.name);
if (columns_mask[src_index++])
columns[res_index++]->insert(ISerialization::kindToString(serialization->getKind()));
@ -235,7 +235,7 @@ void StorageSystemPartsColumns::processNextStorage(
subcolumn_names.push_back(name);
subcolumn_types.push_back(data.type->getName());
subcolumn_sers.push_back(ISerialization::kindToString(data.serialization->getKind()));
}, serialization, column.type, nullptr);
}, { serialization, column.type, nullptr, nullptr });
if (columns_mask[src_index++])
columns[res_index++]->insert(subcolumn_names);

View File

@ -57,3 +57,10 @@ a
aaaaaa
a
aaaaaa
id [] [] []
t ['a','b','b.u','b.s'] ['UInt64','Tuple(u UInt32, s String)','UInt32','String'] ['Sparse','Default','Sparse','Default']
aaaaaa
a
aaaaaa
a
aaaaaa

View File

@ -40,4 +40,14 @@ SELECT t.a FROM sparse_tuple WHERE t.b.u != 0 ORDER BY id LIMIT 5;
SELECT t.b.s FROM sparse_tuple ORDER BY id LIMIT 5;
SELECT t.b.s FROM sparse_tuple WHERE t.b.u != 0 ORDER BY id LIMIT 5;
DETACH TABLE sparse_tuple;
ATTACH TABLE sparse_tuple;
SELECT column, subcolumns.names, subcolumns.types, subcolumns.serializations
FROM system.parts_columns
WHERE table = 'sparse_tuple' AND database = currentDatabase()
ORDER BY column;
SELECT t.b.s FROM sparse_tuple WHERE t.b.u != 0 ORDER BY id LIMIT 5;
DROP TABLE IF EXISTS sparse_tuple;