mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-21 01:00:48 +00:00
refactoring of SerializationInfo
This commit is contained in:
parent
f1a5f79849
commit
0099dfd523
@ -30,7 +30,6 @@ private:
|
||||
public:
|
||||
static constexpr auto DEFAULT_ROWS_SEARCH_SAMPLE_RATIO = 0.1;
|
||||
static constexpr auto DEFAULT_RATIO_FOR_SPARSE_SERIALIZATION = 0.95;
|
||||
// static constexpr auto MIN_ROWS_TO_SEARCH_DEFAULTS = DEFAULT_ROWS_SEARCH_STEP * 16;
|
||||
|
||||
using Base = COWHelper<IColumn, ColumnSparse>;
|
||||
static Ptr create(const ColumnPtr & values_, const ColumnPtr & offsets_, size_t size_)
|
||||
|
@ -12,6 +12,7 @@
|
||||
#include <base/sort.h>
|
||||
#include <base/map.h>
|
||||
#include <base/range.h>
|
||||
#include <DataTypes/Serializations/SerializationInfoTuple.h>
|
||||
|
||||
|
||||
namespace DB
|
||||
@ -555,4 +556,15 @@ void ColumnTuple::getIndicesOfNonDefaultRows(Offsets & indices, size_t from, siz
|
||||
return getIndicesOfNonDefaultRowsImpl<ColumnTuple>(indices, from, limit);
|
||||
}
|
||||
|
||||
SerializationInfoPtr ColumnTuple::getSerializationInfo() const
|
||||
{
|
||||
MutableSerializationInfos infos;
|
||||
infos.reserve(columns.size());
|
||||
|
||||
for (const auto & column : columns)
|
||||
infos.push_back(const_pointer_cast<SerializationInfo>(column->getSerializationInfo()));
|
||||
|
||||
return std::make_shared<SerializationInfoTuple>(std::move(infos), SerializationInfo::Settings{});
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -96,6 +96,7 @@ public:
|
||||
ColumnPtr compress() const override;
|
||||
double getRatioOfDefaultRows(double sample_ratio) const override;
|
||||
void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override;
|
||||
SerializationInfoPtr getSerializationInfo() const override;
|
||||
|
||||
size_t tupleSize() const { return columns.size(); }
|
||||
|
||||
|
@ -4,6 +4,7 @@
|
||||
#include <Columns/ColumnNullable.h>
|
||||
#include <Columns/ColumnConst.h>
|
||||
#include <Core/Field.h>
|
||||
#include <DataTypes/Serializations/SerializationInfo.h>
|
||||
|
||||
|
||||
namespace DB
|
||||
@ -63,6 +64,11 @@ ColumnPtr IColumn::createWithOffsets(const Offsets & offsets, const Field & defa
|
||||
return res;
|
||||
}
|
||||
|
||||
SerializationInfoPtr IColumn::getSerializationInfo() const
|
||||
{
|
||||
return std::make_shared<SerializationInfo>(ISerialization::getKind(*this), SerializationInfo::Settings{});
|
||||
}
|
||||
|
||||
bool isColumnNullable(const IColumn & column)
|
||||
{
|
||||
return checkColumn<ColumnNullable>(column);
|
||||
|
@ -26,8 +26,8 @@ class ColumnGathererStream;
|
||||
class Field;
|
||||
class WeakHash32;
|
||||
|
||||
class ISerialization;
|
||||
using SerializationPtr = std::shared_ptr<const ISerialization>;
|
||||
class SerializationInfo;
|
||||
using SerializationInfoPtr = std::shared_ptr<const SerializationInfo>;
|
||||
|
||||
/*
|
||||
* Represents a set of equal ranges in previous column to perform sorting in current column.
|
||||
@ -404,6 +404,8 @@ public:
|
||||
/// Used to create full column from sparse.
|
||||
virtual Ptr createWithOffsets(const Offsets & offsets, const Field & default_field, size_t total_rows, size_t shift) const;
|
||||
|
||||
virtual SerializationInfoPtr getSerializationInfo() const;
|
||||
|
||||
/// Compress column in memory to some representation that allows to decompress it back.
|
||||
/// Return itself if compression is not applicable for this column type.
|
||||
virtual Ptr compress() const
|
||||
|
@ -117,7 +117,7 @@ ASTPtr CompressionCodecFactory::validateCodecAndGetPreprocessedAST(
|
||||
};
|
||||
|
||||
ISerialization::SubstreamPath path;
|
||||
column_type->getDefaultSerialization()->enumerateStreams(path, callback, column_type, nullptr);
|
||||
column_type->getDefaultSerialization()->enumerateStreams(path, callback, column_type);
|
||||
|
||||
if (!result_codec)
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot find any substream with data type for type {}. It's a bug", column_type->getName());
|
||||
|
@ -9,6 +9,7 @@
|
||||
#include <DataTypes/Serializations/SerializationInfo.h>
|
||||
#include <DataTypes/Serializations/SerializationTuple.h>
|
||||
#include <DataTypes/Serializations/SerializationNamed.h>
|
||||
#include <DataTypes/Serializations/SerializationInfoTuple.h>
|
||||
#include <DataTypes/NestedUtils.h>
|
||||
#include <Parsers/IAST.h>
|
||||
#include <Parsers/ASTNameTypePair.h>
|
||||
@ -254,21 +255,32 @@ SerializationPtr DataTypeTuple::doGetDefaultSerialization() const
|
||||
return std::make_shared<SerializationTuple>(std::move(serializations), use_explicit_names);
|
||||
}
|
||||
|
||||
SerializationPtr DataTypeTuple::getSerialization(const String & column_name, const SerializationCallback & callback) const
|
||||
SerializationPtr DataTypeTuple::getSerialization(const SerializationInfo & info) const
|
||||
{
|
||||
SerializationTuple::ElementSerializations serializations(elems.size());
|
||||
const auto & info_tuple = assert_cast<const SerializationInfoTuple &>(info);
|
||||
bool use_explicit_names = have_explicit_names && serialize_names;
|
||||
|
||||
for (size_t i = 0; i < elems.size(); ++i)
|
||||
{
|
||||
String elem_name = use_explicit_names ? names[i] : toString(i + 1);
|
||||
auto subcolumn_name = Nested::concatenateName(column_name, elem_name);
|
||||
auto serializaion = elems[i]->getSerialization(subcolumn_name, callback);
|
||||
serializations[i] = std::make_shared<SerializationNamed>(serializaion, elem_name);
|
||||
auto serialization = elems[i]->getSerialization(*info_tuple.getElementInfo(i));
|
||||
serializations[i] = std::make_shared<SerializationNamed>(serialization, elem_name);
|
||||
}
|
||||
|
||||
return std::make_shared<SerializationTuple>(std::move(serializations), use_explicit_names);
|
||||
}
|
||||
|
||||
MutableSerializationInfoPtr DataTypeTuple::createSerializationInfo(const SerializationInfo::Settings & settings) const
|
||||
{
|
||||
MutableSerializationInfos infos;
|
||||
infos.reserve(elems.size());
|
||||
for (const auto & elem : elems)
|
||||
infos.push_back(elem->createSerializationInfo(settings));
|
||||
|
||||
return std::make_shared<SerializationInfoTuple>(std::move(infos), settings);
|
||||
}
|
||||
|
||||
|
||||
static DataTypePtr create(const ASTPtr & arguments)
|
||||
{
|
||||
|
@ -54,8 +54,9 @@ public:
|
||||
size_t getMaximumSizeOfValueInMemory() const override;
|
||||
size_t getSizeOfValueInMemory() const override;
|
||||
|
||||
SerializationPtr getSerialization(const String & column_name, const SerializationCallback & callback) const override;
|
||||
SerializationPtr doGetDefaultSerialization() const override;
|
||||
SerializationPtr getSerialization(const SerializationInfo & info) const override;
|
||||
MutableSerializationInfoPtr createSerializationInfo(const SerializationInfo::Settings & settings) const override;
|
||||
|
||||
const DataTypePtr & getElement(size_t i) const { return elems[i]; }
|
||||
const DataTypes & getElements() const { return elems; }
|
||||
|
@ -77,9 +77,7 @@ size_t IDataType::getSizeOfValueInMemory() const
|
||||
|
||||
void IDataType::forEachSubcolumn(
|
||||
const SubcolumnCallback & callback,
|
||||
const SerializationPtr & serialization,
|
||||
const DataTypePtr & type,
|
||||
const ColumnPtr & column)
|
||||
const SubstreamData & data)
|
||||
{
|
||||
ISerialization::StreamCallback callback_with_data = [&](const auto & subpath)
|
||||
{
|
||||
@ -88,66 +86,59 @@ void IDataType::forEachSubcolumn(
|
||||
if (!subpath[i].visited && ISerialization::hasSubcolumnForPath(subpath, i + 1))
|
||||
{
|
||||
auto name = ISerialization::getSubcolumnNameForStream(subpath, i + 1);
|
||||
auto data = ISerialization::createFromPath(subpath, i);
|
||||
callback(subpath, name, data);
|
||||
auto subdata = ISerialization::createFromPath(subpath, i);
|
||||
callback(subpath, name, subdata);
|
||||
}
|
||||
subpath[i].visited = true;
|
||||
}
|
||||
};
|
||||
|
||||
ISerialization::SubstreamPath path;
|
||||
serialization->enumerateStreams(path, callback_with_data, type, column);
|
||||
SubstreamPath path;
|
||||
data.serialization->enumerateStreams(path, callback_with_data, data);
|
||||
}
|
||||
|
||||
DataTypePtr IDataType::tryGetSubcolumnType(const String & subcolumn_name) const
|
||||
template <typename Ptr>
|
||||
Ptr IDataType::getForSubcolumn(
|
||||
const String & subcolumn_name,
|
||||
const SubstreamData & data,
|
||||
Ptr SubstreamData::*member,
|
||||
bool throw_if_null) const
|
||||
{
|
||||
DataTypePtr res;
|
||||
forEachSubcolumn([&](const auto &, const auto & name, const auto & data)
|
||||
Ptr res;
|
||||
forEachSubcolumn([&](const auto &, const auto & name, const auto & subdata)
|
||||
{
|
||||
if (name == subcolumn_name)
|
||||
res = data.type;
|
||||
}, getDefaultSerialization(), getPtr(), nullptr);
|
||||
res = subdata.*member;
|
||||
}, data);
|
||||
|
||||
if (!res && throw_if_null)
|
||||
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in type {}", subcolumn_name, getName());
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
DataTypePtr IDataType::tryGetSubcolumnType(const String & subcolumn_name) const
|
||||
{
|
||||
SubstreamData data = { getDefaultSerialization(), getPtr(), nullptr, nullptr };
|
||||
return getForSubcolumn<DataTypePtr>(subcolumn_name, data, &SubstreamData::type, false);
|
||||
}
|
||||
|
||||
DataTypePtr IDataType::getSubcolumnType(const String & subcolumn_name) const
|
||||
{
|
||||
auto subcolumn_type = tryGetSubcolumnType(subcolumn_name);
|
||||
if (subcolumn_type)
|
||||
return subcolumn_type;
|
||||
|
||||
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in type {}", subcolumn_name, getName());
|
||||
SubstreamData data = { getDefaultSerialization(), getPtr(), nullptr, nullptr };
|
||||
return getForSubcolumn<DataTypePtr>(subcolumn_name, data, &SubstreamData::type);
|
||||
}
|
||||
|
||||
SerializationPtr IDataType::getSubcolumnSerialization(const String & subcolumn_name, const SerializationPtr & serialization) const
|
||||
{
|
||||
SerializationPtr res;
|
||||
forEachSubcolumn([&](const auto &, const auto & name, const auto & data)
|
||||
{
|
||||
if (name == subcolumn_name)
|
||||
res = data.serialization;
|
||||
}, serialization, nullptr, nullptr);
|
||||
|
||||
if (res)
|
||||
return res;
|
||||
|
||||
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in type {}", subcolumn_name, getName());
|
||||
SubstreamData data = { serialization, nullptr, nullptr, nullptr };
|
||||
return getForSubcolumn<SerializationPtr>(subcolumn_name, data, &SubstreamData::serialization);
|
||||
}
|
||||
|
||||
ColumnPtr IDataType::getSubcolumn(const String & subcolumn_name, const ColumnPtr & column) const
|
||||
{
|
||||
ColumnPtr res;
|
||||
forEachSubcolumn([&](const auto &, const auto & name, const auto & data)
|
||||
{
|
||||
if (name == subcolumn_name)
|
||||
res = data.column;
|
||||
}, getDefaultSerialization(), nullptr, column);
|
||||
|
||||
if (res)
|
||||
return res;
|
||||
|
||||
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in type {}", subcolumn_name, getName());
|
||||
SubstreamData data = { getDefaultSerialization(), nullptr, column, nullptr };
|
||||
return getForSubcolumn<ColumnPtr>(subcolumn_name, data, &SubstreamData::column);
|
||||
}
|
||||
|
||||
Names IDataType::getSubcolumnNames() const
|
||||
@ -156,7 +147,7 @@ Names IDataType::getSubcolumnNames() const
|
||||
forEachSubcolumn([&](const auto &, const auto & name, const auto &)
|
||||
{
|
||||
res.push_back(name);
|
||||
}, getDefaultSerialization(), nullptr, nullptr);
|
||||
}, { getDefaultSerialization(), nullptr, nullptr, nullptr });
|
||||
return res;
|
||||
}
|
||||
|
||||
@ -175,6 +166,12 @@ void IDataType::setCustomization(DataTypeCustomDescPtr custom_desc_) const
|
||||
custom_serialization = std::move(custom_desc_->serialization);
|
||||
}
|
||||
|
||||
MutableSerializationInfoPtr IDataType::createSerializationInfo(
|
||||
const SerializationInfo::Settings & settings) const
|
||||
{
|
||||
return std::make_shared<SerializationInfo>(ISerialization::Kind::DEFAULT, settings);
|
||||
}
|
||||
|
||||
SerializationPtr IDataType::getDefaultSerialization() const
|
||||
{
|
||||
if (custom_serialization)
|
||||
@ -196,31 +193,9 @@ SerializationPtr IDataType::getSerialization(ISerialization::Kind kind) const
|
||||
return getDefaultSerialization();
|
||||
}
|
||||
|
||||
SerializationPtr IDataType::getSerialization(const IColumn & column) const
|
||||
SerializationPtr IDataType::getSerialization(const SerializationInfo & info) const
|
||||
{
|
||||
return getSerialization(ISerialization::getKind(column));
|
||||
}
|
||||
|
||||
SerializationPtr IDataType::getSerialization(const String & column_name, const SerializationInfo & info) const
|
||||
{
|
||||
return getSerialization(column_name, [&info](const auto & name) { return info.getKind(name); });
|
||||
}
|
||||
|
||||
SerializationPtr IDataType::getSerialization(const String & column_name, const SerializationCallback & callback) const
|
||||
{
|
||||
return getSerialization(callback(column_name));
|
||||
}
|
||||
|
||||
SerializationPtr IDataType::getSerialization(const ISerialization::Settings & settings) const
|
||||
{
|
||||
if (supportsSparseSerialization())
|
||||
{
|
||||
double ratio = settings.num_rows ? std::min(static_cast<double>(settings.num_default_rows) / settings.num_rows, 1.0) : 0.0;
|
||||
if (ratio > settings.ratio_for_sparse_serialization)
|
||||
return getSparseSerialization();
|
||||
}
|
||||
|
||||
return getDefaultSerialization();
|
||||
return getSerialization(info.getKind());
|
||||
}
|
||||
|
||||
// static
|
||||
@ -229,11 +204,11 @@ SerializationPtr IDataType::getSerialization(const NameAndTypePair & column, con
|
||||
if (column.isSubcolumn())
|
||||
{
|
||||
const auto & type_in_storage = column.getTypeInStorage();
|
||||
auto default_serialization = type_in_storage->getDefaultSerialization();
|
||||
return type_in_storage->getSubcolumnSerialization(column.getSubcolumnName(), default_serialization);
|
||||
auto serialization = type_in_storage->getSerialization(info);
|
||||
return type_in_storage->getSubcolumnSerialization(column.getSubcolumnName(), serialization);
|
||||
}
|
||||
|
||||
return column.type->getSerialization(column.name, info);
|
||||
return column.type->getSerialization(info);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -7,6 +7,7 @@
|
||||
#include <Core/TypeId.h>
|
||||
#include <DataTypes/DataTypeCustom.h>
|
||||
#include <DataTypes/Serializations/ISerialization.h>
|
||||
#include <DataTypes/Serializations/SerializationInfo.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
@ -27,7 +28,6 @@ using DataTypePtr = std::shared_ptr<const IDataType>;
|
||||
using DataTypes = std::vector<DataTypePtr>;
|
||||
|
||||
struct NameAndTypePair;
|
||||
class SerializationInfo;
|
||||
|
||||
struct DataTypeWithConstInfo
|
||||
{
|
||||
@ -84,19 +84,23 @@ public:
|
||||
SerializationPtr getSubcolumnSerialization(const String & subcolumn_name, const SerializationPtr & serialization) const;
|
||||
ColumnPtr getSubcolumn(const String & subcolumn_name, const ColumnPtr & column) const;
|
||||
|
||||
using SubstreamData = ISerialization::SubstreamData;
|
||||
using SubstreamPath = ISerialization::SubstreamPath;
|
||||
|
||||
using SubcolumnCallback = std::function<void(
|
||||
const ISerialization::SubstreamPath &,
|
||||
const SubstreamPath &,
|
||||
const String &,
|
||||
const ISerialization::SubstreamData &)>;
|
||||
const SubstreamData &)>;
|
||||
|
||||
static void forEachSubcolumn(
|
||||
const SubcolumnCallback & callback,
|
||||
const SerializationPtr & serialization,
|
||||
const DataTypePtr & type,
|
||||
const ColumnPtr & column);
|
||||
const SubstreamData & data);
|
||||
|
||||
Names getSubcolumnNames() const;
|
||||
|
||||
virtual MutableSerializationInfoPtr createSerializationInfo(
|
||||
const SerializationInfo::Settings & settings) const;
|
||||
|
||||
/// TODO: support more types.
|
||||
virtual bool supportsSparseSerialization() const { return !haveSubtypes(); }
|
||||
|
||||
@ -106,18 +110,8 @@ public:
|
||||
/// Chooses serialziation according to serialization kind.
|
||||
SerializationPtr getSerialization(ISerialization::Kind kind) const;
|
||||
|
||||
/// Chooses serialziation according to column content.
|
||||
SerializationPtr getSerialization(const IColumn & column) const;
|
||||
|
||||
/// Chooses serialization according to collected information about content of columns.
|
||||
SerializationPtr getSerialization(const String & column_name, const SerializationInfo & info) const;
|
||||
|
||||
/// Chooses serialization according to settings.
|
||||
SerializationPtr getSerialization(const ISerialization::Settings & settings) const;
|
||||
|
||||
using SerializationCallback = std::function<ISerialization::Kind(const String &)>;
|
||||
|
||||
virtual SerializationPtr getSerialization(const String & column_name, const SerializationCallback & callback) const;
|
||||
/// Chooses serialization according to collected information about content of column.
|
||||
virtual SerializationPtr getSerialization(const SerializationInfo & info) const;
|
||||
|
||||
/// Chooses between subcolumn serialization and regular serialization according to @column.
|
||||
/// This method typically should be used to get serialization for reading column or subcolumn.
|
||||
@ -302,6 +296,14 @@ protected:
|
||||
public:
|
||||
const IDataTypeCustomName * getCustomName() const { return custom_name.get(); }
|
||||
const ISerialization * getCustomSerialization() const { return custom_serialization.get(); }
|
||||
|
||||
private:
|
||||
template <typename Ptr>
|
||||
Ptr getForSubcolumn(
|
||||
const String & subcolumn_name,
|
||||
const SubstreamData & data,
|
||||
Ptr SubstreamData::*member,
|
||||
bool throw_if_null = true) const;
|
||||
};
|
||||
|
||||
|
||||
|
@ -72,18 +72,22 @@ String ISerialization::SubstreamPath::toString() const
|
||||
void ISerialization::enumerateStreams(
|
||||
SubstreamPath & path,
|
||||
const StreamCallback & callback,
|
||||
DataTypePtr type,
|
||||
ColumnPtr column) const
|
||||
const SubstreamData & data) const
|
||||
{
|
||||
path.push_back(Substream::Regular);
|
||||
path.back().data = {type, column, getPtr(), nullptr};
|
||||
path.back().data = data;
|
||||
callback(path);
|
||||
path.pop_back();
|
||||
}
|
||||
|
||||
void ISerialization::enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const
|
||||
{
|
||||
enumerateStreams(path, callback, nullptr, nullptr);
|
||||
enumerateStreams(path, callback, {getPtr(), nullptr, nullptr, nullptr});
|
||||
}
|
||||
|
||||
void ISerialization::enumerateStreams(SubstreamPath & path, const StreamCallback & callback, const DataTypePtr & type) const
|
||||
{
|
||||
enumerateStreams(path, callback, {getPtr(), type, nullptr, nullptr});
|
||||
}
|
||||
|
||||
void ISerialization::serializeBinaryBulk(const IColumn & column, WriteBuffer &, size_t, size_t) const
|
||||
@ -268,10 +272,9 @@ ISerialization::SubstreamData ISerialization::createFromPath(const SubstreamPath
|
||||
assert(prefix_len < path.size());
|
||||
|
||||
SubstreamData res = path[prefix_len].data;
|
||||
res.creator.reset();
|
||||
for (ssize_t i = static_cast<ssize_t>(prefix_len) - 1; i >= 0; --i)
|
||||
{
|
||||
const auto & creator = path[i].data.creator;
|
||||
const auto & creator = path[i].creator;
|
||||
if (creator)
|
||||
{
|
||||
res.type = res.type ? creator->create(res.type) : res.type;
|
||||
|
@ -32,6 +32,9 @@ using DataTypePtr = std::shared_ptr<const IDataType>;
|
||||
class ISerialization;
|
||||
using SerializationPtr = std::shared_ptr<const ISerialization>;
|
||||
|
||||
class SerializationInfo;
|
||||
using SerializationInfoPtr = std::shared_ptr<const SerializationInfo>;
|
||||
|
||||
class Field;
|
||||
|
||||
struct FormatSettings;
|
||||
@ -98,10 +101,10 @@ public:
|
||||
|
||||
struct SubstreamData
|
||||
{
|
||||
SerializationPtr serialization;
|
||||
DataTypePtr type;
|
||||
ColumnPtr column;
|
||||
SerializationPtr serialization;
|
||||
SubcolumnCreatorPtr creator;
|
||||
SerializationInfoPtr serialization_info;
|
||||
};
|
||||
|
||||
struct Substream
|
||||
@ -136,6 +139,9 @@ public:
|
||||
/// Data for current substream.
|
||||
SubstreamData data;
|
||||
|
||||
/// Creator of subcolumn for current substream.
|
||||
SubcolumnCreatorPtr creator = nullptr;
|
||||
|
||||
/// Flag, that may help to traverse substream paths.
|
||||
mutable bool visited = false;
|
||||
|
||||
@ -158,13 +164,14 @@ public:
|
||||
virtual void enumerateStreams(
|
||||
SubstreamPath & path,
|
||||
const StreamCallback & callback,
|
||||
DataTypePtr type,
|
||||
ColumnPtr column) const;
|
||||
const SubstreamData & data) const;
|
||||
|
||||
void enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const;
|
||||
void enumerateStreams(const StreamCallback & callback, SubstreamPath && path) const { enumerateStreams(callback, path); }
|
||||
void enumerateStreams(const StreamCallback & callback) const { enumerateStreams(callback, {}); }
|
||||
|
||||
void enumerateStreams(SubstreamPath & path, const StreamCallback & callback, const DataTypePtr & type) const;
|
||||
|
||||
using OutputStreamGetter = std::function<WriteBuffer*(const SubstreamPath &)>;
|
||||
using InputStreamGetter = std::function<ReadBuffer*(const SubstreamPath &)>;
|
||||
|
||||
@ -208,13 +215,6 @@ public:
|
||||
double avg_value_size_hint = 0;
|
||||
};
|
||||
|
||||
struct Settings
|
||||
{
|
||||
size_t num_rows;
|
||||
size_t num_default_rows;
|
||||
double ratio_for_sparse_serialization;
|
||||
};
|
||||
|
||||
/// Call before serializeBinaryBulkWithMultipleStreams chain to write something before first mark.
|
||||
virtual void serializeBinaryBulkStatePrefix(
|
||||
SerializeBinaryBulkSettings & /*settings*/,
|
||||
@ -339,6 +339,7 @@ protected:
|
||||
|
||||
using SerializationPtr = std::shared_ptr<const ISerialization>;
|
||||
using Serializations = std::vector<SerializationPtr>;
|
||||
using SerializationByName = std::unordered_map<String, SerializationPtr>;
|
||||
|
||||
template <typename State, typename StatePtr>
|
||||
State * ISerialization::checkAndGetState(const StatePtr & state) const
|
||||
|
@ -198,33 +198,38 @@ ColumnPtr SerializationArray::SubcolumnCreator::create(const ColumnPtr & prev) c
|
||||
void SerializationArray::enumerateStreams(
|
||||
SubstreamPath & path,
|
||||
const StreamCallback & callback,
|
||||
DataTypePtr type,
|
||||
ColumnPtr column) const
|
||||
const SubstreamData & data) const
|
||||
{
|
||||
const auto * type_array = type ? &assert_cast<const DataTypeArray &>(*type) : nullptr;
|
||||
const auto * column_array = column ? &assert_cast<const ColumnArray &>(*column) : nullptr;
|
||||
const auto * type_array = data.type ? &assert_cast<const DataTypeArray &>(*data.type) : nullptr;
|
||||
const auto * column_array = data.column ? &assert_cast<const ColumnArray &>(*data.column) : nullptr;
|
||||
auto offsets_column = column_array ? column_array->getOffsetsPtr() : nullptr;
|
||||
|
||||
path.push_back(Substream::ArraySizes);
|
||||
path.back().data =
|
||||
{
|
||||
type ? std::make_shared<DataTypeUInt64>() : nullptr,
|
||||
offsets_column ? arrayOffsetsToSizes(*offsets_column) : nullptr,
|
||||
std::make_shared<SerializationNamed>(
|
||||
std::make_shared<SerializationNumber<UInt64>>(),
|
||||
"size" + std::to_string(getArrayLevel(path)), false),
|
||||
nullptr,
|
||||
data.type ? std::make_shared<DataTypeUInt64>() : nullptr,
|
||||
offsets_column ? arrayOffsetsToSizes(*offsets_column) : nullptr,
|
||||
data.serialization_info,
|
||||
};
|
||||
|
||||
callback(path);
|
||||
|
||||
path.back() = Substream::ArrayElements;
|
||||
path.back().data = {type, column, getPtr(), std::make_shared<SubcolumnCreator>(offsets_column)};
|
||||
path.back().data = data;
|
||||
path.back().creator = std::make_shared<SubcolumnCreator>(offsets_column);
|
||||
|
||||
auto next_type = type_array ? type_array->getNestedType() : nullptr;
|
||||
auto next_column = column_array ? column_array->getDataPtr() : nullptr;
|
||||
SubstreamData next_data =
|
||||
{
|
||||
nested,
|
||||
type_array ? type_array->getNestedType() : nullptr,
|
||||
column_array ? column_array->getDataPtr() : nullptr,
|
||||
data.serialization_info,
|
||||
};
|
||||
|
||||
nested->enumerateStreams(path, callback, next_type, next_column);
|
||||
nested->enumerateStreams(path, callback, next_data);
|
||||
path.pop_back();
|
||||
}
|
||||
|
||||
|
@ -38,8 +38,7 @@ public:
|
||||
void enumerateStreams(
|
||||
SubstreamPath & path,
|
||||
const StreamCallback & callback,
|
||||
DataTypePtr type,
|
||||
ColumnPtr column) const override;
|
||||
const SubstreamData & data) const override;
|
||||
|
||||
void serializeBinaryBulkStatePrefix(
|
||||
SerializeBinaryBulkSettings & settings,
|
||||
|
@ -1,8 +1,10 @@
|
||||
#include <DataTypes/Serializations/SerializationInfo.h>
|
||||
#include <DataTypes/NestedUtils.h>
|
||||
#include <Columns/ColumnSparse.h>
|
||||
#include <IO/ReadHelpers.h>
|
||||
#include <IO/WriteHelpers.h>
|
||||
#include <IO/VarInt.h>
|
||||
#include <base/EnumReflection.h>
|
||||
|
||||
#include <Poco/JSON/JSON.h>
|
||||
#include <Poco/JSON/Object.h>
|
||||
@ -17,113 +19,14 @@ namespace ErrorCodes
|
||||
{
|
||||
extern const int LOGICAL_ERROR;
|
||||
extern const int INCORRECT_DATA;
|
||||
}
|
||||
|
||||
SerializationInfoBuilder::SerializationInfoBuilder(
|
||||
double ratio_for_sparse_serialization_,
|
||||
double default_rows_search_sample_ratio_)
|
||||
: ratio_for_sparse_serialization(ratio_for_sparse_serialization_)
|
||||
, default_rows_search_sample_ratio(default_rows_search_sample_ratio_)
|
||||
, info(std::make_shared<SerializationInfo>())
|
||||
{
|
||||
}
|
||||
|
||||
void SerializationInfoBuilder::add(const Block & block)
|
||||
{
|
||||
size_t num_rows = block.rows();
|
||||
info->number_of_rows += num_rows;
|
||||
|
||||
if (!canHaveSparseSerialization())
|
||||
return;
|
||||
|
||||
for (const auto & elem : block)
|
||||
{
|
||||
/// Just skip column and always return default serialization.
|
||||
if (!elem.type->supportsSparseSerialization())
|
||||
continue;
|
||||
|
||||
/// Multiply by step to restore approximate number of default values.
|
||||
info->columns[elem.name].num_defaults += static_cast<size_t>(
|
||||
num_rows * elem.column->getRatioOfDefaultRows(default_rows_search_sample_ratio));
|
||||
|
||||
IDataType::forEachSubcolumn([&](const auto &, const auto & name, const auto & data)
|
||||
{
|
||||
if (!data.type->supportsSparseSerialization())
|
||||
return;
|
||||
|
||||
auto parent_subcolumn_name = Nested::splitName(name, /*reverse=*/ true).first;
|
||||
if (!parent_subcolumn_name.empty())
|
||||
{
|
||||
auto parent_subcolumn_type = elem.type->tryGetSubcolumnType(parent_subcolumn_name);
|
||||
if (parent_subcolumn_type && !parent_subcolumn_type->supportsSparseSerialization())
|
||||
return;
|
||||
}
|
||||
|
||||
auto full_name = Nested::concatenateName(elem.name, name);
|
||||
info->columns[full_name].num_defaults += static_cast<size_t>(
|
||||
num_rows * data.column->getRatioOfDefaultRows(default_rows_search_sample_ratio));
|
||||
|
||||
}, elem.type->getDefaultSerialization(), elem.type, elem.column);
|
||||
}
|
||||
}
|
||||
|
||||
void SerializationInfoBuilder::add(const SerializationInfo & other)
|
||||
{
|
||||
info->number_of_rows += other.number_of_rows;
|
||||
for (const auto & [name, column_info] : other.columns)
|
||||
info->columns[name].num_defaults += column_info.num_defaults;
|
||||
}
|
||||
|
||||
SerializationInfoPtr SerializationInfoBuilder::build() &&
|
||||
{
|
||||
size_t total_rows = info->number_of_rows;
|
||||
for (auto & [_, column_info] : info->columns)
|
||||
{
|
||||
double ratio = total_rows ? std::min(static_cast<double>(column_info.num_defaults) / total_rows, 1.0) : 0.0;
|
||||
if (ratio > ratio_for_sparse_serialization)
|
||||
column_info.kind = ISerialization::Kind::SPARSE;
|
||||
}
|
||||
|
||||
return std::move(info);
|
||||
}
|
||||
|
||||
SerializationInfoPtr SerializationInfoBuilder::buildFrom(const SerializationInfo & other) &&
|
||||
{
|
||||
for (const auto & [name, column_info] : other.columns)
|
||||
{
|
||||
auto it = info->columns.find(name);
|
||||
if (it == info->columns.end())
|
||||
info->columns[name] = column_info;
|
||||
else
|
||||
it->second.kind = column_info.kind;
|
||||
}
|
||||
|
||||
return std::move(info);
|
||||
}
|
||||
|
||||
ISerialization::Kind SerializationInfo::getKind(const String & column_name) const
|
||||
{
|
||||
auto it = columns.find(column_name);
|
||||
if (it == columns.end())
|
||||
return ISerialization::Kind::DEFAULT;
|
||||
|
||||
return it->second.kind;
|
||||
}
|
||||
|
||||
size_t SerializationInfo::getNumberOfDefaultRows(const String & column_name) const
|
||||
{
|
||||
auto it = columns.find(column_name);
|
||||
if (it == columns.end())
|
||||
return 0;
|
||||
|
||||
return it->second.num_defaults;
|
||||
extern const int CORRUPTED_DATA;
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
constexpr auto KEY_VERSION = "version";
|
||||
constexpr auto KEY_NUMBER_OF_ROWS = "number_of_rows";
|
||||
constexpr auto KEY_NUM_ROWS = "num_rows";
|
||||
constexpr auto KEY_COLUMNS = "columns";
|
||||
constexpr auto KEY_NUM_DEFAULTS = "num_defaults";
|
||||
constexpr auto KEY_KIND = "kind";
|
||||
@ -131,68 +34,181 @@ constexpr auto KEY_NAME = "name";
|
||||
|
||||
}
|
||||
|
||||
void SerializationInfo::fromJSON(const String & json_str)
|
||||
void SerializationInfo::Data::add(const IColumn & column)
|
||||
{
|
||||
size_t rows = column.size();
|
||||
double ratio = column.getRatioOfDefaultRows(ColumnSparse::DEFAULT_ROWS_SEARCH_SAMPLE_RATIO);
|
||||
|
||||
num_rows += rows;
|
||||
num_defaults += static_cast<size_t>(ratio * rows);
|
||||
}
|
||||
|
||||
void SerializationInfo::Data::add(const Data & other)
|
||||
{
|
||||
num_rows += other.num_rows;
|
||||
num_defaults += other.num_defaults;
|
||||
}
|
||||
|
||||
SerializationInfo::SerializationInfo(ISerialization::Kind kind_, const Settings & settings_)
|
||||
: settings(settings_)
|
||||
, kind(kind_)
|
||||
{
|
||||
}
|
||||
|
||||
void SerializationInfo::add(const IColumn & column)
|
||||
{
|
||||
data.add(column);
|
||||
if (settings.choose_kind)
|
||||
kind = chooseKind(data, settings);
|
||||
}
|
||||
|
||||
void SerializationInfo::add(const SerializationInfo & other)
|
||||
{
|
||||
data.add(other.data);
|
||||
if (settings.choose_kind)
|
||||
kind = chooseKind(data, settings);
|
||||
}
|
||||
|
||||
void SerializationInfo::replaceData(const SerializationInfo & other)
|
||||
{
|
||||
data = other.data;
|
||||
}
|
||||
|
||||
MutableSerializationInfoPtr SerializationInfo::clone() const
|
||||
{
|
||||
auto res = std::make_shared<SerializationInfo>(kind, settings);
|
||||
res->data = data;
|
||||
return res;
|
||||
}
|
||||
|
||||
void SerializationInfo::serialializeKindBinary(WriteBuffer & out) const
|
||||
{
|
||||
writeBinary(static_cast<UInt8>(kind), out);
|
||||
}
|
||||
|
||||
void SerializationInfo::deserializeFromKindsBinary(ReadBuffer & in)
|
||||
{
|
||||
UInt8 kind_num;
|
||||
readBinary(kind_num, in);
|
||||
auto maybe_kind = magic_enum::enum_cast<ISerialization::Kind>(kind_num);
|
||||
if (!maybe_kind)
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown serialization kind " + std::to_string(kind_num));
|
||||
|
||||
kind = *maybe_kind;
|
||||
}
|
||||
|
||||
Poco::JSON::Object SerializationInfo::toJSON() const
|
||||
{
|
||||
Poco::JSON::Object object;
|
||||
object.set(KEY_KIND, ISerialization::kindToString(kind));
|
||||
object.set(KEY_NUM_DEFAULTS, data.num_defaults);
|
||||
object.set(KEY_NUM_ROWS, data.num_rows);
|
||||
return object;
|
||||
}
|
||||
|
||||
void SerializationInfo::fromJSON(const Poco::JSON::Object & object)
|
||||
{
|
||||
if (!object.has(KEY_KIND) || !object.has(KEY_NUM_DEFAULTS) || !object.has(KEY_NUM_ROWS))
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR,
|
||||
"Missed field '{}' or '{}' or '{}' in SerializationInfo of columns",
|
||||
KEY_KIND, KEY_NUM_DEFAULTS, KEY_NUM_ROWS);
|
||||
|
||||
data.num_rows = object.getValue<size_t>(KEY_NUM_ROWS);
|
||||
data.num_defaults = object.getValue<size_t>(KEY_NUM_DEFAULTS);
|
||||
kind = ISerialization::stringToKind(object.getValue<String>(KEY_KIND));
|
||||
}
|
||||
|
||||
ISerialization::Kind SerializationInfo::chooseKind(const Data & data, const Settings & settings)
|
||||
{
|
||||
double ratio = data.num_rows ? std::min(static_cast<double>(data.num_defaults) / data.num_rows, 1.0) : 0.0;
|
||||
return ratio > settings.ratio_for_sparse ? ISerialization::Kind::SPARSE : ISerialization::Kind::DEFAULT;
|
||||
}
|
||||
|
||||
SerializationInfoByName::SerializationInfoByName(
|
||||
const NamesAndTypesList & columns,
|
||||
const SerializationInfo::Settings & settings)
|
||||
{
|
||||
for (const auto & column : columns)
|
||||
emplace(column.name, column.type->createSerializationInfo(settings));
|
||||
}
|
||||
|
||||
void SerializationInfoByName::add(const Block & block)
|
||||
{
|
||||
for (const auto & column : block)
|
||||
{
|
||||
auto it = find(column.name);
|
||||
if (it == end())
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR,
|
||||
"Not found column {} in serialization infos", column.name);
|
||||
|
||||
it->second->add(*column.column);
|
||||
}
|
||||
}
|
||||
|
||||
void SerializationInfoByName::add(const SerializationInfoByName & other)
|
||||
{
|
||||
for (const auto & [name, info] : other)
|
||||
{
|
||||
auto it = find(name);
|
||||
if (it == end())
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR,
|
||||
"Not found column {} in serialization infos", name);
|
||||
|
||||
it->second->add(*info);
|
||||
}
|
||||
}
|
||||
|
||||
void SerializationInfoByName::writeText(WriteBuffer & out) const
|
||||
{
|
||||
Poco::JSON::Object object;
|
||||
object.set(KEY_VERSION, SERIALIZATION_INFO_VERSION);
|
||||
|
||||
Poco::JSON::Array column_infos;
|
||||
for (const auto & [name, info] : *this)
|
||||
{
|
||||
auto info_json = info->toJSON();
|
||||
info_json.set(KEY_NAME, name);
|
||||
column_infos.add(std::move(info_json));
|
||||
}
|
||||
|
||||
object.set(KEY_COLUMNS, std::move(column_infos));
|
||||
|
||||
std::ostringstream oss; // STYLE_CHECK_ALLOW_STD_STRING_STREAM
|
||||
oss.exceptions(std::ios::failbit);
|
||||
Poco::JSON::Stringifier::stringify(object, oss);
|
||||
|
||||
return writeString(oss.str(), out);
|
||||
}
|
||||
|
||||
void SerializationInfoByName::readText(ReadBuffer & in)
|
||||
{
|
||||
String json_str;
|
||||
readString(json_str, in);
|
||||
|
||||
Poco::JSON::Parser parser;
|
||||
auto object = parser.parse(json_str).extract<Poco::JSON::Object::Ptr>();
|
||||
|
||||
if (object->has(KEY_NUMBER_OF_ROWS))
|
||||
number_of_rows = object->getValue<size_t>(KEY_NUMBER_OF_ROWS);
|
||||
|
||||
if (object->has(KEY_COLUMNS))
|
||||
{
|
||||
auto array = object->getArray(KEY_COLUMNS);
|
||||
for (const auto & elem : *array)
|
||||
{
|
||||
auto elem_object = elem.extract<Poco::JSON::Object::Ptr>();
|
||||
if (!elem_object->has(KEY_NAME) || !elem_object->has(KEY_NUM_DEFAULTS) || !elem_object->has(KEY_KIND))
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR,
|
||||
"Missed field '{}' or '{}' or '{}' in SerializationInfo of columns",
|
||||
KEY_NAME, KEY_NUM_DEFAULTS, KEY_KIND);
|
||||
|
||||
if (!elem_object->has(KEY_NAME))
|
||||
throw Exception(ErrorCodes::CORRUPTED_DATA,
|
||||
"Missed field '{}' in SerializationInfo of columns", KEY_NAME);
|
||||
|
||||
auto name = elem_object->getValue<String>(KEY_NAME);
|
||||
auto kind = elem_object->getValue<String>(KEY_KIND);
|
||||
auto num_defaults = elem_object->getValue<size_t>(KEY_NUM_DEFAULTS);
|
||||
columns[name] = {ISerialization::stringToKind(kind), num_defaults};
|
||||
auto it = find(name);
|
||||
|
||||
if (it == end())
|
||||
throw Exception(ErrorCodes::CORRUPTED_DATA,
|
||||
"There is not column {} in serialization infos", name);
|
||||
|
||||
it->second->fromJSON(*elem_object);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
String SerializationInfo::toJSON() const
|
||||
{
|
||||
Poco::JSON::Object info;
|
||||
info.set(KEY_VERSION, version);
|
||||
info.set(KEY_NUMBER_OF_ROWS, number_of_rows);
|
||||
|
||||
Poco::JSON::Array column_infos;
|
||||
for (const auto & [name, column_info] : columns)
|
||||
{
|
||||
Poco::JSON::Object column_info_json;
|
||||
column_info_json.set(KEY_NAME, name);
|
||||
column_info_json.set(KEY_KIND, ISerialization::kindToString(column_info.kind));
|
||||
column_info_json.set(KEY_NUM_DEFAULTS, column_info.num_defaults);
|
||||
column_infos.add(std::move(column_info_json));
|
||||
}
|
||||
|
||||
info.set(KEY_COLUMNS, std::move(column_infos));
|
||||
|
||||
std::ostringstream oss; // STYLE_CHECK_ALLOW_STD_STRING_STREAM
|
||||
oss.exceptions(std::ios::failbit);
|
||||
Poco::JSON::Stringifier::stringify(info, oss);
|
||||
|
||||
return oss.str();
|
||||
}
|
||||
|
||||
void SerializationInfo::readText(ReadBuffer & in)
|
||||
{
|
||||
String json_str;
|
||||
readString(json_str, in);
|
||||
fromJSON(json_str);
|
||||
}
|
||||
|
||||
void SerializationInfo::writeText(WriteBuffer & out) const
|
||||
{
|
||||
writeString(toJSON(), out);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,13 +1,21 @@
|
||||
#pragma once
|
||||
|
||||
#include <Core/Block.h>
|
||||
#include <Core/Types.h>
|
||||
#include <DataTypes/Serializations/ISerialization.h>
|
||||
#include <Columns/ColumnSparse.h>
|
||||
#include <Poco/JSON/Object.h>
|
||||
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
/** Contains information about kinds of serialization of columns.
|
||||
class ReadBuffer;
|
||||
class WriteBuffer;
|
||||
class NamesAndTypesList;
|
||||
class Block;
|
||||
|
||||
constexpr auto SERIALIZATION_INFO_VERSION = 1;
|
||||
|
||||
/** Contains information about kind of serialization of column and its subcolumns.
|
||||
* Also contains information about content of columns,
|
||||
* that helps to choose kind of serialization of column.
|
||||
*
|
||||
@ -19,72 +27,70 @@ namespace DB
|
||||
class SerializationInfo
|
||||
{
|
||||
public:
|
||||
SerializationInfo() = default;
|
||||
|
||||
static constexpr auto version = 1;
|
||||
size_t getNumberOfDefaultRows(const String & column_name) const;
|
||||
ISerialization::Kind getKind(const String & column_name) const;
|
||||
|
||||
bool empty() const { return !number_of_rows && columns.empty(); }
|
||||
size_t getNumberOfRows() const { return number_of_rows; }
|
||||
|
||||
void readText(ReadBuffer & in);
|
||||
void writeText(WriteBuffer & out) const;
|
||||
|
||||
private:
|
||||
void fromJSON(const String & json_str);
|
||||
String toJSON() const;
|
||||
|
||||
/// Information about one column.
|
||||
/// Can be extended, when new kinds of serialization will be implemented.
|
||||
struct Column
|
||||
struct Data
|
||||
{
|
||||
ISerialization::Kind kind = ISerialization::Kind::DEFAULT;
|
||||
size_t num_rows = 0;
|
||||
size_t num_defaults = 0;
|
||||
|
||||
void add(const IColumn & column);
|
||||
void add(const Data & other);
|
||||
};
|
||||
|
||||
using NameToColumn = std::unordered_map<String, Column>;
|
||||
struct Settings
|
||||
{
|
||||
const double ratio_for_sparse = 1.0;
|
||||
const bool choose_kind = false;
|
||||
};
|
||||
|
||||
size_t number_of_rows = 0;
|
||||
NameToColumn columns;
|
||||
SerializationInfo(ISerialization::Kind kind_, const Settings & settings_);
|
||||
|
||||
friend class SerializationInfoBuilder;
|
||||
virtual ~SerializationInfo() = default;
|
||||
|
||||
virtual bool hasCustomSerialization() const { return kind != ISerialization::Kind::DEFAULT; }
|
||||
|
||||
virtual void add(const IColumn & column);
|
||||
virtual void add(const SerializationInfo & other);
|
||||
virtual void replaceData(const SerializationInfo & other);
|
||||
virtual std::shared_ptr<SerializationInfo> clone() const;
|
||||
|
||||
virtual void serialializeKindBinary(WriteBuffer & out) const;
|
||||
virtual void deserializeFromKindsBinary(ReadBuffer & in);
|
||||
|
||||
virtual Poco::JSON::Object toJSON() const;
|
||||
virtual void fromJSON(const Poco::JSON::Object & object);
|
||||
|
||||
const Settings & getSettings() const { return settings; }
|
||||
const Data & getData() const { return data; }
|
||||
ISerialization::Kind getKind() const { return kind; }
|
||||
|
||||
static ISerialization::Kind chooseKind(const Data & data, const Settings & settings);
|
||||
|
||||
protected:
|
||||
const Settings settings;
|
||||
|
||||
ISerialization::Kind kind;
|
||||
Data data;
|
||||
};
|
||||
|
||||
using SerializationInfoPtr = std::shared_ptr<SerializationInfo>;
|
||||
using SerializationInfoPtr = std::shared_ptr<const SerializationInfo>;
|
||||
using MutableSerializationInfoPtr = std::shared_ptr<SerializationInfo>;
|
||||
|
||||
/// Builder, that helps to create SerializationInfo.
|
||||
class SerializationInfoBuilder
|
||||
using SerializationInfos = std::vector<SerializationInfoPtr>;
|
||||
using MutableSerializationInfos = std::vector<MutableSerializationInfoPtr>;
|
||||
|
||||
class SerializationInfoByName : public std::unordered_map<String, MutableSerializationInfoPtr>
|
||||
{
|
||||
public:
|
||||
SerializationInfoBuilder();
|
||||
SerializationInfoBuilder(
|
||||
double ratio_for_sparse_serialization_,
|
||||
double default_rows_search_sample_ratio_ = ColumnSparse::DEFAULT_ROWS_SEARCH_SAMPLE_RATIO);
|
||||
SerializationInfoByName() = default;
|
||||
SerializationInfoByName(
|
||||
const NamesAndTypesList & columns,
|
||||
const SerializationInfo::Settings & settings);
|
||||
|
||||
/// Add information about column from block.
|
||||
void add(const Block & block);
|
||||
void add(const SerializationInfoByName & other);
|
||||
|
||||
/// Add information about column from other SerializationInfo.
|
||||
void add(const SerializationInfo & other);
|
||||
|
||||
/// Choose kind of serialization for every column
|
||||
/// according its content and return finalized SerializationInfo.
|
||||
SerializationInfoPtr build() &&;
|
||||
|
||||
/// Create SerializationInfo from other.
|
||||
/// Respects kinds of serialization for columns, that exist in other SerializationInfo,
|
||||
/// but keeps information about content of column from current SerializationInfo.
|
||||
SerializationInfoPtr buildFrom(const SerializationInfo & other) &&;
|
||||
|
||||
double getRatioForSparseSerialization() const { return ratio_for_sparse_serialization; }
|
||||
bool canHaveSparseSerialization() const { return ratio_for_sparse_serialization < 1.0; }
|
||||
|
||||
private:
|
||||
double ratio_for_sparse_serialization;
|
||||
double default_rows_search_sample_ratio;
|
||||
|
||||
SerializationInfoPtr info;
|
||||
void writeText(WriteBuffer & out) const;
|
||||
void readText(ReadBuffer & in);
|
||||
};
|
||||
|
||||
}
|
||||
|
116
src/DataTypes/Serializations/SerializationInfoTuple.cpp
Normal file
116
src/DataTypes/Serializations/SerializationInfoTuple.cpp
Normal file
@ -0,0 +1,116 @@
|
||||
#include <DataTypes/Serializations/SerializationInfoTuple.h>
|
||||
#include <DataTypes/DataTypeTuple.h>
|
||||
#include <Columns/ColumnTuple.h>
|
||||
#include <Common/assert_cast.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int LOGICAL_ERROR;
|
||||
extern const int CORRUPTED_DATA;
|
||||
extern const int THERE_IS_NO_COLUMN;
|
||||
}
|
||||
|
||||
SerializationInfoTuple::SerializationInfoTuple(
|
||||
MutableSerializationInfos elems_, const Settings & settings_)
|
||||
: SerializationInfo(ISerialization::Kind::DEFAULT, settings_)
|
||||
, elems(std::move(elems_))
|
||||
{
|
||||
}
|
||||
|
||||
bool SerializationInfoTuple::hasCustomSerialization() const
|
||||
{
|
||||
return std::any_of(elems.begin(), elems.end(), [](const auto & elem) { return elem->hasCustomSerialization(); });
|
||||
}
|
||||
|
||||
void SerializationInfoTuple::add(const IColumn & column)
|
||||
{
|
||||
SerializationInfo::add(column);
|
||||
|
||||
const auto & column_tuple = assert_cast<const ColumnTuple &>(column);
|
||||
const auto & right_elems = column_tuple.getColumns();
|
||||
assert(elems.size() == right_elems.size());
|
||||
|
||||
for (size_t i = 0; i < elems.size(); ++i)
|
||||
elems[i]->add(*right_elems[i]);
|
||||
}
|
||||
|
||||
void SerializationInfoTuple::add(const SerializationInfo & other)
|
||||
{
|
||||
SerializationInfo::add(other);
|
||||
|
||||
const auto & info_tuple = assert_cast<const SerializationInfoTuple &>(other);
|
||||
assert(elems.size() == info_tuple.elems.size());
|
||||
|
||||
for (size_t i = 0; i < elems.size(); ++i)
|
||||
elems[i]->add(*info_tuple.elems[i]);
|
||||
}
|
||||
|
||||
void SerializationInfoTuple::replaceData(const SerializationInfo & other)
|
||||
{
|
||||
SerializationInfo::add(other);
|
||||
|
||||
const auto & info_tuple = assert_cast<const SerializationInfoTuple &>(other);
|
||||
assert(elems.size() == info_tuple.elems.size());
|
||||
|
||||
for (size_t i = 0; i < elems.size(); ++i)
|
||||
elems[i]->replaceData(*info_tuple.elems[i]);
|
||||
}
|
||||
MutableSerializationInfoPtr SerializationInfoTuple::clone() const
|
||||
{
|
||||
MutableSerializationInfos elems_cloned;
|
||||
elems_cloned.reserve(elems.size());
|
||||
for (const auto & elem : elems)
|
||||
elems_cloned.push_back(elem->clone());
|
||||
|
||||
return std::make_shared<SerializationInfoTuple>(std::move(elems_cloned), settings);
|
||||
}
|
||||
|
||||
void SerializationInfoTuple::serialializeKindBinary(WriteBuffer & out) const
|
||||
{
|
||||
SerializationInfo::serialializeKindBinary(out);
|
||||
|
||||
for (const auto & elem : elems)
|
||||
elem->serialializeKindBinary(out);
|
||||
}
|
||||
|
||||
void SerializationInfoTuple::deserializeFromKindsBinary(ReadBuffer & in)
|
||||
{
|
||||
SerializationInfo::deserializeFromKindsBinary(in);
|
||||
|
||||
for (const auto & elem : elems)
|
||||
elem->deserializeFromKindsBinary(in);
|
||||
}
|
||||
|
||||
Poco::JSON::Object SerializationInfoTuple::toJSON() const
|
||||
{
|
||||
auto object = SerializationInfo::toJSON();
|
||||
Poco::JSON::Array subcolumns;
|
||||
for (const auto & elem : elems)
|
||||
subcolumns.add(elem->toJSON());
|
||||
|
||||
object.set("subcolumns", std::move(subcolumns));
|
||||
return object;
|
||||
}
|
||||
|
||||
void SerializationInfoTuple::fromJSON(const Poco::JSON::Object & object)
|
||||
{
|
||||
SerializationInfo::fromJSON(object);
|
||||
|
||||
if (!object.has("subcolumns"))
|
||||
throw Exception(ErrorCodes::CORRUPTED_DATA,
|
||||
"Missed field '{}' in SerializationInfo of columns SerializationInfoTuple");
|
||||
|
||||
auto subcolumns = object.getArray("subcolumns");
|
||||
if (elems.size() != subcolumns->size())
|
||||
throw Exception(ErrorCodes::THERE_IS_NO_COLUMN,
|
||||
"Mismatched number of subcolumns beetween JSON and SerializationInfoTuple."
|
||||
"Expected: {}, got: {}", elems.size(), subcolumns->size());
|
||||
|
||||
for (size_t i = 0; i < elems.size(); ++i)
|
||||
elems[i]->fromJSON(*subcolumns->getObject(i));
|
||||
}
|
||||
|
||||
}
|
31
src/DataTypes/Serializations/SerializationInfoTuple.h
Normal file
31
src/DataTypes/Serializations/SerializationInfoTuple.h
Normal file
@ -0,0 +1,31 @@
|
||||
#pragma once
|
||||
#include <DataTypes/Serializations/SerializationInfo.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
class SerializationInfoTuple : public SerializationInfo
|
||||
{
|
||||
public:
|
||||
SerializationInfoTuple(MutableSerializationInfos elems_, const Settings & settings_);
|
||||
|
||||
bool hasCustomSerialization() const override;
|
||||
void add(const IColumn & column) override;
|
||||
void add(const SerializationInfo & other) override;
|
||||
void replaceData(const SerializationInfo & other) override;
|
||||
|
||||
MutableSerializationInfoPtr clone() const override;
|
||||
void serialializeKindBinary(WriteBuffer & out) const override;
|
||||
void deserializeFromKindsBinary(ReadBuffer & in) override;
|
||||
|
||||
Poco::JSON::Object toJSON() const override;
|
||||
void fromJSON(const Poco::JSON::Object & object) override;
|
||||
|
||||
MutableSerializationInfoPtr getElementInfo(size_t i) const { return elems[i]; }
|
||||
ISerialization::Kind getElementKind(size_t i) const { return elems[i]->getKind(); }
|
||||
|
||||
private:
|
||||
MutableSerializationInfos elems;
|
||||
};
|
||||
|
||||
}
|
@ -43,23 +43,23 @@ SerializationLowCardinality::SerializationLowCardinality(const DataTypePtr & dic
|
||||
void SerializationLowCardinality::enumerateStreams(
|
||||
SubstreamPath & path,
|
||||
const StreamCallback & callback,
|
||||
DataTypePtr type,
|
||||
ColumnPtr column) const
|
||||
const SubstreamData & data) const
|
||||
{
|
||||
const auto * column_lc = column ? &getColumnLowCardinality(*column) : nullptr;
|
||||
|
||||
SubstreamData data;
|
||||
data.type = type ? dictionary_type : nullptr;
|
||||
data.column = column_lc ? column_lc->getDictionary().getNestedColumn() : nullptr;
|
||||
data.serialization = dict_inner_serialization;
|
||||
const auto * column_lc = data.column ? &getColumnLowCardinality(*data.column) : nullptr;
|
||||
|
||||
path.push_back(Substream::DictionaryKeys);
|
||||
path.back().data = data;
|
||||
path.back().data =
|
||||
{
|
||||
dict_inner_serialization,
|
||||
data.type ? dictionary_type : nullptr,
|
||||
column_lc ? column_lc->getDictionary().getNestedColumn() : nullptr,
|
||||
data.serialization_info,
|
||||
};
|
||||
|
||||
dict_inner_serialization->enumerateStreams(path, callback, data.type, data.column);
|
||||
dict_inner_serialization->enumerateStreams(path, callback, path.back().data);
|
||||
|
||||
path.back() = Substream::DictionaryIndexes;
|
||||
path.back().data = {type, column, getPtr(), nullptr};
|
||||
path.back().data = data;
|
||||
|
||||
callback(path);
|
||||
path.pop_back();
|
||||
|
@ -20,8 +20,7 @@ public:
|
||||
void enumerateStreams(
|
||||
SubstreamPath & path,
|
||||
const StreamCallback & callback,
|
||||
DataTypePtr type,
|
||||
ColumnPtr column) const override;
|
||||
const SubstreamData & data) const override;
|
||||
|
||||
void serializeBinaryBulkStatePrefix(
|
||||
SerializeBinaryBulkSettings & settings,
|
||||
|
@ -254,13 +254,17 @@ void SerializationMap::deserializeTextCSV(IColumn & column, ReadBuffer & istr, c
|
||||
void SerializationMap::enumerateStreams(
|
||||
SubstreamPath & path,
|
||||
const StreamCallback & callback,
|
||||
DataTypePtr type,
|
||||
ColumnPtr column) const
|
||||
const SubstreamData & data) const
|
||||
{
|
||||
auto next_type = type ? assert_cast<const DataTypeMap &>(*type).getNestedType() : nullptr;
|
||||
auto next_column = column ? assert_cast<const ColumnMap &>(*column).getNestedColumnPtr() : nullptr;
|
||||
SubstreamData next_data =
|
||||
{
|
||||
nested,
|
||||
data.type ? assert_cast<const DataTypeMap &>(*data.type).getNestedType() : nullptr,
|
||||
data.column ? assert_cast<const ColumnMap &>(*data.column).getNestedColumnPtr() : nullptr,
|
||||
data.serialization_info,
|
||||
};
|
||||
|
||||
nested->enumerateStreams(path, callback, next_type, next_column);
|
||||
nested->enumerateStreams(path, callback, next_data);
|
||||
}
|
||||
|
||||
void SerializationMap::serializeBinaryBulkStatePrefix(
|
||||
|
@ -34,8 +34,7 @@ public:
|
||||
void enumerateStreams(
|
||||
SubstreamPath & path,
|
||||
const StreamCallback & callback,
|
||||
DataTypePtr type,
|
||||
ColumnPtr column) const override;
|
||||
const SubstreamData & data) const override;
|
||||
|
||||
void serializeBinaryBulkStatePrefix(
|
||||
SerializeBinaryBulkSettings & settings,
|
||||
|
@ -6,12 +6,13 @@ namespace DB
|
||||
void SerializationNamed::enumerateStreams(
|
||||
SubstreamPath & path,
|
||||
const StreamCallback & callback,
|
||||
DataTypePtr type,
|
||||
ColumnPtr column) const
|
||||
const SubstreamData & data) const
|
||||
{
|
||||
addToPath(path);
|
||||
path.back().data = {type, column, getPtr(), std::make_shared<SubcolumnCreator>(name, escape_delimiter)};
|
||||
nested_serialization->enumerateStreams(path, callback, type, column);
|
||||
path.back().data = data;
|
||||
path.back().creator = std::make_shared<SubcolumnCreator>(name, escape_delimiter);
|
||||
|
||||
nested_serialization->enumerateStreams(path, callback, data);
|
||||
path.pop_back();
|
||||
}
|
||||
|
||||
|
@ -23,8 +23,7 @@ public:
|
||||
void enumerateStreams(
|
||||
SubstreamPath & path,
|
||||
const StreamCallback & callback,
|
||||
DataTypePtr type,
|
||||
ColumnPtr column) const override;
|
||||
const SubstreamData & data) const override;
|
||||
|
||||
void serializeBinaryBulkStatePrefix(
|
||||
SerializeBinaryBulkSettings & settings,
|
||||
|
@ -41,30 +41,35 @@ ColumnPtr SerializationNullable::SubcolumnCreator::create(const ColumnPtr & prev
|
||||
void SerializationNullable::enumerateStreams(
|
||||
SubstreamPath & path,
|
||||
const StreamCallback & callback,
|
||||
DataTypePtr type,
|
||||
ColumnPtr column) const
|
||||
const SubstreamData & data) const
|
||||
{
|
||||
const auto * type_nullable = type ? &assert_cast<const DataTypeNullable &>(*type) : nullptr;
|
||||
const auto * column_nullable = column ? &assert_cast<const ColumnNullable &>(*column) : nullptr;
|
||||
const auto * type_nullable = data.type ? &assert_cast<const DataTypeNullable &>(*data.type) : nullptr;
|
||||
const auto * column_nullable = data.column ? &assert_cast<const ColumnNullable &>(*data.column) : nullptr;
|
||||
|
||||
path.push_back(Substream::NullMap);
|
||||
path.back().data =
|
||||
{
|
||||
std::make_shared<SerializationNamed>(std::make_shared<SerializationNumber<UInt8>>(), "null", false),
|
||||
type_nullable ? std::make_shared<DataTypeUInt8>() : nullptr,
|
||||
column_nullable ? column_nullable->getNullMapColumnPtr() : nullptr,
|
||||
std::make_shared<SerializationNamed>(std::make_shared<SerializationNumber<UInt8>>(), "null", false),
|
||||
nullptr,
|
||||
data.serialization_info,
|
||||
};
|
||||
|
||||
callback(path);
|
||||
|
||||
path.back() = Substream::NullableElements;
|
||||
path.back().data = {type, column, getPtr(), std::make_shared<SubcolumnCreator>(path.back().data.column)};
|
||||
path.back().creator = std::make_shared<SubcolumnCreator>(path.back().data.column);
|
||||
path.back().data = data;
|
||||
|
||||
auto next_type = type_nullable ? type_nullable->getNestedType() : nullptr;
|
||||
auto next_column = column_nullable ? column_nullable->getNestedColumnPtr() : nullptr;
|
||||
SubstreamData next_data =
|
||||
{
|
||||
nested,
|
||||
type_nullable ? type_nullable->getNestedType() : nullptr,
|
||||
column_nullable ? column_nullable->getNestedColumnPtr() : nullptr,
|
||||
data.serialization_info,
|
||||
};
|
||||
|
||||
nested->enumerateStreams(path, callback, next_type, next_column);
|
||||
nested->enumerateStreams(path, callback, next_data);
|
||||
path.pop_back();
|
||||
}
|
||||
|
||||
|
@ -16,8 +16,7 @@ public:
|
||||
void enumerateStreams(
|
||||
SubstreamPath & path,
|
||||
const StreamCallback & callback,
|
||||
DataTypePtr type,
|
||||
ColumnPtr column) const override;
|
||||
const SubstreamData & data) const override;
|
||||
|
||||
void serializeBinaryBulkStatePrefix(
|
||||
SerializeBinaryBulkSettings & settings,
|
||||
|
@ -150,27 +150,36 @@ ColumnPtr SerializationSparse::SubcolumnCreator::create(const ColumnPtr & prev)
|
||||
void SerializationSparse::enumerateStreams(
|
||||
SubstreamPath & path,
|
||||
const StreamCallback & callback,
|
||||
DataTypePtr type,
|
||||
ColumnPtr column) const
|
||||
const SubstreamData & data) const
|
||||
{
|
||||
const auto * column_sparse = column ? &assert_cast<const ColumnSparse &>(*column) : nullptr;
|
||||
const auto * column_sparse = data.column ? &assert_cast<const ColumnSparse &>(*data.column) : nullptr;
|
||||
|
||||
SubstreamData data;
|
||||
data.type = type ? std::make_shared<DataTypeUInt64>() : nullptr;
|
||||
data.serialization = std::make_shared<SerializationNumber<UInt64>>();
|
||||
data.column = column_sparse ? column_sparse->getOffsetsPtr() : nullptr;
|
||||
size_t column_size = column_sparse ? column_sparse->size() : 0;
|
||||
|
||||
path.push_back(Substream::SparseOffsets);
|
||||
path.back().data = data;
|
||||
path.back().data =
|
||||
{
|
||||
std::make_shared<SerializationNumber<UInt64>>(),
|
||||
data.type ? std::make_shared<DataTypeUInt64>() : nullptr,
|
||||
column_sparse ? column_sparse->getOffsetsPtr() : nullptr,
|
||||
data.serialization_info,
|
||||
};
|
||||
|
||||
callback(path);
|
||||
|
||||
path.back() = Substream::SparseElements;
|
||||
path.back().data = {type, column, getPtr(), std::make_shared<SubcolumnCreator>(data.column, column_size)};
|
||||
path.back().creator = std::make_shared<SubcolumnCreator>(path.back().data.column, column_size);
|
||||
path.back().data = data;
|
||||
|
||||
auto next_column = column_sparse ? column_sparse->getValuesPtr() : nullptr;
|
||||
nested->enumerateStreams(path, callback, type, next_column);
|
||||
SubstreamData next_data =
|
||||
{
|
||||
nested,
|
||||
data.type,
|
||||
column_sparse ? column_sparse->getValuesPtr() : nullptr,
|
||||
data.serialization_info,
|
||||
};
|
||||
|
||||
nested->enumerateStreams(path, callback, next_data);
|
||||
path.pop_back();
|
||||
}
|
||||
|
||||
|
@ -30,8 +30,7 @@ public:
|
||||
virtual void enumerateStreams(
|
||||
SubstreamPath & path,
|
||||
const StreamCallback & callback,
|
||||
DataTypePtr type,
|
||||
ColumnPtr column) const override;
|
||||
const SubstreamData & data) const override;
|
||||
|
||||
void serializeBinaryBulkStatePrefix(
|
||||
SerializeBinaryBulkSettings & settings,
|
||||
|
@ -1,6 +1,7 @@
|
||||
#include <base/map.h>
|
||||
#include <base/range.h>
|
||||
#include <DataTypes/Serializations/SerializationTuple.h>
|
||||
#include <DataTypes/Serializations/SerializationInfoTuple.h>
|
||||
#include <DataTypes/DataTypeTuple.h>
|
||||
#include <Core/Field.h>
|
||||
#include <Columns/ColumnTuple.h>
|
||||
@ -284,18 +285,23 @@ void SerializationTuple::deserializeTextCSV(IColumn & column, ReadBuffer & istr,
|
||||
void SerializationTuple::enumerateStreams(
|
||||
SubstreamPath & path,
|
||||
const StreamCallback & callback,
|
||||
DataTypePtr type,
|
||||
ColumnPtr column) const
|
||||
const SubstreamData & data) const
|
||||
{
|
||||
const auto * type_tuple = type ? &assert_cast<const DataTypeTuple &>(*type) : nullptr;
|
||||
const auto * column_tuple = column ? &assert_cast<const ColumnTuple &>(*column) : nullptr;
|
||||
const auto * type_tuple = data.type ? &assert_cast<const DataTypeTuple &>(*data.type) : nullptr;
|
||||
const auto * column_tuple = data.column ? &assert_cast<const ColumnTuple &>(*data.column) : nullptr;
|
||||
const auto * info_tuple = data.serialization_info ? &assert_cast<const SerializationInfoTuple &>(*data.serialization_info) : nullptr;
|
||||
|
||||
for (size_t i = 0; i < elems.size(); ++i)
|
||||
{
|
||||
auto next_type = type_tuple ? type_tuple->getElement(i) : nullptr;
|
||||
auto next_column = column_tuple ? column_tuple->getColumnPtr(i) : nullptr;
|
||||
SubstreamData next_data =
|
||||
{
|
||||
elems[i],
|
||||
type_tuple ? type_tuple->getElement(i) : nullptr,
|
||||
column_tuple ? column_tuple->getColumnPtr(i) : nullptr,
|
||||
info_tuple ? info_tuple->getElementInfo(i) : nullptr,
|
||||
};
|
||||
|
||||
elems[i]->enumerateStreams(path, callback, next_type, next_column);
|
||||
elems[i]->enumerateStreams(path, callback, next_data);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -36,8 +36,7 @@ public:
|
||||
void enumerateStreams(
|
||||
SubstreamPath & path,
|
||||
const StreamCallback & callback,
|
||||
DataTypePtr type,
|
||||
ColumnPtr column) const override;
|
||||
const SubstreamData & data) const override;
|
||||
|
||||
void serializeBinaryBulkStatePrefix(
|
||||
SerializeBinaryBulkSettings & settings,
|
||||
|
@ -7,10 +7,9 @@ namespace DB
|
||||
void SerializationWrapper::enumerateStreams(
|
||||
SubstreamPath & path,
|
||||
const StreamCallback & callback,
|
||||
DataTypePtr type,
|
||||
ColumnPtr column) const
|
||||
const SubstreamData & data) const
|
||||
{
|
||||
nested_serialization->enumerateStreams(path, callback, type, column);
|
||||
nested_serialization->enumerateStreams(path, callback, data);
|
||||
}
|
||||
|
||||
void SerializationWrapper::serializeBinaryBulkStatePrefix(
|
||||
|
@ -23,8 +23,7 @@ public:
|
||||
void enumerateStreams(
|
||||
SubstreamPath & path,
|
||||
const StreamCallback & callback,
|
||||
DataTypePtr type,
|
||||
ColumnPtr column) const override;
|
||||
const SubstreamData & data) const override;
|
||||
|
||||
void serializeBinaryBulkStatePrefix(
|
||||
SerializeBinaryBulkSettings & settings,
|
||||
|
@ -149,16 +149,14 @@ Block NativeReader::read()
|
||||
SerializationPtr serialization;
|
||||
if (server_revision >= DBMS_MIN_REVISION_WITH_CUSTOM_SERIALIZATION)
|
||||
{
|
||||
serialization = column.type->getSerialization(column.name, [&](const String & /*name*/)
|
||||
{
|
||||
UInt8 kind_num;
|
||||
readBinary(kind_num, istr);
|
||||
auto kind = magic_enum::enum_cast<ISerialization::Kind>(kind_num);
|
||||
if (!kind)
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown serialization kind " + std::to_string(kind_num));
|
||||
auto info = column.type->createSerializationInfo({});
|
||||
|
||||
return *kind;
|
||||
});
|
||||
UInt8 has_custom;
|
||||
readBinary(has_custom, istr);
|
||||
if (has_custom)
|
||||
info->deserializeFromKindsBinary(istr);
|
||||
|
||||
serialization = column.type->getSerialization(*info);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -4,6 +4,7 @@
|
||||
#include <IO/WriteHelpers.h>
|
||||
#include <IO/VarInt.h>
|
||||
#include <Compression/CompressedWriteBuffer.h>
|
||||
#include <DataTypes/Serializations/SerializationInfo.h>
|
||||
|
||||
#include <Formats/MarkInCompressedFile.h>
|
||||
#include <Formats/NativeWriter.h>
|
||||
@ -125,18 +126,13 @@ void NativeWriter::write(const Block & block)
|
||||
SerializationPtr serialization;
|
||||
if (client_revision >= DBMS_MIN_REVISION_WITH_CUSTOM_SERIALIZATION)
|
||||
{
|
||||
serialization = column.type->getSerialization(column.name, [&](const String & name)
|
||||
{
|
||||
auto split = Nested::splitName(name);
|
||||
ISerialization::Kind kind;
|
||||
if (!split.second.empty() && column.type->tryGetSubcolumnType(split.second))
|
||||
kind = ISerialization::getKind(*column.type->getSubcolumn(split.second, column.column));
|
||||
else
|
||||
kind = ISerialization::getKind(*column.column);
|
||||
auto info = column.column->getSerializationInfo();
|
||||
serialization = column.type->getSerialization(*info);
|
||||
|
||||
writeBinary(static_cast<UInt8>(kind), ostr);
|
||||
return kind;
|
||||
});
|
||||
bool has_custom = info->hasCustomSerialization();
|
||||
writeBinary(static_cast<UInt8>(has_custom), ostr);
|
||||
if (has_custom)
|
||||
info->serialializeKindBinary(ostr);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -150,7 +150,7 @@ BlockIO InterpreterDescribeQuery::execute()
|
||||
res_columns[6]->insertDefault();
|
||||
|
||||
res_columns[7]->insert(1u);
|
||||
}, column.type->getDefaultSerialization(), column.type, nullptr);
|
||||
}, {column.type->getDefaultSerialization(), column.type, nullptr, nullptr});
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -160,7 +160,7 @@ void PrettyBlockOutputFormat::write(const Chunk & chunk, PortKind port_kind)
|
||||
|
||||
Serializations serializations(num_columns);
|
||||
for (size_t i = 0; i < num_columns; ++i)
|
||||
serializations[i] = header.getByPosition(i).type->getSerialization(*columns[i]);
|
||||
serializations[i] = header.getByPosition(i).type->getSerialization(*columns[i]->getSerializationInfo());
|
||||
|
||||
WidthsPerColumn widths;
|
||||
Widths max_widths;
|
||||
|
@ -26,7 +26,7 @@ void PrettySpaceBlockOutputFormat::write(const Chunk & chunk, PortKind port_kind
|
||||
|
||||
Serializations serializations(num_columns);
|
||||
for (size_t i = 0; i < num_columns; ++i)
|
||||
serializations[i] = header.getByPosition(i).type->getSerialization(*columns[i]);
|
||||
serializations[i] = header.getByPosition(i).type->getSerialization(*columns[i]->getSerializationInfo());
|
||||
|
||||
WidthsPerColumn widths;
|
||||
Widths max_widths;
|
||||
|
@ -588,8 +588,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToMemory(
|
||||
metadata_snapshot->projections.get(projection_name).metadata,
|
||||
block.getNamesAndTypesList(),
|
||||
{},
|
||||
CompressionCodecFactory::instance().get("NONE", {}),
|
||||
new_data_part->serialization_info);
|
||||
CompressionCodecFactory::instance().get("NONE", {}));
|
||||
|
||||
part_out.write(block);
|
||||
part_out.writeSuffixAndFinalizePart(new_projection_part);
|
||||
@ -613,8 +612,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToMemory(
|
||||
|
||||
MergedBlockOutputStream part_out(
|
||||
new_data_part, metadata_snapshot, block.getNamesAndTypesList(), {},
|
||||
CompressionCodecFactory::instance().get("NONE", {}),
|
||||
new_data_part->serialization_info);
|
||||
CompressionCodecFactory::instance().get("NONE", {}));
|
||||
|
||||
part_out.write(block);
|
||||
part_out.writeSuffixAndFinalizePart(new_data_part);
|
||||
|
@ -281,7 +281,6 @@ IMergeTreeDataPart::IMergeTreeDataPart(
|
||||
, volume(parent_part_ ? parent_part_->volume : volume_)
|
||||
, relative_path(relative_path_.value_or(name_))
|
||||
, index_granularity_info(storage_, part_type_)
|
||||
, serialization_info(std::make_shared<SerializationInfo>())
|
||||
, part_type(part_type_)
|
||||
, parent_part(parent_part_)
|
||||
{
|
||||
@ -307,7 +306,6 @@ IMergeTreeDataPart::IMergeTreeDataPart(
|
||||
, volume(parent_part_ ? parent_part_->volume : volume_)
|
||||
, relative_path(relative_path_.value_or(name_))
|
||||
, index_granularity_info(storage_, part_type_)
|
||||
, serialization_info(std::make_shared<SerializationInfo>())
|
||||
, part_type(part_type_)
|
||||
, parent_part(parent_part_)
|
||||
{
|
||||
@ -408,17 +406,47 @@ std::pair<time_t, time_t> IMergeTreeDataPart::getMinMaxTime() const
|
||||
}
|
||||
|
||||
|
||||
void IMergeTreeDataPart::setColumns(const NamesAndTypesList & new_columns)
|
||||
void IMergeTreeDataPart::setColumns(const NamesAndTypesList & new_columns, const SerializationInfoByName & new_infos)
|
||||
{
|
||||
columns = new_columns;
|
||||
|
||||
column_name_to_position.clear();
|
||||
column_name_to_position.reserve(new_columns.size());
|
||||
size_t pos = 0;
|
||||
|
||||
for (const auto & column : columns)
|
||||
{
|
||||
auto & serialization = serializations[column.name];
|
||||
column_name_to_position.emplace(column.name, pos);
|
||||
for (const auto & subcolumn : column.type->getSubcolumnNames())
|
||||
column_name_to_position.emplace(Nested::concatenateName(column.name, subcolumn), pos);
|
||||
|
||||
auto it = new_infos.find(column.name);
|
||||
if (it != new_infos.end())
|
||||
{
|
||||
auto & old_info = serialization_infos[column.name];
|
||||
const auto & new_info = it->second;
|
||||
|
||||
if (old_info)
|
||||
{
|
||||
old_info->replaceData(*new_info);
|
||||
}
|
||||
else
|
||||
{
|
||||
old_info = new_info->clone();
|
||||
serialization = column.type->getSerialization(*old_info);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
serialization = column.type->getDefaultSerialization();
|
||||
}
|
||||
|
||||
IDataType::forEachSubcolumn([&](const auto &, const auto & subname, const auto & subdata)
|
||||
{
|
||||
auto subcolumn_name = Nested::concatenateName(column.name, subname);
|
||||
column_name_to_position.emplace(subcolumn_name, pos);
|
||||
serializations.emplace(subcolumn_name, subdata.serialization);
|
||||
}, {serialization, column.type, nullptr, nullptr});
|
||||
|
||||
++pos;
|
||||
}
|
||||
}
|
||||
@ -589,7 +617,6 @@ void IMergeTreeDataPart::loadColumnsChecksumsIndexes(bool require_columns_checks
|
||||
loadUUID();
|
||||
loadColumns(require_columns_checksums);
|
||||
loadChecksums(require_columns_checksums);
|
||||
loadSerializationInfo();
|
||||
loadIndexGranularity();
|
||||
calculateColumnsAndSecondaryIndicesSizesOnDisk();
|
||||
loadIndex(); /// Must be called after loadIndexGranularity as it uses the value of `index_granularity`
|
||||
@ -655,13 +682,13 @@ void IMergeTreeDataPart::loadIndex()
|
||||
|
||||
size_t marks_count = index_granularity.getMarksCount();
|
||||
|
||||
Serializations serializations(key_size);
|
||||
Serializations key_serializations(key_size);
|
||||
for (size_t j = 0; j < key_size; ++j)
|
||||
serializations[j] = primary_key.data_types[j]->getDefaultSerialization();
|
||||
key_serializations[j] = primary_key.data_types[j]->getDefaultSerialization();
|
||||
|
||||
for (size_t i = 0; i < marks_count; ++i) //-V756
|
||||
for (size_t j = 0; j < key_size; ++j)
|
||||
serializations[j]->deserializeBinary(*loaded_index[j], *index_file);
|
||||
key_serializations[j]->deserializeBinary(*loaded_index[j], *index_file);
|
||||
|
||||
for (size_t i = 0; i < key_size; ++i)
|
||||
{
|
||||
@ -752,9 +779,8 @@ CompressionCodecPtr IMergeTreeDataPart::detectDefaultCompressionCodec() const
|
||||
auto column_size = getColumnSize(part_column.name, *part_column.type);
|
||||
if (column_size.data_compressed != 0 && !storage_columns.hasCompressionCodec(part_column.name))
|
||||
{
|
||||
auto serialization = IDataType::getSerialization(part_column, *serialization_info);
|
||||
String path_to_data_file;
|
||||
serialization->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path)
|
||||
serializations.at(part_column.name)->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path)
|
||||
{
|
||||
if (path_to_data_file.empty())
|
||||
{
|
||||
@ -888,7 +914,7 @@ void IMergeTreeDataPart::loadRowsCount()
|
||||
/// Most trivial types
|
||||
if (column.type->isValueRepresentedByNumber()
|
||||
&& !column.type->haveSubtypes()
|
||||
&& getSerializationForColumn(column)->getKind() == ISerialization::Kind::DEFAULT)
|
||||
&& getSerialization(column.name)->getKind() == ISerialization::Kind::DEFAULT)
|
||||
{
|
||||
auto size = getColumnSize(column.name, *column.type);
|
||||
|
||||
@ -940,7 +966,7 @@ void IMergeTreeDataPart::loadRowsCount()
|
||||
|
||||
for (const NameAndTypePair & column : columns)
|
||||
{
|
||||
ColumnPtr column_col = column.type->createColumn(*getSerializationForColumn(column));
|
||||
ColumnPtr column_col = column.type->createColumn(*serializations.at(column.name));
|
||||
if (!column_col->isFixedAndContiguous() || column_col->lowCardinality())
|
||||
continue;
|
||||
|
||||
@ -1014,16 +1040,6 @@ void IMergeTreeDataPart::loadUUID()
|
||||
}
|
||||
}
|
||||
|
||||
void IMergeTreeDataPart::loadSerializationInfo() const
|
||||
{
|
||||
String path = getFullRelativePath() + SERIALIZATION_FILE_NAME;
|
||||
if (volume->getDisk()->exists(path))
|
||||
{
|
||||
auto in = openForReading(volume->getDisk(), path);
|
||||
serialization_info->readText(*in);
|
||||
}
|
||||
}
|
||||
|
||||
void IMergeTreeDataPart::loadColumns(bool require)
|
||||
{
|
||||
String path = fs::path(getFullRelativePath()) / "columns.txt";
|
||||
@ -1058,7 +1074,18 @@ void IMergeTreeDataPart::loadColumns(bool require)
|
||||
loaded_columns.readText(*volume->getDisk()->readFile(path));
|
||||
}
|
||||
|
||||
setColumns(loaded_columns);
|
||||
SerializationInfo::Settings settings =
|
||||
{
|
||||
.ratio_for_sparse = storage.getSettings()->ratio_of_defaults_for_sparse_serialization,
|
||||
.choose_kind = false,
|
||||
};
|
||||
|
||||
SerializationInfoByName infos(loaded_columns, settings);
|
||||
path = getFullRelativePath() + SERIALIZATION_FILE_NAME;
|
||||
if (volume->getDisk()->exists(path))
|
||||
infos.readText(*volume->getDisk()->readFile(path));
|
||||
|
||||
setColumns(loaded_columns, infos);
|
||||
}
|
||||
|
||||
bool IMergeTreeDataPart::shallParticipateInMerges(const StoragePolicyPtr & storage_policy) const
|
||||
@ -1563,11 +1590,6 @@ bool IMergeTreeDataPart::checkAllTTLCalculated(const StorageMetadataPtr & metada
|
||||
return true;
|
||||
}
|
||||
|
||||
SerializationPtr IMergeTreeDataPart::getSerializationForColumn(const NameAndTypePair & column) const
|
||||
{
|
||||
return IDataType::getSerialization(column, *serialization_info);
|
||||
}
|
||||
|
||||
String IMergeTreeDataPart::getUniqueId() const
|
||||
{
|
||||
auto disk = volume->getDisk();
|
||||
|
@ -93,7 +93,6 @@ public:
|
||||
const StorageMetadataPtr & metadata_snapshot,
|
||||
const std::vector<MergeTreeIndexPtr> & indices_to_recalc,
|
||||
const CompressionCodecPtr & default_codec_,
|
||||
const SerializationInfoPtr & serialization_info_,
|
||||
const MergeTreeWriterSettings & writer_settings,
|
||||
const MergeTreeIndexGranularity & computed_index_granularity = {}) const = 0;
|
||||
|
||||
@ -127,9 +126,12 @@ public:
|
||||
|
||||
String getTypeName() const { return getType().toString(); }
|
||||
|
||||
void setColumns(const NamesAndTypesList & new_columns);
|
||||
void setColumns(const NamesAndTypesList & new_columns, const SerializationInfoByName & new_infos = {});
|
||||
|
||||
const NamesAndTypesList & getColumns() const { return columns; }
|
||||
const SerializationInfoByName & getSerializationInfos() const { return serialization_infos; }
|
||||
const SerializationByName & getSerializations() const { return serializations; }
|
||||
const SerializationPtr & getSerialization(const String & column_name) const { return serializations.at(column_name); }
|
||||
|
||||
/// Throws an exception if part is not stored in on-disk format.
|
||||
void assertOnDisk() const;
|
||||
@ -190,9 +192,6 @@ public:
|
||||
mutable String relative_path;
|
||||
MergeTreeIndexGranularityInfo index_granularity_info;
|
||||
|
||||
/// TODO: add comment
|
||||
SerializationInfoPtr serialization_info;
|
||||
|
||||
size_t rows_count = 0;
|
||||
|
||||
time_t modification_time = 0;
|
||||
@ -399,8 +398,8 @@ public:
|
||||
/// part creation (using alter query with materialize_ttl setting).
|
||||
bool checkAllTTLCalculated(const StorageMetadataPtr & metadata_snapshot) const;
|
||||
|
||||
/// Returns serialization for column according to serialization_info.
|
||||
SerializationPtr getSerializationForColumn(const NameAndTypePair & column) const;
|
||||
// /// Returns serialization for column according to serialization_info.
|
||||
// SerializationPtr getSerializationForColumn(const NameAndTypePair & column) const;
|
||||
|
||||
/// Return some uniq string for file
|
||||
/// Required for distinguish different copies of the same part on S3
|
||||
@ -424,6 +423,11 @@ protected:
|
||||
|
||||
/// Columns description. Cannot be changed, after part initialization.
|
||||
NamesAndTypesList columns;
|
||||
|
||||
SerializationInfoByName serialization_infos;
|
||||
|
||||
SerializationByName serializations;
|
||||
|
||||
const Type part_type;
|
||||
|
||||
/// Not null when it's a projection part.
|
||||
@ -470,8 +474,6 @@ private:
|
||||
/// Loads ttl infos in json format from file ttl.txt. If file doesn't exists assigns ttl infos with all zeros
|
||||
void loadTTLInfos();
|
||||
|
||||
void loadSerializationInfo() const;
|
||||
|
||||
void loadPartitionAndMinMaxIndex();
|
||||
|
||||
void calculateColumnsSizesOnDisk();
|
||||
|
@ -40,6 +40,7 @@ IMergeTreeReader::IMergeTreeReader(
|
||||
, storage(data_part_->storage)
|
||||
, metadata_snapshot(metadata_snapshot_)
|
||||
, all_mark_ranges(all_mark_ranges_)
|
||||
, serializations(data_part_->getSerializations())
|
||||
, alter_conversions(storage.getAlterConversionsForPart(data_part))
|
||||
{
|
||||
if (isWidePart(data_part))
|
||||
|
@ -87,8 +87,7 @@ protected:
|
||||
using ColumnPosition = std::optional<size_t>;
|
||||
ColumnPosition findColumnForOffsets(const String & column_name) const;
|
||||
|
||||
using Serializations = std::map<std::string, SerializationPtr>;
|
||||
Serializations serializations;
|
||||
const SerializationByName & serializations;
|
||||
|
||||
friend class MergeTreeRangeReader::DelayedStream;
|
||||
|
||||
|
@ -8,14 +8,16 @@ namespace DB
|
||||
IMergedBlockOutputStream::IMergedBlockOutputStream(
|
||||
const MergeTreeDataPartPtr & data_part,
|
||||
const StorageMetadataPtr & metadata_snapshot_,
|
||||
const SerializationInfoPtr & input_serialization_info_)
|
||||
const NamesAndTypesList & columns_list,
|
||||
bool reset_columns_)
|
||||
: storage(data_part->storage)
|
||||
, metadata_snapshot(metadata_snapshot_)
|
||||
, volume(data_part->volume)
|
||||
, part_path(data_part->isStoredOnDisk() ? data_part->getFullRelativePath() : "")
|
||||
, input_serialization_info(input_serialization_info_)
|
||||
, new_serialization_info(data_part->storage.getSettings()->ratio_of_defaults_for_sparse_serialization)
|
||||
, reset_columns(reset_columns_)
|
||||
{
|
||||
if (reset_columns)
|
||||
new_serialization_infos = SerializationInfoByName(columns_list, {});
|
||||
}
|
||||
|
||||
NameSet IMergedBlockOutputStream::removeEmptyColumnsFromPart(
|
||||
@ -32,18 +34,14 @@ NameSet IMergedBlockOutputStream::removeEmptyColumnsFromPart(
|
||||
|
||||
/// Collect counts for shared streams of different columns. As an example, Nested columns have shared stream with array sizes.
|
||||
std::map<String, size_t> stream_counts;
|
||||
std::unordered_map<String, SerializationPtr> serialziations;
|
||||
for (const NameAndTypePair & column : columns)
|
||||
const auto & serializations = data_part->getSerializations();
|
||||
for (const auto & column : columns)
|
||||
{
|
||||
auto serialization = IDataType::getSerialization(column, *data_part->serialization_info);
|
||||
|
||||
serialization->enumerateStreams(
|
||||
serializations.at(column.name)->enumerateStreams(
|
||||
[&](const ISerialization::SubstreamPath & substream_path)
|
||||
{
|
||||
++stream_counts[ISerialization::getFileNameForStream(column, substream_path)];
|
||||
});
|
||||
|
||||
serialziations[column.name] = std::move(serialization);
|
||||
}
|
||||
|
||||
NameSet remove_files;
|
||||
@ -65,7 +63,7 @@ NameSet IMergedBlockOutputStream::removeEmptyColumnsFromPart(
|
||||
}
|
||||
};
|
||||
|
||||
serialziations[column_name]->enumerateStreams(callback);
|
||||
serializations.at(column_name)->enumerateStreams(callback);
|
||||
}
|
||||
|
||||
/// Remove files on disk and checksums
|
||||
|
@ -14,7 +14,8 @@ public:
|
||||
IMergedBlockOutputStream(
|
||||
const MergeTreeDataPartPtr & data_part,
|
||||
const StorageMetadataPtr & metadata_snapshot_,
|
||||
const SerializationInfoPtr & input_serialization_info_);
|
||||
const NamesAndTypesList & columns_list,
|
||||
bool reset_columns_);
|
||||
|
||||
virtual ~IMergedBlockOutputStream() = default;
|
||||
|
||||
@ -46,8 +47,9 @@ protected:
|
||||
String part_path;
|
||||
|
||||
IMergeTreeDataPart::MergeTreeWriterPtr writer;
|
||||
SerializationInfoPtr input_serialization_info;
|
||||
SerializationInfoBuilder new_serialization_info;
|
||||
|
||||
bool reset_columns = false;
|
||||
SerializationInfoByName new_serialization_infos;
|
||||
};
|
||||
|
||||
using IMergedBlockOutputStreamPtr = std::shared_ptr<IMergedBlockOutputStream>;
|
||||
|
@ -158,15 +158,19 @@ bool MergeTask::ExecuteAndFinalizeHorizontalPart::prepare()
|
||||
global_ctx->parent_part);
|
||||
|
||||
global_ctx->new_data_part->uuid = global_ctx->future_part->uuid;
|
||||
global_ctx->new_data_part->setColumns(global_ctx->storage_columns);
|
||||
global_ctx->new_data_part->partition.assign(global_ctx->future_part->getPartition());
|
||||
global_ctx->new_data_part->is_temp = global_ctx->parent_part == nullptr;
|
||||
|
||||
ctx->need_remove_expired_values = false;
|
||||
ctx->force_ttl = false;
|
||||
|
||||
SerializationInfoBuilder serialization_info_builder(
|
||||
global_ctx->data->getSettings()->ratio_of_defaults_for_sparse_serialization);
|
||||
SerializationInfo::Settings info_settings =
|
||||
{
|
||||
.ratio_for_sparse = global_ctx->data->getSettings()->ratio_of_defaults_for_sparse_serialization,
|
||||
.choose_kind = true,
|
||||
};
|
||||
|
||||
SerializationInfoByName infos(global_ctx->storage_columns, info_settings);
|
||||
|
||||
for (const auto & part : global_ctx->future_part->parts)
|
||||
{
|
||||
@ -178,10 +182,10 @@ bool MergeTask::ExecuteAndFinalizeHorizontalPart::prepare()
|
||||
ctx->force_ttl = true;
|
||||
}
|
||||
|
||||
serialization_info_builder.add(*part->serialization_info);
|
||||
infos.add(part->getSerializationInfos());
|
||||
}
|
||||
|
||||
global_ctx->input_serialization_info = std::move(serialization_info_builder).build();
|
||||
global_ctx->new_data_part->setColumns(global_ctx->storage_columns, infos);
|
||||
|
||||
const auto & local_part_min_ttl = global_ctx->new_data_part->ttl_infos.part_min_ttl;
|
||||
if (local_part_min_ttl && local_part_min_ttl <= global_ctx->time_of_merge)
|
||||
@ -256,7 +260,7 @@ bool MergeTask::ExecuteAndFinalizeHorizontalPart::prepare()
|
||||
global_ctx->merging_columns,
|
||||
MergeTreeIndexFactory::instance().getMany(global_ctx->metadata_snapshot->getSecondaryIndices()),
|
||||
ctx->compression_codec,
|
||||
global_ctx->input_serialization_info,
|
||||
/*reset_columns=*/ true,
|
||||
ctx->blocks_are_granules_size);
|
||||
|
||||
global_ctx->rows_written = 0;
|
||||
@ -435,7 +439,6 @@ void MergeTask::VerticalMergeStage::prepareVerticalMergeForOneColumn() const
|
||||
global_ctx->metadata_snapshot,
|
||||
ctx->executor->getHeader(),
|
||||
ctx->compression_codec,
|
||||
global_ctx->input_serialization_info,
|
||||
/// we don't need to recalc indices here
|
||||
/// because all of them were already recalculated and written
|
||||
/// as key part of vertical merge
|
||||
|
@ -157,7 +157,6 @@ private:
|
||||
|
||||
SyncGuardPtr sync_guard{nullptr};
|
||||
MergeTreeData::MutableDataPartPtr new_data_part{nullptr};
|
||||
SerializationInfoPtr input_serialization_info{nullptr};
|
||||
|
||||
size_t rows_written{0};
|
||||
UInt64 watch_prev_elapsed{0};
|
||||
|
@ -59,7 +59,6 @@ IMergeTreeDataPart::MergeTreeWriterPtr MergeTreeDataPartCompact::getWriter(
|
||||
const StorageMetadataPtr & metadata_snapshot,
|
||||
const std::vector<MergeTreeIndexPtr> & indices_to_recalc,
|
||||
const CompressionCodecPtr & default_codec_,
|
||||
const SerializationInfoPtr & serialization_info_,
|
||||
const MergeTreeWriterSettings & writer_settings,
|
||||
const MergeTreeIndexGranularity & computed_index_granularity) const
|
||||
{
|
||||
@ -74,7 +73,7 @@ IMergeTreeDataPart::MergeTreeWriterPtr MergeTreeDataPartCompact::getWriter(
|
||||
return std::make_unique<MergeTreeDataPartWriterCompact>(
|
||||
shared_from_this(), ordered_columns_list, metadata_snapshot,
|
||||
indices_to_recalc, index_granularity_info.marks_file_extension,
|
||||
default_codec_, serialization_info_, writer_settings, computed_index_granularity);
|
||||
default_codec_, writer_settings, computed_index_granularity);
|
||||
}
|
||||
|
||||
|
||||
|
@ -51,7 +51,6 @@ public:
|
||||
const StorageMetadataPtr & metadata_snapshot,
|
||||
const std::vector<MergeTreeIndexPtr> & indices_to_recalc,
|
||||
const CompressionCodecPtr & default_codec_,
|
||||
const SerializationInfoPtr & serialization_info_,
|
||||
const MergeTreeWriterSettings & writer_settings,
|
||||
const MergeTreeIndexGranularity & computed_index_granularity) const override;
|
||||
|
||||
|
@ -60,7 +60,6 @@ IMergeTreeDataPart::MergeTreeWriterPtr MergeTreeDataPartInMemory::getWriter(
|
||||
const StorageMetadataPtr & metadata_snapshot,
|
||||
const std::vector<MergeTreeIndexPtr> & /* indices_to_recalc */,
|
||||
const CompressionCodecPtr & /* default_codec */,
|
||||
const SerializationInfoPtr & /* serialization_info */,
|
||||
const MergeTreeWriterSettings & writer_settings,
|
||||
const MergeTreeIndexGranularity & /* computed_index_granularity */) const
|
||||
{
|
||||
@ -92,7 +91,7 @@ void MergeTreeDataPartInMemory::flushToDisk(const String & base_path, const Stri
|
||||
|
||||
auto compression_codec = storage.getContext()->chooseCompressionCodec(0, 0);
|
||||
auto indices = MergeTreeIndexFactory::instance().getMany(metadata_snapshot->getSecondaryIndices());
|
||||
MergedBlockOutputStream out(new_data_part, metadata_snapshot, columns, indices, compression_codec, new_data_part->serialization_info);
|
||||
MergedBlockOutputStream out(new_data_part, metadata_snapshot, columns, indices, compression_codec);
|
||||
out.write(block);
|
||||
const auto & projections = metadata_snapshot->getProjections();
|
||||
for (const auto & [projection_name, projection] : projection_parts)
|
||||
@ -123,7 +122,7 @@ void MergeTreeDataPartInMemory::flushToDisk(const String & base_path, const Stri
|
||||
auto projection_indices = MergeTreeIndexFactory::instance().getMany(desc.metadata->getSecondaryIndices());
|
||||
MergedBlockOutputStream projection_out(
|
||||
projection_data_part, desc.metadata, projection_part->columns, projection_indices,
|
||||
projection_compression_codec, new_data_part->serialization_info);
|
||||
projection_compression_codec);
|
||||
|
||||
projection_out.write(projection_part->block);
|
||||
projection_out.writeSuffixAndFinalizePart(projection_data_part);
|
||||
|
@ -40,7 +40,6 @@ public:
|
||||
const StorageMetadataPtr & metadata_snapshot,
|
||||
const std::vector<MergeTreeIndexPtr> & indices_to_recalc,
|
||||
const CompressionCodecPtr & default_codec_,
|
||||
const SerializationInfoPtr & serialization_info,
|
||||
const MergeTreeWriterSettings & writer_settings,
|
||||
const MergeTreeIndexGranularity & computed_index_granularity) const override;
|
||||
|
||||
|
@ -61,14 +61,13 @@ IMergeTreeDataPart::MergeTreeWriterPtr MergeTreeDataPartWide::getWriter(
|
||||
const StorageMetadataPtr & metadata_snapshot,
|
||||
const std::vector<MergeTreeIndexPtr> & indices_to_recalc,
|
||||
const CompressionCodecPtr & default_codec_,
|
||||
const SerializationInfoPtr & serialization_info_,
|
||||
const MergeTreeWriterSettings & writer_settings,
|
||||
const MergeTreeIndexGranularity & computed_index_granularity) const
|
||||
{
|
||||
return std::make_unique<MergeTreeDataPartWriterWide>(
|
||||
shared_from_this(), columns_list, metadata_snapshot, indices_to_recalc,
|
||||
index_granularity_info.marks_file_extension,
|
||||
default_codec_, serialization_info_, writer_settings, computed_index_granularity);
|
||||
default_codec_, writer_settings, computed_index_granularity);
|
||||
}
|
||||
|
||||
|
||||
@ -81,8 +80,7 @@ ColumnSize MergeTreeDataPartWide::getColumnSizeImpl(
|
||||
if (checksums.empty())
|
||||
return size;
|
||||
|
||||
auto serialization = getSerializationForColumn(column);
|
||||
serialization->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path)
|
||||
serializations.at(column.name)->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path)
|
||||
{
|
||||
String file_name = ISerialization::getFileNameForStream(column, substream_path);
|
||||
|
||||
@ -159,8 +157,7 @@ void MergeTreeDataPartWide::checkConsistency(bool require_part_metadata) const
|
||||
{
|
||||
for (const NameAndTypePair & name_type : columns)
|
||||
{
|
||||
auto serialization = getSerializationForColumn(name_type);
|
||||
serialization->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path)
|
||||
serializations.at(name_type.name)->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path)
|
||||
{
|
||||
String file_name = ISerialization::getFileNameForStream(name_type, substream_path);
|
||||
String mrk_file_name = file_name + index_granularity_info.marks_file_extension;
|
||||
@ -174,7 +171,6 @@ void MergeTreeDataPartWide::checkConsistency(bool require_part_metadata) const
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -182,8 +178,7 @@ void MergeTreeDataPartWide::checkConsistency(bool require_part_metadata) const
|
||||
std::optional<UInt64> marks_size;
|
||||
for (const NameAndTypePair & name_type : columns)
|
||||
{
|
||||
auto serialization = IDataType::getSerialization(name_type, *serialization_info);
|
||||
serialization->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path)
|
||||
serializations.at(name_type.name)->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path)
|
||||
{
|
||||
auto file_path = path + ISerialization::getFileNameForStream(name_type, substream_path) + index_granularity_info.marks_file_extension;
|
||||
|
||||
@ -218,7 +213,7 @@ bool MergeTreeDataPartWide::hasColumnFiles(const NameAndTypePair & column) const
|
||||
};
|
||||
|
||||
bool res = true;
|
||||
auto serialization = IDataType::getSerialization(column, *serialization_info);
|
||||
auto serialization = getSerialization(column.name);
|
||||
serialization->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path)
|
||||
{
|
||||
String file_name = ISerialization::getFileNameForStream(column, substream_path);
|
||||
@ -232,8 +227,7 @@ bool MergeTreeDataPartWide::hasColumnFiles(const NameAndTypePair & column) const
|
||||
String MergeTreeDataPartWide::getFileNameForColumn(const NameAndTypePair & column) const
|
||||
{
|
||||
String filename;
|
||||
auto serialization = getSerializationForColumn(column);
|
||||
serialization->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path)
|
||||
serializations.at(column.name)->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path)
|
||||
{
|
||||
if (filename.empty())
|
||||
filename = ISerialization::getFileNameForStream(column, substream_path);
|
||||
@ -255,7 +249,7 @@ void MergeTreeDataPartWide::calculateEachColumnSizes(ColumnSizeByName & each_col
|
||||
if (rows_count != 0
|
||||
&& column.type->isValueRepresentedByNumber()
|
||||
&& !column.type->haveSubtypes()
|
||||
&& getSerializationForColumn(column)->getKind() == ISerialization::Kind::DEFAULT)
|
||||
&& serializations.at(column.name)->getKind() == ISerialization::Kind::DEFAULT)
|
||||
{
|
||||
size_t rows_in_column = size.data_uncompressed / column.type->getSizeOfValueInMemory();
|
||||
if (rows_in_column != rows_count)
|
||||
|
@ -45,7 +45,6 @@ public:
|
||||
const StorageMetadataPtr & metadata_snapshot,
|
||||
const std::vector<MergeTreeIndexPtr> & indices_to_recalc,
|
||||
const CompressionCodecPtr & default_codec_,
|
||||
const SerializationInfoPtr & serialization_info_,
|
||||
const MergeTreeWriterSettings & writer_settings,
|
||||
const MergeTreeIndexGranularity & computed_index_granularity) const override;
|
||||
|
||||
|
@ -16,12 +16,11 @@ MergeTreeDataPartWriterCompact::MergeTreeDataPartWriterCompact(
|
||||
const std::vector<MergeTreeIndexPtr> & indices_to_recalc_,
|
||||
const String & marks_file_extension_,
|
||||
const CompressionCodecPtr & default_codec_,
|
||||
const SerializationInfoPtr & serialization_info_,
|
||||
const MergeTreeWriterSettings & settings_,
|
||||
const MergeTreeIndexGranularity & index_granularity_)
|
||||
: MergeTreeDataPartWriterOnDisk(data_part_, columns_list_, metadata_snapshot_,
|
||||
indices_to_recalc_, marks_file_extension_,
|
||||
default_codec_, serialization_info_, settings_, index_granularity_)
|
||||
default_codec_, settings_, index_granularity_)
|
||||
, plain_file(data_part->volume->getDisk()->writeFile(
|
||||
part_path + MergeTreeDataPartCompact::DATA_FILE_NAME_WITH_EXTENSION,
|
||||
settings.max_compress_block_size,
|
||||
@ -34,12 +33,8 @@ MergeTreeDataPartWriterCompact::MergeTreeDataPartWriterCompact(
|
||||
, marks(*marks_file)
|
||||
{
|
||||
const auto & storage_columns = metadata_snapshot->getColumns();
|
||||
serializations.reserve(columns_list.size());
|
||||
for (const auto & column : columns_list)
|
||||
{
|
||||
serializations.emplace(column.name, column.type->getSerialization(column.name, *serialization_info));
|
||||
addStreams(column, storage_columns.getCodecDescOrDefault(column.name, default_codec));
|
||||
}
|
||||
}
|
||||
|
||||
void MergeTreeDataPartWriterCompact::addStreams(const NameAndTypePair & column, const ASTPtr & effective_codec_desc)
|
||||
@ -71,7 +66,7 @@ void MergeTreeDataPartWriterCompact::addStreams(const NameAndTypePair & column,
|
||||
};
|
||||
|
||||
ISerialization::SubstreamPath path;
|
||||
serializations[column.name]->enumerateStreams(path, callback, column.type, nullptr);
|
||||
serializations.at(column.name)->enumerateStreams(path, callback, column.type);
|
||||
}
|
||||
|
||||
namespace
|
||||
@ -212,7 +207,7 @@ void MergeTreeDataPartWriterCompact::writeDataBlock(const Block & block, const G
|
||||
writeIntBinary(UInt64(0), marks);
|
||||
|
||||
writeColumnSingleGranule(
|
||||
block.getByName(name_and_type->name), serializations[name_and_type->name],
|
||||
block.getByName(name_and_type->name), serializations.at(name_and_type->name),
|
||||
stream_getter, granule.start_row, granule.rows_to_write);
|
||||
|
||||
/// Each type always have at least one substream
|
||||
|
@ -15,7 +15,6 @@ public:
|
||||
const std::vector<MergeTreeIndexPtr> & indices_to_recalc,
|
||||
const String & marks_file_extension,
|
||||
const CompressionCodecPtr & default_codec,
|
||||
const SerializationInfoPtr & serialization_info_,
|
||||
const MergeTreeWriterSettings & settings,
|
||||
const MergeTreeIndexGranularity & index_granularity);
|
||||
|
||||
|
@ -68,7 +68,6 @@ MergeTreeDataPartWriterOnDisk::MergeTreeDataPartWriterOnDisk(
|
||||
const MergeTreeIndices & indices_to_recalc_,
|
||||
const String & marks_file_extension_,
|
||||
const CompressionCodecPtr & default_codec_,
|
||||
const SerializationInfoPtr & serialization_info_,
|
||||
const MergeTreeWriterSettings & settings_,
|
||||
const MergeTreeIndexGranularity & index_granularity_)
|
||||
: IMergeTreeDataPartWriter(data_part_,
|
||||
@ -76,8 +75,8 @@ MergeTreeDataPartWriterOnDisk::MergeTreeDataPartWriterOnDisk(
|
||||
, skip_indices(indices_to_recalc_)
|
||||
, part_path(data_part_->getFullRelativePath())
|
||||
, marks_file_extension(marks_file_extension_)
|
||||
, serializations(data_part_->getSerializations())
|
||||
, default_codec(default_codec_)
|
||||
, serialization_info(serialization_info_)
|
||||
, compute_granularity(index_granularity.empty())
|
||||
{
|
||||
if (settings.blocks_are_granules_size && !index_granularity.empty())
|
||||
|
@ -87,7 +87,6 @@ public:
|
||||
const std::vector<MergeTreeIndexPtr> & indices_to_recalc,
|
||||
const String & marks_file_extension,
|
||||
const CompressionCodecPtr & default_codec,
|
||||
const SerializationInfoPtr & serialization_info_,
|
||||
const MergeTreeWriterSettings & settings,
|
||||
const MergeTreeIndexGranularity & index_granularity);
|
||||
|
||||
@ -124,8 +123,8 @@ protected:
|
||||
|
||||
const String part_path;
|
||||
const String marks_file_extension;
|
||||
const SerializationByName & serializations;
|
||||
const CompressionCodecPtr default_codec;
|
||||
const SerializationInfoPtr serialization_info;
|
||||
|
||||
const bool compute_granularity;
|
||||
|
||||
@ -133,9 +132,6 @@ protected:
|
||||
MergeTreeIndexAggregators skip_indices_aggregators;
|
||||
std::vector<size_t> skip_index_accumulated_marks;
|
||||
|
||||
using SerializationsMap = std::unordered_map<String, SerializationPtr>;
|
||||
SerializationsMap serializations;
|
||||
|
||||
std::unique_ptr<WriteBufferFromFileBase> index_file_stream;
|
||||
std::unique_ptr<HashingWriteBuffer> index_stream;
|
||||
DataTypes index_types;
|
||||
|
@ -75,22 +75,17 @@ MergeTreeDataPartWriterWide::MergeTreeDataPartWriterWide(
|
||||
const std::vector<MergeTreeIndexPtr> & indices_to_recalc_,
|
||||
const String & marks_file_extension_,
|
||||
const CompressionCodecPtr & default_codec_,
|
||||
const SerializationInfoPtr & serialization_info_,
|
||||
const MergeTreeWriterSettings & settings_,
|
||||
const MergeTreeIndexGranularity & index_granularity_)
|
||||
: MergeTreeDataPartWriterOnDisk(data_part_, columns_list_, metadata_snapshot_,
|
||||
indices_to_recalc_, marks_file_extension_,
|
||||
default_codec_, serialization_info_, settings_, index_granularity_)
|
||||
default_codec_, settings_, index_granularity_)
|
||||
{
|
||||
const auto & columns = metadata_snapshot->getColumns();
|
||||
for (const auto & it : columns_list)
|
||||
{
|
||||
serializations.emplace(it.name, it.type->getSerialization(it.name, *serialization_info));
|
||||
addStreams(it, columns.getCodecDescOrDefault(it.name, default_codec));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void MergeTreeDataPartWriterWide::addStreams(
|
||||
const NameAndTypePair & column,
|
||||
const ASTPtr & effective_codec_desc)
|
||||
@ -123,7 +118,7 @@ void MergeTreeDataPartWriterWide::addStreams(
|
||||
};
|
||||
|
||||
ISerialization::SubstreamPath path;
|
||||
serializations[column.name]->enumerateStreams(path, callback, column.type, nullptr);
|
||||
serializations.at(column.name)->enumerateStreams(path, callback, column.type);
|
||||
}
|
||||
|
||||
|
||||
@ -218,7 +213,7 @@ void MergeTreeDataPartWriterWide::write(const Block & block, const IColumn::Perm
|
||||
{
|
||||
auto & column = block_to_write.getByName(it->name);
|
||||
|
||||
if (serializations[column.name]->getKind() != ISerialization::Kind::SPARSE)
|
||||
if (serializations.at(column.name)->getKind() != ISerialization::Kind::SPARSE)
|
||||
column.column = recursiveRemoveSparse(column.column);
|
||||
|
||||
if (permutation)
|
||||
@ -280,7 +275,7 @@ StreamsWithMarks MergeTreeDataPartWriterWide::getCurrentMarksForColumn(
|
||||
ISerialization::SubstreamPath & path)
|
||||
{
|
||||
StreamsWithMarks result;
|
||||
serializations[column.name]->enumerateStreams([&] (const ISerialization::SubstreamPath & substream_path)
|
||||
serializations.at(column.name)->enumerateStreams([&] (const ISerialization::SubstreamPath & substream_path)
|
||||
{
|
||||
bool is_offsets = !substream_path.empty() && substream_path.back().type == ISerialization::Substream::ArraySizes;
|
||||
|
||||
@ -315,7 +310,7 @@ void MergeTreeDataPartWriterWide::writeSingleGranule(
|
||||
ISerialization::SerializeBinaryBulkSettings & serialize_settings,
|
||||
const Granule & granule)
|
||||
{
|
||||
const auto & serialization = serializations[name_and_type.name];
|
||||
const auto & serialization = serializations.at(name_and_type.name);
|
||||
serialization->serializeBinaryBulkWithMultipleStreams(column, granule.start_row, granule.rows_to_write, serialize_settings, serialization_state);
|
||||
|
||||
/// So that instead of the marks pointing to the end of the compressed block, there were marks pointing to the beginning of the next one.
|
||||
@ -350,7 +345,7 @@ void MergeTreeDataPartWriterWide::writeColumn(
|
||||
{
|
||||
ISerialization::SerializeBinaryBulkSettings serialize_settings;
|
||||
serialize_settings.getter = createStreamGetter(name_and_type, offset_columns);
|
||||
serializations[name]->serializeBinaryBulkStatePrefix(serialize_settings, it->second);
|
||||
serializations.at(name)->serializeBinaryBulkStatePrefix(serialize_settings, it->second);
|
||||
}
|
||||
|
||||
const auto & global_settings = storage.getContext()->getSettingsRef();
|
||||
@ -391,7 +386,7 @@ void MergeTreeDataPartWriterWide::writeColumn(
|
||||
}
|
||||
}
|
||||
|
||||
serializations[name]->enumerateStreams([&] (const ISerialization::SubstreamPath & substream_path)
|
||||
serializations.at(name)->enumerateStreams([&] (const ISerialization::SubstreamPath & substream_path)
|
||||
{
|
||||
bool is_offsets = !substream_path.empty() && substream_path.back().type == ISerialization::Substream::ArraySizes;
|
||||
if (is_offsets)
|
||||
@ -405,7 +400,7 @@ void MergeTreeDataPartWriterWide::writeColumn(
|
||||
|
||||
void MergeTreeDataPartWriterWide::validateColumnOfFixedSize(const String & name, const IDataType & type)
|
||||
{
|
||||
const auto & serialization = serializations[name];
|
||||
const auto & serialization = serializations.at(name);
|
||||
|
||||
if (!type.isValueRepresentedByNumber() || type.haveSubtypes() || serialization->getKind() != ISerialization::Kind::DEFAULT)
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot validate column of non fixed type {}", type.getName());
|
||||
@ -543,7 +538,7 @@ void MergeTreeDataPartWriterWide::finishDataSerialization(IMergeTreeDataPart::Ch
|
||||
if (!serialization_states.empty())
|
||||
{
|
||||
serialize_settings.getter = createStreamGetter(*it, written_offset_columns ? *written_offset_columns : offset_columns);
|
||||
serializations[it->name]->serializeBinaryBulkStateSuffix(serialize_settings, serialization_states[it->name]);
|
||||
serializations.at(it->name)->serializeBinaryBulkStateSuffix(serialize_settings, serialization_states[it->name]);
|
||||
}
|
||||
|
||||
if (write_final_mark)
|
||||
@ -568,7 +563,7 @@ void MergeTreeDataPartWriterWide::finishDataSerialization(IMergeTreeDataPart::Ch
|
||||
{
|
||||
if (column.type->isValueRepresentedByNumber()
|
||||
&& !column.type->haveSubtypes()
|
||||
&& serializations[column.name]->getKind() == ISerialization::Kind::DEFAULT)
|
||||
&& serializations.at(column.name)->getKind() == ISerialization::Kind::DEFAULT)
|
||||
{
|
||||
validateColumnOfFixedSize(column.name, *column.type);
|
||||
}
|
||||
@ -596,7 +591,7 @@ void MergeTreeDataPartWriterWide::writeFinalMark(
|
||||
{
|
||||
writeSingleMark(column, offset_columns, 0, path);
|
||||
/// Memoize information about offsets
|
||||
serializations[column.name]->enumerateStreams([&] (const ISerialization::SubstreamPath & substream_path)
|
||||
serializations.at(column.name)->enumerateStreams([&] (const ISerialization::SubstreamPath & substream_path)
|
||||
{
|
||||
bool is_offsets = !substream_path.empty() && substream_path.back().type == ISerialization::Substream::ArraySizes;
|
||||
if (is_offsets)
|
||||
|
@ -24,7 +24,6 @@ public:
|
||||
const std::vector<MergeTreeIndexPtr> & indices_to_recalc,
|
||||
const String & marks_file_extension,
|
||||
const CompressionCodecPtr & default_codec,
|
||||
const SerializationInfoPtr & serialization_info_,
|
||||
const MergeTreeWriterSettings & settings,
|
||||
const MergeTreeIndexGranularity & index_granularity);
|
||||
|
||||
|
@ -359,7 +359,13 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataWriter::writeTempPart(
|
||||
if (data.storage_settings.get()->assign_part_uuids)
|
||||
new_data_part->uuid = UUIDHelpers::generateV4();
|
||||
|
||||
new_data_part->setColumns(columns);
|
||||
const auto & data_settings = data.getSettings();
|
||||
|
||||
SerializationInfo::Settings settings{data_settings->ratio_of_defaults_for_sparse_serialization, true};
|
||||
SerializationInfoByName infos(columns, settings);
|
||||
infos.add(block);
|
||||
|
||||
new_data_part->setColumns(columns, infos);
|
||||
new_data_part->rows_count = block.rows();
|
||||
new_data_part->partition = std::move(partition);
|
||||
new_data_part->minmax_idx = std::move(minmax_idx);
|
||||
@ -406,15 +412,9 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataWriter::writeTempPart(
|
||||
/// either default lz4 or compression method with zero thresholds on absolute and relative part size.
|
||||
auto compression_codec = data.getContext()->chooseCompressionCodec(0, 0);
|
||||
|
||||
const auto & data_settings = data.getSettings();
|
||||
|
||||
SerializationInfoBuilder serialization_info(data_settings->ratio_of_defaults_for_sparse_serialization);
|
||||
serialization_info.add(block);
|
||||
|
||||
const auto & index_factory = MergeTreeIndexFactory::instance();
|
||||
MergedBlockOutputStream out(new_data_part, metadata_snapshot,columns,
|
||||
index_factory.getMany(metadata_snapshot->getSecondaryIndices()),
|
||||
compression_codec, std::move(serialization_info).build());
|
||||
index_factory.getMany(metadata_snapshot->getSecondaryIndices()), compression_codec);
|
||||
|
||||
bool sync_on_insert = data_settings->fsync_after_insert;
|
||||
|
||||
@ -458,7 +458,11 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataWriter::writeProjectionPartImpl(
|
||||
new_data_part->is_temp = is_temp;
|
||||
|
||||
NamesAndTypesList columns = metadata_snapshot->getColumns().getAllPhysical().filter(block.getNames());
|
||||
new_data_part->setColumns(columns);
|
||||
SerializationInfo::Settings settings{data.getSettings()->ratio_of_defaults_for_sparse_serialization, true};
|
||||
SerializationInfoByName infos(columns, settings);
|
||||
infos.add(block);
|
||||
|
||||
new_data_part->setColumns(columns, infos);
|
||||
|
||||
if (new_data_part->isStoredOnDisk())
|
||||
{
|
||||
@ -506,16 +510,12 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataWriter::writeProjectionPartImpl(
|
||||
/// either default lz4 or compression method with zero thresholds on absolute and relative part size.
|
||||
auto compression_codec = data.getContext()->chooseCompressionCodec(0, 0);
|
||||
|
||||
SerializationInfoBuilder serialization_info(data.getSettings()->ratio_of_defaults_for_sparse_serialization);
|
||||
serialization_info.add(block);
|
||||
|
||||
MergedBlockOutputStream out(
|
||||
new_data_part,
|
||||
metadata_snapshot,
|
||||
columns,
|
||||
{},
|
||||
compression_codec,
|
||||
std::move(serialization_info).build());
|
||||
compression_codec);
|
||||
|
||||
out.writeWithPermutation(block, perm_ptr);
|
||||
out.writeSuffixAndFinalizePart(new_data_part);
|
||||
|
@ -64,16 +64,6 @@ MergeTreeReaderCompact::MergeTreeReaderCompact(
|
||||
}
|
||||
|
||||
column_positions[i] = std::move(position);
|
||||
|
||||
if (column_from_part.isSubcolumn())
|
||||
{
|
||||
auto name_in_storage = column_from_part.getNameInStorage();
|
||||
/// We have to read whole column and extract subcolumn.
|
||||
serializations.emplace(name_in_storage, data_part->getSerializationForColumn(
|
||||
{name_in_storage, column_from_part.getTypeInStorage()}));
|
||||
}
|
||||
|
||||
serializations.emplace(column_from_part.name, data_part->getSerializationForColumn(column_from_part));
|
||||
}
|
||||
|
||||
/// Do not use max_read_buffer_size, but try to lower buffer size with maximal size of granule to avoid reading much data.
|
||||
|
@ -109,7 +109,7 @@ size_t MergeTreeReaderWide::readRows(size_t from_mark, bool continue_reading, si
|
||||
/// The column is already present in the block so we will append the values to the end.
|
||||
bool append = res_columns[pos] != nullptr;
|
||||
if (!append)
|
||||
res_columns[pos] = type->createColumn(*serializations[name]);
|
||||
res_columns[pos] = type->createColumn(*serializations.at(name));
|
||||
|
||||
auto & column = res_columns[pos];
|
||||
try
|
||||
@ -188,9 +188,7 @@ void MergeTreeReaderWide::addStreams(const NameAndTypePair & name_and_type,
|
||||
profile_callback, clock_type));
|
||||
};
|
||||
|
||||
auto serialization = data_part->getSerializationForColumn(name_and_type);
|
||||
serialization->enumerateStreams(callback);
|
||||
serializations.emplace(name_and_type.name, std::move(serialization));
|
||||
serializations.at(name_and_type.name)->enumerateStreams(callback);
|
||||
}
|
||||
|
||||
|
||||
@ -231,7 +229,7 @@ void MergeTreeReaderWide::prefetch(
|
||||
std::unordered_set<std::string> & prefetched_streams)
|
||||
{
|
||||
const auto & name = name_and_type.name;
|
||||
auto & serialization = serializations[name];
|
||||
auto & serialization = serializations.at(name);
|
||||
|
||||
serialization->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path)
|
||||
{
|
||||
@ -259,7 +257,7 @@ void MergeTreeReaderWide::readData(
|
||||
deserialize_settings.avg_value_size_hint = avg_value_size_hint;
|
||||
|
||||
const auto & name = name_and_type.name;
|
||||
auto & serialization = serializations[name];
|
||||
auto & serialization = serializations.at(name);
|
||||
|
||||
if (deserialize_binary_bulk_state_map.count(name) == 0)
|
||||
{
|
||||
|
@ -195,8 +195,7 @@ MergeTreeData::MutableDataPartsVector MergeTreeWriteAheadLog::restore(const Stor
|
||||
metadata_snapshot,
|
||||
block.getNamesAndTypesList(),
|
||||
{},
|
||||
CompressionCodecFactory::instance().get("NONE", {}),
|
||||
part->serialization_info);
|
||||
CompressionCodecFactory::instance().get("NONE", {}));
|
||||
|
||||
part->minmax_idx->update(block, storage.getMinMaxColumnsNames(metadata_snapshot->getPartitionKey()));
|
||||
part->partition.create(metadata_snapshot, block, 0, context);
|
||||
|
@ -18,9 +18,9 @@ MergedBlockOutputStream::MergedBlockOutputStream(
|
||||
const NamesAndTypesList & columns_list_,
|
||||
const MergeTreeIndices & skip_indices,
|
||||
CompressionCodecPtr default_codec_,
|
||||
const SerializationInfoPtr & input_serialization_info_,
|
||||
bool reset_columns_,
|
||||
bool blocks_are_granules_size)
|
||||
: IMergedBlockOutputStream(data_part, metadata_snapshot_, input_serialization_info_)
|
||||
: IMergedBlockOutputStream(data_part, metadata_snapshot_, columns_list_, reset_columns_)
|
||||
, columns_list(columns_list_)
|
||||
, default_codec(default_codec_)
|
||||
{
|
||||
@ -34,7 +34,7 @@ MergedBlockOutputStream::MergedBlockOutputStream(
|
||||
if (!part_path.empty())
|
||||
volume->getDisk()->createDirectories(part_path);
|
||||
|
||||
writer = data_part->getWriter(columns_list, metadata_snapshot, skip_indices, default_codec, input_serialization_info, writer_settings);
|
||||
writer = data_part->getWriter(columns_list, metadata_snapshot, skip_indices, default_codec, writer_settings);
|
||||
}
|
||||
|
||||
/// If data is pre-sorted.
|
||||
@ -78,12 +78,12 @@ void MergedBlockOutputStream::writeSuffixAndFinalizePart(
|
||||
else
|
||||
part_columns = *total_columns_list;
|
||||
|
||||
new_part->serialization_info = std::move(new_serialization_info).buildFrom(*input_serialization_info);
|
||||
if (reset_columns)
|
||||
new_part->setColumns(part_columns, new_serialization_infos);
|
||||
|
||||
if (new_part->isStoredOnDisk())
|
||||
finalizePartOnDisk(new_part, part_columns, checksums, sync);
|
||||
|
||||
new_part->setColumns(part_columns);
|
||||
new_part->rows_count = rows_count;
|
||||
new_part->modification_time = time(nullptr);
|
||||
new_part->index = writer->releaseIndexColumns();
|
||||
@ -168,11 +168,12 @@ void MergedBlockOutputStream::finalizePartOnDisk(
|
||||
|
||||
removeEmptyColumnsFromPart(new_part, part_columns, checksums);
|
||||
|
||||
if (new_part->serialization_info->getNumberOfRows() > 0)
|
||||
const auto & serialization_infos = new_part->getSerializationInfos();
|
||||
if (!serialization_infos.empty())
|
||||
{
|
||||
auto out = volume->getDisk()->writeFile(part_path + IMergeTreeDataPart::SERIALIZATION_FILE_NAME, 4096);
|
||||
HashingWriteBuffer out_hashing(*out);
|
||||
new_part->serialization_info->writeText(out_hashing);
|
||||
serialization_infos.writeText(out_hashing);
|
||||
checksums.files[IMergeTreeDataPart::SERIALIZATION_FILE_NAME].file_size = out_hashing.count();
|
||||
checksums.files[IMergeTreeDataPart::SERIALIZATION_FILE_NAME].file_hash = out_hashing.getHash();
|
||||
out->finalize();
|
||||
@ -219,7 +220,8 @@ void MergedBlockOutputStream::writeImpl(const Block & block, const IColumn::Perm
|
||||
return;
|
||||
|
||||
writer->write(block, permutation);
|
||||
new_serialization_info.add(block);
|
||||
if (reset_columns)
|
||||
new_serialization_infos.add(block);
|
||||
|
||||
rows_count += rows;
|
||||
}
|
||||
|
@ -19,7 +19,7 @@ public:
|
||||
const NamesAndTypesList & columns_list_,
|
||||
const MergeTreeIndices & skip_indices,
|
||||
CompressionCodecPtr default_codec_,
|
||||
const SerializationInfoPtr & input_serialization_info_,
|
||||
bool reset_columns_ = false,
|
||||
bool blocks_are_granules_size = false);
|
||||
|
||||
Block getHeader() const { return metadata_snapshot->getSampleBlock(); }
|
||||
|
@ -14,12 +14,11 @@ MergedColumnOnlyOutputStream::MergedColumnOnlyOutputStream(
|
||||
const StorageMetadataPtr & metadata_snapshot_,
|
||||
const Block & header_,
|
||||
CompressionCodecPtr default_codec,
|
||||
const SerializationInfoPtr & input_serialization_info_,
|
||||
const MergeTreeIndices & indices_to_recalc,
|
||||
WrittenOffsetColumns * offset_columns_,
|
||||
const MergeTreeIndexGranularity & index_granularity,
|
||||
const MergeTreeIndexGranularityInfo * index_granularity_info)
|
||||
: IMergedBlockOutputStream(data_part, metadata_snapshot_, input_serialization_info_)
|
||||
: IMergedBlockOutputStream(data_part, metadata_snapshot_, header_.getNamesAndTypesList(), /*reset_columns=*/ true)
|
||||
, header(header_)
|
||||
{
|
||||
const auto & global_settings = data_part->storage.getContext()->getSettings();
|
||||
@ -36,7 +35,6 @@ MergedColumnOnlyOutputStream::MergedColumnOnlyOutputStream(
|
||||
metadata_snapshot_,
|
||||
indices_to_recalc,
|
||||
default_codec,
|
||||
input_serialization_info,
|
||||
std::move(writer_settings),
|
||||
index_granularity);
|
||||
|
||||
@ -53,7 +51,7 @@ void MergedColumnOnlyOutputStream::write(const Block & block)
|
||||
return;
|
||||
|
||||
writer->write(block, nullptr);
|
||||
new_serialization_info.add(block);
|
||||
new_serialization_infos.add(block);
|
||||
}
|
||||
|
||||
MergeTreeData::DataPart::Checksums
|
||||
@ -79,8 +77,7 @@ MergedColumnOnlyOutputStream::writeSuffixAndGetChecksums(
|
||||
if (all_checksums.files.count(removed_file))
|
||||
all_checksums.files.erase(removed_file);
|
||||
|
||||
new_part->setColumns(columns);
|
||||
new_part->serialization_info = std::move(new_serialization_info).buildFrom(*input_serialization_info);
|
||||
new_part->setColumns(columns, new_serialization_infos);
|
||||
return checksums;
|
||||
}
|
||||
|
||||
|
@ -18,7 +18,6 @@ public:
|
||||
const StorageMetadataPtr & metadata_snapshot_,
|
||||
const Block & header_,
|
||||
CompressionCodecPtr default_codec_,
|
||||
const SerializationInfoPtr & input_serialization_info_,
|
||||
const MergeTreeIndices & indices_to_recalc_,
|
||||
WrittenOffsetColumns * offset_columns_ = nullptr,
|
||||
const MergeTreeIndexGranularity & index_granularity = {},
|
||||
|
@ -313,8 +313,7 @@ NameSet collectFilesToSkip(
|
||||
files_to_skip.insert(stream_name + mrk_extension);
|
||||
};
|
||||
|
||||
auto serialization = source_part->getSerializationForColumn({entry.name, entry.type});
|
||||
serialization->enumerateStreams(callback);
|
||||
source_part->getSerialization(entry.name)->enumerateStreams(callback);
|
||||
}
|
||||
for (const auto & index : indices_to_recalc)
|
||||
{
|
||||
@ -339,8 +338,7 @@ static NameToNameVector collectFilesForRenames(
|
||||
std::map<String, size_t> stream_counts;
|
||||
for (const auto & column : source_part->getColumns())
|
||||
{
|
||||
auto serialization = source_part->getSerializationForColumn(column);
|
||||
serialization->enumerateStreams(
|
||||
source_part->getSerialization(column.name)->enumerateStreams(
|
||||
[&](const ISerialization::SubstreamPath & substream_path)
|
||||
{
|
||||
++stream_counts[ISerialization::getFileNameForStream(column, substream_path)];
|
||||
@ -384,10 +382,7 @@ static NameToNameVector collectFilesForRenames(
|
||||
|
||||
auto column = source_part->getColumns().tryGetByName(command.column_name);
|
||||
if (column)
|
||||
{
|
||||
auto serialization = source_part->getSerializationForColumn(*column);
|
||||
serialization->enumerateStreams(callback);
|
||||
}
|
||||
source_part->getSerialization(column->name)->enumerateStreams(callback);
|
||||
}
|
||||
else if (command.type == MutationCommand::Type::RENAME_COLUMN)
|
||||
{
|
||||
@ -409,10 +404,7 @@ static NameToNameVector collectFilesForRenames(
|
||||
|
||||
auto column = source_part->getColumns().tryGetByName(command.column_name);
|
||||
if (column)
|
||||
{
|
||||
auto serialization = source_part->getSerializationForColumn(*column);
|
||||
serialization->enumerateStreams(callback);
|
||||
}
|
||||
source_part->getSerialization(column->name)->enumerateStreams(callback);
|
||||
}
|
||||
}
|
||||
|
||||
@ -968,8 +960,7 @@ private:
|
||||
ctx->metadata_snapshot,
|
||||
ctx->new_data_part->getColumns(),
|
||||
skip_part_indices,
|
||||
ctx->compression_codec,
|
||||
ctx->new_data_part->serialization_info);
|
||||
ctx->compression_codec);
|
||||
|
||||
ctx->mutating_pipeline = QueryPipelineBuilder::getPipeline(std::move(builder));
|
||||
ctx->mutating_executor = std::make_unique<PullingPipelineExecutor>(ctx->mutating_pipeline);
|
||||
@ -1118,7 +1109,6 @@ private:
|
||||
ctx->metadata_snapshot,
|
||||
ctx->updated_header,
|
||||
ctx->compression_codec,
|
||||
ctx->source_part->serialization_info,
|
||||
std::vector<MergeTreeIndexPtr>(ctx->indices_to_recalc.begin(), ctx->indices_to_recalc.end()),
|
||||
nullptr,
|
||||
ctx->source_part->index_granularity,
|
||||
@ -1298,11 +1288,13 @@ bool MutateTask::prepare()
|
||||
ctx->new_data_part->uuid = ctx->future_part->uuid;
|
||||
ctx->new_data_part->is_temp = true;
|
||||
ctx->new_data_part->ttl_infos = ctx->source_part->ttl_infos;
|
||||
ctx->new_data_part->serialization_info = ctx->source_part->serialization_info;
|
||||
|
||||
/// It shouldn't be changed by mutation.
|
||||
ctx->new_data_part->index_granularity_info = ctx->source_part->index_granularity_info;
|
||||
ctx->new_data_part->setColumns(MergeTreeDataMergerMutator::getColumnsForNewDataPart(ctx->source_part, ctx->updated_header, ctx->storage_columns, ctx->for_file_renames));
|
||||
ctx->new_data_part->setColumns(
|
||||
MergeTreeDataMergerMutator::getColumnsForNewDataPart(ctx->source_part, ctx->updated_header, ctx->storage_columns, ctx->for_file_renames),
|
||||
ctx->source_part->getSerializationInfos());
|
||||
|
||||
ctx->new_data_part->partition.assign(ctx->source_part->partition);
|
||||
|
||||
ctx->disk = ctx->new_data_part->volume->getDisk();
|
||||
|
@ -98,13 +98,13 @@ IMergeTreeDataPart::Checksums checkDataPart(
|
||||
};
|
||||
};
|
||||
|
||||
auto serialization_info = std::make_shared<SerializationInfo>();
|
||||
SerializationInfoByName serialization_infos(columns_txt, {});
|
||||
auto serialization_path = path + IMergeTreeDataPart::SERIALIZATION_FILE_NAME;
|
||||
|
||||
if (disk->exists(serialization_path))
|
||||
{
|
||||
auto serialization_file = disk->readFile(serialization_path);
|
||||
serialization_info->readText(*serialization_file);
|
||||
serialization_infos.readText(*serialization_file);
|
||||
}
|
||||
|
||||
/// This function calculates only checksum of file content (compressed or uncompressed).
|
||||
@ -141,7 +141,7 @@ IMergeTreeDataPart::Checksums checkDataPart(
|
||||
const NamesAndTypesList & projection_columns_list = projection->getColumns();
|
||||
for (const auto & projection_column : projection_columns_list)
|
||||
{
|
||||
auto serialization = IDataType::getSerialization(projection_column, *serialization_info);
|
||||
auto serialization = projection_column.type->getSerialization(*serialization_infos.at(projection_column.name));
|
||||
serialization->enumerateStreams(
|
||||
[&](const ISerialization::SubstreamPath & substream_path)
|
||||
{
|
||||
@ -214,7 +214,7 @@ IMergeTreeDataPart::Checksums checkDataPart(
|
||||
{
|
||||
for (const auto & column : columns_list)
|
||||
{
|
||||
auto serialization = IDataType::getSerialization(column, *serialization_info);
|
||||
auto serialization = column.type->getSerialization(*serialization_infos.at(column .name));
|
||||
serialization->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path)
|
||||
{
|
||||
String file_name = ISerialization::getFileNameForStream(column, substream_path) + ".bin";
|
||||
|
@ -166,7 +166,7 @@ void LogSource::readData(const NameAndTypePair & name_and_type, ColumnPtr & colu
|
||||
{
|
||||
ISerialization::DeserializeBinaryBulkSettings settings; /// TODO Use avg_value_size_hint.
|
||||
const auto & [name, type] = name_and_type;
|
||||
auto serialization = IDataType::getSerialization(name_and_type, {});
|
||||
auto serialization = type->getDefaultSerialization();
|
||||
|
||||
auto create_stream_getter = [&](bool stream_for_prefix)
|
||||
{
|
||||
|
@ -7109,10 +7109,6 @@ bool StorageReplicatedMergeTree::createEmptyPartInsteadOfLost(zkutil::ZooKeeperP
|
||||
new_data_part->minmax_idx = std::move(minmax_idx);
|
||||
new_data_part->is_temp = true;
|
||||
|
||||
/// Create empty serialization_info.
|
||||
auto ratio = getSettings()->ratio_of_defaults_for_sparse_serialization;
|
||||
new_data_part->serialization_info = SerializationInfoBuilder(ratio).build();
|
||||
|
||||
SyncGuardPtr sync_guard;
|
||||
if (new_data_part->isStoredOnDisk())
|
||||
{
|
||||
@ -7138,8 +7134,7 @@ bool StorageReplicatedMergeTree::createEmptyPartInsteadOfLost(zkutil::ZooKeeperP
|
||||
|
||||
const auto & index_factory = MergeTreeIndexFactory::instance();
|
||||
MergedBlockOutputStream out(new_data_part, metadata_snapshot, columns,
|
||||
index_factory.getMany(metadata_snapshot->getSecondaryIndices()),
|
||||
compression_codec, new_data_part->serialization_info);
|
||||
index_factory.getMany(metadata_snapshot->getSecondaryIndices()), compression_codec);
|
||||
|
||||
bool sync_on_insert = settings->fsync_after_insert;
|
||||
|
||||
|
@ -168,7 +168,7 @@ void TinyLogSource::readData(const NameAndTypePair & name_and_type,
|
||||
{
|
||||
ISerialization::DeserializeBinaryBulkSettings settings; /// TODO Use avg_value_size_hint.
|
||||
const auto & [name, type] = name_and_type;
|
||||
auto serialization = IDataType::getSerialization(name_and_type, {});
|
||||
auto serialization = name_and_type.type->getDefaultSerialization();
|
||||
|
||||
settings.getter = [&] (const ISerialization::SubstreamPath & path) -> ReadBuffer *
|
||||
{
|
||||
|
@ -222,7 +222,7 @@ void StorageSystemPartsColumns::processNextStorage(
|
||||
if (columns_mask[src_index++])
|
||||
columns[res_index++]->insert(column_size.marks);
|
||||
|
||||
auto serialization = part->getSerializationForColumn(column);
|
||||
auto serialization = part->getSerialization(column.name);
|
||||
if (columns_mask[src_index++])
|
||||
columns[res_index++]->insert(ISerialization::kindToString(serialization->getKind()));
|
||||
|
||||
@ -235,7 +235,7 @@ void StorageSystemPartsColumns::processNextStorage(
|
||||
subcolumn_names.push_back(name);
|
||||
subcolumn_types.push_back(data.type->getName());
|
||||
subcolumn_sers.push_back(ISerialization::kindToString(data.serialization->getKind()));
|
||||
}, serialization, column.type, nullptr);
|
||||
}, { serialization, column.type, nullptr, nullptr });
|
||||
|
||||
if (columns_mask[src_index++])
|
||||
columns[res_index++]->insert(subcolumn_names);
|
||||
|
@ -57,3 +57,10 @@ a
|
||||
aaaaaa
|
||||
a
|
||||
aaaaaa
|
||||
id [] [] []
|
||||
t ['a','b','b.u','b.s'] ['UInt64','Tuple(u UInt32, s String)','UInt32','String'] ['Sparse','Default','Sparse','Default']
|
||||
aaaaaa
|
||||
a
|
||||
aaaaaa
|
||||
a
|
||||
aaaaaa
|
||||
|
@ -40,4 +40,14 @@ SELECT t.a FROM sparse_tuple WHERE t.b.u != 0 ORDER BY id LIMIT 5;
|
||||
SELECT t.b.s FROM sparse_tuple ORDER BY id LIMIT 5;
|
||||
SELECT t.b.s FROM sparse_tuple WHERE t.b.u != 0 ORDER BY id LIMIT 5;
|
||||
|
||||
DETACH TABLE sparse_tuple;
|
||||
ATTACH TABLE sparse_tuple;
|
||||
|
||||
SELECT column, subcolumns.names, subcolumns.types, subcolumns.serializations
|
||||
FROM system.parts_columns
|
||||
WHERE table = 'sparse_tuple' AND database = currentDatabase()
|
||||
ORDER BY column;
|
||||
|
||||
SELECT t.b.s FROM sparse_tuple WHERE t.b.u != 0 ORDER BY id LIMIT 5;
|
||||
|
||||
DROP TABLE IF EXISTS sparse_tuple;
|
||||
|
Loading…
Reference in New Issue
Block a user