mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-26 01:22:04 +00:00
Merge pull request #33849 from CurtizJ/fix-sparse-columns
Fix sparse columns
This commit is contained in:
commit
43f2e2c178
@ -158,6 +158,19 @@ void SerializationInfoByName::add(const SerializationInfoByName & other)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void SerializationInfoByName::replaceData(const SerializationInfoByName & other)
|
||||||
|
{
|
||||||
|
for (const auto & [name, new_info] : other)
|
||||||
|
{
|
||||||
|
auto & old_info = (*this)[name];
|
||||||
|
|
||||||
|
if (old_info)
|
||||||
|
old_info->replaceData(*new_info);
|
||||||
|
else
|
||||||
|
old_info = new_info->clone();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void SerializationInfoByName::writeJSON(WriteBuffer & out) const
|
void SerializationInfoByName::writeJSON(WriteBuffer & out) const
|
||||||
{
|
{
|
||||||
Poco::JSON::Object object;
|
Poco::JSON::Object object;
|
||||||
|
@ -89,6 +89,11 @@ public:
|
|||||||
void add(const Block & block);
|
void add(const Block & block);
|
||||||
void add(const SerializationInfoByName & other);
|
void add(const SerializationInfoByName & other);
|
||||||
|
|
||||||
|
/// Takes data from @other, but keeps current serialization kinds.
|
||||||
|
/// If column exists in @other infos, but not in current infos,
|
||||||
|
/// it's cloned to current infos.
|
||||||
|
void replaceData(const SerializationInfoByName & other);
|
||||||
|
|
||||||
void writeJSON(WriteBuffer & out) const;
|
void writeJSON(WriteBuffer & out) const;
|
||||||
void readJSON(ReadBuffer & in);
|
void readJSON(ReadBuffer & in);
|
||||||
};
|
};
|
||||||
|
@ -416,7 +416,7 @@ std::pair<time_t, time_t> IMergeTreeDataPart::getMinMaxTime() const
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void IMergeTreeDataPart::setColumns(const NamesAndTypesList & new_columns, const SerializationInfoByName & new_infos)
|
void IMergeTreeDataPart::setColumns(const NamesAndTypesList & new_columns)
|
||||||
{
|
{
|
||||||
columns = new_columns;
|
columns = new_columns;
|
||||||
|
|
||||||
@ -425,21 +425,12 @@ void IMergeTreeDataPart::setColumns(const NamesAndTypesList & new_columns, const
|
|||||||
size_t pos = 0;
|
size_t pos = 0;
|
||||||
|
|
||||||
for (const auto & column : columns)
|
for (const auto & column : columns)
|
||||||
{
|
|
||||||
column_name_to_position.emplace(column.name, pos++);
|
column_name_to_position.emplace(column.name, pos++);
|
||||||
|
}
|
||||||
|
|
||||||
auto it = new_infos.find(column.name);
|
void IMergeTreeDataPart::setSerializationInfos(const SerializationInfoByName & new_infos)
|
||||||
if (it != new_infos.end())
|
{
|
||||||
{
|
serialization_infos = new_infos;
|
||||||
auto & old_info = serialization_infos[column.name];
|
|
||||||
const auto & new_info = it->second;
|
|
||||||
|
|
||||||
if (old_info)
|
|
||||||
old_info->replaceData(*new_info);
|
|
||||||
else
|
|
||||||
old_info = new_info->clone();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
SerializationPtr IMergeTreeDataPart::getSerialization(const NameAndTypePair & column) const
|
SerializationPtr IMergeTreeDataPart::getSerialization(const NameAndTypePair & column) const
|
||||||
@ -1098,7 +1089,8 @@ void IMergeTreeDataPart::loadColumns(bool require)
|
|||||||
if (volume->getDisk()->exists(path))
|
if (volume->getDisk()->exists(path))
|
||||||
infos.readJSON(*volume->getDisk()->readFile(path));
|
infos.readJSON(*volume->getDisk()->readFile(path));
|
||||||
|
|
||||||
setColumns(loaded_columns, infos);
|
setColumns(loaded_columns);
|
||||||
|
setSerializationInfos(infos);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool IMergeTreeDataPart::shallParticipateInMerges(const StoragePolicyPtr & storage_policy) const
|
bool IMergeTreeDataPart::shallParticipateInMerges(const StoragePolicyPtr & storage_policy) const
|
||||||
|
@ -128,11 +128,14 @@ public:
|
|||||||
|
|
||||||
String getTypeName() const { return getType().toString(); }
|
String getTypeName() const { return getType().toString(); }
|
||||||
|
|
||||||
void setColumns(const NamesAndTypesList & new_columns, const SerializationInfoByName & new_infos = {});
|
void setColumns(const NamesAndTypesList & new_columns);
|
||||||
|
|
||||||
const NamesAndTypesList & getColumns() const { return columns; }
|
const NamesAndTypesList & getColumns() const { return columns; }
|
||||||
|
|
||||||
|
void setSerializationInfos(const SerializationInfoByName & new_infos);
|
||||||
|
|
||||||
const SerializationInfoByName & getSerializationInfos() const { return serialization_infos; }
|
const SerializationInfoByName & getSerializationInfos() const { return serialization_infos; }
|
||||||
SerializationInfoByName & getSerializationInfos() { return serialization_infos; }
|
|
||||||
SerializationPtr getSerialization(const NameAndTypePair & column) const;
|
SerializationPtr getSerialization(const NameAndTypePair & column) const;
|
||||||
|
|
||||||
/// Throws an exception if part is not stored in on-disk format.
|
/// Throws an exception if part is not stored in on-disk format.
|
||||||
|
@ -186,7 +186,8 @@ bool MergeTask::ExecuteAndFinalizeHorizontalPart::prepare()
|
|||||||
infos.add(part->getSerializationInfos());
|
infos.add(part->getSerializationInfos());
|
||||||
}
|
}
|
||||||
|
|
||||||
global_ctx->new_data_part->setColumns(global_ctx->storage_columns, infos);
|
global_ctx->new_data_part->setColumns(global_ctx->storage_columns);
|
||||||
|
global_ctx->new_data_part->setSerializationInfos(infos);
|
||||||
|
|
||||||
const auto & local_part_min_ttl = global_ctx->new_data_part->ttl_infos.part_min_ttl;
|
const auto & local_part_min_ttl = global_ctx->new_data_part->ttl_infos.part_min_ttl;
|
||||||
if (local_part_min_ttl && local_part_min_ttl <= global_ctx->time_of_merge)
|
if (local_part_min_ttl && local_part_min_ttl <= global_ctx->time_of_merge)
|
||||||
|
@ -370,7 +370,8 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataWriter::writeTempPart(
|
|||||||
SerializationInfoByName infos(columns, settings);
|
SerializationInfoByName infos(columns, settings);
|
||||||
infos.add(block);
|
infos.add(block);
|
||||||
|
|
||||||
new_data_part->setColumns(columns, infos);
|
new_data_part->setColumns(columns);
|
||||||
|
new_data_part->setSerializationInfos(infos);
|
||||||
new_data_part->rows_count = block.rows();
|
new_data_part->rows_count = block.rows();
|
||||||
new_data_part->partition = std::move(partition);
|
new_data_part->partition = std::move(partition);
|
||||||
new_data_part->minmax_idx = std::move(minmax_idx);
|
new_data_part->minmax_idx = std::move(minmax_idx);
|
||||||
@ -468,7 +469,8 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataWriter::writeProjectionPartImpl(
|
|||||||
SerializationInfoByName infos(columns, settings);
|
SerializationInfoByName infos(columns, settings);
|
||||||
infos.add(block);
|
infos.add(block);
|
||||||
|
|
||||||
new_data_part->setColumns(columns, infos);
|
new_data_part->setColumns(columns);
|
||||||
|
new_data_part->setSerializationInfos(infos);
|
||||||
|
|
||||||
if (new_data_part->isStoredOnDisk())
|
if (new_data_part->isStoredOnDisk())
|
||||||
{
|
{
|
||||||
|
@ -72,21 +72,20 @@ void MergedBlockOutputStream::writeSuffixAndFinalizePart(
|
|||||||
projection_part->checksums.getTotalSizeOnDisk(),
|
projection_part->checksums.getTotalSizeOnDisk(),
|
||||||
projection_part->checksums.getTotalChecksumUInt128());
|
projection_part->checksums.getTotalChecksumUInt128());
|
||||||
|
|
||||||
NamesAndTypesList part_columns;
|
if (reset_columns)
|
||||||
if (!total_columns_list)
|
{
|
||||||
part_columns = columns_list;
|
auto part_columns = total_columns_list ? *total_columns_list : columns_list;
|
||||||
else
|
auto serialization_infos = new_part->getSerializationInfos();
|
||||||
part_columns = *total_columns_list;
|
|
||||||
|
|
||||||
auto & serialization_infos = reset_columns
|
serialization_infos.replaceData(new_serialization_infos);
|
||||||
? new_serialization_infos
|
removeEmptyColumnsFromPart(new_part, part_columns, serialization_infos, checksums);
|
||||||
: new_part->getSerializationInfos();
|
|
||||||
|
new_part->setColumns(part_columns);
|
||||||
|
new_part->setSerializationInfos(serialization_infos);
|
||||||
|
}
|
||||||
|
|
||||||
if (new_part->isStoredOnDisk())
|
if (new_part->isStoredOnDisk())
|
||||||
finalizePartOnDisk(new_part, part_columns, serialization_infos, checksums, sync);
|
finalizePartOnDisk(new_part, checksums, sync);
|
||||||
|
|
||||||
if (reset_columns)
|
|
||||||
new_part->setColumns(part_columns, serialization_infos);
|
|
||||||
|
|
||||||
new_part->rows_count = rows_count;
|
new_part->rows_count = rows_count;
|
||||||
new_part->modification_time = time(nullptr);
|
new_part->modification_time = time(nullptr);
|
||||||
@ -102,9 +101,7 @@ void MergedBlockOutputStream::writeSuffixAndFinalizePart(
|
|||||||
}
|
}
|
||||||
|
|
||||||
void MergedBlockOutputStream::finalizePartOnDisk(
|
void MergedBlockOutputStream::finalizePartOnDisk(
|
||||||
const MergeTreeData::MutableDataPartPtr & new_part,
|
const MergeTreeData::DataPartPtr & new_part,
|
||||||
NamesAndTypesList & part_columns,
|
|
||||||
SerializationInfoByName & serialization_infos,
|
|
||||||
MergeTreeData::DataPart::Checksums & checksums,
|
MergeTreeData::DataPart::Checksums & checksums,
|
||||||
bool sync)
|
bool sync)
|
||||||
{
|
{
|
||||||
@ -171,13 +168,11 @@ void MergedBlockOutputStream::finalizePartOnDisk(
|
|||||||
out->sync();
|
out->sync();
|
||||||
}
|
}
|
||||||
|
|
||||||
removeEmptyColumnsFromPart(new_part, part_columns, serialization_infos, checksums);
|
if (!new_part->getSerializationInfos().empty())
|
||||||
|
|
||||||
if (!serialization_infos.empty())
|
|
||||||
{
|
{
|
||||||
auto out = volume->getDisk()->writeFile(part_path + IMergeTreeDataPart::SERIALIZATION_FILE_NAME, 4096);
|
auto out = volume->getDisk()->writeFile(part_path + IMergeTreeDataPart::SERIALIZATION_FILE_NAME, 4096);
|
||||||
HashingWriteBuffer out_hashing(*out);
|
HashingWriteBuffer out_hashing(*out);
|
||||||
serialization_infos.writeJSON(out_hashing);
|
new_part->getSerializationInfos().writeJSON(out_hashing);
|
||||||
checksums.files[IMergeTreeDataPart::SERIALIZATION_FILE_NAME].file_size = out_hashing.count();
|
checksums.files[IMergeTreeDataPart::SERIALIZATION_FILE_NAME].file_size = out_hashing.count();
|
||||||
checksums.files[IMergeTreeDataPart::SERIALIZATION_FILE_NAME].file_hash = out_hashing.getHash();
|
checksums.files[IMergeTreeDataPart::SERIALIZATION_FILE_NAME].file_hash = out_hashing.getHash();
|
||||||
out->finalize();
|
out->finalize();
|
||||||
@ -188,7 +183,7 @@ void MergedBlockOutputStream::finalizePartOnDisk(
|
|||||||
{
|
{
|
||||||
/// Write a file with a description of columns.
|
/// Write a file with a description of columns.
|
||||||
auto out = volume->getDisk()->writeFile(fs::path(part_path) / "columns.txt", 4096);
|
auto out = volume->getDisk()->writeFile(fs::path(part_path) / "columns.txt", 4096);
|
||||||
part_columns.writeText(*out);
|
new_part->getColumns().writeText(*out);
|
||||||
out->finalize();
|
out->finalize();
|
||||||
if (sync)
|
if (sync)
|
||||||
out->sync();
|
out->sync();
|
||||||
|
@ -47,9 +47,7 @@ private:
|
|||||||
void writeImpl(const Block & block, const IColumn::Permutation * permutation);
|
void writeImpl(const Block & block, const IColumn::Permutation * permutation);
|
||||||
|
|
||||||
void finalizePartOnDisk(
|
void finalizePartOnDisk(
|
||||||
const MergeTreeData::MutableDataPartPtr & new_part,
|
const MergeTreeData::DataPartPtr & new_part,
|
||||||
NamesAndTypesList & part_columns,
|
|
||||||
SerializationInfoByName & serialization_infos,
|
|
||||||
MergeTreeData::DataPart::Checksums & checksums,
|
MergeTreeData::DataPart::Checksums & checksums,
|
||||||
bool sync);
|
bool sync);
|
||||||
|
|
||||||
|
@ -71,13 +71,17 @@ MergedColumnOnlyOutputStream::writeSuffixAndGetChecksums(
|
|||||||
projection_part->checksums.getTotalChecksumUInt128());
|
projection_part->checksums.getTotalChecksumUInt128());
|
||||||
|
|
||||||
auto columns = new_part->getColumns();
|
auto columns = new_part->getColumns();
|
||||||
|
auto serialization_infos = new_part->getSerializationInfos();
|
||||||
|
serialization_infos.replaceData(new_serialization_infos);
|
||||||
|
|
||||||
auto removed_files = removeEmptyColumnsFromPart(new_part, columns, new_serialization_infos, checksums);
|
auto removed_files = removeEmptyColumnsFromPart(new_part, columns, serialization_infos, checksums);
|
||||||
for (const String & removed_file : removed_files)
|
for (const String & removed_file : removed_files)
|
||||||
if (all_checksums.files.count(removed_file))
|
if (all_checksums.files.count(removed_file))
|
||||||
all_checksums.files.erase(removed_file);
|
all_checksums.files.erase(removed_file);
|
||||||
|
|
||||||
new_part->setColumns(columns, new_serialization_infos);
|
new_part->setColumns(columns);
|
||||||
|
new_part->setSerializationInfos(serialization_infos);
|
||||||
|
|
||||||
return checksums;
|
return checksums;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1295,7 +1295,8 @@ bool MutateTask::prepare()
|
|||||||
ctx->source_part, ctx->updated_header, ctx->storage_columns,
|
ctx->source_part, ctx->updated_header, ctx->storage_columns,
|
||||||
ctx->source_part->getSerializationInfos(), ctx->commands_for_part);
|
ctx->source_part->getSerializationInfos(), ctx->commands_for_part);
|
||||||
|
|
||||||
ctx->new_data_part->setColumns(new_columns, new_infos);
|
ctx->new_data_part->setColumns(new_columns);
|
||||||
|
ctx->new_data_part->setSerializationInfos(new_infos);
|
||||||
ctx->new_data_part->partition.assign(ctx->source_part->partition);
|
ctx->new_data_part->partition.assign(ctx->source_part->partition);
|
||||||
|
|
||||||
ctx->disk = ctx->new_data_part->volume->getDisk();
|
ctx->disk = ctx->new_data_part->volume->getDisk();
|
||||||
|
@ -0,0 +1,12 @@
|
|||||||
|
1000
|
||||||
|
id Default
|
||||||
|
s Sparse
|
||||||
|
1000
|
||||||
|
id Default
|
||||||
|
s Sparse
|
||||||
|
1000
|
||||||
|
id Default
|
||||||
|
s Sparse
|
||||||
|
1000
|
||||||
|
id Default
|
||||||
|
s Sparse
|
53
tests/queries/0_stateless/02179_sparse_columns_detach.sql
Normal file
53
tests/queries/0_stateless/02179_sparse_columns_detach.sql
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
DROP TABLE IF EXISTS t_sparse_detach;
|
||||||
|
|
||||||
|
CREATE TABLE t_sparse_detach(id UInt64, s String)
|
||||||
|
ENGINE = MergeTree ORDER BY id
|
||||||
|
SETTINGS ratio_of_defaults_for_sparse_serialization = 0.9;
|
||||||
|
|
||||||
|
INSERT INTO t_sparse_detach SELECT number, number % 20 = 0 ? toString(number) : '' FROM numbers(10000);
|
||||||
|
INSERT INTO t_sparse_detach SELECT number, number % 20 = 0 ? toString(number) : '' FROM numbers(10000);
|
||||||
|
|
||||||
|
OPTIMIZE TABLE t_sparse_detach FINAL;
|
||||||
|
|
||||||
|
SELECT count() FROM t_sparse_detach WHERE s != '';
|
||||||
|
|
||||||
|
SELECT column, serialization_kind FROM system.parts_columns
|
||||||
|
WHERE table = 't_sparse_detach' AND database = currentDatabase() AND active
|
||||||
|
ORDER BY column;
|
||||||
|
|
||||||
|
DETACH TABLE t_sparse_detach;
|
||||||
|
ATTACH TABLE t_sparse_detach;
|
||||||
|
|
||||||
|
SELECT count() FROM t_sparse_detach WHERE s != '';
|
||||||
|
|
||||||
|
SELECT column, serialization_kind FROM system.parts_columns
|
||||||
|
WHERE table = 't_sparse_detach' AND database = currentDatabase() AND active
|
||||||
|
ORDER BY column;
|
||||||
|
|
||||||
|
TRUNCATE TABLE t_sparse_detach;
|
||||||
|
|
||||||
|
ALTER TABLE t_sparse_detach
|
||||||
|
MODIFY SETTING vertical_merge_algorithm_min_rows_to_activate = 1,
|
||||||
|
vertical_merge_algorithm_min_columns_to_activate = 1;
|
||||||
|
|
||||||
|
INSERT INTO t_sparse_detach SELECT number, number % 20 = 0 ? toString(number) : '' FROM numbers(10000);
|
||||||
|
INSERT INTO t_sparse_detach SELECT number, number % 20 = 0 ? toString(number) : '' FROM numbers(10000);
|
||||||
|
|
||||||
|
OPTIMIZE TABLE t_sparse_detach FINAL;
|
||||||
|
|
||||||
|
SELECT count() FROM t_sparse_detach WHERE s != '';
|
||||||
|
|
||||||
|
SELECT column, serialization_kind FROM system.parts_columns
|
||||||
|
WHERE table = 't_sparse_detach' AND database = currentDatabase() AND active
|
||||||
|
ORDER BY column;
|
||||||
|
|
||||||
|
DETACH TABLE t_sparse_detach;
|
||||||
|
ATTACH TABLE t_sparse_detach;
|
||||||
|
|
||||||
|
SELECT count() FROM t_sparse_detach WHERE s != '';
|
||||||
|
|
||||||
|
SELECT column, serialization_kind FROM system.parts_columns
|
||||||
|
WHERE table = 't_sparse_detach' AND database = currentDatabase() AND active
|
||||||
|
ORDER BY column;
|
||||||
|
|
||||||
|
DROP TABLE t_sparse_detach;
|
Loading…
Reference in New Issue
Block a user