Backport #70328 to 24.8: Fix ALTER of Dynamic type with reducing max_types parameter

This commit is contained in:
robot-clickhouse 2024-10-07 05:08:51 +00:00
parent fa3d99b00a
commit 87e3c55bca
3 changed files with 118 additions and 9 deletions

View File

@ -4487,26 +4487,124 @@ private:
return [to_max_types]
(ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable *, size_t) -> ColumnPtr
{
const auto & column_dynamic = assert_cast<const ColumnDynamic &>(*arguments[0].column);
const auto & dynamic_column = assert_cast<const ColumnDynamic &>(*arguments[0].column);
/// We should use the same limit as already used in column and change only global limit.
/// It's needed because shared variant should contain values only when limit is exceeded,
/// so if there are already some data, we cannot increase the limit.
return ColumnDynamic::create(column_dynamic.getVariantColumnPtr(), column_dynamic.getVariantInfo(), column_dynamic.getMaxDynamicTypes(), to_max_types);
return ColumnDynamic::create(dynamic_column.getVariantColumnPtr(), dynamic_column.getVariantInfo(), dynamic_column.getMaxDynamicTypes(), to_max_types);
};
}
return [to_max_types]
(ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable *, size_t) -> ColumnPtr
{
const auto & column_dynamic = assert_cast<const ColumnDynamic &>(*arguments[0].column);
const auto & dynamic_column = assert_cast<const ColumnDynamic &>(*arguments[0].column);
/// If real limit in the column is not greater than desired, just use the same variant column.
if (column_dynamic.getMaxDynamicTypes() <= to_max_types)
return ColumnDynamic::create(column_dynamic.getVariantColumnPtr(), column_dynamic.getVariantInfo(), column_dynamic.getMaxDynamicTypes(), to_max_types);
if (dynamic_column.getMaxDynamicTypes() <= to_max_types)
return ColumnDynamic::create(dynamic_column.getVariantColumnPtr(), dynamic_column.getVariantInfo(), dynamic_column.getMaxDynamicTypes(), to_max_types);
/// Otherwise some variants should go to the shared variant. We try to keep the most frequent variants.
const auto & variant_info = dynamic_column.getVariantInfo();
const auto & variants = assert_cast<const DataTypeVariant &>(*variant_info.variant_type).getVariants();
const auto & statistics = dynamic_column.getStatistics();
const auto & variant_column = dynamic_column.getVariantColumn();
auto shared_variant_discr = dynamic_column.getSharedVariantDiscriminator();
std::vector<std::tuple<size_t, String, DataTypePtr>> variants_with_sizes;
variants_with_sizes.reserve(variant_info.variant_names.size());
for (const auto & [name, discr] : variant_info.variant_name_to_discriminator)
{
/// Don't include shared variant.
if (discr == shared_variant_discr)
continue;
size_t size = variant_column.getVariantByGlobalDiscriminator(discr).size();
/// If column has statistics from the data part, use size from it for consistency.
/// It's important to keep the same dynamic structure of the result column during ALTER.
if (statistics)
{
auto statistics_it = statistics->variants_statistics.find(name);
if (statistics_it != statistics->variants_statistics.end())
size = statistics_it->second;
}
variants_with_sizes.emplace_back(size, name, variants[discr]);
}
std::sort(variants_with_sizes.begin(), variants_with_sizes.end(), std::greater());
DataTypes result_variants;
result_variants.reserve(to_max_types + 1); /// +1 for shared variant.
/// Add new variants from sorted list until we reach to_max_types.
for (const auto & [size, name, type] : variants_with_sizes)
{
if (result_variants.size() < to_max_types)
result_variants.push_back(type);
else
break;
}
/// Add shared variant.
result_variants.push_back(ColumnDynamic::getSharedVariantDataType());
/// Create resulting Variant type and Dynamic column.
auto result_variant_type = std::make_shared<DataTypeVariant>(result_variants);
auto result_dynamic_column = ColumnDynamic::create(result_variant_type->createColumn(), result_variant_type, to_max_types, to_max_types);
const auto & result_variant_info = result_dynamic_column->getVariantInfo();
auto & result_variant_column = result_dynamic_column->getVariantColumn();
auto result_shared_variant_discr = result_dynamic_column->getSharedVariantDiscriminator();
/// Create mapping from old discriminators to the new ones.
std::vector<ColumnVariant::Discriminator> old_to_new_discriminators;
old_to_new_discriminators.resize(variant_info.variant_name_to_discriminator.size(), result_shared_variant_discr);
for (const auto & [name, discr] : result_variant_info.variant_name_to_discriminator)
{
auto old_discr = variant_info.variant_name_to_discriminator.at(name);
old_to_new_discriminators[old_discr] = discr;
/// Reuse old variant column if it's not shared variant.
if (discr != result_shared_variant_discr)
result_variant_column.getVariantPtrByGlobalDiscriminator(discr) = variant_column.getVariantPtrByGlobalDiscriminator(old_discr);
}
const auto & local_discriminators = variant_column.getLocalDiscriminators();
const auto & offsets = variant_column.getOffsets();
const auto & shared_variant = dynamic_column.getSharedVariant();
auto & result_local_discriminators = result_variant_column.getLocalDiscriminators();
result_local_discriminators.reserve(local_discriminators.size());
auto & result_offsets = result_variant_column.getOffsets();
result_offsets.reserve(offsets.size());
auto & result_shared_variant = result_dynamic_column->getSharedVariant();
for (size_t i = 0; i != local_discriminators.size(); ++i)
{
auto global_discr = variant_column.globalDiscriminatorByLocal(local_discriminators[i]);
if (global_discr == ColumnVariant::NULL_DISCRIMINATOR)
{
result_local_discriminators.push_back(ColumnVariant::NULL_DISCRIMINATOR);
result_offsets.emplace_back();
}
else if (global_discr == shared_variant_discr)
{
result_local_discriminators.push_back(result_variant_column.localDiscriminatorByGlobal(result_shared_variant_discr));
result_offsets.push_back(result_shared_variant.size());
result_shared_variant.insertFrom(shared_variant, offsets[i]);
}
else
{
auto result_global_discr = old_to_new_discriminators[global_discr];
if (result_global_discr == result_shared_variant_discr)
{
result_local_discriminators.push_back(result_variant_column.localDiscriminatorByGlobal(result_shared_variant_discr));
result_offsets.push_back(result_shared_variant.size());
ColumnDynamic::serializeValueIntoSharedVariant(
result_shared_variant,
variant_column.getVariantByGlobalDiscriminator(global_discr),
variants[global_discr],
variants[global_discr]->getDefaultSerialization(),
offsets[i]);
}
else
{
result_local_discriminators.push_back(result_variant_column.localDiscriminatorByGlobal(result_global_discr));
result_offsets.push_back(offsets[i]);
}
}
}
/// Otherwise some variants should go to the shared variant. In this case we can just insert all
/// the data into resulting column and it will do all the logic with shared variant.
auto result_dynamic_column = ColumnDynamic::create(to_max_types);
result_dynamic_column->insertRangeFrom(column_dynamic, 0, column_dynamic.size());
return result_dynamic_column;
};
}

View File

@ -0,0 +1,2 @@
600000 UInt64 false
400000 String true

View File

@ -0,0 +1,9 @@
set allow_experimental_dynamic_type=1;
drop table if exists test;
create table test (d Dynamic) engine=MergeTree order by tuple() settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1;
insert into test select number < 600000 ? number::Dynamic : ('str_' || number)::Dynamic from numbers(1000000);
alter table test modify column d Dynamic(max_types=1);
select count(), dynamicType(d), isDynamicElementInSharedData(d) from test group by dynamicType(d), isDynamicElementInSharedData(d);
drop table test;