Update new prepareForSquashing method for ColumnDynamic

This commit is contained in:
avogar 2024-08-12 21:29:26 +00:00
parent 03182c7a8f
commit 8136e6a452
4 changed files with 35 additions and 27 deletions

View File

@ -987,7 +987,8 @@ void ColumnDynamic::prepareForSquashing(const Columns & source_columns)
/// Internal variants of source dynamic columns may differ.
/// We want to preallocate memory for all variants we will have after squashing.
/// It may happen that the total number of variants in source columns will
/// exceed the limit, in this case we will choose the most frequent variants.
/// exceed the limit, in this case we will choose the most frequent variants
/// and insert the rest types into the shared variant.
/// First, preallocate memory for variant discriminators and offsets.
size_t new_size = size();
@ -1030,17 +1031,14 @@ void ColumnDynamic::prepareForSquashing(const Columns & source_columns)
DataTypePtr result_variant_type;
/// Check if the number of all variants exceeds the limit.
if (all_variants.size() > max_dynamic_types || (all_variants.size() == max_dynamic_types && !total_variant_sizes.contains("String")))
if (!canAddNewVariants(0, all_variants.size()))
{
/// We want to keep the most frequent variants in the resulting dynamic column.
DataTypes result_variants;
result_variants.reserve(max_dynamic_types);
result_variants.reserve(max_dynamic_types + 1); /// +1 for shared variant.
/// Add variants from current variant column as we will not rewrite it.
for (const auto & variant : assert_cast<const DataTypeVariant &>(*variant_info.variant_type).getVariants())
result_variants.push_back(variant);
/// Add String variant in advance (if we didn't add it yet) as we must have it across variants when we reach the limit.
if (!variant_info.variant_name_to_discriminator.contains("String"))
result_variants.push_back(std::make_shared<DataTypeString>());
/// Create list of remaining variants with their sizes and sort it.
std::vector<std::pair<size_t, DataTypePtr>> variants_with_sizes;
@ -1049,15 +1047,18 @@ void ColumnDynamic::prepareForSquashing(const Columns & source_columns)
{
/// Add variant to the list only of we didn't add it yet.
auto variant_name = variant->getName();
if (variant_name != "String" && !variant_info.variant_name_to_discriminator.contains(variant_name))
variants_with_sizes.emplace_back(total_variant_sizes[variant->getName()], variant);
if (!variant_info.variant_name_to_discriminator.contains(variant_name))
variants_with_sizes.emplace_back(total_variant_sizes[variant_name], variant);
}
std::sort(variants_with_sizes.begin(), variants_with_sizes.end(), std::greater());
/// Add the most frequent variants until we reach max_dynamic_types.
size_t num_new_variants = max_dynamic_types - result_variants.size();
for (size_t i = 0; i != num_new_variants; ++i)
result_variants.push_back(variants_with_sizes[i].second);
for (const auto & [_, new_variant] : variants_with_sizes)
{
if (!canAddNewVariant(result_variants.size()))
break;
result_variants.push_back(new_variant);
}
result_variant_type = std::make_shared<DataTypeVariant>(result_variants);
}

View File

@ -117,7 +117,7 @@ bool DataTypeVariant::equals(const IDataType & rhs) const
/// The same data types with different custom names considered different.
/// For example, UInt8 and Bool.
if ((variants[i]->hasCustomName() || rhs_variant.variants[i]) && variants[i]->getName() != rhs_variant.variants[i]->getName())
if ((variants[i]->hasCustomName() || rhs_variant.variants[i]->hasCustomName()) && variants[i]->getName() != rhs_variant.variants[i]->getName())
return false;
}

View File

@ -1,8 +1,12 @@
Array(UInt8)
None
UInt64
None
String
UInt64
String
UInt64
1
Array(UInt8) true
None false
UInt64 false
2
Array(UInt8) true
None false
UInt64 false
3
Array(UInt8) true
String false
UInt64 true

View File

@ -4,17 +4,20 @@ set max_block_size = 1000;
drop table if exists test;
create table test (d Dynamic) engine=MergeTree order by tuple();
insert into test select multiIf(number < 1000, NULL::Dynamic(max_types=2), number < 3000, range(number % 5)::Dynamic(max_types=2), number::Dynamic(max_types=2)) from numbers(1000000);
select distinct dynamicType(d) as type from test order by type;
insert into test select multiIf(number < 1000, NULL::Dynamic(max_types=1), number < 3000, range(number % 5)::Dynamic(max_types=1), number::Dynamic(max_types=1)) from numbers(1000000);
select '1';
select distinct dynamicType(d) as type, isDynamicElementInSharedData(d) as flag from test order by type;
drop table test;
create table test (d Dynamic(max_types=2)) engine=MergeTree order by tuple();
insert into test select multiIf(number < 1000, NULL::Dynamic(max_types=2), number < 3000, range(number % 5)::Dynamic(max_types=2), number::Dynamic(max_types=2)) from numbers(1000000);
select distinct dynamicType(d) as type from test order by type;
create table test (d Dynamic(max_types=1)) engine=MergeTree order by tuple();
insert into test select multiIf(number < 1000, NULL::Dynamic(max_types=1), number < 3000, range(number % 5)::Dynamic(max_types=1), number::Dynamic(max_types=1)) from numbers(1000000);
select '2';
select distinct dynamicType(d) as type, isDynamicElementInSharedData(d) as flag from test order by type;
truncate table test;
insert into test select multiIf(number < 1000, 'Str'::Dynamic(max_types=2), number < 3000, range(number % 5)::Dynamic(max_types=2), number::Dynamic(max_types=2)) from numbers(1000000);
select distinct dynamicType(d) as type from test order by type;
insert into test select multiIf(number < 1000, 'Str'::Dynamic(max_types=1), number < 3000, range(number % 5)::Dynamic(max_types=1), number::Dynamic(max_types=1)) from numbers(1000000);
select '3';
select distinct dynamicType(d) as type, isDynamicElementInSharedData(d) as flag from test order by type;
drop table test;