From 57c1d7a1011f96cea21ca66a3064b7481f8ce40b Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Fri, 19 Jul 2024 12:36:57 +0000 Subject: [PATCH] fix filling of empty Nested --- src/DataTypes/IDataType.cpp | 4 +- src/DataTypes/ObjectUtils.cpp | 31 ++++++++++ src/DataTypes/ObjectUtils.h | 3 + src/DataTypes/Serializations/ISerialization.h | 3 +- src/Interpreters/inplaceBlockConversions.cpp | 57 ++++++++++++++++--- src/Storages/MergeTree/IMergeTreeReader.cpp | 7 ++- src/Storages/MergeTree/IMergeTreeReader.h | 3 + 7 files changed, 95 insertions(+), 13 deletions(-) diff --git a/src/DataTypes/IDataType.cpp b/src/DataTypes/IDataType.cpp index 1cb64b65d3a..824bc6e33b0 100644 --- a/src/DataTypes/IDataType.cpp +++ b/src/DataTypes/IDataType.cpp @@ -90,7 +90,9 @@ void IDataType::forEachSubcolumn( { auto name = ISerialization::getSubcolumnNameForStream(subpath, prefix_len); auto subdata = ISerialization::createFromPath(subpath, prefix_len); - callback(subpath, name, subdata); + auto path_copy = subpath; + path_copy.resize(prefix_len); + callback(path_copy, name, subdata); } subpath[i].visited = true; } diff --git a/src/DataTypes/ObjectUtils.cpp b/src/DataTypes/ObjectUtils.cpp index 1d525e5987f..356e609e77a 100644 --- a/src/DataTypes/ObjectUtils.cpp +++ b/src/DataTypes/ObjectUtils.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -66,6 +67,36 @@ DataTypePtr getBaseTypeOfArray(const DataTypePtr & type) return last_array ? last_array->getNestedType() : type; } +DataTypePtr getBaseTypeOfArray(DataTypePtr type, const Names & tuple_elements) +{ + auto it = tuple_elements.begin(); + while (true) + { + if (const auto * type_array = typeid_cast(type.get())) + { + type = type_array->getNestedType(); + } + else if (const auto * type_tuple = typeid_cast(type.get())) + { + if (it == tuple_elements.end()) + break; + + auto pos = type_tuple->tryGetPositionByName(*it); + if (!pos) + break; + + ++it; + type = type_tuple->getElement(*pos); + } + else + { + break; + } + } + + return type; +} + ColumnPtr getBaseColumnOfArray(const ColumnPtr & column) { /// Get raw pointers to avoid extra copying of column pointers. diff --git a/src/DataTypes/ObjectUtils.h b/src/DataTypes/ObjectUtils.h index 6599d8adef1..21e5c3b2f59 100644 --- a/src/DataTypes/ObjectUtils.h +++ b/src/DataTypes/ObjectUtils.h @@ -27,6 +27,9 @@ size_t getNumberOfDimensions(const IColumn & column); /// Returns type of scalars of Array of arbitrary dimensions. DataTypePtr getBaseTypeOfArray(const DataTypePtr & type); +/// The same as above but takes into account Tuples of Nested. +DataTypePtr getBaseTypeOfArray(DataTypePtr type, const Names & tuple_elements); + /// Returns Array type with requested scalar type and number of dimensions. DataTypePtr createArrayOfType(DataTypePtr type, size_t num_dimensions); diff --git a/src/DataTypes/Serializations/ISerialization.h b/src/DataTypes/Serializations/ISerialization.h index 255dbbfadd2..5d0bf60c59f 100644 --- a/src/DataTypes/Serializations/ISerialization.h +++ b/src/DataTypes/Serializations/ISerialization.h @@ -195,7 +195,7 @@ public: /// Types of substreams that can have arbitrary name. static const std::set named_types; - Type type; + Type type = Type::Regular; /// The name of a variant element type. String variant_element_name; @@ -212,6 +212,7 @@ public: /// Flag, that may help to traverse substream paths. mutable bool visited = false; + Substream() = default; Substream(Type type_) : type(type_) {} /// NOLINT String toString() const; }; diff --git a/src/Interpreters/inplaceBlockConversions.cpp b/src/Interpreters/inplaceBlockConversions.cpp index f7d8a2a2daf..ce3f25d16f8 100644 --- a/src/Interpreters/inplaceBlockConversions.cpp +++ b/src/Interpreters/inplaceBlockConversions.cpp @@ -283,6 +283,9 @@ static ColumnPtr createColumnWithDefaultValue(const IDataType & data_type, const { auto column = data_type.createColumnConstWithDefaultValue(num_rows); + /// We must turn a constant column into a full column because the interpreter could infer + /// that it is constant everywhere but in some blocks (from other parts) it can be a full column. + if (subcolumn_name.empty()) return column->convertToFullColumnIfConst(); @@ -293,6 +296,35 @@ static ColumnPtr createColumnWithDefaultValue(const IDataType & data_type, const return ColumnConst::create(std::move(column), num_rows)->convertToFullColumnIfConst(); } +static bool hasDefault(const StorageMetadataPtr & metadata_snapshot, const NameAndTypePair & column) +{ + if (!metadata_snapshot) + return false; + + const auto & columns = metadata_snapshot->getColumns(); + if (columns.has(column.name)) + return columns.hasDefault(column.name); + + auto name_in_storage = column.getNameInStorage(); + return columns.hasDefault(name_in_storage); +} + +static String removeTupleElementsFromSubcolumn(String subcolumn_name, const Names & tuple_elements) +{ + subcolumn_name += "."; + for (const auto & elem : tuple_elements) + { + auto pos = subcolumn_name.find(elem + "."); + if (pos != std::string::npos) + subcolumn_name.erase(pos, elem.size()); + } + + if (subcolumn_name.ends_with(".")) + subcolumn_name.pop_back(); + + return subcolumn_name; +} + void fillMissingColumns( Columns & res_columns, size_t num_rows, @@ -321,10 +353,8 @@ void fillMissingColumns( if (res_columns[i] && partially_read_columns.contains(requested_column->name)) res_columns[i] = nullptr; - if (res_columns[i]) - continue; - - if (metadata_snapshot && metadata_snapshot->getColumns().hasDefault(requested_column->getNameInStorage())) + /// Nothing to fill or default should be filled in evaluateMissingDefaults + if (res_columns[i] || hasDefault(metadata_snapshot, *requested_column)) continue; std::vector current_offsets; @@ -365,19 +395,30 @@ void fillMissingColumns( if (!current_offsets.empty()) { + + Names tuple_elements; + auto serialization = IDataType::getSerialization(*requested_column); + + IDataType::forEachSubcolumn([&](const auto & path, const auto &, const auto &) + { + if (path.back().type == ISerialization::Substream::TupleElement) + tuple_elements.push_back(path.back().name_of_substream); + }, ISerialization::SubstreamData(serialization)); + size_t num_empty_dimensions = num_dimensions - current_offsets.size(); - auto scalar_type = createArrayOfType(getBaseTypeOfArray(requested_column->getTypeInStorage()), num_empty_dimensions); + auto base_type = getBaseTypeOfArray(requested_column->getTypeInStorage(), tuple_elements); + auto scalar_type = createArrayOfType(base_type, num_empty_dimensions); size_t data_size = assert_cast(*current_offsets.back()).getData().back(); - res_columns[i] = createColumnWithDefaultValue(*scalar_type, requested_column->getSubcolumnName(), data_size); + auto subcolumn_name = removeTupleElementsFromSubcolumn(requested_column->getSubcolumnName(), tuple_elements); + + res_columns[i] = createColumnWithDefaultValue(*scalar_type, subcolumn_name, data_size); for (auto it = current_offsets.rbegin(); it != current_offsets.rend(); ++it) res_columns[i] = ColumnArray::create(res_columns[i], *it); } else { - /// We must turn a constant column into a full column because the interpreter could infer - /// that it is constant everywhere but in some blocks (from other parts) it can be a full column. res_columns[i] = createColumnWithDefaultValue(*requested_column->getTypeInStorage(), requested_column->getSubcolumnName(), num_rows); } } diff --git a/src/Storages/MergeTree/IMergeTreeReader.cpp b/src/Storages/MergeTree/IMergeTreeReader.cpp index aff1001163e..5f36e4c7c13 100644 --- a/src/Storages/MergeTree/IMergeTreeReader.cpp +++ b/src/Storages/MergeTree/IMergeTreeReader.cpp @@ -44,6 +44,7 @@ IMergeTreeReader::IMergeTreeReader( , alter_conversions(data_part_info_for_read->getAlterConversions()) /// For wide parts convert plain arrays of Nested to subcolumns /// to allow to use shared offset column from cache. + , original_requested_columns(columns_) , requested_columns(data_part_info_for_read->isWidePart() ? Nested::convertToSubcolumns(columns_) : columns_) @@ -139,7 +140,7 @@ void IMergeTreeReader::evaluateMissingDefaults(Block additional_columns, Columns { try { - size_t num_columns = requested_columns.size(); + size_t num_columns = original_requested_columns.size(); if (res_columns.size() != num_columns) throw Exception(ErrorCodes::LOGICAL_ERROR, "invalid number of columns passed to MergeTreeReader::fillMissingColumns. " @@ -151,7 +152,7 @@ void IMergeTreeReader::evaluateMissingDefaults(Block additional_columns, Columns /// Convert columns list to block. And convert subcolumns to full columns. /// TODO: rewrite with columns interface. It will be possible after changes in ExpressionActions. - auto it = requested_columns.begin(); + auto it = original_requested_columns.begin(); for (size_t pos = 0; pos < num_columns; ++pos, ++it) { auto name_in_storage = it->getNameInStorage(); @@ -178,7 +179,7 @@ void IMergeTreeReader::evaluateMissingDefaults(Block additional_columns, Columns } /// Move columns from block. - it = requested_columns.begin(); + it = original_requested_columns.begin(); for (size_t pos = 0; pos < num_columns; ++pos, ++it) { auto name_in_storage = it->getNameInStorage(); diff --git a/src/Storages/MergeTree/IMergeTreeReader.h b/src/Storages/MergeTree/IMergeTreeReader.h index a1ec0339fd6..d799ce57b40 100644 --- a/src/Storages/MergeTree/IMergeTreeReader.h +++ b/src/Storages/MergeTree/IMergeTreeReader.h @@ -112,6 +112,9 @@ protected: private: /// Columns that are requested to read. + NamesAndTypesList original_requested_columns; + + /// The same as above but with converted Arrays to subcolumns of Nested. NamesAndTypesList requested_columns; /// Actual columns description in part.