From 06d5b87bc9fe90688bf819f7d4399947651abb7e Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Mon, 7 Dec 2020 22:02:26 +0300 Subject: [PATCH] fix nested and subcolumns --- src/DataTypes/NestedUtils.cpp | 40 ++++++++++++++++--- src/DataTypes/NestedUtils.h | 3 ++ .../MergeTree/MergeTreeDataPartInMemory.cpp | 2 +- .../MergeTree/MergeTreeDataPartWide.cpp | 2 +- src/Storages/StorageBuffer.cpp | 20 ++++++---- src/Storages/StorageLog.cpp | 3 +- src/Storages/StorageTinyLog.cpp | 3 +- 7 files changed, 54 insertions(+), 19 deletions(-) diff --git a/src/DataTypes/NestedUtils.cpp b/src/DataTypes/NestedUtils.cpp index 1d080d1032a..6c13eea0a1b 100644 --- a/src/DataTypes/NestedUtils.cpp +++ b/src/DataTypes/NestedUtils.cpp @@ -86,7 +86,7 @@ Block flatten(const Block & block) for (const auto & elem : block) { const DataTypeArray * type_arr = typeid_cast(elem.type.get()); - if (!isNested(elem.type) && type_arr) + if (type_arr) { const DataTypeTuple * type_tuple = typeid_cast(type_arr->getNestedType().get()); if (type_tuple && type_tuple->haveExplicitNames()) @@ -130,12 +130,14 @@ Block flatten(const Block & block) return res; } - -NamesAndTypesList collect(const NamesAndTypesList & names_and_types) +namespace { - NamesAndTypesList res = names_and_types; - std::map nested; +using NameToDataType = std::map; + +NameToDataType getSubcolumnsOfNested(const NamesAndTypesList & names_and_types) +{ + std::unordered_map nested; for (const auto & name_type : names_and_types) { const DataTypeArray * type_arr = typeid_cast(name_type.type.get()); @@ -149,10 +151,36 @@ NamesAndTypesList collect(const NamesAndTypesList & names_and_types) } } - std::unordered_map nested_types; + std::map nested_types; + for (const auto & [name, elems] : nested) nested_types.emplace(name, createNested(elems.getTypes(), elems.getNames())); + return nested_types; +} + +} + +NamesAndTypesList collect(const NamesAndTypesList & names_and_types) +{ + NamesAndTypesList res; + auto nested_types = getSubcolumnsOfNested(names_and_types); + + for (const auto & name_type : names_and_types) + if (!nested_types.count(splitName(name_type.name).first)) + res.push_back(name_type); + + for (const auto & name_type : nested_types) + res.emplace_back(name_type.first, name_type.second); + + return res; +} + +NamesAndTypesList convertToSubcolumns(const NamesAndTypesList & names_and_types) +{ + auto nested_types = getSubcolumnsOfNested(names_and_types); + auto res = names_and_types; + for (auto & name_type : res) { auto split = splitName(name_type.name); diff --git a/src/DataTypes/NestedUtils.h b/src/DataTypes/NestedUtils.h index 3039fd7f118..b8428b96d3e 100644 --- a/src/DataTypes/NestedUtils.h +++ b/src/DataTypes/NestedUtils.h @@ -23,6 +23,9 @@ namespace Nested /// Collect Array columns in a form of `column_name.element_name` to single Array(Tuple(...)) column. NamesAndTypesList collect(const NamesAndTypesList & names_and_types); + /// Convert old-style nested (single arrays with same prefix, `n.a`, `n.b`...) to subcolumns of data type Nested. + NamesAndTypesList convertToSubcolumns(const NamesAndTypesList & names_and_types); + /// Check that sizes of arrays - elements of nested data structures - are equal. void validateArraySizes(const Block & block); } diff --git a/src/Storages/MergeTree/MergeTreeDataPartInMemory.cpp b/src/Storages/MergeTree/MergeTreeDataPartInMemory.cpp index eb191219acc..96fa411339c 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartInMemory.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartInMemory.cpp @@ -51,7 +51,7 @@ IMergeTreeDataPart::MergeTreeReaderPtr MergeTreeDataPartInMemory::getReader( { auto ptr = std::static_pointer_cast(shared_from_this()); return std::make_unique( - ptr, Nested::collect(columns_to_read), metadata_snapshot, mark_ranges, reader_settings); + ptr, columns_to_read, metadata_snapshot, mark_ranges, reader_settings); } IMergeTreeDataPart::MergeTreeWriterPtr MergeTreeDataPartInMemory::getWriter( diff --git a/src/Storages/MergeTree/MergeTreeDataPartWide.cpp b/src/Storages/MergeTree/MergeTreeDataPartWide.cpp index aea53c36bde..a38df9dc89c 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWide.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWide.cpp @@ -50,7 +50,7 @@ IMergeTreeDataPart::MergeTreeReaderPtr MergeTreeDataPartWide::getReader( { auto ptr = std::static_pointer_cast(shared_from_this()); return std::make_unique( - ptr, Nested::collect(columns_to_read), metadata_snapshot, uncompressed_cache, + ptr, Nested::convertToSubcolumns(columns_to_read), metadata_snapshot, uncompressed_cache, mark_cache, mark_ranges, reader_settings, avg_value_size_hints, profile_callback); } diff --git a/src/Storages/StorageBuffer.cpp b/src/Storages/StorageBuffer.cpp index 549caf427ea..752cef78d71 100644 --- a/src/Storages/StorageBuffer.cpp +++ b/src/Storages/StorageBuffer.cpp @@ -95,7 +95,7 @@ public: BufferSource(const Names & column_names_, StorageBuffer::Buffer & buffer_, const StorageBuffer & storage, const StorageMetadataPtr & metadata_snapshot) : SourceWithProgress( metadata_snapshot->getSampleBlockForColumns(column_names_, storage.getVirtuals(), storage.getStorageID())) - , column_names(column_names_.begin(), column_names_.end()) + , column_names_and_types(metadata_snapshot->getColumns().getAllWithSubcolumns().addTypes(column_names_)) , buffer(buffer_) {} String getName() const override { return "Buffer"; } @@ -115,10 +115,16 @@ protected: return res; Columns columns; - columns.reserve(column_names.size()); + columns.reserve(column_names_and_types.size()); - for (const auto & name : column_names) - columns.push_back(buffer.data.getByName(name).column); + for (const auto & elem : column_names_and_types) + { + const auto & current_column = buffer.data.getByName(elem.getStorageName()).column; + if (elem.isSubcolumn()) + columns.emplace_back(elem.getStorageType()->getSubcolumn(elem.getSubcolumnName(), *current_column->assumeMutable())); + else + columns.emplace_back(std::move(current_column)); + } UInt64 size = columns.at(0)->size(); res.setColumns(std::move(columns), size); @@ -127,7 +133,7 @@ protected: } private: - Names column_names; + NamesAndTypesList column_names_and_types; StorageBuffer::Buffer & buffer; bool has_been_read = false; }; @@ -188,8 +194,8 @@ void StorageBuffer::read( { const auto & dest_columns = destination_metadata_snapshot->getColumns(); const auto & our_columns = metadata_snapshot->getColumns(); - return dest_columns.hasPhysical(column_name) && - dest_columns.get(column_name).type->equals(*our_columns.get(column_name).type); + return dest_columns.hasPhysicalOrSubcolumn(column_name) && + dest_columns.getPhysicalOrSubcolumn(column_name).type->equals(*our_columns.getPhysicalOrSubcolumn(column_name).type); }); if (dst_has_same_structure) diff --git a/src/Storages/StorageLog.cpp b/src/Storages/StorageLog.cpp index b67b5d2291a..6b2845702aa 100644 --- a/src/Storages/StorageLog.cpp +++ b/src/Storages/StorageLog.cpp @@ -57,7 +57,6 @@ public: for (const auto & name_type : columns) res.insert({ name_type.type->createColumn(), name_type.type, name_type.name }); - // return Nested::flatten(res); return res; } @@ -628,7 +627,7 @@ Pipe StorageLog::read( loadMarks(); auto all_columns = metadata_snapshot->getColumns().getAllWithSubcolumns().addTypes(column_names); - all_columns = Nested::collect(all_columns); + all_columns = Nested::convertToSubcolumns(all_columns); std::shared_lock lock(rwlock); diff --git a/src/Storages/StorageTinyLog.cpp b/src/Storages/StorageTinyLog.cpp index 6a08f4f301f..38705bdf0ec 100644 --- a/src/Storages/StorageTinyLog.cpp +++ b/src/Storages/StorageTinyLog.cpp @@ -219,7 +219,6 @@ Chunk TinyLogSource::generate() streams.clear(); } - // auto flatten = Nested::flatten(res); return Chunk(res.getColumns(), res.rows()); } @@ -449,7 +448,7 @@ Pipe StorageTinyLog::read( // When reading, we lock the entire storage, because we only have one file // per column and can't modify it concurrently. return Pipe(std::make_shared( - max_block_size, Nested::collect(all_columns), + max_block_size, Nested::convertToSubcolumns(all_columns), *this, context.getSettingsRef().max_read_buffer_size)); }