From 0dbbca5b3e6ff8959604bfb3c69b097261c147b1 Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Wed, 11 May 2022 19:33:45 +0000 Subject: [PATCH] fix filling of missed Nested columns with multiple levels --- .../Serializations/SerializationArray.cpp | 38 +++++----- .../Serializations/SerializationArray.h | 1 + src/Interpreters/inplaceBlockConversions.cpp | 73 ++++++++++++++----- src/Interpreters/inplaceBlockConversions.h | 1 + src/Storages/MergeTree/IMergeTreeReader.cpp | 6 +- src/Storages/StorageMemory.cpp | 2 +- .../0_stateless/01825_type_json_17.reference | 7 ++ .../0_stateless/01825_type_json_17.sql | 18 +++++ 8 files changed, 108 insertions(+), 38 deletions(-) create mode 100644 tests/queries/0_stateless/01825_type_json_17.reference create mode 100644 tests/queries/0_stateless/01825_type_json_17.sql diff --git a/src/DataTypes/Serializations/SerializationArray.cpp b/src/DataTypes/Serializations/SerializationArray.cpp index 30ee5e98b74..aebfb1b27b2 100644 --- a/src/DataTypes/Serializations/SerializationArray.cpp +++ b/src/DataTypes/Serializations/SerializationArray.cpp @@ -132,29 +132,29 @@ namespace offset_values.resize(i); } +} - ColumnPtr arraySizesToOffsets(const IColumn & column) - { - const auto & column_sizes = assert_cast(column); - MutableColumnPtr column_offsets = column_sizes.cloneEmpty(); - - if (column_sizes.empty()) - return column_offsets; - - const auto & sizes_data = column_sizes.getData(); - auto & offsets_data = assert_cast(*column_offsets).getData(); - - offsets_data.resize(sizes_data.size()); - - IColumn::Offset prev_offset = 0; - for (size_t i = 0, size = sizes_data.size(); i < size; ++i) - { - prev_offset += sizes_data[i]; - offsets_data[i] = prev_offset; - } +ColumnPtr arraySizesToOffsets(const IColumn & column) +{ + const auto & column_sizes = assert_cast(column); + MutableColumnPtr column_offsets = column_sizes.cloneEmpty(); + if (column_sizes.empty()) return column_offsets; + + const auto & sizes_data = column_sizes.getData(); + auto & offsets_data = assert_cast(*column_offsets).getData(); + + offsets_data.resize(sizes_data.size()); + + IColumn::Offset prev_offset = 0; + for (size_t i = 0, size = sizes_data.size(); i < size; ++i) + { + prev_offset += sizes_data[i]; + offsets_data[i] = prev_offset; } + + return column_offsets; } ColumnPtr arrayOffsetsToSizes(const IColumn & column) diff --git a/src/DataTypes/Serializations/SerializationArray.h b/src/DataTypes/Serializations/SerializationArray.h index 3769f8a4513..9179988bf10 100644 --- a/src/DataTypes/Serializations/SerializationArray.h +++ b/src/DataTypes/Serializations/SerializationArray.h @@ -80,5 +80,6 @@ private: }; ColumnPtr arrayOffsetsToSizes(const IColumn & column); +ColumnPtr arraySizesToOffsets(const IColumn & column); } diff --git a/src/Interpreters/inplaceBlockConversions.cpp b/src/Interpreters/inplaceBlockConversions.cpp index 1bde6fe5a8c..bc40d64de16 100644 --- a/src/Interpreters/inplaceBlockConversions.cpp +++ b/src/Interpreters/inplaceBlockConversions.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -198,6 +199,9 @@ static bool arrayHasNoElementsRead(const IColumn & column) if (!size) return false; + if (const auto * nested_array = typeid_cast(&column_array->getData())) + return arrayHasNoElementsRead(*nested_array); + size_t data_size = column_array->getData().size(); if (data_size) return false; @@ -210,6 +214,7 @@ void fillMissingColumns( Columns & res_columns, size_t num_rows, const NamesAndTypesList & requested_columns, + const NamesAndTypesList & available_columns, StorageMetadataPtr metadata_snapshot) { size_t num_columns = requested_columns.size(); @@ -224,26 +229,35 @@ void fillMissingColumns( /// First, collect offset columns for all arrays in the block. std::unordered_map offset_columns; - auto requested_column = requested_columns.begin(); - for (size_t i = 0; i < num_columns; ++i, ++requested_column) + auto available_column = available_columns.begin(); + for (size_t i = 0; i < num_columns; ++i, ++available_column) { if (res_columns[i] == nullptr) continue; - if (const auto * array = typeid_cast(res_columns[i].get())) + auto serialization = IDataType::getSerialization(*available_column); + auto name_in_storage = Nested::extractTableName(available_column->name); + + ISerialization::SubstreamPath path; + serialization->enumerateStreams(path, [&](const auto & subpath) { - String offsets_name = Nested::extractTableName(requested_column->name); - auto & offsets_column = offset_columns[offsets_name]; + if (subpath.empty() || subpath.back().type != ISerialization::Substream::ArraySizes) + return; + + auto subname = ISerialization::getSubcolumnNameForStream(subpath); + auto & offsets_column = offset_columns[Nested::concatenateName(name_in_storage, subname)]; /// If for some reason multiple offsets columns are present for the same nested data structure, /// choose the one that is not empty. + /// TODO: more optimal if (!offsets_column || offsets_column->empty()) - offsets_column = array->getOffsetsPtr(); - } + offsets_column = arraySizesToOffsets(*subpath.back().data.column); + + }, {serialization, available_column->type, res_columns[i], nullptr}); } /// insert default values only for columns without default expressions - requested_column = requested_columns.begin(); + auto requested_column = requested_columns.begin(); for (size_t i = 0; i < num_columns; ++i, ++requested_column) { const auto & [name, type] = *requested_column; @@ -256,19 +270,44 @@ void fillMissingColumns( if (metadata_snapshot && metadata_snapshot->getColumns().hasDefault(name)) continue; - String offsets_name = Nested::extractTableName(name); - auto offset_it = offset_columns.find(offsets_name); + std::vector current_offsets; + bool has_all_offsets = true; + const auto * array_type = typeid_cast(type.get()); - if (offset_it != offset_columns.end() && array_type) + if (array_type) { - const auto & nested_type = array_type->getNestedType(); - ColumnPtr offsets_column = offset_it->second; - size_t nested_rows = typeid_cast(*offsets_column).getData().back(); + auto serialization = IDataType::getSerialization(*requested_column); + auto name_in_storage = Nested::extractTableName(requested_column->name); - ColumnPtr nested_column = - nested_type->createColumnConstWithDefaultValue(nested_rows)->convertToFullColumnIfConst(); + ISerialization::SubstreamPath path; + serialization->enumerateStreams(path, [&](const auto & subpath) + { + if (!has_all_offsets) + return; - res_columns[i] = ColumnArray::create(nested_column, offsets_column); + if (subpath.empty() || subpath.back().type != ISerialization::Substream::ArraySizes) + return; + + auto subname = ISerialization::getSubcolumnNameForStream(subpath); + auto it = offset_columns.find(Nested::concatenateName(name_in_storage, subname)); + if (it != offset_columns.end()) + current_offsets.emplace_back(it->second); + else + has_all_offsets = false; + + }, {serialization, type, nullptr, nullptr}); + } + + if (array_type && has_all_offsets) + { + assert(!current_offsets.empty()); + auto scalar_type = getBaseTypeOfArray(type); + + size_t data_size = assert_cast(*current_offsets.back()).getData().back(); + res_columns[i] = scalar_type->createColumnConstWithDefaultValue(data_size)->convertToFullColumnIfConst(); + + for (auto it = current_offsets.rbegin(); it != current_offsets.rend(); ++it) + res_columns[i] = ColumnArray::create(res_columns[i], *it); } else { diff --git a/src/Interpreters/inplaceBlockConversions.h b/src/Interpreters/inplaceBlockConversions.h index b3113ddfa5c..70187d5aace 100644 --- a/src/Interpreters/inplaceBlockConversions.h +++ b/src/Interpreters/inplaceBlockConversions.h @@ -43,6 +43,7 @@ void fillMissingColumns( Columns & res_columns, size_t num_rows, const NamesAndTypesList & requested_columns, + const NamesAndTypesList & available_columns, StorageMetadataPtr metadata_snapshot); } diff --git a/src/Storages/MergeTree/IMergeTreeReader.cpp b/src/Storages/MergeTree/IMergeTreeReader.cpp index 3a823345dda..4eff1653d1e 100644 --- a/src/Storages/MergeTree/IMergeTreeReader.cpp +++ b/src/Storages/MergeTree/IMergeTreeReader.cpp @@ -66,7 +66,11 @@ void IMergeTreeReader::fillMissingColumns(Columns & res_columns, bool & should_e { try { - DB::fillMissingColumns(res_columns, num_rows, columns, metadata_snapshot); + NamesAndTypesList available_columns; + for (const auto & column : columns) + available_columns.push_back(getColumnFromPart(column)); + + DB::fillMissingColumns(res_columns, num_rows, columns, available_columns, metadata_snapshot); should_evaluate_missing_defaults = std::any_of( res_columns.begin(), res_columns.end(), [](const auto & column) { return column == nullptr; }); } diff --git a/src/Storages/StorageMemory.cpp b/src/Storages/StorageMemory.cpp index e7911125383..20ca84452e7 100644 --- a/src/Storages/StorageMemory.cpp +++ b/src/Storages/StorageMemory.cpp @@ -91,7 +91,7 @@ protected: ++name_and_type; } - fillMissingColumns(columns, src.rows(), column_names_and_types, /*metadata_snapshot=*/ nullptr); + fillMissingColumns(columns, src.rows(), column_names_and_types, column_names_and_types, /*metadata_snapshot=*/ nullptr); assert(std::all_of(columns.begin(), columns.end(), [](const auto & column) { return column != nullptr; })); return Chunk(std::move(columns), src.rows()); diff --git a/tests/queries/0_stateless/01825_type_json_17.reference b/tests/queries/0_stateless/01825_type_json_17.reference new file mode 100644 index 00000000000..96e58224f32 --- /dev/null +++ b/tests/queries/0_stateless/01825_type_json_17.reference @@ -0,0 +1,7 @@ +Tuple(arr Nested(k1 Nested(k2 String, k3 String, k4 Int8), k5 Tuple(k6 String)), id Int8) +{"obj":{"arr":[{"k1":[{"k2":"aaa","k3":"bbb","k4":0},{"k2":"ccc","k3":"","k4":0}],"k5":{"k6":""}}],"id":1}} +{"obj":{"arr":[{"k1":[{"k2":"","k3":"ddd","k4":10},{"k2":"","k3":"","k4":20}],"k5":{"k6":"foo"}}],"id":2}} +[['bbb','']] [['aaa','ccc']] +[['ddd','']] [['','']] +[[0,0]] +[[10,20]] diff --git a/tests/queries/0_stateless/01825_type_json_17.sql b/tests/queries/0_stateless/01825_type_json_17.sql new file mode 100644 index 00000000000..b34357f8ef1 --- /dev/null +++ b/tests/queries/0_stateless/01825_type_json_17.sql @@ -0,0 +1,18 @@ +-- Tags: no-fasttest + +DROP TABLE IF EXISTS t_json_17; +SET allow_experimental_object_type = 1; +SET output_format_json_named_tuples_as_objects = 1; + +CREATE TABLE t_json_17(obj JSON) +ENGINE = MergeTree ORDER BY tuple(); + +INSERT INTO t_json_17 FORMAT JSONAsObject {"id": 1, "arr": [{"k1": [{"k2": "aaa", "k3": "bbb"}, {"k2": "ccc"}]}]} +INSERT INTO t_json_17 FORMAT JSONAsObject {"id": 2, "arr": [{"k1": [{"k3": "ddd", "k4": 10}, {"k4": 20}], "k5": {"k6": "foo"}}]} + +SELECT toTypeName(obj) FROM t_json_17 LIMIT 1; +SELECT obj FROM t_json_17 ORDER BY obj.id FORMAT JSONEachRow; +SELECT obj.arr.k1.k3, obj.arr.k1.k2 FROM t_json_17 ORDER BY obj.id; +SELECT obj.arr.k1.k4 FROM t_json_17 ORDER BY obj.id; + +DROP TABLE IF EXISTS t_json_17;