From 0c210a831cd6d4c705de12fea52abff24b2cd5f7 Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Mon, 21 Mar 2022 21:12:26 +0000 Subject: [PATCH] add sizes of subcolumns to system.parts_columns table --- .../System/StorageSystemPartsColumns.cpp | 53 +++++++++++++++++-- .../02242_subcolumns_sizes.reference | 8 +++ .../0_stateless/02242_subcolumns_sizes.sql | 32 +++++++++++ 3 files changed, 88 insertions(+), 5 deletions(-) create mode 100644 tests/queries/0_stateless/02242_subcolumns_sizes.reference create mode 100644 tests/queries/0_stateless/02242_subcolumns_sizes.sql diff --git a/src/Storages/System/StorageSystemPartsColumns.cpp b/src/Storages/System/StorageSystemPartsColumns.cpp index f5e9b82c136..a9341abb9cd 100644 --- a/src/Storages/System/StorageSystemPartsColumns.cpp +++ b/src/Storages/System/StorageSystemPartsColumns.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -64,7 +65,11 @@ StorageSystemPartsColumns::StorageSystemPartsColumns(const StorageID & table_id_ {"serialization_kind", std::make_shared()}, {"subcolumns.names", std::make_shared(std::make_shared())}, {"subcolumns.types", std::make_shared(std::make_shared())}, - {"subcolumns.serializations", std::make_shared(std::make_shared())} + {"subcolumns.serializations", std::make_shared(std::make_shared())}, + {"subcolumns.bytes_on_disk", std::make_shared(std::make_shared())}, + {"subcolumns.data_compressed_bytes", std::make_shared(std::make_shared())}, + {"subcolumns.data_uncompressed_bytes", std::make_shared(std::make_shared())}, + {"subcolumns.marks_bytes", std::make_shared(std::make_shared())}, } ) { @@ -228,13 +233,43 @@ void StorageSystemPartsColumns::processNextStorage( Array subcolumn_names; Array subcolumn_types; - Array subcolumn_sers; + Array subcolumn_serializations; + Array subcolumn_bytes_on_disk; + Array subcolumn_data_compressed_bytes; + Array subcolumn_data_uncompressed_bytes; + Array subcolumn_marks_bytes; - IDataType::forEachSubcolumn([&](const auto &, const auto & name, const auto & data) + IDataType::forEachSubcolumn([&](const auto & subpath, const auto & name, const auto & data) { + /// We count only final subcolumns, which are represented by files on disk + /// and skip intermediate suibcolumns of types Tuple and Nested. + if (isTuple(data.type) || isNested(data.type)) + return; + subcolumn_names.push_back(name); subcolumn_types.push_back(data.type->getName()); - subcolumn_sers.push_back(ISerialization::kindToString(data.serialization->getKind())); + subcolumn_serializations.push_back(ISerialization::kindToString(data.serialization->getKind())); + + ColumnSize size; + NameAndTypePair subcolumn(column.name, name, column.type, data.type); + String file_name = ISerialization::getFileNameForStream(subcolumn, subpath); + + auto bin_checksum = part->checksums.files.find(file_name + ".bin"); + if (bin_checksum != part->checksums.files.end()) + { + size.data_compressed += bin_checksum->second.file_size; + size.data_uncompressed += bin_checksum->second.uncompressed_size; + } + + auto mrk_checksum = part->checksums.files.find(file_name + part->index_granularity_info.marks_file_extension); + if (mrk_checksum != part->checksums.files.end()) + size.marks += mrk_checksum->second.file_size; + + subcolumn_bytes_on_disk.push_back(size.data_compressed + size.marks); + subcolumn_data_compressed_bytes.push_back(size.data_compressed); + subcolumn_data_uncompressed_bytes.push_back(size.data_uncompressed); + subcolumn_marks_bytes.push_back(size.marks); + }, { serialization, column.type, nullptr, nullptr }); if (columns_mask[src_index++]) @@ -242,7 +277,15 @@ void StorageSystemPartsColumns::processNextStorage( if (columns_mask[src_index++]) columns[res_index++]->insert(subcolumn_types); if (columns_mask[src_index++]) - columns[res_index++]->insert(subcolumn_sers); + columns[res_index++]->insert(subcolumn_serializations); + if (columns_mask[src_index++]) + columns[res_index++]->insert(subcolumn_bytes_on_disk); + if (columns_mask[src_index++]) + columns[res_index++]->insert(subcolumn_data_compressed_bytes); + if (columns_mask[src_index++]) + columns[res_index++]->insert(subcolumn_data_uncompressed_bytes); + if (columns_mask[src_index++]) + columns[res_index++]->insert(subcolumn_marks_bytes); if (has_state_column) columns[res_index++]->insert(part->stateString()); diff --git a/tests/queries/0_stateless/02242_subcolumns_sizes.reference b/tests/queries/0_stateless/02242_subcolumns_sizes.reference new file mode 100644 index 00000000000..124b6341116 --- /dev/null +++ b/tests/queries/0_stateless/02242_subcolumns_sizes.reference @@ -0,0 +1,8 @@ +arr size0 UInt64 1 +d k1 String 1 +d k2.k3 Array(String) 1 +d k2.k4 Array(String) 1 +d k2.k5 Array(Int8) 1 +d k2.size0 UInt64 1 +n null UInt8 1 +1 1 1 1 diff --git a/tests/queries/0_stateless/02242_subcolumns_sizes.sql b/tests/queries/0_stateless/02242_subcolumns_sizes.sql new file mode 100644 index 00000000000..f6277e96468 --- /dev/null +++ b/tests/queries/0_stateless/02242_subcolumns_sizes.sql @@ -0,0 +1,32 @@ +DROP TABLE IF EXISTS t_subcolumns_sizes; + +SET allow_experimental_object_type = 1; + +CREATE TABLE t_subcolumns_sizes (id UInt64, arr Array(UInt64), n Nullable(String), d JSON) +ENGINE = MergeTree ORDER BY id +SETTINGS min_bytes_for_wide_part = 0; + +INSERT INTO t_subcolumns_sizes FORMAT JSONEachRow {"id": 1, "arr": [1, 2, 3], "n": null, "d": {"k1": "v1", "k2": [{"k3": 1, "k4": "v2"}, {"k3": 3}]}} +INSERT INTO t_subcolumns_sizes FORMAT JSONEachRow {"id": 2, "arr": [0], "n": "foo", "d": {"k1": "v3", "k2": [{"k4": "v4"}, {"k3": "v5", "k5": 5}]}} + +OPTIMIZE TABLE t_subcolumns_sizes FINAL; + +SELECT + column, + subcolumns.names AS sname, + subcolumns.types AS stype, + subcolumns.bytes_on_disk > 0 +FROM system.parts_columns ARRAY JOIN subcolumns +WHERE database = currentDatabase() AND table = 't_subcolumns_sizes' AND active +ORDER BY column, sname, stype; + +SELECT + any(column_bytes_on_disk) = sum(subcolumns.bytes_on_disk), + any(column_data_compressed_bytes) = sum(subcolumns.data_compressed_bytes), + any(column_data_uncompressed_bytes) = sum(subcolumns.data_uncompressed_bytes), + any(column_marks_bytes) = sum(subcolumns.marks_bytes) +FROM system.parts_columns ARRAY JOIN subcolumns +WHERE database = currentDatabase() AND table = 't_subcolumns_sizes' +AND active AND column = 'd'; + +DROP TABLE IF EXISTS t_subcolumns_sizes;