add sizes of subcolumns to system.parts_columns table

This commit is contained in:
Anton Popov 2022-03-21 21:12:26 +00:00
parent 8a04ed72af
commit 0c210a831c
3 changed files with 88 additions and 5 deletions

View File

@ -7,6 +7,7 @@
#include <DataTypes/DataTypeDateTime.h>
#include <DataTypes/DataTypeDate.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeNested.h>
#include <DataTypes/NestedUtils.h>
#include <DataTypes/DataTypeUUID.h>
#include <Storages/VirtualColumnUtils.h>
@ -64,7 +65,11 @@ StorageSystemPartsColumns::StorageSystemPartsColumns(const StorageID & table_id_
{"serialization_kind", std::make_shared<DataTypeString>()},
{"subcolumns.names", std::make_shared<DataTypeArray>(std::make_shared<DataTypeString>())},
{"subcolumns.types", std::make_shared<DataTypeArray>(std::make_shared<DataTypeString>())},
{"subcolumns.serializations", std::make_shared<DataTypeArray>(std::make_shared<DataTypeString>())}
{"subcolumns.serializations", std::make_shared<DataTypeArray>(std::make_shared<DataTypeString>())},
{"subcolumns.bytes_on_disk", std::make_shared<DataTypeArray>(std::make_shared<DataTypeUInt64>())},
{"subcolumns.data_compressed_bytes", std::make_shared<DataTypeArray>(std::make_shared<DataTypeUInt64>())},
{"subcolumns.data_uncompressed_bytes", std::make_shared<DataTypeArray>(std::make_shared<DataTypeUInt64>())},
{"subcolumns.marks_bytes", std::make_shared<DataTypeArray>(std::make_shared<DataTypeUInt64>())},
}
)
{
@ -228,13 +233,43 @@ void StorageSystemPartsColumns::processNextStorage(
Array subcolumn_names;
Array subcolumn_types;
Array subcolumn_sers;
Array subcolumn_serializations;
Array subcolumn_bytes_on_disk;
Array subcolumn_data_compressed_bytes;
Array subcolumn_data_uncompressed_bytes;
Array subcolumn_marks_bytes;
IDataType::forEachSubcolumn([&](const auto &, const auto & name, const auto & data)
IDataType::forEachSubcolumn([&](const auto & subpath, const auto & name, const auto & data)
{
/// We count only final subcolumns, which are represented by files on disk
/// and skip intermediate suibcolumns of types Tuple and Nested.
if (isTuple(data.type) || isNested(data.type))
return;
subcolumn_names.push_back(name);
subcolumn_types.push_back(data.type->getName());
subcolumn_sers.push_back(ISerialization::kindToString(data.serialization->getKind()));
subcolumn_serializations.push_back(ISerialization::kindToString(data.serialization->getKind()));
ColumnSize size;
NameAndTypePair subcolumn(column.name, name, column.type, data.type);
String file_name = ISerialization::getFileNameForStream(subcolumn, subpath);
auto bin_checksum = part->checksums.files.find(file_name + ".bin");
if (bin_checksum != part->checksums.files.end())
{
size.data_compressed += bin_checksum->second.file_size;
size.data_uncompressed += bin_checksum->second.uncompressed_size;
}
auto mrk_checksum = part->checksums.files.find(file_name + part->index_granularity_info.marks_file_extension);
if (mrk_checksum != part->checksums.files.end())
size.marks += mrk_checksum->second.file_size;
subcolumn_bytes_on_disk.push_back(size.data_compressed + size.marks);
subcolumn_data_compressed_bytes.push_back(size.data_compressed);
subcolumn_data_uncompressed_bytes.push_back(size.data_uncompressed);
subcolumn_marks_bytes.push_back(size.marks);
}, { serialization, column.type, nullptr, nullptr });
if (columns_mask[src_index++])
@ -242,7 +277,15 @@ void StorageSystemPartsColumns::processNextStorage(
if (columns_mask[src_index++])
columns[res_index++]->insert(subcolumn_types);
if (columns_mask[src_index++])
columns[res_index++]->insert(subcolumn_sers);
columns[res_index++]->insert(subcolumn_serializations);
if (columns_mask[src_index++])
columns[res_index++]->insert(subcolumn_bytes_on_disk);
if (columns_mask[src_index++])
columns[res_index++]->insert(subcolumn_data_compressed_bytes);
if (columns_mask[src_index++])
columns[res_index++]->insert(subcolumn_data_uncompressed_bytes);
if (columns_mask[src_index++])
columns[res_index++]->insert(subcolumn_marks_bytes);
if (has_state_column)
columns[res_index++]->insert(part->stateString());

View File

@ -0,0 +1,8 @@
arr size0 UInt64 1
d k1 String 1
d k2.k3 Array(String) 1
d k2.k4 Array(String) 1
d k2.k5 Array(Int8) 1
d k2.size0 UInt64 1
n null UInt8 1
1 1 1 1

View File

@ -0,0 +1,32 @@
DROP TABLE IF EXISTS t_subcolumns_sizes;
SET allow_experimental_object_type = 1;
CREATE TABLE t_subcolumns_sizes (id UInt64, arr Array(UInt64), n Nullable(String), d JSON)
ENGINE = MergeTree ORDER BY id
SETTINGS min_bytes_for_wide_part = 0;
INSERT INTO t_subcolumns_sizes FORMAT JSONEachRow {"id": 1, "arr": [1, 2, 3], "n": null, "d": {"k1": "v1", "k2": [{"k3": 1, "k4": "v2"}, {"k3": 3}]}}
INSERT INTO t_subcolumns_sizes FORMAT JSONEachRow {"id": 2, "arr": [0], "n": "foo", "d": {"k1": "v3", "k2": [{"k4": "v4"}, {"k3": "v5", "k5": 5}]}}
OPTIMIZE TABLE t_subcolumns_sizes FINAL;
SELECT
column,
subcolumns.names AS sname,
subcolumns.types AS stype,
subcolumns.bytes_on_disk > 0
FROM system.parts_columns ARRAY JOIN subcolumns
WHERE database = currentDatabase() AND table = 't_subcolumns_sizes' AND active
ORDER BY column, sname, stype;
SELECT
any(column_bytes_on_disk) = sum(subcolumns.bytes_on_disk),
any(column_data_compressed_bytes) = sum(subcolumns.data_compressed_bytes),
any(column_data_uncompressed_bytes) = sum(subcolumns.data_uncompressed_bytes),
any(column_marks_bytes) = sum(subcolumns.marks_bytes)
FROM system.parts_columns ARRAY JOIN subcolumns
WHERE database = currentDatabase() AND table = 't_subcolumns_sizes'
AND active AND column = 'd';
DROP TABLE IF EXISTS t_subcolumns_sizes;