From 9fa955403bd6acebf63ed6b086dff95587eb3d12 Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Wed, 17 Jul 2019 01:13:12 +0800 Subject: [PATCH 1/4] Optimize count() Choose the smallest column to count if possible. --- dbms/src/Columns/IColumn.h | 14 ++++++++++ dbms/src/Interpreters/ExpressionAnalyzer.cpp | 19 ++++++++++++- dbms/src/Storages/IStorage.h | 2 ++ .../MergeTree/MergeTreeBlockReadUtils.cpp | 2 +- dbms/src/Storages/MergeTree/MergeTreeData.cpp | 8 +++--- dbms/src/Storages/MergeTree/MergeTreeData.h | 3 +-- .../Storages/MergeTree/MergeTreeDataPart.cpp | 6 ++--- .../Storages/MergeTree/MergeTreeDataPart.h | 14 ---------- .../Storages/System/StorageSystemColumns.cpp | 6 +---- .../Storages/System/StorageSystemParts.cpp | 2 +- .../System/StorageSystemPartsColumns.cpp | 2 +- dbms/tests/performance/count.xml | 27 +++++++++++++++++++ 12 files changed, 73 insertions(+), 32 deletions(-) create mode 100644 dbms/tests/performance/count.xml diff --git a/dbms/src/Columns/IColumn.h b/dbms/src/Columns/IColumn.h index 3955ed4b528..17c62fbd2d8 100644 --- a/dbms/src/Columns/IColumn.h +++ b/dbms/src/Columns/IColumn.h @@ -24,6 +24,20 @@ namespace ErrorCodes class Arena; class ColumnGathererStream; +struct ColumnSize +{ + size_t marks = 0; + size_t data_compressed = 0; + size_t data_uncompressed = 0; + + void add(const ColumnSize & other) + { + marks += other.marks; + data_compressed += other.data_compressed; + data_uncompressed += other.data_uncompressed; + } +}; + /// Declares interface to store columns in memory. class IColumn : public COW { diff --git a/dbms/src/Interpreters/ExpressionAnalyzer.cpp b/dbms/src/Interpreters/ExpressionAnalyzer.cpp index 20b4beb0789..380060eba84 100644 --- a/dbms/src/Interpreters/ExpressionAnalyzer.cpp +++ b/dbms/src/Interpreters/ExpressionAnalyzer.cpp @@ -1040,7 +1040,24 @@ void ExpressionAnalyzer::collectUsedColumns() /// You need to read at least one column to find the number of rows. if (select_query && required.empty()) - required.insert(ExpressionActions::getSmallestColumn(source_columns)); + { + size_t min_data_compressed = 0; + String min_column_name; + if (storage) + { + auto column_sizes = storage->getColumnSizes(); + for (auto & [column_name, column_size] : column_sizes) + if (min_data_compressed == 0 || min_data_compressed > column_size.data_compressed) + { + min_data_compressed = column_size.data_compressed; + min_column_name = column_name; + } + } + if (min_data_compressed > 0) + required.insert(min_column_name); + else + required.insert(ExpressionActions::getSmallestColumn(source_columns)); + } NameSet unknown_required_source_columns = required; diff --git a/dbms/src/Storages/IStorage.h b/dbms/src/Storages/IStorage.h index 9e66ee5088e..7c45401e338 100644 --- a/dbms/src/Storages/IStorage.h +++ b/dbms/src/Storages/IStorage.h @@ -82,6 +82,8 @@ public: /// Returns true if the storage supports deduplication of inserted data blocks. virtual bool supportsDeduplication() const { return false; } + using ColumnSizeByName = std::unordered_map; + virtual ColumnSizeByName getColumnSizes() const { return {}; } public: /// thread-unsafe part. lockStructure must be acquired const ColumnsDescription & getColumns() const; /// returns combined set of columns diff --git a/dbms/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp b/dbms/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp index 96e1ba1429f..043adc7a259 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp @@ -119,7 +119,7 @@ void MergeTreeBlockSizePredictor::initialize(const Block & sample_block, const N ColumnInfo info; info.name = column_name; /// If column isn't fixed and doesn't have checksum, than take first - MergeTreeDataPart::ColumnSize column_size = data_part->getColumnSize( + ColumnSize column_size = data_part->getColumnSize( column_name, *column_with_type_and_name.type); info.bytes_per_row_global = column_size.data_uncompressed diff --git a/dbms/src/Storages/MergeTree/MergeTreeData.cpp b/dbms/src/Storages/MergeTree/MergeTreeData.cpp index d1e9e470e57..b32470f9f77 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeData.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreeData.cpp @@ -2378,8 +2378,8 @@ void MergeTreeData::addPartContributionToColumnSizes(const DataPartPtr & part) for (const auto & column : part->columns) { - DataPart::ColumnSize & total_column_size = column_sizes[column.name]; - DataPart::ColumnSize part_column_size = part->getColumnSize(column.name, *column.type); + ColumnSize & total_column_size = column_sizes[column.name]; + ColumnSize part_column_size = part->getColumnSize(column.name, *column.type); total_column_size.add(part_column_size); } } @@ -2390,8 +2390,8 @@ void MergeTreeData::removePartContributionToColumnSizes(const DataPartPtr & part for (const auto & column : part->columns) { - DataPart::ColumnSize & total_column_size = column_sizes[column.name]; - DataPart::ColumnSize part_column_size = part->getColumnSize(column.name, *column.type); + ColumnSize & total_column_size = column_sizes[column.name]; + ColumnSize part_column_size = part->getColumnSize(column.name, *column.type); auto log_subtract = [&](size_t & from, size_t value, const char * field) { diff --git a/dbms/src/Storages/MergeTree/MergeTreeData.h b/dbms/src/Storages/MergeTree/MergeTreeData.h index 2ebddb886f6..29962382749 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeData.h +++ b/dbms/src/Storages/MergeTree/MergeTreeData.h @@ -547,8 +547,7 @@ public: return it == std::end(column_sizes) ? 0 : it->second.data_compressed; } - using ColumnSizeByName = std::unordered_map; - ColumnSizeByName getColumnSizes() const + ColumnSizeByName getColumnSizes() const override { auto lock = lockParts(); return column_sizes; diff --git a/dbms/src/Storages/MergeTree/MergeTreeDataPart.cpp b/dbms/src/Storages/MergeTree/MergeTreeDataPart.cpp index fc4c5fef130..7b8be970e1d 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeDataPart.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreeDataPart.cpp @@ -153,7 +153,7 @@ MergeTreeDataPart::MergeTreeDataPart(const MergeTreeData & storage_, const Strin /// Takes into account the fact that several columns can e.g. share their .size substreams. /// When calculating totals these should be counted only once. -MergeTreeDataPart::ColumnSize MergeTreeDataPart::getColumnSizeImpl( +ColumnSize MergeTreeDataPart::getColumnSizeImpl( const String & column_name, const IDataType & type, std::unordered_set * processed_substreams) const { ColumnSize size; @@ -182,12 +182,12 @@ MergeTreeDataPart::ColumnSize MergeTreeDataPart::getColumnSizeImpl( return size; } -MergeTreeDataPart::ColumnSize MergeTreeDataPart::getColumnSize(const String & column_name, const IDataType & type) const +ColumnSize MergeTreeDataPart::getColumnSize(const String & column_name, const IDataType & type) const { return getColumnSizeImpl(column_name, type, nullptr); } -MergeTreeDataPart::ColumnSize MergeTreeDataPart::getTotalColumnsSize() const +ColumnSize MergeTreeDataPart::getTotalColumnsSize() const { ColumnSize totals; std::unordered_set processed_substreams; diff --git a/dbms/src/Storages/MergeTree/MergeTreeDataPart.h b/dbms/src/Storages/MergeTree/MergeTreeDataPart.h index f775e5bc085..053e9428861 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeDataPart.h +++ b/dbms/src/Storages/MergeTree/MergeTreeDataPart.h @@ -39,20 +39,6 @@ struct MergeTreeDataPart /// If no checksums are present returns the name of the first physically existing column. String getColumnNameWithMinumumCompressedSize() const; - struct ColumnSize - { - size_t marks = 0; - size_t data_compressed = 0; - size_t data_uncompressed = 0; - - void add(const ColumnSize & other) - { - marks += other.marks; - data_compressed += other.data_compressed; - data_uncompressed += other.data_uncompressed; - } - }; - /// NOTE: Returns zeros if column files are not found in checksums. /// NOTE: You must ensure that no ALTERs are in progress when calculating ColumnSizes. /// (either by locking columns_lock, or by locking table structure). diff --git a/dbms/src/Storages/System/StorageSystemColumns.cpp b/dbms/src/Storages/System/StorageSystemColumns.cpp index 26749775d23..3ba1128c245 100644 --- a/dbms/src/Storages/System/StorageSystemColumns.cpp +++ b/dbms/src/Storages/System/StorageSystemColumns.cpp @@ -121,11 +121,7 @@ protected: cols_required_for_primary_key = storage->getColumnsRequiredForPrimaryKey(); cols_required_for_sampling = storage->getColumnsRequiredForSampling(); - /** Info about sizes of columns for tables of MergeTree family. - * NOTE: It is possible to add getter for this info to IStorage interface. - */ - if (auto storage_concrete = dynamic_cast(storage.get())) - column_sizes = storage_concrete->getColumnSizes(); + column_sizes = storage->getColumnSizes(); } for (const auto & column : columns) diff --git a/dbms/src/Storages/System/StorageSystemParts.cpp b/dbms/src/Storages/System/StorageSystemParts.cpp index ebcf36abfb7..f8fffd2d9c9 100644 --- a/dbms/src/Storages/System/StorageSystemParts.cpp +++ b/dbms/src/Storages/System/StorageSystemParts.cpp @@ -68,7 +68,7 @@ void StorageSystemParts::processNextStorage(MutableColumns & columns, const Stor const auto & part = all_parts[part_number]; auto part_state = all_parts_state[part_number]; - MergeTreeDataPart::ColumnSize columns_size = part->getTotalColumnsSize(); + ColumnSize columns_size = part->getTotalColumnsSize(); size_t i = 0; { diff --git a/dbms/src/Storages/System/StorageSystemPartsColumns.cpp b/dbms/src/Storages/System/StorageSystemPartsColumns.cpp index 6b8cb6ae836..ab688b514e7 100644 --- a/dbms/src/Storages/System/StorageSystemPartsColumns.cpp +++ b/dbms/src/Storages/System/StorageSystemPartsColumns.cpp @@ -151,7 +151,7 @@ void StorageSystemPartsColumns::processNextStorage(MutableColumns & columns, con columns[j++]->insertDefault(); } - MergeTreeDataPart::ColumnSize column_size = part->getColumnSize(column.name, *column.type); + ColumnSize column_size = part->getColumnSize(column.name, *column.type); columns[j++]->insert(column_size.data_compressed + column_size.marks); columns[j++]->insert(column_size.data_compressed); columns[j++]->insert(column_size.data_uncompressed); diff --git a/dbms/tests/performance/count.xml b/dbms/tests/performance/count.xml new file mode 100644 index 00000000000..9ce3c0a98e4 --- /dev/null +++ b/dbms/tests/performance/count.xml @@ -0,0 +1,27 @@ + + count + + loop + + + + 30000 + + + 6000 + 60000 + + + + + + + + CREATE TABLE data(k UInt64, v UInt64) ENGINE = MergeTree ORDER BY k + + INSERT INTO data SELECT number, 1 from numbers(10000000) + + SELECT count() FROM data + + DROP TABLE IF EXISTS data + From d0c088b9cc0acaff09e06f982bcc6472cf6b0e9f Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Tue, 16 Jul 2019 22:16:25 +0300 Subject: [PATCH 2/4] Update ExpressionAnalyzer.cpp --- dbms/src/Interpreters/ExpressionAnalyzer.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dbms/src/Interpreters/ExpressionAnalyzer.cpp b/dbms/src/Interpreters/ExpressionAnalyzer.cpp index 380060eba84..362bf404a71 100644 --- a/dbms/src/Interpreters/ExpressionAnalyzer.cpp +++ b/dbms/src/Interpreters/ExpressionAnalyzer.cpp @@ -1047,11 +1047,13 @@ void ExpressionAnalyzer::collectUsedColumns() { auto column_sizes = storage->getColumnSizes(); for (auto & [column_name, column_size] : column_sizes) + { if (min_data_compressed == 0 || min_data_compressed > column_size.data_compressed) { min_data_compressed = column_size.data_compressed; min_column_name = column_name; } + } } if (min_data_compressed > 0) required.insert(min_column_name); From 503556ae538f84587de95e6b0e20de649ff6c155 Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Tue, 16 Jul 2019 22:22:05 +0300 Subject: [PATCH 3/4] Update ExpressionAnalyzer.cpp --- dbms/src/Interpreters/ExpressionAnalyzer.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dbms/src/Interpreters/ExpressionAnalyzer.cpp b/dbms/src/Interpreters/ExpressionAnalyzer.cpp index 362bf404a71..e8143f3c41a 100644 --- a/dbms/src/Interpreters/ExpressionAnalyzer.cpp +++ b/dbms/src/Interpreters/ExpressionAnalyzer.cpp @@ -1041,6 +1041,7 @@ void ExpressionAnalyzer::collectUsedColumns() /// You need to read at least one column to find the number of rows. if (select_query && required.empty()) { + /// We will find a column with minimum compressed size. Because it is the column that is cheapest to read. size_t min_data_compressed = 0; String min_column_name; if (storage) @@ -1058,6 +1059,7 @@ void ExpressionAnalyzer::collectUsedColumns() if (min_data_compressed > 0) required.insert(min_column_name); else + /// If we have no information about columns sizes, choose a column of minimum size of its data type. required.insert(ExpressionActions::getSmallestColumn(source_columns)); } From 71233e11a9d6daf10cbcc1aa1638b152b647d816 Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Wed, 17 Jul 2019 09:24:37 +0800 Subject: [PATCH 4/4] minor updates. --- dbms/src/Columns/IColumn.h | 14 -------------- dbms/src/Storages/IStorage.h | 15 +++++++++++++++ dbms/src/Storages/MergeTree/MergeTreeDataPart.h | 1 + 3 files changed, 16 insertions(+), 14 deletions(-) diff --git a/dbms/src/Columns/IColumn.h b/dbms/src/Columns/IColumn.h index 17c62fbd2d8..3955ed4b528 100644 --- a/dbms/src/Columns/IColumn.h +++ b/dbms/src/Columns/IColumn.h @@ -24,20 +24,6 @@ namespace ErrorCodes class Arena; class ColumnGathererStream; -struct ColumnSize -{ - size_t marks = 0; - size_t data_compressed = 0; - size_t data_uncompressed = 0; - - void add(const ColumnSize & other) - { - marks += other.marks; - data_compressed += other.data_compressed; - data_uncompressed += other.data_uncompressed; - } -}; - /// Declares interface to store columns in memory. class IColumn : public COW { diff --git a/dbms/src/Storages/IStorage.h b/dbms/src/Storages/IStorage.h index 7c45401e338..477c4456b17 100644 --- a/dbms/src/Storages/IStorage.h +++ b/dbms/src/Storages/IStorage.h @@ -38,6 +38,19 @@ class AlterCommands; class MutationCommands; class PartitionCommands; +struct ColumnSize +{ + size_t marks = 0; + size_t data_compressed = 0; + size_t data_uncompressed = 0; + + void add(const ColumnSize & other) + { + marks += other.marks; + data_compressed += other.data_compressed; + data_uncompressed += other.data_uncompressed; + } +}; /** Storage. Describes the table. Responsible for * - storage of the table data; @@ -82,6 +95,8 @@ public: /// Returns true if the storage supports deduplication of inserted data blocks. virtual bool supportsDeduplication() const { return false; } + /// Optional size information of each physical column. + /// Currently it's only used by the MergeTree family for query optimizations. using ColumnSizeByName = std::unordered_map; virtual ColumnSizeByName getColumnSizes() const { return {}; } diff --git a/dbms/src/Storages/MergeTree/MergeTreeDataPart.h b/dbms/src/Storages/MergeTree/MergeTreeDataPart.h index 053e9428861..f41ea8af424 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeDataPart.h +++ b/dbms/src/Storages/MergeTree/MergeTreeDataPart.h @@ -22,6 +22,7 @@ namespace DB { +struct ColumnSize; class MergeTreeData;