From 714420fc6713d8e1f1a6af29bd37ad932d86059f Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 19 May 2024 09:00:35 +0200 Subject: [PATCH 1/8] Speed up Set index a little --- src/Storages/MergeTree/MergeTreeIndexSet.cpp | 74 ++++++-------------- src/Storages/MergeTree/MergeTreeIndexSet.h | 1 - 2 files changed, 23 insertions(+), 52 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeIndexSet.cpp b/src/Storages/MergeTree/MergeTreeIndexSet.cpp index 1bd42518fdd..0b7e2e1f942 100644 --- a/src/Storages/MergeTree/MergeTreeIndexSet.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexSet.cpp @@ -35,8 +35,7 @@ MergeTreeIndexGranuleSet::MergeTreeIndexGranuleSet( size_t max_rows_) : index_name(index_name_) , max_rows(max_rows_) - , index_sample_block(index_sample_block_) - , block(index_sample_block) + , block(index_sample_block_) { } @@ -47,8 +46,7 @@ MergeTreeIndexGranuleSet::MergeTreeIndexGranuleSet( MutableColumns && mutable_columns_) : index_name(index_name_) , max_rows(max_rows_) - , index_sample_block(index_sample_block_) - , block(index_sample_block.cloneWithColumns(std::move(mutable_columns_))) + , block(index_sample_block_.cloneWithColumns(std::move(mutable_columns_))) { } @@ -67,10 +65,11 @@ void MergeTreeIndexGranuleSet::serializeBinary(WriteBuffer & ostr) const } size_serialization->serializeBinary(size(), ostr, {}); + size_t num_columns = block.columns(); - for (size_t i = 0; i < index_sample_block.columns(); ++i) + for (size_t i = 0; i < num_columns; ++i) { - const auto & type = index_sample_block.getByPosition(i).type; + const auto & type = block.getByPosition(i).type; ISerialization::SerializeBinaryBulkSettings settings; settings.getter = [&ostr](ISerialization::SubstreamPath) -> WriteBuffer * { return &ostr; }; @@ -92,8 +91,6 @@ void MergeTreeIndexGranuleSet::deserializeBinary(ReadBuffer & istr, MergeTreeInd if (version != 1) throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown index version {}.", version); - block.clear(); - Field field_rows; const auto & size_type = DataTypePtr(std::make_shared()); size_type->getDefaultSerialization()->deserializeBinary(field_rows, istr, {}); @@ -102,24 +99,22 @@ void MergeTreeIndexGranuleSet::deserializeBinary(ReadBuffer & istr, MergeTreeInd if (rows_to_read == 0) return; - for (size_t i = 0; i < index_sample_block.columns(); ++i) + size_t num_columns = block.columns(); + + ISerialization::DeserializeBinaryBulkSettings settings; + settings.getter = [&](ISerialization::SubstreamPath) -> ReadBuffer * { return &istr; }; + settings.position_independent_encoding = false; + + for (size_t i = 0; i < num_columns; ++i) { - const auto & column = index_sample_block.getByPosition(i); - const auto & type = column.type; - ColumnPtr new_column = type->createColumn(); - - - ISerialization::DeserializeBinaryBulkSettings settings; - settings.getter = [&](ISerialization::SubstreamPath) -> ReadBuffer * { return &istr; }; - settings.position_independent_encoding = false; + auto & elem = block.getByPosition(i); + elem.column = elem.column->cloneEmpty(); ISerialization::DeserializeBinaryBulkStatePtr state; - auto serialization = type->getDefaultSerialization(); + auto serialization = elem.type->getDefaultSerialization(); serialization->deserializeBinaryBulkStatePrefix(settings, state); - serialization->deserializeBinaryBulkWithMultipleStreams(new_column, rows_to_read, settings, state, nullptr); - - block.insert(ColumnWithTypeAndName(new_column, type, column.name)); + serialization->deserializeBinaryBulkWithMultipleStreams(elem.column, rows_to_read, settings, state, nullptr); } } @@ -284,42 +279,19 @@ bool MergeTreeIndexConditionSet::mayBeTrueOnGranule(MergeTreeIndexGranulePtr idx if (isUseless()) return true; - auto granule = std::dynamic_pointer_cast(idx_granule); - if (!granule) - throw Exception(ErrorCodes::LOGICAL_ERROR, - "Set index condition got a granule with the wrong type"); + const MergeTreeIndexGranuleSet & granule = assert_cast(*idx_granule); - if (isUseless() || granule->empty() || (max_rows != 0 && granule->size() > max_rows)) + size_t size = granule.size(); + if (size == 0 || (max_rows != 0 && size > max_rows)) return true; - Block result = granule->block; + Block result = granule.block; actions->execute(result); - const auto & filter_node_name = actions->getActionsDAG().getOutputs().at(0)->result_name; - auto column = result.getByName(filter_node_name).column->convertToFullColumnIfConst()->convertToFullColumnIfLowCardinality(); + const auto & column = result.getByPosition(result.columns() - 1).column; - if (column->onlyNull()) - return false; - - const auto * col_uint8 = typeid_cast(column.get()); - - const NullMap * null_map = nullptr; - - if (const auto * col_nullable = checkAndGetColumn(&*column)) - { - col_uint8 = typeid_cast(&col_nullable->getNestedColumn()); - null_map = &col_nullable->getNullMapData(); - } - - if (!col_uint8) - throw Exception(ErrorCodes::LOGICAL_ERROR, - "ColumnUInt8 expected as Set index condition result"); - - const auto & condition = col_uint8->getData(); - size_t column_size = column->size(); - - for (size_t i = 0; i < column_size; ++i) - if ((!null_map || (*null_map)[i] == 0) && condition[i] & 1) + for (size_t i = 0; i < size; ++i) + if (column->getBool(i)) return true; return false; diff --git a/src/Storages/MergeTree/MergeTreeIndexSet.h b/src/Storages/MergeTree/MergeTreeIndexSet.h index 7c66ba1a867..3348b5fbe34 100644 --- a/src/Storages/MergeTree/MergeTreeIndexSet.h +++ b/src/Storages/MergeTree/MergeTreeIndexSet.h @@ -34,7 +34,6 @@ struct MergeTreeIndexGranuleSet final : public IMergeTreeIndexGranule const String index_name; const size_t max_rows; - const Block index_sample_block; Block block; }; From 500475f2b81e74276f6316e710ff7313244928e0 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 19 May 2024 10:45:05 +0200 Subject: [PATCH 2/8] Add a test --- tests/performance/set_index_analysis.xml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 tests/performance/set_index_analysis.xml diff --git a/tests/performance/set_index_analysis.xml b/tests/performance/set_index_analysis.xml new file mode 100644 index 00000000000..64d0af6690b --- /dev/null +++ b/tests/performance/set_index_analysis.xml @@ -0,0 +1,14 @@ + + + CREATE TABLE test_set (k UInt32, x UInt32, INDEX idx (x) TYPE set(10) GRANULARITY 1) ENGINE = MergeTree ORDER BY k SETTINGS index_granularity = 111; + + SYSTEM STOP MERGES + INSERT INTO test_set SELECT number, number DIV 100 + rand() % 7 FROM numbers(3000000) SETTINGS max_insert_threads = 4; + + + SELECT count() FROM test_set WHERE x = 1234 SETTINGS max_threads = 8; + + + SYSTEM START MERGES + DROP TABLE IF EXISTS test_set + From 2a9795f4e39e6b8e2ef0aee3d2e97f396416662e Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 19 May 2024 10:45:19 +0200 Subject: [PATCH 3/8] Minor changes --- .../MergeTree/MergeTreeDataSelectExecutor.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp index de769c59d33..949807bb88b 100644 --- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp +++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp @@ -1296,8 +1296,7 @@ MarkRanges MergeTreeDataSelectExecutor::filterMarksUsingIndex( size_t last_index_mark = 0; PostingsCacheForStore cache_in_store; - - if (dynamic_cast(&*index_helper) != nullptr) + if (dynamic_cast(index_helper.get())) cache_in_store.store = GinIndexStoreFactory::instance().get(index_helper->getFileName(), part->getDataPartStoragePtr()); for (size_t i = 0; i < ranges.size(); ++i) @@ -1315,12 +1314,12 @@ MarkRanges MergeTreeDataSelectExecutor::filterMarksUsingIndex( auto ann_condition = std::dynamic_pointer_cast(condition); if (ann_condition != nullptr) { - // vector of indexes of useful ranges + /// An array of indices of useful ranges. auto result = ann_condition->getUsefulRanges(granule); for (auto range : result) { - // range for corresponding index + /// The range for the corresponding index. MarkRange data_range( std::max(ranges[i].begin, index_mark * index_granularity + range), std::min(ranges[i].end, index_mark * index_granularity + range + 1)); @@ -1344,8 +1343,8 @@ MarkRanges MergeTreeDataSelectExecutor::filterMarksUsingIndex( continue; MarkRange data_range( - std::max(ranges[i].begin, index_mark * index_granularity), - std::min(ranges[i].end, (index_mark + 1) * index_granularity)); + std::max(ranges[i].begin, index_mark * index_granularity), + std::min(ranges[i].end, (index_mark + 1) * index_granularity)); if (res.empty() || data_range.begin - res.back().end > min_marks_for_seek) res.push_back(data_range); From 332ec7c51fe260d43bcd9b9480daaa2e95179dcb Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 19 May 2024 14:28:04 +0300 Subject: [PATCH 4/8] Update MergeTreeIndexSet.cpp --- src/Storages/MergeTree/MergeTreeIndexSet.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/MergeTreeIndexSet.cpp b/src/Storages/MergeTree/MergeTreeIndexSet.cpp index 0b7e2e1f942..e9dc638341a 100644 --- a/src/Storages/MergeTree/MergeTreeIndexSet.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexSet.cpp @@ -35,7 +35,7 @@ MergeTreeIndexGranuleSet::MergeTreeIndexGranuleSet( size_t max_rows_) : index_name(index_name_) , max_rows(max_rows_) - , block(index_sample_block_) + , block(index_sample_block_.cloneEmpty()) { } From 31f0b2f741e8a8c7b06e2271cfd5838a8d16fb32 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 19 May 2024 14:52:51 +0300 Subject: [PATCH 5/8] Update MergeTreeIndexSet.cpp --- src/Storages/MergeTree/MergeTreeIndexSet.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/MergeTreeIndexSet.cpp b/src/Storages/MergeTree/MergeTreeIndexSet.cpp index e9dc638341a..797455816f0 100644 --- a/src/Storages/MergeTree/MergeTreeIndexSet.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexSet.cpp @@ -291,7 +291,7 @@ bool MergeTreeIndexConditionSet::mayBeTrueOnGranule(MergeTreeIndexGranulePtr idx const auto & column = result.getByPosition(result.columns() - 1).column; for (size_t i = 0; i < size; ++i) - if (column->getBool(i)) + if (column->getUInt(i) & 1) return true; return false; From e18fa68f3d72a0dbed4257c4922a6c534fdb677e Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 19 May 2024 15:00:14 +0300 Subject: [PATCH 6/8] Update MergeTreeIndexSet.cpp --- src/Storages/MergeTree/MergeTreeIndexSet.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/MergeTreeIndexSet.cpp b/src/Storages/MergeTree/MergeTreeIndexSet.cpp index 797455816f0..068e08f6819 100644 --- a/src/Storages/MergeTree/MergeTreeIndexSet.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexSet.cpp @@ -291,7 +291,7 @@ bool MergeTreeIndexConditionSet::mayBeTrueOnGranule(MergeTreeIndexGranulePtr idx const auto & column = result.getByPosition(result.columns() - 1).column; for (size_t i = 0; i < size; ++i) - if (column->getUInt(i) & 1) + if (!column->isNullAt(i) && (column->get64(i) & 1)) return true; return false; From 11af3fd54f6ee3ed0291fee9ed88a852f03a252a Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 19 May 2024 16:13:41 +0300 Subject: [PATCH 7/8] Update MergeTreeIndexSet.cpp --- src/Storages/MergeTree/MergeTreeIndexSet.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/MergeTreeIndexSet.cpp b/src/Storages/MergeTree/MergeTreeIndexSet.cpp index 068e08f6819..3e5cbb34556 100644 --- a/src/Storages/MergeTree/MergeTreeIndexSet.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexSet.cpp @@ -267,6 +267,8 @@ MergeTreeIndexConditionSet::MergeTreeIndexConditionSet( filter_actions_dag->removeUnusedActions(); actions = std::make_shared(filter_actions_dag); + + actions_output_column_name = filter_actions_dag->getOutputs().at(0)->result_name; } bool MergeTreeIndexConditionSet::alwaysUnknownOrTrue() const @@ -288,7 +290,7 @@ bool MergeTreeIndexConditionSet::mayBeTrueOnGranule(MergeTreeIndexGranulePtr idx Block result = granule.block; actions->execute(result); - const auto & column = result.getByPosition(result.columns() - 1).column; + const auto & column = result.getByName(actions_output_column_name).column; for (size_t i = 0; i < size; ++i) if (!column->isNullAt(i) && (column->get64(i) & 1)) From a28309689f26e161dfbaa014bc51dea7460de30f Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 19 May 2024 16:13:58 +0300 Subject: [PATCH 8/8] Update MergeTreeIndexSet.h --- src/Storages/MergeTree/MergeTreeIndexSet.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Storages/MergeTree/MergeTreeIndexSet.h b/src/Storages/MergeTree/MergeTreeIndexSet.h index 3348b5fbe34..901653e47d6 100644 --- a/src/Storages/MergeTree/MergeTreeIndexSet.h +++ b/src/Storages/MergeTree/MergeTreeIndexSet.h @@ -126,6 +126,7 @@ private: std::unordered_set key_columns; ExpressionActionsPtr actions; + String actions_output_column_name; };