diff --git a/src/Core/SettingsChangesHistory.cpp b/src/Core/SettingsChangesHistory.cpp index 9ddf40e87b1..392f0dbc2ee 100644 --- a/src/Core/SettingsChangesHistory.cpp +++ b/src/Core/SettingsChangesHistory.cpp @@ -94,7 +94,7 @@ static std::initializer_list::buildOutput() {} template<> void AddedColumns::buildJoinGetOutput() {} -<<<<<<< HEAD template<> template void AddedColumns::buildOutputFromBlocks() {} -======= ->>>>>>> add threshold for table rows template<> void AddedColumns::buildOutput() @@ -35,15 +32,9 @@ void AddedColumns::buildOutput() buildOutputFromBlocks(); else { -<<<<<<< HEAD if (join_data_avg_perkey_rows < output_by_row_list_threshold) buildOutputFromBlocks(); - else -======= - if (join_data_avg_perkey_rows < sort_right_perkey_rows_threshold) - buildOutputFromBlocks(); else if (join_data_sorted) ->>>>>>> add threshold for table rows { for (size_t i = 0; i < this->size(); ++i) { @@ -53,19 +44,31 @@ void AddedColumns::buildOutput() if (row_ref_i) { const RowRefList * row_ref_list = reinterpret_cast(row_ref_i); -<<<<<<< HEAD - for (auto it = row_ref_list->begin(); it.ok(); ++it) - col->insertFrom(*it->block->getByPosition(right_indexes[i]).column, it->row_num); -======= col->insertRangeFrom(*row_ref_list->block->getByPosition(right_indexes[i]).column, row_ref_list->row_num, row_ref_list->rows); ->>>>>>> add threshold for table rows } else type_name[i].type->insertDefaultInto(*col); } } } -<<<<<<< HEAD + else + { + for (size_t i = 0; i < this->size(); ++i) + { + auto & col = columns[i]; + for (auto row_ref_i : lazy_output.row_refs) + { + if (row_ref_i) + { + const RowRefList * row_ref_list = reinterpret_cast(row_ref_i); + for (auto it = row_ref_list->begin(); it.ok(); ++it) + col->insertFrom(*it->block->getByPosition(right_indexes[i]).column, it->row_num); + } + else + type_name[i].type->insertDefaultInto(*col); + } + } + } } } @@ -88,25 +91,6 @@ void AddedColumns::buildJoinGetOutput() nullable_col->insertFromNotNullable(*column_from_block.column, row_ref->row_num); else col->insertFrom(*column_from_block.column, row_ref->row_num); -======= - else - { - for (size_t i = 0; i < this->size(); ++i) - { - auto & col = columns[i]; - for (auto row_ref_i : lazy_output.row_refs) - { - if (row_ref_i) - { - const RowRefList * row_ref_list = reinterpret_cast(row_ref_i); - for (auto it = row_ref_list->begin(); it.ok(); ++it) - col->insertFrom(*it->block->getByPosition(right_indexes[i]).column, it->row_num); - } - else - type_name[i].type->insertDefaultInto(*col); - } - } ->>>>>>> add threshold for table rows } } } @@ -115,11 +99,7 @@ template<> template void AddedColumns::buildOutputFromBlocks() { -<<<<<<< HEAD if (this->size() == 0) -======= - if (this->size() == 0) ->>>>>>> add threshold for table rows return; std::vector blocks; std::vector row_nums; @@ -160,32 +140,6 @@ void AddedColumns::buildOutputFromBlocks() col->insertFrom(*blocks[j]->getByPosition(right_indexes[i]).column, row_nums[j]); else type_name[i].type->insertDefaultInto(*col); -<<<<<<< HEAD -======= - } - } -} - -template<> -void AddedColumns::buildJoinGetOutput() -{ - for (size_t i = 0; i < this->size(); ++i) - { - auto & col = columns[i]; - for (auto row_ref_i : lazy_output.row_refs) - { - if (!row_ref_i) - { - type_name[i].type->insertDefaultInto(*col); - continue; - } - const auto * row_ref = reinterpret_cast(row_ref_i); - const auto & column_from_block = row_ref->block->getByPosition(right_indexes[i]); - if (auto * nullable_col = typeid_cast(col.get()); nullable_col && !column_from_block.column->isNullable()) - nullable_col->insertFromNotNullable(*column_from_block.column, row_ref->row_num); - else - col->insertFrom(*column_from_block.column, row_ref->row_num); ->>>>>>> add threshold for table rows } } } diff --git a/src/Interpreters/HashJoin/AddedColumns.h b/src/Interpreters/HashJoin/AddedColumns.h index 5ae69fbbf66..3f90b215602 100644 --- a/src/Interpreters/HashJoin/AddedColumns.h +++ b/src/Interpreters/HashJoin/AddedColumns.h @@ -115,6 +115,7 @@ public: } join_data_avg_perkey_rows = join.getJoinedData()->avgPerKeyRows(); output_by_row_list_threshold = join.getTableJoin().outputByRowListPerkeyRowsThreshold(); + join_data_sorted = join.getJoinedData()->sorted; } size_t size() const { return columns.size(); } @@ -147,6 +148,7 @@ public: std::unique_ptr offsets_to_replicate; bool need_filter = false; bool output_by_row_list = false; + bool join_data_sorted = false; size_t join_data_avg_perkey_rows = 0; size_t output_by_row_list_threshold = 0; IColumn::Filter filter; @@ -196,12 +198,6 @@ private: } } - /** Build output from the blocks that extract from `RowRef` or `RowRefList`, to avoid block cache miss which may cause performance slow down. - * And This problem would happen it we directly build output from `RowRef` or `RowRefList`. - */ - template - void buildOutputFromBlocks(); - MutableColumns columns; bool is_join_get; std::vector right_indexes; diff --git a/src/Interpreters/HashJoin/HashJoin.cpp b/src/Interpreters/HashJoin/HashJoin.cpp index e394b9913b5..59888d7a71d 100644 --- a/src/Interpreters/HashJoin/HashJoin.cpp +++ b/src/Interpreters/HashJoin/HashJoin.cpp @@ -1422,12 +1422,12 @@ void HashJoin::tryRerangeRightTableData() if ((kind != JoinKind::Inner && kind != JoinKind::Left) || strictness != JoinStrictness::All || table_join->getMixedJoinExpression()) return; - if (!data || data->sorted || data->blocks.empty() || data->maps.size() > 1) + if (!data || data->sorted || data->blocks.empty() || data->maps.size() > 1 || data->rows_to_join > table_join->sortRightTableRowsThreshold() || data->avgPerKeyRows() < table_join->sortRightPerkeyRowsThreshold()) return; if (data->keys_to_join == 0) data->keys_to_join = getTotalRowCount(); - if (sample_block_with_columns_to_add.columns() == 0 || data->rows_to_join > table_join->sortRightTableRowsThreshold() || data->avgPerKeyRows() < table_join->sortRightPerkeyRowsThreshold()) + if (sample_block_with_columns_to_add.columns() == 0) { LOG_DEBUG(log, "The joined right table total rows :{}, total keys :{}, columns added:{}", data->rows_to_join, data->keys_to_join, sample_block_with_columns_to_add.columns()); diff --git a/src/Interpreters/TableJoin.cpp b/src/Interpreters/TableJoin.cpp index 138085f0710..8bcaef77939 100644 --- a/src/Interpreters/TableJoin.cpp +++ b/src/Interpreters/TableJoin.cpp @@ -116,6 +116,8 @@ TableJoin::TableJoin(const Settings & settings, VolumePtr tmp_volume_, Temporary , max_files_to_merge(settings.join_on_disk_max_files_to_merge) , temporary_files_codec(settings.temporary_files_codec) , output_by_rowlist_perkey_rows_threshold(settings.join_output_by_rowlist_perkey_rows_threshold) + , sort_right_perkey_rows_threshold(settings.join_to_sort_perkey_rows_threshold) + , sort_right_table_rows_threshold(settings.join_to_sort_table_rows_threshold) , max_memory_usage(settings.max_memory_usage) , tmp_volume(tmp_volume_) , tmp_data(tmp_data_) diff --git a/src/Interpreters/TableJoin.h b/src/Interpreters/TableJoin.h index 4d626084d81..09d7f0f2b2a 100644 --- a/src/Interpreters/TableJoin.h +++ b/src/Interpreters/TableJoin.h @@ -149,6 +149,8 @@ private: const size_t max_files_to_merge = 0; const String temporary_files_codec = "LZ4"; const size_t output_by_rowlist_perkey_rows_threshold = 0; + const size_t sort_right_perkey_rows_threshold = 0; + const size_t sort_right_table_rows_threshold = 0; /// Value if setting max_memory_usage for query, can be used when max_bytes_in_join is not specified. size_t max_memory_usage = 0; @@ -297,6 +299,8 @@ public: } size_t outputByRowListPerkeyRowsThreshold() const { return output_by_rowlist_perkey_rows_threshold; } + size_t sortRightPerkeyRowsThreshold() const { return sort_right_perkey_rows_threshold; } + size_t sortRightTableRowsThreshold() const { return sort_right_table_rows_threshold; } size_t defaultMaxBytes() const { return default_max_bytes; } size_t maxJoinedBlockRows() const { return max_joined_block_rows; } size_t maxRowsInRightBlock() const { return partial_merge_join_rows_in_right_blocks; } diff --git a/tests/performance/all_join_opt.xml b/tests/performance/all_join_opt.xml index 2ecd76ee976..0ab9c39f67c 100644 --- a/tests/performance/all_join_opt.xml +++ b/tests/performance/all_join_opt.xml @@ -5,10 +5,10 @@ INSERT INTO test SELECT number % 10000, number % 10000, number % 10000 FROM numbers(10000000) INSERT INTO test1 SELECT number % 1000 , number % 1000, number % 1000 FROM numbers(100000) - SELECT MAX(test1.a) FROM test INNER JOIN test1 on test.b = test1.b SETTINGS join_to_sort_table_rows_threshold=100000 - SELECT MAX(test1.a) FROM test LEFT JOIN test1 on test.b = test1.b SETTINGS join_to_sort_table_rows_threshold=100000 - SELECT MAX(test1.a) FROM test RIGHT JOIN test1 on test.b = test1.b SETTINGS join_to_sort_table_rows_threshold=100000 - SELECT MAX(test1.a) FROM test FULL JOIN test1 on test.b = test1.b SETTINGS join_to_sort_table_rows_threshold=100000 + SELECT MAX(test1.a) FROM test INNER JOIN test1 on test.b = test1.b + SELECT MAX(test1.a) FROM test LEFT JOIN test1 on test.b = test1.b + SELECT MAX(test1.a) FROM test RIGHT JOIN test1 on test.b = test1.b + SELECT MAX(test1.a) FROM test FULL JOIN test1 on test.b = test1.b DROP TABLE IF EXISTS test DROP TABLE IF EXISTS test1