rebase and reslove conflict

This commit is contained in:
kevinyhzou 2024-08-20 17:33:08 +08:00
parent cfa4ca6fb1
commit add486b62a
7 changed files with 33 additions and 77 deletions

View File

@ -94,7 +94,7 @@ static std::initializer_list<std::pair<ClickHouseVersion, SettingsChangesHistory
{"type_json_skip_duplicated_paths", false, false, "Allow to skip duplicated paths during JSON parsing"}, {"type_json_skip_duplicated_paths", false, false, "Allow to skip duplicated paths during JSON parsing"},
{"join_output_by_rowlist_perkey_rows_threshold", 0, 5, "The lower limit of per-key average rows in the right table to determine whether to output by row list in hash join."}, {"join_output_by_rowlist_perkey_rows_threshold", 0, 5, "The lower limit of per-key average rows in the right table to determine whether to output by row list in hash join."},
{"allow_experimental_vector_similarity_index", false, false, "Added new setting to allow experimental vector similarity indexes"}, {"allow_experimental_vector_similarity_index", false, false, "Added new setting to allow experimental vector similarity indexes"},
{"input_format_try_infer_datetimes_only_datetime64", true, false, "Allow to infer DateTime instead of DateTime64 in data formats"} {"input_format_try_infer_datetimes_only_datetime64", true, false, "Allow to infer DateTime instead of DateTime64 in data formats"},
{"join_to_sort_perkey_rows_threshold", 0, 40, "The lower limit of per-key average rows in the right table to determine whether to sort it in hash join."}, {"join_to_sort_perkey_rows_threshold", 0, 40, "The lower limit of per-key average rows in the right table to determine whether to sort it in hash join."},
{"join_to_sort_table_rows_threshold", 0, 10000, "The upper limit of rows in the right table to determine whether to sort it in hash join."}, {"join_to_sort_table_rows_threshold", 0, 10000, "The upper limit of rows in the right table to determine whether to sort it in hash join."},
} }

View File

@ -20,13 +20,10 @@ void AddedColumns<false>::buildOutput() {}
template<> template<>
void AddedColumns<false>::buildJoinGetOutput() {} void AddedColumns<false>::buildJoinGetOutput() {}
<<<<<<< HEAD
template<> template<>
template<bool from_row_list> template<bool from_row_list>
void AddedColumns<false>::buildOutputFromBlocks() {} void AddedColumns<false>::buildOutputFromBlocks() {}
=======
>>>>>>> add threshold for table rows
template<> template<>
void AddedColumns<true>::buildOutput() void AddedColumns<true>::buildOutput()
@ -35,15 +32,9 @@ void AddedColumns<true>::buildOutput()
buildOutputFromBlocks<false>(); buildOutputFromBlocks<false>();
else else
{ {
<<<<<<< HEAD
if (join_data_avg_perkey_rows < output_by_row_list_threshold) if (join_data_avg_perkey_rows < output_by_row_list_threshold)
buildOutputFromBlocks<true>(); buildOutputFromBlocks<true>();
else
=======
if (join_data_avg_perkey_rows < sort_right_perkey_rows_threshold)
buildOutputFromBlocks<true>();
else if (join_data_sorted) else if (join_data_sorted)
>>>>>>> add threshold for table rows
{ {
for (size_t i = 0; i < this->size(); ++i) for (size_t i = 0; i < this->size(); ++i)
{ {
@ -53,19 +44,31 @@ void AddedColumns<true>::buildOutput()
if (row_ref_i) if (row_ref_i)
{ {
const RowRefList * row_ref_list = reinterpret_cast<const RowRefList *>(row_ref_i); const RowRefList * row_ref_list = reinterpret_cast<const RowRefList *>(row_ref_i);
<<<<<<< HEAD
for (auto it = row_ref_list->begin(); it.ok(); ++it)
col->insertFrom(*it->block->getByPosition(right_indexes[i]).column, it->row_num);
=======
col->insertRangeFrom(*row_ref_list->block->getByPosition(right_indexes[i]).column, row_ref_list->row_num, row_ref_list->rows); col->insertRangeFrom(*row_ref_list->block->getByPosition(right_indexes[i]).column, row_ref_list->row_num, row_ref_list->rows);
>>>>>>> add threshold for table rows
} }
else else
type_name[i].type->insertDefaultInto(*col); type_name[i].type->insertDefaultInto(*col);
} }
} }
} }
<<<<<<< HEAD else
{
for (size_t i = 0; i < this->size(); ++i)
{
auto & col = columns[i];
for (auto row_ref_i : lazy_output.row_refs)
{
if (row_ref_i)
{
const RowRefList * row_ref_list = reinterpret_cast<const RowRefList *>(row_ref_i);
for (auto it = row_ref_list->begin(); it.ok(); ++it)
col->insertFrom(*it->block->getByPosition(right_indexes[i]).column, it->row_num);
}
else
type_name[i].type->insertDefaultInto(*col);
}
}
}
} }
} }
@ -88,25 +91,6 @@ void AddedColumns<true>::buildJoinGetOutput()
nullable_col->insertFromNotNullable(*column_from_block.column, row_ref->row_num); nullable_col->insertFromNotNullable(*column_from_block.column, row_ref->row_num);
else else
col->insertFrom(*column_from_block.column, row_ref->row_num); col->insertFrom(*column_from_block.column, row_ref->row_num);
=======
else
{
for (size_t i = 0; i < this->size(); ++i)
{
auto & col = columns[i];
for (auto row_ref_i : lazy_output.row_refs)
{
if (row_ref_i)
{
const RowRefList * row_ref_list = reinterpret_cast<const RowRefList *>(row_ref_i);
for (auto it = row_ref_list->begin(); it.ok(); ++it)
col->insertFrom(*it->block->getByPosition(right_indexes[i]).column, it->row_num);
}
else
type_name[i].type->insertDefaultInto(*col);
}
}
>>>>>>> add threshold for table rows
} }
} }
} }
@ -115,11 +99,7 @@ template<>
template<bool from_row_list> template<bool from_row_list>
void AddedColumns<true>::buildOutputFromBlocks() void AddedColumns<true>::buildOutputFromBlocks()
{ {
<<<<<<< HEAD
if (this->size() == 0) if (this->size() == 0)
=======
if (this->size() == 0)
>>>>>>> add threshold for table rows
return; return;
std::vector<const Block *> blocks; std::vector<const Block *> blocks;
std::vector<UInt32> row_nums; std::vector<UInt32> row_nums;
@ -160,32 +140,6 @@ void AddedColumns<true>::buildOutputFromBlocks()
col->insertFrom(*blocks[j]->getByPosition(right_indexes[i]).column, row_nums[j]); col->insertFrom(*blocks[j]->getByPosition(right_indexes[i]).column, row_nums[j]);
else else
type_name[i].type->insertDefaultInto(*col); type_name[i].type->insertDefaultInto(*col);
<<<<<<< HEAD
=======
}
}
}
template<>
void AddedColumns<true>::buildJoinGetOutput()
{
for (size_t i = 0; i < this->size(); ++i)
{
auto & col = columns[i];
for (auto row_ref_i : lazy_output.row_refs)
{
if (!row_ref_i)
{
type_name[i].type->insertDefaultInto(*col);
continue;
}
const auto * row_ref = reinterpret_cast<const RowRef *>(row_ref_i);
const auto & column_from_block = row_ref->block->getByPosition(right_indexes[i]);
if (auto * nullable_col = typeid_cast<ColumnNullable *>(col.get()); nullable_col && !column_from_block.column->isNullable())
nullable_col->insertFromNotNullable(*column_from_block.column, row_ref->row_num);
else
col->insertFrom(*column_from_block.column, row_ref->row_num);
>>>>>>> add threshold for table rows
} }
} }
} }

View File

@ -115,6 +115,7 @@ public:
} }
join_data_avg_perkey_rows = join.getJoinedData()->avgPerKeyRows(); join_data_avg_perkey_rows = join.getJoinedData()->avgPerKeyRows();
output_by_row_list_threshold = join.getTableJoin().outputByRowListPerkeyRowsThreshold(); output_by_row_list_threshold = join.getTableJoin().outputByRowListPerkeyRowsThreshold();
join_data_sorted = join.getJoinedData()->sorted;
} }
size_t size() const { return columns.size(); } size_t size() const { return columns.size(); }
@ -147,6 +148,7 @@ public:
std::unique_ptr<IColumn::Offsets> offsets_to_replicate; std::unique_ptr<IColumn::Offsets> offsets_to_replicate;
bool need_filter = false; bool need_filter = false;
bool output_by_row_list = false; bool output_by_row_list = false;
bool join_data_sorted = false;
size_t join_data_avg_perkey_rows = 0; size_t join_data_avg_perkey_rows = 0;
size_t output_by_row_list_threshold = 0; size_t output_by_row_list_threshold = 0;
IColumn::Filter filter; IColumn::Filter filter;
@ -196,12 +198,6 @@ private:
} }
} }
/** Build output from the blocks that extract from `RowRef` or `RowRefList`, to avoid block cache miss which may cause performance slow down.
* And This problem would happen it we directly build output from `RowRef` or `RowRefList`.
*/
template<bool from_row_list>
void buildOutputFromBlocks();
MutableColumns columns; MutableColumns columns;
bool is_join_get; bool is_join_get;
std::vector<size_t> right_indexes; std::vector<size_t> right_indexes;

View File

@ -1422,12 +1422,12 @@ void HashJoin::tryRerangeRightTableData()
if ((kind != JoinKind::Inner && kind != JoinKind::Left) || strictness != JoinStrictness::All || table_join->getMixedJoinExpression()) if ((kind != JoinKind::Inner && kind != JoinKind::Left) || strictness != JoinStrictness::All || table_join->getMixedJoinExpression())
return; return;
if (!data || data->sorted || data->blocks.empty() || data->maps.size() > 1) if (!data || data->sorted || data->blocks.empty() || data->maps.size() > 1 || data->rows_to_join > table_join->sortRightTableRowsThreshold() || data->avgPerKeyRows() < table_join->sortRightPerkeyRowsThreshold())
return; return;
if (data->keys_to_join == 0) if (data->keys_to_join == 0)
data->keys_to_join = getTotalRowCount(); data->keys_to_join = getTotalRowCount();
if (sample_block_with_columns_to_add.columns() == 0 || data->rows_to_join > table_join->sortRightTableRowsThreshold() || data->avgPerKeyRows() < table_join->sortRightPerkeyRowsThreshold()) if (sample_block_with_columns_to_add.columns() == 0)
{ {
LOG_DEBUG(log, "The joined right table total rows :{}, total keys :{}, columns added:{}", LOG_DEBUG(log, "The joined right table total rows :{}, total keys :{}, columns added:{}",
data->rows_to_join, data->keys_to_join, sample_block_with_columns_to_add.columns()); data->rows_to_join, data->keys_to_join, sample_block_with_columns_to_add.columns());

View File

@ -116,6 +116,8 @@ TableJoin::TableJoin(const Settings & settings, VolumePtr tmp_volume_, Temporary
, max_files_to_merge(settings.join_on_disk_max_files_to_merge) , max_files_to_merge(settings.join_on_disk_max_files_to_merge)
, temporary_files_codec(settings.temporary_files_codec) , temporary_files_codec(settings.temporary_files_codec)
, output_by_rowlist_perkey_rows_threshold(settings.join_output_by_rowlist_perkey_rows_threshold) , output_by_rowlist_perkey_rows_threshold(settings.join_output_by_rowlist_perkey_rows_threshold)
, sort_right_perkey_rows_threshold(settings.join_to_sort_perkey_rows_threshold)
, sort_right_table_rows_threshold(settings.join_to_sort_table_rows_threshold)
, max_memory_usage(settings.max_memory_usage) , max_memory_usage(settings.max_memory_usage)
, tmp_volume(tmp_volume_) , tmp_volume(tmp_volume_)
, tmp_data(tmp_data_) , tmp_data(tmp_data_)

View File

@ -149,6 +149,8 @@ private:
const size_t max_files_to_merge = 0; const size_t max_files_to_merge = 0;
const String temporary_files_codec = "LZ4"; const String temporary_files_codec = "LZ4";
const size_t output_by_rowlist_perkey_rows_threshold = 0; const size_t output_by_rowlist_perkey_rows_threshold = 0;
const size_t sort_right_perkey_rows_threshold = 0;
const size_t sort_right_table_rows_threshold = 0;
/// Value if setting max_memory_usage for query, can be used when max_bytes_in_join is not specified. /// Value if setting max_memory_usage for query, can be used when max_bytes_in_join is not specified.
size_t max_memory_usage = 0; size_t max_memory_usage = 0;
@ -297,6 +299,8 @@ public:
} }
size_t outputByRowListPerkeyRowsThreshold() const { return output_by_rowlist_perkey_rows_threshold; } size_t outputByRowListPerkeyRowsThreshold() const { return output_by_rowlist_perkey_rows_threshold; }
size_t sortRightPerkeyRowsThreshold() const { return sort_right_perkey_rows_threshold; }
size_t sortRightTableRowsThreshold() const { return sort_right_table_rows_threshold; }
size_t defaultMaxBytes() const { return default_max_bytes; } size_t defaultMaxBytes() const { return default_max_bytes; }
size_t maxJoinedBlockRows() const { return max_joined_block_rows; } size_t maxJoinedBlockRows() const { return max_joined_block_rows; }
size_t maxRowsInRightBlock() const { return partial_merge_join_rows_in_right_blocks; } size_t maxRowsInRightBlock() const { return partial_merge_join_rows_in_right_blocks; }

View File

@ -5,10 +5,10 @@
<fill_query>INSERT INTO test SELECT number % 10000, number % 10000, number % 10000 FROM numbers(10000000)</fill_query> <fill_query>INSERT INTO test SELECT number % 10000, number % 10000, number % 10000 FROM numbers(10000000)</fill_query>
<fill_query>INSERT INTO test1 SELECT number % 1000 , number % 1000, number % 1000 FROM numbers(100000)</fill_query> <fill_query>INSERT INTO test1 SELECT number % 1000 , number % 1000, number % 1000 FROM numbers(100000)</fill_query>
<query tag='INNER'>SELECT MAX(test1.a) FROM test INNER JOIN test1 on test.b = test1.b SETTINGS join_to_sort_table_rows_threshold=100000</query> <query tag='INNER'>SELECT MAX(test1.a) FROM test INNER JOIN test1 on test.b = test1.b</query>
<query tag='LEFT'>SELECT MAX(test1.a) FROM test LEFT JOIN test1 on test.b = test1.b SETTINGS join_to_sort_table_rows_threshold=100000</query> <query tag='LEFT'>SELECT MAX(test1.a) FROM test LEFT JOIN test1 on test.b = test1.b</query>
<query tag='RIGHT'>SELECT MAX(test1.a) FROM test RIGHT JOIN test1 on test.b = test1.b SETTINGS join_to_sort_table_rows_threshold=100000</query> <query tag='RIGHT'>SELECT MAX(test1.a) FROM test RIGHT JOIN test1 on test.b = test1.b</query>
<query tag='FULL'>SELECT MAX(test1.a) FROM test FULL JOIN test1 on test.b = test1.b SETTINGS join_to_sort_table_rows_threshold=100000</query> <query tag='FULL'>SELECT MAX(test1.a) FROM test FULL JOIN test1 on test.b = test1.b</query>
<drop_query>DROP TABLE IF EXISTS test</drop_query> <drop_query>DROP TABLE IF EXISTS test</drop_query>
<drop_query>DROP TABLE IF EXISTS test1</drop_query> <drop_query>DROP TABLE IF EXISTS test1</drop_query>