review

2024-09-19 16:20:50 +00:00 · 2024-09-10 16:32:52 +08:00 · 2024-09-10 16:32:52 +08:00 · 597181c45e
commit 597181c45e
parent 02e129f0d1
5 changed files with 13 additions and 13 deletions
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@ -922,9 +922,9 @@ class IColumn;
    M(Bool, implicit_transaction, false, "If enabled and not already inside a transaction, wraps the query inside a full transaction (begin + commit or rollback)", 0) \
    M(UInt64, grace_hash_join_initial_buckets, 1, "Initial number of grace hash join buckets", 0) \
    M(UInt64, grace_hash_join_max_buckets, 1024, "Limit on the number of grace hash join buckets", 0) \
-    M(Int32, join_to_sort_perkey_rows_threshold, 40, "Rerange the right table by key in left or inner hash join when the per-key average rows of it exceed this value (means the table keys is dense) and its number of rows is not too many(controlled by `join_to_sort_table_rows_threshold`), to make the join output by the data batch of key, which would improve performance.", 0) \
-    M(Int32, join_to_sort_table_rows_threshold, 10000, "Rerange the right table by key in left or inner hash join when its number of rows not exceed this value and the table keys is dense (controlled by `join_to_sort_perkey_rows_threshold`), to make the join performance improve as output by the data batch of key, but not cost too much on the table reranging.", 0) \
-    M(Bool, allow_experimental_join_right_table_sorting, false, "If it is set to true, and the conditions of `join_to_sort_perkey_rows_threshold` and `join_to_sort_perkey_rows_threshold` are met, then we will try to rerange the right table by key to improve the performance in left or inner hash join.", 0) \
+    M(Int32, join_to_sort_minimum_perkey_rows, 40, "The lower limit of per-key average rows in the right table to determine whether to rerange the right table by key in left or inner join. This setting ensures that the optimization is not applied for sparse table keys", 0) \
+    M(Int32, join_to_sort_maximum_table_rows, 10000, "The maximum number of rows in the right table to determine whether to rerange the right table by key in left or inner join.", 0) \
+    M(Bool, allow_experimental_join_right_table_sorting, false, "If it is set to true, and the conditions of `join_to_sort_minimum_perkey_rows` and `join_to_sort_maximum_table_rows` are met, rerange the right table by key to improve the performance in left or inner hash join.", 0) \
    M(Timezone, session_timezone, "", "This setting can be removed in the future due to potential caveats. It is experimental and is not suitable for production usage. The default timezone for current session or query. The server default timezone if empty.", 0) \
    M(Bool, use_hive_partitioning, false, "Allows to use hive partitioning for File, URL, S3, AzureBlobStorage and HDFS engines.", 0)\
    \
--- a/src/Core/SettingsChangesHistory.cpp
+++ b/src/Core/SettingsChangesHistory.cpp
@ -95,9 +95,9 @@ static std::initializer_list<std::pair<ClickHouseVersion, SettingsChangesHistory
            {"join_output_by_rowlist_perkey_rows_threshold", 0, 5, "The lower limit of per-key average rows in the right table to determine whether to output by row list in hash join."},
            {"allow_experimental_vector_similarity_index", false, false, "Added new setting to allow experimental vector similarity indexes"},
            {"input_format_try_infer_datetimes_only_datetime64", true, false, "Allow to infer DateTime instead of DateTime64 in data formats"},
-            {"join_to_sort_perkey_rows_threshold", 0, 40, "Rerange the right table by key in left or inner hash join when the per-key average rows of it exceed this value (means the table keys is dense) and its number of rows is not too many(controlled by `join_to_sort_table_rows_threshold`), to make the join output by the data batch of key, which would improve performance."},
-            {"join_to_sort_table_rows_threshold", 0, 10000, "Rerange the right table by key in left or inner hash join when its number of rows not exceed this value and the table keys is dense (controlled by `join_to_sort_perkey_rows_threshold`), to make the join performance improve as output by the data batch of key, but not cost too much on the table reranging."},
-            {"allow_experimental_join_right_table_sorting", false, false, "If it is set to true, and the conditions of `join_to_sort_perkey_rows_threshold` and `join_to_sort_perkey_rows_threshold` are met, then we will try to rerange the right table by key to improve the performance in left or inner hash join."},
+            {"join_to_sort_minimum_perkey_rows", 0, 40, "The lower limit of per-key average rows in the right table to determine whether to rerange the right table by key in left or inner join. This setting ensures that the optimization is not applied for sparse table keys"},
+            {"join_to_sort_maximum_table_rows", 0, 10000, "The maximum number of rows in the right table to determine whether to rerange the right table by key in left or inner join."},
+            {"allow_experimental_join_right_table_sorting", false, false, "If it is set to true, and the conditions of `join_to_sort_minimum_perkey_rows` and `join_to_sort_maximum_table_rows` are met, rerange the right table by key to improve the performance in left or inner hash join."},
        }
    },
    {"24.7",
--- a/src/Interpreters/HashJoin/HashJoin.cpp
+++ b/src/Interpreters/HashJoin/HashJoin.cpp
@ -1429,7 +1429,7 @@ void HashJoin::tryRerangeRightTableData()
    /// 2. the join clauses size is greater than 1, like `...join on a.key1=b.key1 or a.key2=b.key2`, we can not rerange the right table on different set of keys.
    /// 3. the number of right table rows exceed the threshold, which may result in a significant cost for reranging and lead to performance degradation.
    /// 4. the keys of right table is very sparse, which may result in insignificant performance improvement after reranging by key.
-    if (!data || data->sorted || data->blocks.empty() || data->maps.size() > 1 || data->rows_to_join > table_join->sortRightTableRowsThreshold() ||  data->avgPerKeyRows() < table_join->sortRightPerkeyRowsThreshold())
+    if (!data || data->sorted || data->blocks.empty() || data->maps.size() > 1 || data->rows_to_join > table_join->sortRightMaximumTableRows() ||  data->avgPerKeyRows() < table_join->sortRightMinimumPerkeyRows())
        return;

    if (data->keys_to_join == 0)
--- a/src/Interpreters/TableJoin.cpp
+++ b/src/Interpreters/TableJoin.cpp
@ -116,8 +116,8 @@ TableJoin::TableJoin(const Settings & settings, VolumePtr tmp_volume_, Temporary
    , max_files_to_merge(settings.join_on_disk_max_files_to_merge)
    , temporary_files_codec(settings.temporary_files_codec)
    , output_by_rowlist_perkey_rows_threshold(settings.join_output_by_rowlist_perkey_rows_threshold)
-    , sort_right_perkey_rows_threshold(settings.join_to_sort_perkey_rows_threshold)
-    , sort_right_table_rows_threshold(settings.join_to_sort_table_rows_threshold)
+    , sort_right_minimum_perkey_rows(settings.join_to_sort_minimum_perkey_rows)
+    , sort_right_maximum_table_rows(settings.join_to_sort_maximum_table_rows)
    , allow_join_sorting(settings.allow_experimental_join_right_table_sorting)
    , max_memory_usage(settings.max_memory_usage)
    , tmp_volume(tmp_volume_)
--- a/src/Interpreters/TableJoin.h
+++ b/src/Interpreters/TableJoin.h
@ -149,8 +149,8 @@ private:
    const size_t max_files_to_merge = 0;
    const String temporary_files_codec = "LZ4";
    const size_t output_by_rowlist_perkey_rows_threshold = 0;
-    const size_t sort_right_perkey_rows_threshold = 0;
-    const size_t sort_right_table_rows_threshold = 0;
+    const size_t sort_right_minimum_perkey_rows = 0;
+    const size_t sort_right_maximum_table_rows = 0;
    const bool allow_join_sorting = false;

    /// Value if setting max_memory_usage for query, can be used when max_bytes_in_join is not specified.
@ -300,8 +300,8 @@ public:
    }

    size_t outputByRowListPerkeyRowsThreshold() const { return output_by_rowlist_perkey_rows_threshold; }
-    size_t sortRightPerkeyRowsThreshold() const { return sort_right_perkey_rows_threshold; }
-    size_t sortRightTableRowsThreshold() const { return sort_right_table_rows_threshold; }
+    size_t sortRightMinimumPerkeyRows() const { return sort_right_minimum_perkey_rows; }
+    size_t sortRightMaximumTableRows() const { return sort_right_maximum_table_rows; }
    bool allowJoinSorting() const { return allow_join_sorting; }
    size_t defaultMaxBytes() const { return default_max_bytes; }
    size_t maxJoinedBlockRows() const { return max_joined_block_rows; }