optimize_sorting_for_input_stream setting and perf tests

This commit is contained in:
Igor Nikonov 2022-07-19 16:58:15 +00:00
parent 828f3711d2
commit 1fe83cc8d8
6 changed files with 36 additions and 24 deletions

View File

@ -613,6 +613,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
M(Bool, compatibility_ignore_auto_increment_in_create_table, false, "Ignore AUTO_INCREMENT keyword in column declaration if true, otherwise return error. It simplifies migration from MySQL", 0) \
M(Bool, multiple_joins_try_to_keep_original_names, false, "Do not add aliases to top level expression list on multiple joins rewrite", 0) \
M(Bool, optimize_distinct_in_order, true, "Enable DISTINCT optimization if some columns in DISTINCT form a prefix of sorting. For example, prefix of sorting key in merge tree or ORDER BY statement", 0) \
M(Bool, optimize_sorting_for_input_stream, true, "Optimize sorting to sorting properties of input stream", 0) \
// End of COMMON_SETTINGS
// Please add settings related to formats into the FORMAT_FACTORY_SETTINGS and move obsolete settings to OBSOLETE_SETTINGS.

View File

@ -1367,7 +1367,8 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, std::optional<P
settings.remerge_sort_lowered_memory_bytes_ratio,
settings.max_bytes_before_external_sort,
this->context->getTemporaryVolume(),
settings.min_free_disk_space_for_temporary_data);
settings.min_free_disk_space_for_temporary_data,
settings.optimize_sorting_for_input_stream);
sorting_step->setStepDescription(fmt::format("Sort {} before JOIN", is_right ? "right" : "left"));
plan.addStep(std::move(sorting_step));
};
@ -2497,7 +2498,8 @@ void InterpreterSelectQuery::executeWindow(QueryPlan & query_plan)
settings.remerge_sort_lowered_memory_bytes_ratio,
settings.max_bytes_before_external_sort,
context->getTemporaryVolume(),
settings.min_free_disk_space_for_temporary_data);
settings.min_free_disk_space_for_temporary_data,
settings.optimize_sorting_for_input_stream);
sorting_step->setStepDescription("Sorting for window '" + window.window_name + "'");
query_plan.addStep(std::move(sorting_step));
}
@ -2555,7 +2557,8 @@ void InterpreterSelectQuery::executeOrder(QueryPlan & query_plan, InputOrderInfo
settings.remerge_sort_lowered_memory_bytes_ratio,
settings.max_bytes_before_external_sort,
context->getTemporaryVolume(),
settings.min_free_disk_space_for_temporary_data);
settings.min_free_disk_space_for_temporary_data,
settings.optimize_sorting_for_input_stream);
sorting_step->setStepDescription("Sorting for ORDER BY");
query_plan.addStep(std::move(sorting_step));

View File

@ -38,7 +38,8 @@ SortingStep::SortingStep(
double remerge_lowered_memory_bytes_ratio_,
size_t max_bytes_before_external_sort_,
VolumePtr tmp_volume_,
size_t min_free_disk_space_)
size_t min_free_disk_space_,
bool optimize_sorting_for_input_stream_)
: ITransformingStep(input_stream, input_stream.header, getTraits(limit_))
, type(Type::Auto)
, result_description(description_)
@ -50,6 +51,7 @@ SortingStep::SortingStep(
, max_bytes_before_external_sort(max_bytes_before_external_sort_)
, tmp_volume(tmp_volume_)
, min_free_disk_space(min_free_disk_space_)
, optimize_sorting_for_input_stream(optimize_sorting_for_input_stream_)
{
/// TODO: check input_stream is partially sorted by the same description.
output_stream->sort_description = result_description;
@ -249,15 +251,18 @@ void SortingStep::transformPipeline(QueryPipelineBuilder & pipeline, const Build
LOG_DEBUG(getLogger(), "Prefix({}): {}", prefix_description.size(), dumpSortDescription(prefix_description));
LOG_DEBUG(getLogger(), "Result({}): {}", result_description.size(), dumpSortDescription(result_description));
if (input_sort_mode == DataStream::SortMode::Stream && input_sort_desc.hasPrefix(result_description))
return;
/// merge sorted
if (input_sort_mode == DataStream::SortMode::Port && input_sort_desc.hasPrefix(result_description))
if (optimize_sorting_for_input_stream)
{
LOG_DEBUG(getLogger(), "MergingSorted, SortMode::Port");
mergingSorted(pipeline, result_description, limit);
return;
if (input_sort_mode == DataStream::SortMode::Stream && input_sort_desc.hasPrefix(result_description))
return;
/// merge sorted
if (input_sort_mode == DataStream::SortMode::Port && input_sort_desc.hasPrefix(result_description))
{
LOG_DEBUG(getLogger(), "MergingSorted, SortMode::Port");
mergingSorted(pipeline, result_description, limit);
return;
}
}
if (type == Type::MergingSorted)
@ -277,7 +282,7 @@ void SortingStep::transformPipeline(QueryPipelineBuilder & pipeline, const Build
return;
}
if (input_sort_mode == DataStream::SortMode::Chunk)
if (optimize_sorting_for_input_stream && input_sort_mode == DataStream::SortMode::Chunk)
{
if (input_sort_desc.hasPrefix(result_description))
{
@ -285,14 +290,6 @@ void SortingStep::transformPipeline(QueryPipelineBuilder & pipeline, const Build
fullSort(pipeline, result_description, limit, true);
return;
}
if (result_description.hasPrefix(input_sort_desc))
{
LOG_DEBUG(getLogger(), "FinishSorting, SortMode::Chunk");
mergeSorting(pipeline, input_sort_desc, 0);
mergingSorted(pipeline, input_sort_desc, 0);
finishSorting(pipeline, input_sort_desc, result_description, limit);
return;
}
}
LOG_DEBUG(getLogger(), "FullSort");

View File

@ -22,7 +22,8 @@ public:
double remerge_lowered_memory_bytes_ratio_,
size_t max_bytes_before_external_sort_,
VolumePtr tmp_volume_,
size_t min_free_disk_space_);
size_t min_free_disk_space_,
bool optimize_sorting_for_input_stream_);
/// FinishSorting
SortingStep(
@ -86,6 +87,7 @@ private:
size_t max_bytes_before_external_sort = 0;
VolumePtr tmp_volume;
size_t min_free_disk_space = 0;
const bool optimize_sorting_for_input_stream = false;
};
}

View File

@ -3,8 +3,6 @@
<!-- FIXME this should have been an EXPLAIN test, no point in measuring performance to deduce that the query was rewritten -->
<query>SELECT * FROM (SELECT CounterID, EventDate FROM hits_10m_single ORDER BY CounterID DESC) ORDER BY EventDate, CounterID FORMAT Null</query>
<query>SELECT * FROM (SELECT CounterID, EventDate FROM hits_10m_single ORDER BY CounterID, EventDate) ORDER BY CounterID FORMAT Null</query>
<query>SELECT * FROM (SELECT CounterID, EventDate FROM hits_10m_single ORDER BY CounterID) ORDER BY CounterID, EventDate FORMAT Null</query>
<query>SELECT DISTINCT * FROM (SELECT DISTINCT CounterID, EventDate FROM hits_10m_single) FORMAT Null</query>
<query>SELECT DISTINCT * FROM (SELECT DISTINCT CounterID, EventDate FROM hits_10m_single ORDER BY CounterID DESC) ORDER BY toStartOfWeek(EventDate) FORMAT Null</query>
</test>

View File

@ -1,4 +1,15 @@
<test>
<settings><optimize_sorting_for_input_stream>1</optimize_sorting_for_input_stream></settings>
<!-- ORDER BY key is prefix of MergeTree sorting key -->
<query>SELECT CounterID, EventDate FROM hits_10m_single ORDER BY CounterID SETTINGS optimize_read_in_order=1 FORMAT Null</query>
<query>SELECT CounterID, EventDate FROM hits_10m_single ORDER BY CounterID SETTINGS optimize_read_in_order=0 FORMAT Null</query>
<!-- MergeTree sorting key is prefix of ORDER BY key -->
<query>SELECT CounterID, EventTime FROM hits_10m_single ORDER BY CounterID, EventTime SETTINGS optimize_read_in_order=1 format Null</query>
<query>SELECT CounterID, EventTime FROM hits_10m_single ORDER BY CounterID, EventTime SETTINGS optimize_read_in_order=0 format Null</query>
<!-- sorting step getting sort description from subquery -->
<query>SELECT * FROM (SELECT CounterID, EventDate FROM hits_10m_single) ORDER BY CounterID SETTINGS optimize_read_in_order=1 FORMAT Null</query>
<query>SELECT * FROM (SELECT CounterID, EventDate FROM hits_10m_single) ORDER BY CounterID SETTINGS optimize_read_in_order=0 FORMAT Null</query>
<query>SELECT * FROM (SELECT CounterID, EventDate FROM hits_10m_single ORDER BY CounterID, EventDate) ORDER BY CounterID SETTINGS optimize_duplicate_order_by_and_distinct=1 FORMAT Null</query>