Merge pull request #9113 from dimarub2000/group_by_in_order_optimization

[WIP] Optimization of GROUP BY with respect to table sorting key.
2024-12-03 13:02:00 +00:00 · 2020-06-06 14:25:59 +03:00 · 2020-06-06 14:25:59 +03:00 · 5c42408add
commit 5c42408add
parent 42534959e3 a1f8d88cd4
44 changed files with 994 additions and 161 deletions
--- a/src/AggregateFunctions/IAggregateFunction.h
+++ b/src/AggregateFunctions/IAggregateFunction.h
@ -150,6 +150,8 @@ public:
    virtual void addBatchSinglePlaceNotNull(
        size_t batch_size, AggregateDataPtr place, const IColumn ** columns, const UInt8 * null_map, Arena * arena) const = 0;

+    virtual void addBatchSinglePlaceFromInterval(size_t batch_begin, size_t batch_end, AggregateDataPtr place, const IColumn ** columns, Arena * arena) const = 0;
+
    /** In addition to addBatch, this method collects multiple rows of arguments into array "places"
      *  as long as they are between offsets[i-1] and offsets[i]. This is used for arrayReduce and
      *  -Array combinator. It might also be used generally to break data dependency when array
@ -214,6 +216,12 @@ public:
                static_cast<const Derived *>(this)->add(place, columns, i, arena);
    }

+    void addBatchSinglePlaceFromInterval(size_t batch_begin, size_t batch_end, AggregateDataPtr place, const IColumn ** columns, Arena * arena) const override
+    {
+        for (size_t i = batch_begin; i < batch_end; ++i)
+            static_cast<const Derived *>(this)->add(place, columns, i, arena);
+    }
+
    void addBatchArray(
        size_t batch_size, AggregateDataPtr * places, size_t place_offset, const IColumn ** columns, const UInt64 * offsets, Arena * arena)
        const override
--- a/src/Common/Arena.h
+++ b/src/Common/Arena.h
@ -150,7 +150,7 @@ public:
        return res;
    }

-    /// Get peice of memory with alignment
+    /// Get piece of memory with alignment
    char * alignedAlloc(size_t size, size_t alignment)
    {
        do
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@ -388,6 +388,7 @@ struct Settings : public SettingsCollection<Settings>
    M(SettingBool, enable_debug_queries, false, "Enables debug queries such as AST.", 0) \
    M(SettingBool, enable_unaligned_array_join, false, "Allow ARRAY JOIN with multiple arrays that have different sizes. When this settings is enabled, arrays will be resized to the longest one.", 0) \
    M(SettingBool, optimize_read_in_order, true, "Enable ORDER BY optimization for reading data in corresponding order in MergeTree tables.", 0) \
+    M(SettingBool, optimize_aggregation_in_order, false, "Enable GROUP BY optimization for aggregating data in corresponding order in MergeTree tables.", 0) \
    M(SettingBool, low_cardinality_allow_in_native_format, true, "Use LowCardinality type in Native format. Otherwise, convert LowCardinality columns to ordinary for select query, and convert ordinary columns to required LowCardinality for insert query.", 0) \
    M(SettingBool, cancel_http_readonly_queries_on_client_close, false, "Cancel HTTP readonly queries when a client closes the connection without waiting for response.", 0) \
    M(SettingBool, external_table_functions_use_nulls, true, "If it is set to true, external table functions will implicitly use Nullable type if needed. Otherwise NULLs will be substituted with default values. Currently supported only by 'mysql' and 'odbc' table functions.", 0) \
--- a/src/Core/SortCursor.h
+++ b/src/Core/SortCursor.h
@ -63,7 +63,7 @@ struct SortCursorImpl
        for (auto & column_desc : desc)
        {
            if (!column_desc.column_name.empty())
-                throw Exception("SortDesctiption should contain column position if SortCursor was used without header.",
+                throw Exception("SortDescription should contain column position if SortCursor was used without header.",
                        ErrorCodes::LOGICAL_ERROR);
        }
        reset(columns, {});
--- a/src/Core/SortDescription.h
+++ b/src/Core/SortDescription.h
@ -59,6 +59,13 @@ struct SortColumnDescription
    {
        return !(*this == other);
    }
+
+    std::string dump() const
+    {
+        std::stringstream ss;
+        ss << column_name << ":" << column_number << ":dir " << direction << "nulls " << nulls_direction;
+        return ss.str();
+    }
 };

 /// Description of the sorting rule for several columns.
--- a/src/Interpreters/Aggregator.cpp
+++ b/src/Interpreters/Aggregator.cpp
@ -530,63 +530,33 @@ void NO_INLINE Aggregator::executeWithoutKeyImpl(
 }


-bool Aggregator::executeOnBlock(const Block & block, AggregatedDataVariants & result,
-    ColumnRawPtrs & key_columns, AggregateColumns & aggregate_columns, bool & no_more_keys)
+void NO_INLINE Aggregator::executeOnIntervalWithoutKeyImpl(
+        AggregatedDataWithoutKey & res,
+        size_t row_begin,
+        size_t row_end,
+        AggregateFunctionInstruction * aggregate_instructions,
+        Arena * arena)
 {
-    UInt64 num_rows = block.rows();
-    return executeOnBlock(block.getColumns(), num_rows, result, key_columns, aggregate_columns, no_more_keys);
+    /// Adding values
+    for (AggregateFunctionInstruction * inst = aggregate_instructions; inst->that; ++inst)
+    {
+        if (inst->offsets)
+            inst->batch_that->addBatchSinglePlaceFromInterval(inst->offsets[row_begin], inst->offsets[row_end - 1], res + inst->state_offset, inst->batch_arguments, arena);
+        else
+            inst->batch_that->addBatchSinglePlaceFromInterval(row_begin, row_end, res + inst->state_offset, inst->batch_arguments, arena);
+    }
 }

-bool Aggregator::executeOnBlock(Columns columns, UInt64 num_rows, AggregatedDataVariants & result,
-    ColumnRawPtrs & key_columns, AggregateColumns & aggregate_columns, bool & no_more_keys)
+
+void Aggregator::prepareAggregateInstructions(Columns columns, AggregateColumns & aggregate_columns, Columns & materialized_columns,
+                                              AggregateFunctionInstructions & aggregate_functions_instructions, NestedColumnsHolder & nested_columns_holder)
 {
-    if (isCancelled())
-        return true;
-
-    /// `result` will destroy the states of aggregate functions in the destructor
-    result.aggregator = this;
-
-    /// How to perform the aggregation?
-    if (result.empty())
-    {
-        result.init(method_chosen);
-        result.keys_size = params.keys_size;
-        result.key_sizes = key_sizes;
-        LOG_TRACE(log, "Aggregation method: {}", result.getMethodName());
-    }
-
-    if (isCancelled())
-        return true;
-
    for (size_t i = 0; i < params.aggregates_size; ++i)
        aggregate_columns[i].resize(params.aggregates[i].arguments.size());

-    /** Constant columns are not supported directly during aggregation.
-      * To make them work anyway, we materialize them.
-      */
-    Columns materialized_columns;
-
-    /// Remember the columns we will work with
-    for (size_t i = 0; i < params.keys_size; ++i)
-    {
-        materialized_columns.push_back(columns.at(params.keys[i])->convertToFullColumnIfConst());
-        key_columns[i] = materialized_columns.back().get();
-
-        if (!result.isLowCardinality())
-        {
-            auto column_no_lc = recursiveRemoveLowCardinality(key_columns[i]->getPtr());
-            if (column_no_lc.get() != key_columns[i])
-            {
-                materialized_columns.emplace_back(std::move(column_no_lc));
-                key_columns[i] = materialized_columns.back().get();
-            }
-        }
-    }
-
-    AggregateFunctionInstructions aggregate_functions_instructions(params.aggregates_size + 1);
+    aggregate_functions_instructions.resize(params.aggregates_size + 1);
    aggregate_functions_instructions[params.aggregates_size].that = nullptr;

-    std::vector<std::vector<const IColumn *>> nested_columns_holder;
    for (size_t i = 0; i < params.aggregates_size; ++i)
    {
        for (size_t j = 0; j < aggregate_columns[i].size(); ++j)
@ -627,6 +597,62 @@ bool Aggregator::executeOnBlock(Columns columns, UInt64 num_rows, AggregatedData

        aggregate_functions_instructions[i].batch_that = that;
    }
+}
+
+
+bool Aggregator::executeOnBlock(const Block & block, AggregatedDataVariants & result,
+                                ColumnRawPtrs & key_columns, AggregateColumns & aggregate_columns, bool & no_more_keys)
+{
+    UInt64 num_rows = block.rows();
+    return executeOnBlock(block.getColumns(), num_rows, result, key_columns, aggregate_columns, no_more_keys);
+}
+
+
+bool Aggregator::executeOnBlock(Columns columns, UInt64 num_rows, AggregatedDataVariants & result,
+    ColumnRawPtrs & key_columns, AggregateColumns & aggregate_columns, bool & no_more_keys)
+{
+    if (isCancelled())
+        return true;
+
+    /// `result` will destroy the states of aggregate functions in the destructor
+    result.aggregator = this;
+
+    /// How to perform the aggregation?
+    if (result.empty())
+    {
+        result.init(method_chosen);
+        result.keys_size = params.keys_size;
+        result.key_sizes = key_sizes;
+        LOG_TRACE(log, "Aggregation method: {}", result.getMethodName());
+    }
+
+    if (isCancelled())
+        return true;
+
+    /** Constant columns are not supported directly during aggregation.
+      * To make them work anyway, we materialize them.
+      */
+    Columns materialized_columns;
+
+    /// Remember the columns we will work with
+    for (size_t i = 0; i < params.keys_size; ++i)
+    {
+        materialized_columns.push_back(columns.at(params.keys[i])->convertToFullColumnIfConst());
+        key_columns[i] = materialized_columns.back().get();
+
+        if (!result.isLowCardinality())
+        {
+            auto column_no_lc = recursiveRemoveLowCardinality(key_columns[i]->getPtr());
+            if (column_no_lc.get() != key_columns[i])
+            {
+                materialized_columns.emplace_back(std::move(column_no_lc));
+                key_columns[i] = materialized_columns.back().get();
+            }
+        }
+    }
+    NestedColumnsHolder nested_columns_holder;
+    AggregateFunctionInstructions aggregate_functions_instructions;
+    prepareAggregateInstructions(columns, aggregate_columns, materialized_columns, aggregate_functions_instructions, nested_columns_holder);

    if (isCancelled())
        return true;
@ -1112,7 +1138,39 @@ Block Aggregator::prepareBlockAndFill(

    return res;
 }
+void Aggregator::fillAggregateColumnsWithSingleKey(
+    AggregatedDataVariants & data_variants,
+    MutableColumns & final_aggregate_columns)
+{
+    AggregatedDataWithoutKey & data = data_variants.without_key;

+    for (size_t i = 0; i < params.aggregates_size; ++i)
+    {
+        ColumnAggregateFunction & column_aggregate_func = assert_cast<ColumnAggregateFunction &>(*final_aggregate_columns[i]);
+        for (auto & pool : data_variants.aggregates_pools)
+        {
+            column_aggregate_func.addArena(pool);
+        }
+        column_aggregate_func.getData().push_back(data + offsets_of_aggregate_states[i]);
+    }
+    data = nullptr;
+}
+
+void Aggregator::createStatesAndFillKeyColumnsWithSingleKey(
+    AggregatedDataVariants & data_variants,
+    Columns & key_columns,
+    size_t key_row,
+    MutableColumns & final_key_columns)
+{
+    AggregateDataPtr place = data_variants.aggregates_pool->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
+    createAggregateStates(place);
+    data_variants.without_key = place;
+
+    for (size_t i = 0; i < params.keys_size; ++i)
+    {
+        final_key_columns[i]->insertFrom(*key_columns[i].get(), key_row);
+    }
+}

 Block Aggregator::prepareBlockAndFillWithoutKey(AggregatedDataVariants & data_variants, bool final, bool is_overflows) const
 {
--- a/src/Interpreters/Aggregator.h
+++ b/src/Interpreters/Aggregator.h
@ -1002,6 +1002,7 @@ protected:
    friend class MergingAndConvertingBlockInputStream;
    friend class ConvertingAggregatedToChunksTransform;
    friend class ConvertingAggregatedToChunksSource;
+    friend class AggregatingInOrderTransform;

    Params params;

@ -1033,12 +1034,13 @@ protected:
    };

    using AggregateFunctionInstructions = std::vector<AggregateFunctionInstruction>;
+    using NestedColumnsHolder = std::vector<std::vector<const IColumn *>>;

    Sizes offsets_of_aggregate_states;    /// The offset to the n-th aggregate function in a row of aggregate functions.
    size_t total_size_of_aggregate_states = 0;    /// The total size of the row from the aggregate functions.

    // add info to track alignment requirement
-    // If there are states whose alignmentment are v1, ..vn, align_aggregate_states will be max(v1, ... vn)
+    // If there are states whose alignment are v1, ..vn, align_aggregate_states will be max(v1, ... vn)
    size_t align_aggregate_states = 1;

    bool all_aggregates_has_trivial_destructor = false;
@ -1105,6 +1107,13 @@ protected:
        AggregateFunctionInstruction * aggregate_instructions,
        Arena * arena);

+    static void executeOnIntervalWithoutKeyImpl(
+        AggregatedDataWithoutKey & res,
+        size_t row_begin,
+        size_t row_end,
+        AggregateFunctionInstruction * aggregate_instructions,
+        Arena * arena);
+
    template <typename Method>
    void writeToTemporaryFileImpl(
        AggregatedDataVariants & data_variants,
@ -1250,6 +1259,22 @@ protected:
      * - sets the variable no_more_keys to true.
      */
    bool checkLimits(size_t result_size, bool & no_more_keys) const;
+
+    void prepareAggregateInstructions(
+        Columns columns,
+        AggregateColumns & aggregate_columns,
+        Columns & materialized_columns,
+        AggregateFunctionInstructions & instructions,
+        NestedColumnsHolder & nested_columns_holder);
+
+    void fillAggregateColumnsWithSingleKey(
+        AggregatedDataVariants & data_variants,
+        MutableColumns & final_aggregate_columns);
+
+    void createStatesAndFillKeyColumnsWithSingleKey(
+        AggregatedDataVariants & data_variants,
+        Columns & key_columns, size_t key_row,
+        MutableColumns & final_key_columns);
 };


--- a/src/Interpreters/ExpressionAnalyzer.cpp
+++ b/src/Interpreters/ExpressionAnalyzer.cpp
@ -726,7 +726,8 @@ bool SelectQueryExpressionAnalyzer::appendWhere(ExpressionActionsChain & chain,
    return true;
 }

-bool SelectQueryExpressionAnalyzer::appendGroupBy(ExpressionActionsChain & chain, bool only_types)
+bool SelectQueryExpressionAnalyzer::appendGroupBy(ExpressionActionsChain & chain, bool only_types, bool optimize_aggregation_in_order,
+                                                  ManyExpressionActions & group_by_elements_actions)
 {
    const auto * select_query = getAggregatingQuery();

@ -743,6 +744,20 @@ bool SelectQueryExpressionAnalyzer::appendGroupBy(ExpressionActionsChain & chain
        getRootActions(ast, only_types, step.actions);
    }

+    if (optimize_aggregation_in_order)
+    {
+        auto all_columns = sourceWithJoinedColumns();
+        for (auto & child : asts)
+        {
+            group_by_elements_actions.emplace_back(std::make_shared<ExpressionActions>(all_columns, context));
+            getRootActions(child, only_types, group_by_elements_actions.back());
+        }
+//        std::cerr << "group_by_elements_actions\n";
+//        for (const auto & elem : group_by_elements_actions) {
+//            std::cerr << elem->dumpActions() << "\n";
+//        }
+    }
+
    return true;
 }

@ -834,8 +849,11 @@ bool SelectQueryExpressionAnalyzer::appendOrderBy(ExpressionActionsChain & chain
            order_by_elements_actions.emplace_back(std::make_shared<ExpressionActions>(all_columns, context));
            getRootActions(child, only_types, order_by_elements_actions.back());
        }
+//        std::cerr << "order_by_elements_actions\n";
+//        for (const auto & elem : order_by_elements_actions) {
+//            std::cerr << elem->dumpActions() << "\n";
+//        }
    }
-
    return true;
 }

@ -1115,7 +1133,12 @@ ExpressionAnalysisResult::ExpressionAnalysisResult(

        if (need_aggregate)
        {
-            query_analyzer.appendGroupBy(chain, only_types || !first_stage);
+            /// TODO correct conditions
+            optimize_aggregation_in_order =
+                    context.getSettingsRef().optimize_aggregation_in_order
+                    && storage && query.groupBy();
+
+            query_analyzer.appendGroupBy(chain, only_types || !first_stage, optimize_aggregation_in_order, group_by_elements_actions);
            query_analyzer.appendAggregateFunctionsArguments(chain, only_types || !first_stage);
            before_aggregation = chain.getLastActions();

@ -1128,13 +1151,13 @@ ExpressionAnalysisResult::ExpressionAnalysisResult(
            }
        }

-        bool has_stream_with_non_joned_rows = (before_join && before_join->getTableJoinAlgo()->hasStreamWithNonJoinedRows());
+        bool has_stream_with_non_joined_rows = (before_join && before_join->getTableJoinAlgo()->hasStreamWithNonJoinedRows());
        optimize_read_in_order =
            settings.optimize_read_in_order
            && storage && query.orderBy()
            && !query_analyzer.hasAggregation()
            && !query.final()
-            && !has_stream_with_non_joned_rows;
+            && !has_stream_with_non_joined_rows;

        /// If there is aggregation, we execute expressions in SELECT and ORDER BY on the initiating server, otherwise on the source servers.
        query_analyzer.appendSelect(chain, only_types || (need_aggregate ? !second_stage : !first_stage));
--- a/src/Interpreters/ExpressionAnalyzer.h
+++ b/src/Interpreters/ExpressionAnalyzer.h
@ -174,6 +174,7 @@ struct ExpressionAnalysisResult

    bool remove_where_filter = false;
    bool optimize_read_in_order = false;
+    bool optimize_aggregation_in_order = false;

    ExpressionActionsPtr before_join;   /// including JOIN
    ExpressionActionsPtr before_where;
@ -195,6 +196,7 @@ struct ExpressionAnalysisResult
    ConstantFilterDescription where_constant_filter_description;
    /// Actions by every element of ORDER BY
    ManyExpressionActions order_by_elements_actions;
+    ManyExpressionActions group_by_elements_actions;

    ExpressionAnalysisResult() = default;

@ -303,7 +305,7 @@ private:
    /// Columns in `additional_required_columns` will not be removed (they can be used for e.g. sampling or FINAL modifier).
    bool appendPrewhere(ExpressionActionsChain & chain, bool only_types, const Names & additional_required_columns);
    bool appendWhere(ExpressionActionsChain & chain, bool only_types);
-    bool appendGroupBy(ExpressionActionsChain & chain, bool only_types);
+    bool appendGroupBy(ExpressionActionsChain & chain, bool only_types, bool optimize_aggregation_in_order, ManyExpressionActions &);
    void appendAggregateFunctionsArguments(ExpressionActionsChain & chain, bool only_types);

    /// After aggregation:
--- a/src/Interpreters/InterpreterSelectQuery.cpp
+++ b/src/Interpreters/InterpreterSelectQuery.cpp
@ -71,6 +71,8 @@
 #include <Processors/Pipe.h>
 #include <Processors/Sources/SourceFromSingleChunk.h>
 #include <Processors/Transforms/ConvertingTransform.h>
+#include <Processors/Transforms/AggregatingInOrderTransform.h>
+#include <Processors/Merges/AggregatingSortedTransform.h>


 namespace DB
@ -601,6 +603,20 @@ static SortDescription getSortDescription(const ASTSelectQuery & query, const Co
    return order_descr;
 }

+static SortDescription getSortDescriptionFromGroupBy(const ASTSelectQuery & query)
+{
+    SortDescription order_descr;
+    order_descr.reserve(query.groupBy()->children.size());
+
+    for (const auto & elem : query.groupBy()->children)
+    {
+        String name = elem->getColumnName();
+        order_descr.emplace_back(name, 1, 1);
+    }
+
+    return order_descr;
+}
+
 static UInt64 getLimitUIntValue(const ASTPtr & node, const Context & context, const std::string & expr)
 {
    const auto & [field, type] = evaluateConstantExpression(node, context);
@ -739,7 +755,7 @@ void InterpreterSelectQuery::executeImpl(QueryPipeline & pipeline, const BlockIn
            if (!expressions.second_stage && !expressions.need_aggregate && !expressions.hasHaving())
            {
                if (expressions.has_order_by)
-                    executeOrder(pipeline, query_info.input_sorting_info);
+                    executeOrder(pipeline, query_info.input_order_info);

                if (expressions.has_order_by && query.limitLength())
                    executeDistinct(pipeline, false, expressions.selected_columns);
@ -832,7 +848,11 @@ void InterpreterSelectQuery::executeImpl(QueryPipeline & pipeline, const BlockIn
                executeWhere(pipeline, expressions.before_where, expressions.remove_where_filter);

            if (expressions.need_aggregate)
-                executeAggregation(pipeline, expressions.before_aggregation, aggregate_overflow_row, aggregate_final);
+            {
+                executeAggregation(pipeline, expressions.before_aggregation, aggregate_overflow_row, aggregate_final, query_info.input_order_info);
+                /// We need to reset input order info, so that executeOrder can't use  it
+                query_info.input_order_info.reset();
+            }
            else
            {
                executeExpression(pipeline, expressions.before_order_and_select);
@ -898,7 +918,7 @@ void InterpreterSelectQuery::executeImpl(QueryPipeline & pipeline, const BlockIn
                if (!expressions.first_stage && !expressions.need_aggregate && !(query.group_by_with_totals && !aggregate_final))
                    executeMergeSorted(pipeline);
                else    /// Otherwise, just sort.
-                    executeOrder(pipeline, query_info.input_sorting_info);
+                    executeOrder(pipeline, query_info.input_order_info);
            }

            /** Optimization - if there are several sources and there is LIMIT, then first apply the preliminary LIMIT,
@ -1262,15 +1282,21 @@ void InterpreterSelectQuery::executeFetchColumns(
        query_info.prewhere_info = prewhere_info;

        /// Create optimizer with prepared actions.
-        /// Maybe we will need to calc input_sorting_info later, e.g. while reading from StorageMerge.
-        if (analysis_result.optimize_read_in_order)
+        /// Maybe we will need to calc input_order_info later, e.g. while reading from StorageMerge.
+        if (analysis_result.optimize_read_in_order || analysis_result.optimize_aggregation_in_order)
        {
-            query_info.order_by_optimizer = std::make_shared<ReadInOrderOptimizer>(
-                analysis_result.order_by_elements_actions,
-                getSortDescription(query, *context),
-                query_info.syntax_analyzer_result);
+            if (analysis_result.optimize_read_in_order)
+                query_info.order_optimizer = std::make_shared<ReadInOrderOptimizer>(
+                    analysis_result.order_by_elements_actions,
+                    getSortDescription(query, *context),
+                    query_info.syntax_analyzer_result);
+            else
+                query_info.order_optimizer = std::make_shared<ReadInOrderOptimizer>(
+                    analysis_result.group_by_elements_actions,
+                    getSortDescriptionFromGroupBy(query),
+                    query_info.syntax_analyzer_result);

-            query_info.input_sorting_info = query_info.order_by_optimizer->getInputOrder(storage);
+            query_info.input_order_info = query_info.order_optimizer->getInputOrder(storage);
        }

        Pipes pipes = storage->read(required_columns, query_info, *context, processing_stage, max_block_size, max_streams);
@ -1376,7 +1402,7 @@ void InterpreterSelectQuery::executeWhere(QueryPipeline & pipeline, const Expres
 }


-void InterpreterSelectQuery::executeAggregation(QueryPipeline & pipeline, const ExpressionActionsPtr & expression, bool overflow_row, bool final)
+void InterpreterSelectQuery::executeAggregation(QueryPipeline & pipeline, const ExpressionActionsPtr & expression, bool overflow_row, bool final, InputOrderInfoPtr group_by_info)
 {
    pipeline.addSimpleTransform([&](const Block & header)
    {
@ -1414,6 +1440,62 @@ void InterpreterSelectQuery::executeAggregation(QueryPipeline & pipeline, const
    /// Forget about current totals and extremes. They will be calculated again after aggregation if needed.
    pipeline.dropTotalsAndExtremes();

+    if (group_by_info && settings.optimize_aggregation_in_order)
+    {
+        auto & query = getSelectQuery();
+        SortDescription group_by_descr = getSortDescriptionFromGroupBy(query);
+        bool need_finish_sorting = (group_by_info->order_key_prefix_descr.size() < group_by_descr.size());
+
+        if (need_finish_sorting)
+        {
+            /// TOO SLOW
+        }
+        else
+        {
+            if (pipeline.getNumStreams() > 1)
+            {
+                auto many_data = std::make_shared<ManyAggregatedData>(pipeline.getNumStreams());
+                size_t counter = 0;
+                pipeline.addSimpleTransform([&](const Block & header)
+                {
+                    return std::make_shared<AggregatingInOrderTransform>(header, transform_params, group_by_descr, settings.max_block_size, many_data, counter++);
+                });
+
+                for (auto & column_description : group_by_descr)
+                {
+                    if (!column_description.column_name.empty())
+                    {
+                        column_description.column_number = pipeline.getHeader().getPositionByName(column_description.column_name);
+                        column_description.column_name.clear();
+                    }
+                }
+
+                auto transform = std::make_shared<AggregatingSortedTransform>(
+                    pipeline.getHeader(),
+                    pipeline.getNumStreams(),
+                    group_by_descr,
+                    settings.max_block_size);
+
+                pipeline.addPipe({ std::move(transform) });
+            }
+            else
+            {
+                pipeline.addSimpleTransform([&](const Block & header)
+                {
+                    return std::make_shared<AggregatingInOrderTransform>(header, transform_params, group_by_descr, settings.max_block_size);
+                });
+            }
+
+            pipeline.addSimpleTransform([&](const Block & header)
+            {
+                return std::make_shared<FinalizingSimpleTransform>(header, transform_params);
+            });
+
+            pipeline.enableQuotaForCurrentStreams();
+            return;
+        }
+    }
+
    /// If there are several sources, then we perform parallel aggregation
    if (pipeline.getNumStreams() > 1)
    {
@ -1576,7 +1658,47 @@ void InterpreterSelectQuery::executeExpression(QueryPipeline & pipeline, const E
 }


-void InterpreterSelectQuery::executeOrder(QueryPipeline & pipeline, InputSortingInfoPtr input_sorting_info)
+void InterpreterSelectQuery::executeOrderOptimized(QueryPipeline & pipeline, InputOrderInfoPtr input_sorting_info, UInt64 limit, SortDescription & output_order_descr)
+{
+    const Settings & settings = context->getSettingsRef();
+
+    bool need_finish_sorting = (input_sorting_info->order_key_prefix_descr.size() < output_order_descr.size());
+    if (pipeline.getNumStreams() > 1)
+    {
+        UInt64 limit_for_merging = (need_finish_sorting ? 0 : limit);
+        auto transform = std::make_shared<MergingSortedTransform>(
+                pipeline.getHeader(),
+                pipeline.getNumStreams(),
+                input_sorting_info->order_key_prefix_descr,
+                settings.max_block_size, limit_for_merging);
+
+        pipeline.addPipe({ std::move(transform) });
+    }
+
+    pipeline.enableQuotaForCurrentStreams();
+
+    if (need_finish_sorting)
+    {
+        pipeline.addSimpleTransform([&](const Block & header, QueryPipeline::StreamType stream_type) -> ProcessorPtr
+        {
+            if (stream_type != QueryPipeline::StreamType::Main)
+                return nullptr;
+
+            return std::make_shared<PartialSortingTransform>(header, output_order_descr, limit);
+        });
+
+            /// NOTE limits are not applied to the size of temporary sets in FinishSortingTransform
+
+            pipeline.addSimpleTransform([&](const Block & header) -> ProcessorPtr
+            {
+                return std::make_shared<FinishSortingTransform>(
+                    header, input_sorting_info->order_key_prefix_descr,
+                    output_order_descr, settings.max_block_size, limit);
+        });
+    }
+}
+
+void InterpreterSelectQuery::executeOrder(QueryPipeline & pipeline, InputOrderInfoPtr input_sorting_info)
 {
    auto & query = getSelectQuery();
    SortDescription output_order_descr = getSortDescription(query, *context);
@ -1596,43 +1718,7 @@ void InterpreterSelectQuery::executeOrder(QueryPipeline & pipeline, InputSorting
         *  and then merge them into one sorted stream.
         * At this stage we merge per-thread streams into one.
         */
-
-        bool need_finish_sorting = (input_sorting_info->order_key_prefix_descr.size() < output_order_descr.size());
-
-        if (pipeline.getNumStreams() > 1)
-        {
-            UInt64 limit_for_merging = (need_finish_sorting ? 0 : limit);
-            auto transform = std::make_shared<MergingSortedTransform>(
-                pipeline.getHeader(),
-                pipeline.getNumStreams(),
-                input_sorting_info->order_key_prefix_descr,
-                settings.max_block_size, limit_for_merging);
-
-            pipeline.addPipe({ std::move(transform) });
-        }
-
-        pipeline.enableQuotaForCurrentStreams();
-
-        if (need_finish_sorting)
-        {
-            pipeline.addSimpleTransform([&](const Block & header, QueryPipeline::StreamType stream_type) -> ProcessorPtr
-            {
-                if (stream_type != QueryPipeline::StreamType::Main)
-                    return nullptr;
-
-                return std::make_shared<PartialSortingTransform>(header, output_order_descr, limit);
-            });
-
-            /// NOTE limits are not applied to the size of temporary sets in FinishSortingTransform
-
-            pipeline.addSimpleTransform([&](const Block & header) -> ProcessorPtr
-            {
-                return std::make_shared<FinishSortingTransform>(
-                    header, input_sorting_info->order_key_prefix_descr,
-                    output_order_descr, settings.max_block_size, limit);
-            });
-        }
-
+        executeOrderOptimized(pipeline, input_sorting_info, limit, output_order_descr);
        return;
    }

@ -1905,8 +1991,8 @@ void InterpreterSelectQuery::executeExtremes(QueryPipeline & pipeline)

 void InterpreterSelectQuery::executeSubqueriesInSetsAndJoins(QueryPipeline & pipeline, const SubqueriesForSets & subqueries_for_sets)
 {
-    if (query_info.input_sorting_info)
-        executeMergeSorted(pipeline, query_info.input_sorting_info->order_key_prefix_descr, 0);
+    if (query_info.input_order_info)
+        executeMergeSorted(pipeline, query_info.input_order_info->order_key_prefix_descr, 0);

    const Settings & settings = context->getSettingsRef();

--- a/src/Interpreters/InterpreterSelectQuery.h
+++ b/src/Interpreters/InterpreterSelectQuery.h
@ -113,12 +113,13 @@ private:
        const Names & columns_to_remove_after_prewhere);

    void executeWhere(QueryPipeline & pipeline, const ExpressionActionsPtr & expression, bool remove_filter);
-    void executeAggregation(QueryPipeline & pipeline, const ExpressionActionsPtr & expression, bool overflow_row, bool final);
+    void executeAggregation(QueryPipeline & pipeline, const ExpressionActionsPtr & expression, bool overflow_row, bool final, InputOrderInfoPtr group_by_info);
    void executeMergeAggregated(QueryPipeline & pipeline, bool overflow_row, bool final);
    void executeTotalsAndHaving(QueryPipeline & pipeline, bool has_having, const ExpressionActionsPtr & expression, bool overflow_row, bool final);
    void executeHaving(QueryPipeline & pipeline, const ExpressionActionsPtr & expression);
    static void executeExpression(QueryPipeline & pipeline, const ExpressionActionsPtr & expression);
-    void executeOrder(QueryPipeline & pipeline, InputSortingInfoPtr sorting_info);
+    void executeOrder(QueryPipeline & pipeline, InputOrderInfoPtr sorting_info);
+    void executeOrderOptimized(QueryPipeline & pipeline, InputOrderInfoPtr sorting_info, UInt64 limit, SortDescription & output_order_descr);
    void executeWithFill(QueryPipeline & pipeline);
    void executeMergeSorted(QueryPipeline & pipeline);
    void executePreLimit(QueryPipeline & pipeline, bool do_not_skip_offset);
--- a/src/Processors/IProcessor.h
+++ b/src/Processors/IProcessor.h
@ -158,11 +158,11 @@ public:

    static std::string statusToName(Status status);

-    /** Method 'prepare' is responsible for all cheap ("instantenous": O(1) of data volume, no wait) calculations.
+    /** Method 'prepare' is responsible for all cheap ("instantaneous": O(1) of data volume, no wait) calculations.
      *
      * It may access input and output ports,
      *  indicate the need for work by another processor by returning NeedData or PortFull,
-      *  or indicate the absense of work by returning Finished or Unneeded,
+      *  or indicate the absence of work by returning Finished or Unneeded,
      *  it may pull data from input ports and push data to output ports.
      *
      * The method is not thread-safe and must be called from a single thread in one moment of time,
--- a/src/Processors/Transforms/AggregatingInOrderTransform.cpp
+++ b/src/Processors/Transforms/AggregatingInOrderTransform.cpp
@ -0,0 +1,244 @@
+#include <Processors/Transforms/AggregatingInOrderTransform.h>
+#include <DataTypes/DataTypeLowCardinality.h>
+
+namespace DB
+{
+
+AggregatingInOrderTransform::AggregatingInOrderTransform(
+    Block header, AggregatingTransformParamsPtr params_,
+    const SortDescription & group_by_description_, size_t res_block_size_)
+    : AggregatingInOrderTransform(std::move(header), std::move(params_)
+    , group_by_description_, res_block_size_, std::make_unique<ManyAggregatedData>(1), 0)
+{
+}
+
+AggregatingInOrderTransform::AggregatingInOrderTransform(
+    Block header, AggregatingTransformParamsPtr params_,
+    const SortDescription & group_by_description_, size_t res_block_size_,
+    ManyAggregatedDataPtr many_data_, size_t current_variant)
+    : IProcessor({std::move(header)}, {params_->getCustomHeader(false)})
+    , res_block_size(res_block_size_)
+    , params(std::move(params_))
+    , group_by_description(group_by_description_)
+    , aggregate_columns(params->params.aggregates_size)
+    , many_data(std::move(many_data_))
+    , variants(*many_data->variants[current_variant])
+{
+    /// We won't finalize states in order to merge same states (generated due to multi-thread execution) in AggregatingSortedTransform
+    res_header = params->getCustomHeader(false);
+
+    /// Replace column names to column position in description_sorted.
+    for (auto & column_description : group_by_description)
+    {
+        if (!column_description.column_name.empty())
+        {
+            column_description.column_number = res_header.getPositionByName(column_description.column_name);
+            column_description.column_name.clear();
+        }
+    }
+}
+
+AggregatingInOrderTransform::~AggregatingInOrderTransform() = default;
+
+static bool less(const MutableColumns & lhs, const Columns & rhs, size_t i, size_t j, const SortDescription & descr)
+{
+    for (const auto & elem : descr)
+    {
+        size_t ind = elem.column_number;
+        int res = elem.direction * lhs[ind]->compareAt(i, j, *rhs[ind], elem.nulls_direction);
+        if (res < 0)
+            return true;
+        else if (res > 0)
+            return false;
+    }
+    return false;
+}
+
+
+void AggregatingInOrderTransform::consume(Chunk chunk)
+{
+    size_t rows = chunk.getNumRows();
+    if (rows == 0)
+        return;
+
+    if (!is_consume_started)
+    {
+        LOG_TRACE(log, "Aggregating in order");
+        is_consume_started = true;
+    }
+    src_rows += rows;
+    src_bytes += chunk.bytes();
+
+    Columns materialized_columns;
+    Columns key_columns(params->params.keys_size);
+    for (size_t i = 0; i < params->params.keys_size; ++i)
+    {
+        materialized_columns.push_back(chunk.getColumns().at(params->params.keys[i])->convertToFullColumnIfConst());
+        key_columns[i] = materialized_columns.back();
+    }
+
+    Aggregator::NestedColumnsHolder nested_columns_holder;
+    Aggregator::AggregateFunctionInstructions aggregate_function_instructions;
+    params->aggregator.prepareAggregateInstructions(chunk.getColumns(), aggregate_columns, materialized_columns, aggregate_function_instructions, nested_columns_holder);
+
+    size_t key_end = 0;
+    size_t key_begin = 0;
+    /// If we don't have a block we create it and fill with first key
+    if (!cur_block_size)
+    {
+        res_key_columns.resize(params->params.keys_size);
+        res_aggregate_columns.resize(params->params.aggregates_size);
+
+        for (size_t i = 0; i < params->params.keys_size; ++i)
+        {
+            res_key_columns[i] = res_header.safeGetByPosition(i).type->createColumn();
+        }
+        for (size_t i = 0; i < params->params.aggregates_size; ++i)
+        {
+            res_aggregate_columns[i] = res_header.safeGetByPosition(i + params->params.keys_size).type->createColumn();
+        }
+        params->aggregator.createStatesAndFillKeyColumnsWithSingleKey(variants, key_columns, key_begin, res_key_columns);
+        ++cur_block_size;
+    }
+    size_t mid = 0;
+    size_t high = 0;
+    size_t low = -1;
+    /// Will split block into segments with the same key
+    while (key_end != rows)
+    {
+        high = rows;
+        /// Find the first position of new (not current) key in current chunk
+        while (high - low > 1)
+        {
+            mid = (low + high) / 2;
+            if (!less(res_key_columns, key_columns, cur_block_size - 1, mid, group_by_description))
+                low = mid;
+            else
+                high = mid;
+        }
+        key_end = high;
+        /// Add data to aggr. state if interval is not empty. Empty when haven't found current key in new block.
+        if (key_begin != key_end)
+        {
+            params->aggregator.executeOnIntervalWithoutKeyImpl(variants.without_key, key_begin, key_end, aggregate_function_instructions.data(), variants.aggregates_pool);
+        }
+
+        low = key_begin = key_end;
+        /// We finalize last key aggregation state if a new key found.
+        if (key_begin != rows)
+        {
+            params->aggregator.fillAggregateColumnsWithSingleKey(variants, res_aggregate_columns);
+            /// If res_block_size is reached we have to stop consuming and generate the block. Save the extra rows into new chunk.
+            if (cur_block_size == res_block_size)
+            {
+                Columns source_columns = chunk.detachColumns();
+
+                for (auto & source_column : source_columns)
+                    source_column = source_column->cut(key_begin, rows - key_begin);
+
+                current_chunk = Chunk(source_columns, rows - key_begin);
+                block_end_reached = true;
+                need_generate = true;
+                cur_block_size = 0;
+                return;
+            }
+
+            /// We create a new state for the new key and update res_key_columns
+            params->aggregator.createStatesAndFillKeyColumnsWithSingleKey(variants, key_columns, key_begin, res_key_columns);
+            ++cur_block_size;
+        }
+    }
+    block_end_reached = false;
+}
+
+
+void AggregatingInOrderTransform::work()
+{
+    if (is_consume_finished || need_generate)
+    {
+        generate();
+    }
+    else
+    {
+        consume(std::move(current_chunk));
+    }
+}
+
+
+IProcessor::Status AggregatingInOrderTransform::prepare()
+{
+    auto & output = outputs.front();
+    auto & input = inputs.back();
+
+    /// Check can output.
+    if (output.isFinished())
+    {
+        input.close();
+        return Status::Finished;
+    }
+
+    if (!output.canPush())
+    {
+        input.setNotNeeded();
+        return Status::PortFull;
+    }
+
+    if (block_end_reached)
+    {
+        if (need_generate)
+        {
+            return Status::Ready;
+        }
+        else
+        {
+            output.push(std::move(to_push_chunk));
+            return Status::Ready;
+        }
+    }
+    else
+    {
+        if (is_consume_finished)
+        {
+            output.push(std::move(to_push_chunk));
+            output.finish();
+            LOG_TRACE(log, "Aggregated. {} to {} rows (from {})", src_rows, res_rows,
+                                        formatReadableSizeWithBinarySuffix(src_bytes));
+            return Status::Finished;
+        }
+        if (input.isFinished())
+        {
+            is_consume_finished = true;
+            return Status::Ready;
+        }
+    }
+    if (!input.hasData())
+    {
+        input.setNeeded();
+        return Status::NeedData;
+    }
+    current_chunk = input.pull(!is_consume_finished);
+    return Status::Ready;
+}
+
+void AggregatingInOrderTransform::generate()
+{
+    if (cur_block_size && is_consume_finished)
+        params->aggregator.fillAggregateColumnsWithSingleKey(variants, res_aggregate_columns);
+
+    Block res = res_header.cloneEmpty();
+
+    for (size_t i = 0; i < res_key_columns.size(); ++i)
+    {
+        res.getByPosition(i).column = std::move(res_key_columns[i]);
+    }
+    for (size_t i = 0; i < res_aggregate_columns.size(); ++i)
+    {
+        res.getByPosition(i + res_key_columns.size()).column = std::move(res_aggregate_columns[i]);
+    }
+    to_push_chunk = convertToChunk(res);
+    res_rows += to_push_chunk.getNumRows();
+    need_generate = false;
+}
+
+
+}
--- a/src/Processors/Transforms/AggregatingInOrderTransform.h
+++ b/src/Processors/Transforms/AggregatingInOrderTransform.h
@ -0,0 +1,92 @@
+#pragma once
+
+#include <Core/SortDescription.h>
+#include <Interpreters/Aggregator.h>
+#include <Processors/ISimpleTransform.h>
+#include <Processors/Transforms/AggregatingTransform.h>
+#include <Processors/Transforms/TotalsHavingTransform.h>
+
+namespace DB
+{
+
+class AggregatingInOrderTransform : public IProcessor
+{
+
+public:
+    AggregatingInOrderTransform(Block header, AggregatingTransformParamsPtr params,
+                                const SortDescription & group_by_description, size_t res_block_size,
+                                ManyAggregatedDataPtr many_data, size_t current_variant);
+
+    AggregatingInOrderTransform(Block header, AggregatingTransformParamsPtr params,
+                                const SortDescription & group_by_description, size_t res_block_size);
+
+    ~AggregatingInOrderTransform() override;
+
+    String getName() const override { return "AggregatingInOrderTransform"; }
+
+    Status prepare() override;
+
+    void work() override;
+
+    void consume(Chunk chunk);
+
+private:
+    void generate();
+
+    size_t res_block_size;
+    size_t cur_block_size = 0;
+
+    MutableColumns res_key_columns;
+    MutableColumns res_aggregate_columns;
+
+    AggregatingTransformParamsPtr params;
+    SortDescription group_by_description;
+
+    Aggregator::AggregateColumns aggregate_columns;
+
+    ManyAggregatedDataPtr many_data;
+    AggregatedDataVariants & variants;
+
+    UInt64 src_rows = 0;
+    UInt64 src_bytes = 0;
+    UInt64 res_rows = 0;
+
+    bool need_generate = false;
+    bool block_end_reached = false;
+    bool is_consume_started = false;
+    bool is_consume_finished = false;
+
+    Block res_header;
+    Chunk current_chunk;
+    Chunk to_push_chunk;
+
+    Poco::Logger * log = &Poco::Logger::get("AggregatingInOrderTransform");
+};
+
+
+class FinalizingSimpleTransform : public ISimpleTransform
+{
+public:
+    FinalizingSimpleTransform(Block header, AggregatingTransformParamsPtr params_)
+        : ISimpleTransform({std::move(header)}, {params_->getHeader()}, true)
+        , params(params_) {}
+
+    void transform(Chunk & chunk) override
+    {
+        if (params->final)
+            finalizeChunk(chunk);
+        else if (!chunk.getChunkInfo())
+        {
+            auto info = std::make_shared<AggregatedChunkInfo>();
+            chunk.setChunkInfo(std::move(info));
+        }
+    }
+
+    String getName() const override { return "FinalizingSimpleTransform"; }
+
+private:
+    AggregatingTransformParamsPtr params;
+};
+
+
+}
--- a/src/Processors/Transforms/AggregatingTransform.cpp
+++ b/src/Processors/Transforms/AggregatingTransform.cpp
@ -19,23 +19,23 @@ namespace ErrorCodes
    extern const int LOGICAL_ERROR;
 }

+/// Convert block to chunk.
+/// Adds additional info about aggregation.
+Chunk convertToChunk(const Block & block)
+{
+    auto info = std::make_shared<AggregatedChunkInfo>();
+    info->bucket_num = block.info.bucket_num;
+    info->is_overflows = block.info.is_overflows;
+
+    UInt64 num_rows = block.rows();
+    Chunk chunk(block.getColumns(), num_rows);
+    chunk.setChunkInfo(std::move(info));
+
+    return chunk;
+}
+
 namespace
 {
-    /// Convert block to chunk.
-    /// Adds additional info about aggregation.
-    Chunk convertToChunk(const Block & block)
-    {
-        auto info = std::make_shared<AggregatedChunkInfo>();
-        info->bucket_num = block.info.bucket_num;
-        info->is_overflows = block.info.is_overflows;
-
-        UInt64 num_rows = block.rows();
-        Chunk chunk(block.getColumns(), num_rows);
-        chunk.setChunkInfo(std::move(info));
-
-        return chunk;
-    }
-
    const AggregatedChunkInfo * getInfoFromChunk(const Chunk & chunk)
    {
        const auto & info = chunk.getChunkInfo();
--- a/src/Processors/Transforms/AggregatingTransform.h
+++ b/src/Processors/Transforms/AggregatingTransform.h
@ -28,6 +28,8 @@ struct AggregatingTransformParams
        : params(params_), aggregator(params), final(final_) {}

    Block getHeader() const { return aggregator.getHeader(final); }
+
+    Block getCustomHeader(bool final_) const { return aggregator.getHeader(final_); }
 };

 struct ManyAggregatedData
@ -117,4 +119,6 @@ private:
    void initGenerate();
 };

+Chunk convertToChunk(const Block & block);
+
 }
--- a/src/Processors/Transforms/FinishSortingTransform.cpp
+++ b/src/Processors/Transforms/FinishSortingTransform.cpp
@ -112,7 +112,7 @@ void FinishSortingTransform::consume(Chunk chunk)
        }
    }

-    /// If we reach here, that means that current cunk is first in portion
+    /// If we reach here, that means that current chunk is first in portion
    /// or it all consists of rows with the same key as tail of a previous chunk.
    chunks.push_back(std::move(chunk));
 }
--- a/src/Processors/Transforms/TotalsHavingTransform.h
+++ b/src/Processors/Transforms/TotalsHavingTransform.h
@ -1,5 +1,6 @@
-#include <Processors/ISimpleTransform.h>
+#pragma once

+#include <Processors/ISimpleTransform.h>
 #include <Common/Arena.h>

 namespace DB
--- a/src/Processors/ya.make
+++ b/src/Processors/ya.make
@ -134,6 +134,7 @@ SRCS(
    Transforms/RollupTransform.cpp
    Transforms/SortingTransform.cpp
    Transforms/TotalsHavingTransform.cpp
+    Transforms/AggregatingInOrderTransform.cpp
 )

 END()
--- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
@ -637,9 +637,9 @@ Pipes MergeTreeDataSelectExecutor::readFromParts(
            reader_settings,
            result_projection);
    }
-    else if (settings.optimize_read_in_order && query_info.input_sorting_info)
+    else if ((settings.optimize_read_in_order || settings.optimize_aggregation_in_order) && query_info.input_order_info)
    {
-        size_t prefix_size = query_info.input_sorting_info->order_key_prefix_descr.size();
+        size_t prefix_size = query_info.input_order_info->order_key_prefix_descr.size();
        auto order_key_prefix_ast = data.getSortingKey().expression_list_ast->clone();
        order_key_prefix_ast->children.resize(prefix_size);

@ -855,7 +855,8 @@ Pipes MergeTreeDataSelectExecutor::spreadMarkRangesAmongStreamsWithOrder(
    ExpressionActionsPtr & out_projection) const
 {
    size_t sum_marks = 0;
-    const InputSortingInfoPtr & input_sorting_info = query_info.input_sorting_info;
+    const InputOrderInfoPtr & input_order_info = query_info.input_order_info;
+
    size_t adaptive_parts = 0;
    std::vector<size_t> sum_marks_in_parts(parts.size());
    const auto data_settings = data.getSettings();
@ -998,10 +999,9 @@ Pipes MergeTreeDataSelectExecutor::spreadMarkRangesAmongStreamsWithOrder(
                }
                parts.emplace_back(part);
            }
+            ranges_to_get_from_part = split_ranges(ranges_to_get_from_part, input_order_info->direction);

-            ranges_to_get_from_part = split_ranges(ranges_to_get_from_part, input_sorting_info->direction);
-
-            if (input_sorting_info->direction == 1)
+            if (input_order_info->direction == 1)
            {
                pipes.emplace_back(std::make_shared<MergeTreeSelectProcessor>(
                    data, part.data_part, max_block_size, settings.preferred_block_size_bytes,
@ -1024,9 +1024,9 @@ Pipes MergeTreeDataSelectExecutor::spreadMarkRangesAmongStreamsWithOrder(
        if (pipes.size() > 1)
        {
            SortDescription sort_description;
-            for (size_t j = 0; j < input_sorting_info->order_key_prefix_descr.size(); ++j)
+            for (size_t j = 0; j < input_order_info->order_key_prefix_descr.size(); ++j)
                sort_description.emplace_back(data.getSortingKey().column_names[j],
-                    input_sorting_info->direction, 1);
+                      input_order_info->direction, 1);

            /// Drop temporary columns, added by 'sorting_key_prefix_expr'
            out_projection = createProjection(pipes.back(), data);
--- a/src/Storages/ReadInOrderOptimizer.cpp
+++ b/src/Storages/ReadInOrderOptimizer.cpp
@ -30,7 +30,7 @@ ReadInOrderOptimizer::ReadInOrderOptimizer(
        forbidden_columns.insert(elem.first);
 }

-InputSortingInfoPtr ReadInOrderOptimizer::getInputOrder(const StoragePtr & storage) const
+InputOrderInfoPtr ReadInOrderOptimizer::getInputOrder(const StoragePtr & storage) const
 {
    Names sorting_key_columns;
    if (const auto * merge_tree = dynamic_cast<const MergeTreeData *>(storage.get()))
@ -122,7 +122,7 @@ InputSortingInfoPtr ReadInOrderOptimizer::getInputOrder(const StoragePtr & stora
    if (order_key_prefix_descr.empty())
        return {};

-    return std::make_shared<InputSortingInfo>(std::move(order_key_prefix_descr), read_direction);
+    return std::make_shared<InputOrderInfo>(std::move(order_key_prefix_descr), read_direction);
 }

 }
--- a/src/Storages/ReadInOrderOptimizer.h
+++ b/src/Storages/ReadInOrderOptimizer.h
@ -20,10 +20,10 @@ public:
        const SortDescription & required_sort_description,
        const SyntaxAnalyzerResultPtr & syntax_result);

-    InputSortingInfoPtr getInputOrder(const StoragePtr & storage) const;
+    InputOrderInfoPtr getInputOrder(const StoragePtr & storage) const;

 private:
-    /// Actions for every element of order expression to analyze functions for monotonicicy
+    /// Actions for every element of order expression to analyze functions for monotonicity
    ManyExpressionActions elements_actions;
    NameSet forbidden_columns;
    SortDescription required_sort_description;
--- a/src/Storages/SelectQueryInfo.h
+++ b/src/Storages/SelectQueryInfo.h
@ -2,6 +2,7 @@

 #include <Interpreters/PreparedSets.h>
 #include <Core/SortDescription.h>
+#include <Core/Names.h>
 #include <memory>

 namespace DB
@ -35,25 +36,25 @@ struct FilterInfo
    bool do_remove_column = false;
 };

-struct InputSortingInfo
+struct InputOrderInfo
 {
    SortDescription order_key_prefix_descr;
    int direction;

-    InputSortingInfo(const SortDescription & order_key_prefix_descr_, int direction_)
+    InputOrderInfo(const SortDescription & order_key_prefix_descr_, int direction_)
        : order_key_prefix_descr(order_key_prefix_descr_), direction(direction_) {}

-    bool operator ==(const InputSortingInfo & other) const
+    bool operator ==(const InputOrderInfo & other) const
    {
        return order_key_prefix_descr == other.order_key_prefix_descr && direction == other.direction;
    }

-    bool operator !=(const InputSortingInfo & other) const { return !(*this == other); }
+    bool operator !=(const InputOrderInfo & other) const { return !(*this == other); }
 };

 using PrewhereInfoPtr = std::shared_ptr<PrewhereInfo>;
 using FilterInfoPtr = std::shared_ptr<FilterInfo>;
-using InputSortingInfoPtr = std::shared_ptr<const InputSortingInfo>;
+using InputOrderInfoPtr = std::shared_ptr<const InputOrderInfo>;

 struct SyntaxAnalyzerResult;
 using SyntaxAnalyzerResultPtr = std::shared_ptr<const SyntaxAnalyzerResult>;
@ -61,6 +62,7 @@ using SyntaxAnalyzerResultPtr = std::shared_ptr<const SyntaxAnalyzerResult>;
 class ReadInOrderOptimizer;
 using ReadInOrderOptimizerPtr = std::shared_ptr<const ReadInOrderOptimizer>;

+
 /** Query along with some additional data,
  *  that can be used during query processing
  *  inside storage engines.
@ -73,9 +75,9 @@ struct SelectQueryInfo

    PrewhereInfoPtr prewhere_info;

-    ReadInOrderOptimizerPtr order_by_optimizer;
+    ReadInOrderOptimizerPtr order_optimizer;
    /// We can modify it while reading from storage
-    mutable InputSortingInfoPtr input_sorting_info;
+    mutable InputOrderInfoPtr input_order_info;

    /// Prepared sets are used for indices by storage engine.
    /// Example: x IN (1, 2, 3)
--- a/src/Storages/StorageBuffer.cpp
+++ b/src/Storages/StorageBuffer.cpp
@ -171,8 +171,8 @@ Pipes StorageBuffer::read(

        if (dst_has_same_structure)
        {
-            if (query_info.order_by_optimizer)
-                query_info.input_sorting_info = query_info.order_by_optimizer->getInputOrder(destination);
+            if (query_info.order_optimizer)
+                query_info.input_order_info = query_info.order_optimizer->getInputOrder(destination);

            /// The destination table has the same structure of the requested columns and we can simply read blocks from there.
            pipes_from_dst = destination->read(column_names, query_info, context, processed_stage, max_block_size, num_streams);
--- a/src/Storages/StorageMaterializedView.cpp
+++ b/src/Storages/StorageMaterializedView.cpp
@ -180,8 +180,8 @@ Pipes StorageMaterializedView::read(
    auto lock = storage->lockStructureForShare(
            false, context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout);

-    if (query_info.order_by_optimizer)
-        query_info.input_sorting_info = query_info.order_by_optimizer->getInputOrder(storage);
+    if (query_info.order_optimizer)
+        query_info.input_order_info = query_info.order_optimizer->getInputOrder(storage);

    Pipes pipes = storage->read(column_names, query_info, context, processed_stage, max_block_size, num_streams);

--- a/src/Storages/StorageMerge.cpp
+++ b/src/Storages/StorageMerge.cpp
@ -172,12 +172,12 @@ Pipes StorageMerge::read(
    num_streams *= num_streams_multiplier;
    size_t remaining_streams = num_streams;

-    InputSortingInfoPtr input_sorting_info;
-    if (query_info.order_by_optimizer)
+    InputOrderInfoPtr input_sorting_info;
+    if (query_info.order_optimizer)
    {
        for (auto it = selected_tables.begin(); it != selected_tables.end(); ++it)
        {
-            auto current_info = query_info.order_by_optimizer->getInputOrder(std::get<0>(*it));
+            auto current_info = query_info.order_optimizer->getInputOrder(std::get<0>(*it));
            if (it == selected_tables.begin())
                input_sorting_info = current_info;
            else if (!current_info || (input_sorting_info && *current_info != *input_sorting_info))
@ -187,7 +187,7 @@ Pipes StorageMerge::read(
                break;
        }

-        query_info.input_sorting_info = input_sorting_info;
+        query_info.input_order_info = input_sorting_info;
    }

    for (const auto & table : selected_tables)
--- a/tests/performance/aggregation_in_order.xml
+++ b/tests/performance/aggregation_in_order.xml
@ -0,0 +1,23 @@
+<test>
+    <preconditions>
+        <table_exists>hits_10m_single</table_exists>
+        <table_exists>hits_100m_single</table_exists>
+    </preconditions>
+
+    <settings><optimize_aggregation_in_order>1</optimize_aggregation_in_order></settings>
+
+    <substitutions>
+        <substitution>
+           <name>table</name>
+           <values>
+               <value>hits_10m_single</value>
+               <value>hits_100m_single</value>
+           </values>
+       </substitution>
+    </substitutions>
+
+    <query>SELECT avg(length(URL)) as x from hits_100m_single GROUP BY CounterID FORMAT Null</query>
+    <query>SELECT avg(length(URL)) as x from {table} GROUP BY CounterID, EventDate FORMAT Null</query>
+    <query>SELECT avg(length(URL)) as x from hits_10m_single GROUP BY CounterID, EventDate, intHash32(UserID) FORMAT Null</query>
+
+</test>
--- a/tests/queries/0_stateless/01291_aggregation_in_order.reference
+++ b/tests/queries/0_stateless/01291_aggregation_in_order.reference
@ -0,0 +1,41 @@
+1	1
+1	2
+1	3
+1	4
+1	5
+1	6
+2	1
+2	2
+2	3
+2	4
+1
+2
+1	1	101	1
+1	2	102	1
+1	3	103	1
+1	4	104	1
+1	5	104	1
+1	6	105	1
+2	1	213	2
+2	2	107	2
+2	3	108	2
+2	4	109	2
+1	619	1
+2	537	2
+1	619	1
+2	537	2
+2019-05-05 00:00:00	-45363190
+2019-05-05 00:00:00	-1249512288
+2019-05-05 00:00:00	345522721
+2019-05-05 00:00:00	486601715
+2019-05-05 00:00:00	1449669396
+2019-05-05 00:00:00	45
+2019-05-06 00:00:00	46
+2019-05-07 00:00:00	47
+2019-05-08 00:00:00	48
+2019-05-09 00:00:00	49
+2019-05-05 00:00:00	0	1900940608
+2019-05-06 00:00:00	1	1857737272
+2019-05-07 00:00:00	2	1996614413
+2019-05-08 00:00:00	3	1873725230
+2019-05-09 00:00:00	4	1831412253
--- a/tests/queries/0_stateless/01291_aggregation_in_order.sql
+++ b/tests/queries/0_stateless/01291_aggregation_in_order.sql
@ -0,0 +1,33 @@
+DROP TABLE IF EXISTS pk_order;
+
+SET optimize_aggregation_in_order = 1;
+
+CREATE TABLE pk_order(a UInt64, b UInt64, c UInt64, d UInt64) ENGINE=MergeTree() ORDER BY (a, b);
+INSERT INTO pk_order(a, b, c, d) VALUES (1, 1, 101, 1), (1, 2, 102, 1), (1, 3, 103, 1), (1, 4, 104, 1);
+INSERT INTO pk_order(a, b, c, d) VALUES (1, 5, 104, 1), (1, 6, 105, 1), (2, 1, 106, 2), (2, 1, 107, 2);
+INSERT INTO pk_order(a, b, c, d) VALUES (2, 2, 107, 2), (2, 3, 108, 2), (2, 4, 109, 2);
+
+-- Order after group by in order is determined
+
+SELECT a, b FROM pk_order GROUP BY a, b;
+SELECT a FROM pk_order GROUP BY a;
+
+SELECT a, b, sum(c), avg(d) FROM pk_order GROUP BY a, b;
+SELECT a, sum(c), avg(d) FROM pk_order GROUP BY a;
+SELECT a, sum(c), avg(d) FROM pk_order GROUP BY -a;
+
+DROP TABLE IF EXISTS pk_order;
+
+CREATE TABLE pk_order (d DateTime, a Int32, b Int32) ENGINE = MergeTree ORDER BY (d, a)
+    PARTITION BY toDate(d) SETTINGS index_granularity=1;
+
+INSERT INTO pk_order
+    SELECT toDateTime('2019-05-05 00:00:00') + INTERVAL number % 10 DAY, number, intHash32(number) from numbers(100);
+
+set max_block_size = 1;
+
+SELECT d, max(b) FROM pk_order GROUP BY d, a LIMIT 5;
+SELECT d, avg(a) FROM pk_order GROUP BY toString(d) LIMIT 5;
+SELECT toStartOfHour(d) as d1, min(a), max(b) FROM pk_order GROUP BY d1 LIMIT 5;
+
+DROP TABLE pk_order;
--- a/tests/queries/1_stateful/00004_top_counters.reference
+++ b/tests/queries/1_stateful/00004_top_counters.reference
@ -8,3 +8,13 @@
 59183	85379
 33010362	77807
 800784	77492
+1704509	523264
+732797	475698
+598875	337212
+792887	252197
+3807842	196036
+25703952	147211
+716829	90109
+59183	85379
+33010362	77807
+800784	77492
--- a/tests/queries/1_stateful/00004_top_counters.sql
+++ b/tests/queries/1_stateful/00004_top_counters.sql
@ -1 +1,2 @@
-SELECT CounterID, count() AS c FROM test.hits GROUP BY CounterID ORDER BY c DESC LIMIT 10
+SELECT CounterID, count() AS c FROM test.hits GROUP BY CounterID ORDER BY c DESC LIMIT 10;
+SELECT CounterID, count() AS c FROM test.hits GROUP BY CounterID ORDER BY c DESC LIMIT 10 SETTINGS optimize_aggregation_in_order = 1
--- a/tests/queries/1_stateful/00047_bar.reference
+++ b/tests/queries/1_stateful/00047_bar.reference
@ -98,3 +98,103 @@
 7901143	10022	█▌
 194599	9997	█▌
 21052498	9780	█▍
+1704509	523264	████████████████████████████████████████████████████████████████████████████████
+732797	475698	████████████████████████████████████████████████████████████████████████▋
+598875	337212	███████████████████████████████████████████████████▌
+792887	252197	██████████████████████████████████████▌
+3807842	196036	█████████████████████████████▊
+25703952	147211	██████████████████████▌
+716829	90109	█████████████▋
+59183	85379	█████████████
+33010362	77807	███████████▊
+800784	77492	███████████▋
+20810645	73213	███████████▏
+25843850	68945	██████████▌
+23447120	67570	██████████▎
+14739804	64174	█████████▋
+32077710	60456	█████████▏
+22446879	58389	████████▊
+170282	57017	████████▋
+11482817	52345	████████
+63469	52142	███████▊
+29103473	47758	███████▎
+10136747	44080	██████▋
+27528801	43395	██████▋
+10581377	43279	██████▌
+9841201	40581	██████▏
+20310963	37562	█████▋
+17337667	34301	█████▏
+28600281	32776	█████
+32046685	28788	████▍
+10130880	26603	████
+8676831	25733	███▊
+53230	25595	███▊
+20271226	25585	███▊
+17420663	25496	███▊
+631207	25270	███▋
+633130	24744	███▋
+14324015	23349	███▌
+8537965	21270	███▎
+11285298	20825	███▏
+14937615	20788	███▏
+185050	20785	███▏
+16368233	19897	███
+81602	19724	███
+62896	19717	███
+12967664	19402	██▊
+15996597	18557	██▋
+4379238	18370	██▋
+90982	17443	██▋
+18211045	17390	██▋
+14625884	17302	██▋
+12864910	17279	██▋
+126096	16959	██▌
+30296134	16849	██▌
+26360482	16175	██▍
+17788950	16017	██▍
+5928716	15340	██▎
+15469035	15171	██▎
+29732125	15146	██▎
+32946244	15104	██▎
+20957241	14719	██▎
+9495695	14584	██▏
+29241146	14540	██▏
+109805	14199	██▏
+26905788	13972	██▏
+212019	13930	██▏
+171509	13792	██
+23913162	13615	██
+1861993	13509	██
+125776	13308	██
+11312316	13181	██
+32667326	13181	██
+28628973	12922	█▊
+122804	12520	█▊
+12322758	12352	█▊
+1301819	12283	█▊
+10769545	12183	█▋
+21566939	12170	█▋
+28905364	12158	█▋
+4250765	12049	█▋
+15009727	11818	█▋
+12761932	11733	█▋
+26995888	11658	█▋
+12759346	11514	█▋
+1507911	11452	█▋
+968488	11444	█▋
+15736172	11358	█▋
+54310	11193	█▋
+17027391	11047	█▋
+17439919	10936	█▋
+4480860	10747	█▋
+26738469	10738	█▋
+9986231	10656	█▋
+1539995	10655	█▋
+214556	10625	█▌
+219339	10522	█▌
+3266	10503	█▌
+30563429	10128	█▌
+1960469	10098	█▌
+7901143	10022	█▌
+194599	9997	█▌
+21052498	9780	█▍
--- a/tests/queries/1_stateful/00047_bar.sql
+++ b/tests/queries/1_stateful/00047_bar.sql
@ -1 +1,2 @@
-SELECT CounterID, count() AS c, bar(c, 0, 523264) FROM test.hits GROUP BY CounterID ORDER BY c DESC, CounterID ASC LIMIT 100
+SELECT CounterID, count() AS c, bar(c, 0, 523264) FROM test.hits GROUP BY CounterID ORDER BY c DESC, CounterID ASC LIMIT 100;
+SELECT CounterID, count() AS c, bar(c, 0, 523264) FROM test.hits GROUP BY CounterID ORDER BY c DESC, CounterID ASC LIMIT 100 SETTINGS optimize_aggregation_in_order = 1
--- a/tests/queries/1_stateful/00049_max_string_if.reference
+++ b/tests/queries/1_stateful/00049_max_string_if.reference
@ -18,3 +18,23 @@
 11482817	52345	я скачать игры
 63469	52142	яндекс марте рокус надписями я любимому у полосы фото минск
 29103473	47758	
+1704509	523264	نيك امريكي نيك افلام سكس جامد
+732797	475698	نيك سكس سيحاق
+598875	337212	سکس باصات
+792887	252197	№2267 отзыв
+3807842	196036	ярмаркетовара 200кг купить по неделю тебелье
+25703952	147211	
+716829	90109	яндекс повыш
+59183	85379	франция машину угловы крузер из кофе
+33010362	77807	ярмаркетовара 200кг купить по неделю тебелье
+800784	77492	ярмаркур смерти теплицы из чего
+20810645	73213	ярмаркетовара 200кг купить по неделю тебе перево метиков детский
+25843850	68945	электросчет-фактура
+23447120	67570	южная степанов
+14739804	64174	штангал волк
+32077710	60456	
+22446879	58389	فیلم سكس امريكي نيك
+170282	57017	ل افلام السكس
+11482817	52345	я скачать игры
+63469	52142	яндекс марте рокус надписями я любимому у полосы фото минск
+29103473	47758	
--- a/tests/queries/1_stateful/00049_max_string_if.sql
+++ b/tests/queries/1_stateful/00049_max_string_if.sql
@ -1 +1,2 @@
-SELECT CounterID, count(), maxIf(SearchPhrase, notEmpty(SearchPhrase)) FROM test.hits GROUP BY CounterID ORDER BY count() DESC LIMIT 20
+SELECT CounterID, count(), maxIf(SearchPhrase, notEmpty(SearchPhrase)) FROM test.hits GROUP BY CounterID ORDER BY count() DESC LIMIT 20;
+SELECT CounterID, count(), maxIf(SearchPhrase, notEmpty(SearchPhrase)) FROM test.hits GROUP BY CounterID ORDER BY count() DESC LIMIT 20 SETTINGS optimize_aggregation_in_order = 1
--- a/tests/queries/1_stateful/00050_min_max.reference
+++ b/tests/queries/1_stateful/00050_min_max.reference
@ -18,3 +18,23 @@
 11482817	4611708000353743073	9223337838355779113
 63469	4611695097019173921	9223353530156141191
 29103473	4611744585914335132	9223333530281362537
+1704509	4611700827100483880	9223360787015464643
+732797	4611701940806302259	9223355550934604746
+598875	4611701407242345792	9223362250391155632
+792887	4611699550286611812	9223290551912005343
+3807842	4611710821592843606	9223326163906184987
+25703952	4611709443519524003	9223353913449113943
+716829	4611852156092872082	9223361623076951140
+59183	4611730685242027332	9223354909338698162
+33010362	4611704682869732882	9223268545373999677
+800784	4611752907938305166	9223340418389788041
+20810645	4611712185532639162	9223218900001937412
+25843850	4611690025407720929	9223346023778617822
+23447120	4611796031755620254	9223329309291309758
+14739804	4611692230555590277	9223313509005166531
+32077710	4611884228437061959	9223352444952988904
+22446879	4611846229717089436	9223124373140579096
+170282	4611833225706935900	9223371583739401906
+11482817	4611708000353743073	9223337838355779113
+63469	4611695097019173921	9223353530156141191
+29103473	4611744585914335132	9223333530281362537
--- a/tests/queries/1_stateful/00050_min_max.sql
+++ b/tests/queries/1_stateful/00050_min_max.sql
@ -1 +1,2 @@
-SELECT CounterID, min(WatchID), max(WatchID) FROM test.hits GROUP BY CounterID ORDER BY count() DESC LIMIT 20
+SELECT CounterID, min(WatchID), max(WatchID) FROM test.hits GROUP BY CounterID ORDER BY count() DESC LIMIT 20;
+SELECT CounterID, min(WatchID), max(WatchID) FROM test.hits GROUP BY CounterID ORDER BY count() DESC LIMIT 20 SETTINGS optimize_aggregation_in_order = 1
--- a/tests/queries/1_stateful/00051_min_max_array.reference
+++ b/tests/queries/1_stateful/00051_min_max_array.reference
@ -18,3 +18,23 @@
 11482817	52345	[]	[]	[]
 63469	52142	[]	[]	[]
 29103473	47758	[6185451]	[]	[6185451]
+1704509	523264	[271264]	[]	[271264]
+732797	475698	[]	[]	[]
+598875	337212	[]	[]	[]
+792887	252197	[2094893,2028343]	[]	[1272031]
+3807842	196036	[1710269]	[]	[1134660]
+25703952	147211	[]	[]	[]
+716829	90109	[4186138]	[]	[1800405]
+59183	85379	[]	[]	[]
+33010362	77807	[]	[]	[]
+800784	77492	[4002316]	[]	[1270480]
+20810645	73213	[]	[]	[]
+25843850	68945	[4028285]	[]	[4028285]
+23447120	67570	[6503091,2762273]	[]	[2098132]
+14739804	64174	[4180720]	[]	[664490]
+32077710	60456	[]	[]	[]
+22446879	58389	[]	[]	[]
+170282	57017	[4166114]	[]	[34386,1240412,1248634,1616213,2928740,1458582]
+11482817	52345	[]	[]	[]
+63469	52142	[]	[]	[]
+29103473	47758	[6185451]	[]	[6185451]
--- a/tests/queries/1_stateful/00051_min_max_array.sql
+++ b/tests/queries/1_stateful/00051_min_max_array.sql
@ -1 +1,2 @@
-SELECT CounterID, count(), max(GoalsReached), min(GoalsReached), minIf(GoalsReached, notEmpty(GoalsReached)) FROM test.hits GROUP BY CounterID ORDER BY count() DESC LIMIT 20
+SELECT CounterID, count(), max(GoalsReached), min(GoalsReached), minIf(GoalsReached, notEmpty(GoalsReached)) FROM test.hits GROUP BY CounterID ORDER BY count() DESC LIMIT 20;
+SELECT CounterID, count(), max(GoalsReached), min(GoalsReached), minIf(GoalsReached, notEmpty(GoalsReached)) FROM test.hits GROUP BY CounterID ORDER BY count() DESC LIMIT 20 SETTINGS optimize_aggregation_in_order = 1
--- a/tests/queries/1_stateful/00087_where_0.sql
+++ b/tests/queries/1_stateful/00087_where_0.sql
@ -1,3 +1,5 @@
 SET max_rows_to_read = 1000;
 SELECT CounterID, uniq(UserID) FROM test.hits WHERE 0 != 0 GROUP BY CounterID;
+SELECT CounterID, uniq(UserID) FROM test.hits WHERE 0 != 0 GROUP BY CounterID SETTINGS optimize_aggregation_in_order = 1;
 SELECT CounterID, uniq(UserID) FROM test.hits WHERE 0 AND CounterID = 1704509 GROUP BY CounterID;
+SELECT CounterID, uniq(UserID) FROM test.hits WHERE 0 AND CounterID = 1704509 GROUP BY CounterID SETTINGS optimize_aggregation_in_order = 1;
--- a/tests/queries/1_stateful/00149_quantiles_timing_distributed.reference
+++ b/tests/queries/1_stateful/00149_quantiles_timing_distributed.reference
@ -1 +1,2 @@
 10726001768429413598
+10726001768429413598
--- a/tests/queries/1_stateful/00149_quantiles_timing_distributed.sql
+++ b/tests/queries/1_stateful/00149_quantiles_timing_distributed.sql
@ -1 +1,2 @@
 SELECT sum(cityHash64(*)) FROM (SELECT CounterID, quantileTiming(0.5)(SendTiming), count() FROM remote('127.0.0.{1,2,3,4,5,6,7,8,9,10}', test.hits) WHERE SendTiming != -1 GROUP BY CounterID);
+SELECT sum(cityHash64(*)) FROM (SELECT CounterID, quantileTiming(0.5)(SendTiming), count() FROM remote('127.0.0.{1,2,3,4,5,6,7,8,9,10}', test.hits) WHERE SendTiming != -1 GROUP BY CounterID) SETTINGS optimize_aggregation_in_order = 1;
--- a/tests/queries/1_stateful/00150_quantiles_timing_precision.reference
+++ b/tests/queries/1_stateful/00150_quantiles_timing_precision.reference
@ -1 +1,2 @@
 4379238	1868	1879	5755	0.006
+4379238	1868	1879	5755	0.006
--- a/tests/queries/1_stateful/00150_quantiles_timing_precision.sql
+++ b/tests/queries/1_stateful/00150_quantiles_timing_precision.sql
@ -1 +1,2 @@
 SELECT CounterID, quantileTiming(0.5)(SendTiming) AS qt, least(30000, quantileExact(0.5)(SendTiming)) AS qe, count() AS c, round(abs(qt - qe) / greatest(qt, qe) AS diff, 3) AS rounded_diff FROM test.hits WHERE SendTiming != -1 GROUP BY CounterID HAVING diff != 0 ORDER BY diff DESC;
+SELECT CounterID, quantileTiming(0.5)(SendTiming) AS qt, least(30000, quantileExact(0.5)(SendTiming)) AS qe, count() AS c, round(abs(qt - qe) / greatest(qt, qe) AS diff, 3) AS rounded_diff FROM test.hits WHERE SendTiming != -1 GROUP BY CounterID HAVING diff != 0 ORDER BY diff DESC SETTINGS optimize_aggregation_in_order = 1;