Merge pull request #13677 from hagen1778/merge-tree-fail-fast-on-rows-limit

[mergeTree]: fail fast if max_rows_to_read limit exceeded on parts scan
2024-11-22 07:31:57 +00:00 · 2020-08-18 22:24:39 +03:00 · 2020-08-18 22:24:39 +03:00 · 23ccb0b6be
commit 23ccb0b6be
parent fb2855c558 ac72148c3e
2 changed files with 30 additions and 2 deletions
--- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
@ -79,6 +79,7 @@ namespace ErrorCodes
    extern const int ILLEGAL_TYPE_OF_COLUMN_FOR_FILTER;
    extern const int ILLEGAL_COLUMN;
    extern const int ARGUMENT_OUT_OF_BOUND;
+    extern const int TOO_MANY_ROWS;
 }


@ -573,6 +574,8 @@ Pipe MergeTreeDataSelectExecutor::readFromParts(

    /// Let's find what range to read from each part.
    {
+        std::atomic<size_t> total_rows {0};
+
        auto process_part = [&](size_t part_index)
        {
            auto & part = parts[part_index];
@ -599,7 +602,23 @@ Pipe MergeTreeDataSelectExecutor::readFromParts(
                        index_and_condition.first, index_and_condition.second, part, ranges.ranges, settings, reader_settings, log);

            if (!ranges.ranges.empty())
+            {
+                if (settings.read_overflow_mode == OverflowMode::THROW && settings.max_rows_to_read)
+                {
+                    /// Fail fast if estimated number of rows to read exceeds the limit
+                    auto current_rows_estimate = ranges.getRowsCount();
+                    size_t prev_total_rows_estimate = total_rows.fetch_add(current_rows_estimate);
+                    size_t total_rows_estimate = current_rows_estimate + prev_total_rows_estimate;
+                    if (total_rows_estimate > settings.max_rows_to_read)
+                        throw Exception(
+                            "Limit for rows (controlled by 'max_rows_to_read' setting) exceeded, max rows: "
+                            + formatReadableQuantity(settings.max_rows_to_read)
+                            + ", estimated rows to read (at least): " + formatReadableQuantity(total_rows_estimate),
+                            ErrorCodes::TOO_MANY_ROWS);
+                }
+
                parts_with_ranges[part_index] = std::move(ranges);
+            }
        };

        size_t num_threads = std::min(size_t(num_streams), parts.size());
--- a/tests/queries/0_stateless/01064_incremental_streaming_from_2_src_with_feedback.sql
+++ b/tests/queries/0_stateless/01064_incremental_streaming_from_2_src_with_feedback.sql
@ -12,6 +12,7 @@ DROP TABLE IF EXISTS mv_checkouts2target;
 -- that is the final table, which is filled incrementally from 2 different sources

 CREATE TABLE target_table Engine=SummingMergeTree() ORDER BY id
+SETTINGS index_granularity=128
 AS
   SELECT
     number as id,
@ -85,12 +86,20 @@ INSERT INTO logins SELECT number as id,    '2000-01-01 08:00:00' from numbers(50
 INSERT INTO checkouts SELECT number as id, '2000-01-01 10:00:00' from numbers(50000);

 -- ensure that we don't read whole target table during join
-set max_rows_to_read = 2000;
+-- by this time we should have 3 parts for target_table because of prev inserts
+-- and we plan to make two more inserts. With index_granularity=128 and max id=1000
+-- we expect to read not more than:
+--      (1000/128) marks per part * (3 + 2) parts * 128 granularity = 5120 rows
+set max_rows_to_read = 5120;

 INSERT INTO logins    SELECT number as id, '2000-01-01 11:00:00' from numbers(1000);
 INSERT INTO checkouts SELECT number as id, '2000-01-01 11:10:00' from numbers(1000);

-set max_rows_to_read = 10;
+-- by this time we should have 5 parts for target_table because of prev inserts
+-- and we plan to make two more inserts. With index_granularity=128 and max id=1
+-- we expect to read not more than:
+--      1 mark per part * (5 + 2) parts * 128 granularity = 896 rows
+set max_rows_to_read = 896;

 INSERT INTO logins    SELECT number+2 as id, '2001-01-01 11:10:01' from numbers(1);
 INSERT INTO checkouts SELECT number+2 as id, '2001-01-01 11:10:02' from numbers(1);