Merge pull request #25333 from MaxWk/exact-rows-before-limit

add setting exact_rows_before_limit
2024-11-14 03:25:15 +00:00 · 2022-07-24 02:30:31 +03:00 · 2022-07-24 02:30:31 +03:00 · c489ad7d71
commit c489ad7d71
parent 4302f583d8 196fe5a269
5 changed files with 97 additions and 3 deletions
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@ -769,6 +769,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
    M(Bool, output_format_pretty_row_numbers, false, "Add row numbers before each row for pretty output format", 0) \
    M(Bool, insert_distributed_one_random_shard, false, "If setting is enabled, inserting into distributed table will choose a random shard to write when there is no sharding key", 0) \
    \
+    M(Bool, exact_rows_before_limit, false, "When enabled, ClickHouse will provide exact value for rows_before_limit_at_least statistic, but with the cost that the data before limit will have to be read completely", 0) \
    M(UInt64, cross_to_inner_join_rewrite, 1, "Use inner join instead of comma/cross join if there're joining expressions in the WHERE section. Values: 0 - no rewrite, 1 - apply if possible for comma/cross, 2 - force rewrite all comma joins, cross - if possible", 0) \
    \
    M(Bool, output_format_arrow_low_cardinality_as_dictionary, false, "Enable output LowCardinality type as Dictionary Arrow type", 0) \
--- a/src/Interpreters/InterpreterSelectQuery.cpp
+++ b/src/Interpreters/InterpreterSelectQuery.cpp
@ -2712,11 +2712,14 @@ void InterpreterSelectQuery::executePreLimit(QueryPlan & query_plan, bool do_not
            limit_offset = 0;
        }

-        auto limit = std::make_unique<LimitStep>(query_plan.getCurrentDataStream(), limit_length, limit_offset);
+        const Settings & settings = context->getSettingsRef();
+
+        auto limit = std::make_unique<LimitStep>(query_plan.getCurrentDataStream(), limit_length, limit_offset, settings.exact_rows_before_limit);
        if (do_not_skip_offset)
            limit->setStepDescription("preliminary LIMIT (with OFFSET)");
        else
            limit->setStepDescription("preliminary LIMIT (without OFFSET)");
+
        query_plan.addStep(std::move(limit));
    }
 }
@ -2778,7 +2781,8 @@ void InterpreterSelectQuery::executeLimit(QueryPlan & query_plan)
          *  if there is WITH TOTALS and there is no ORDER BY, then read the data to the end,
          *  otherwise TOTALS is counted according to incomplete data.
          */
-        bool always_read_till_end = false;
+        const Settings & settings = context->getSettingsRef();
+        bool always_read_till_end = settings.exact_rows_before_limit;

        if (query.group_by_with_totals && !query.orderBy())
            always_read_till_end = true;
--- a/src/Interpreters/InterpreterSelectWithUnionQuery.cpp
+++ b/src/Interpreters/InterpreterSelectWithUnionQuery.cpp
@ -344,7 +344,7 @@ void InterpreterSelectWithUnionQuery::buildQueryPlan(QueryPlan & query_plan)
    {
        if (settings.limit > 0)
        {
-            auto limit = std::make_unique<LimitStep>(query_plan.getCurrentDataStream(), settings.limit, settings.offset);
+            auto limit = std::make_unique<LimitStep>(query_plan.getCurrentDataStream(), settings.limit, settings.offset, settings.exact_rows_before_limit);
            limit->setStepDescription("LIMIT OFFSET for SETTINGS");
            query_plan.addStep(std::move(limit));
        }
--- a/tests/queries/0_stateless/01913_exact_rows_before_limit.reference
+++ b/tests/queries/0_stateless/01913_exact_rows_before_limit.reference
@ -0,0 +1,72 @@
+{
+	"meta":
+	[
+		{
+			"name": "0",
+			"type": "UInt8"
+		}
+	],
+
+	"data":
+	[
+		[0]
+	],
+
+	"rows": 1,
+
+	"rows_before_limit_at_least": 10000
+}
+{
+	"meta":
+	[
+		{
+			"name": "0",
+			"type": "UInt8"
+		}
+	],
+
+	"data":
+	[
+		[0]
+	],
+
+	"rows": 1,
+
+	"rows_before_limit_at_least": 20000
+}
+{
+	"meta":
+	[
+		{
+			"name": "0",
+			"type": "UInt8"
+		}
+	],
+
+	"data":
+	[
+		[0]
+	],
+
+	"rows": 1,
+
+	"rows_before_limit_at_least": 1
+}
+{
+	"meta":
+	[
+		{
+			"name": "0",
+			"type": "UInt8"
+		}
+	],
+
+	"data":
+	[
+		[0]
+	],
+
+	"rows": 1,
+
+	"rows_before_limit_at_least": 20000
+}
--- a/tests/queries/0_stateless/01913_exact_rows_before_limit.sql
+++ b/tests/queries/0_stateless/01913_exact_rows_before_limit.sql
@ -0,0 +1,17 @@
+-- Tags: no-parallel
+drop table if exists test_rows_compact_part;
+create table test_rows_compact_part(f1 int,f2 int)  engine=MergeTree partition by f1 order by f2 settings min_bytes_for_wide_part=10485760;
+insert into test_rows_compact_part select  0,arrayJoin(range(10000)) ;
+insert into test_rows_compact_part select  1,arrayJoin(range(10000));
+select 0 from test_rows_compact_part limit 1 FORMAT JSONCompact settings exact_rows_before_limit = 0,output_format_write_statistics = 0;
+select 0 from test_rows_compact_part limit 1 FORMAT JSONCompact settings exact_rows_before_limit = 1, output_format_write_statistics = 0;
+drop table if exists test_rows_compact_part;
+
+
+drop table if exists test_rows_wide_part;
+create table test_rows_wide_part(f1 int,f2 int)  engine=MergeTree partition by f1 order by f2 settings min_bytes_for_wide_part=0;
+insert into test_rows_wide_part select  0,arrayJoin(range(10000)) ;
+insert into test_rows_wide_part select  1,arrayJoin(range(10000));
+select 0 from test_rows_wide_part limit 1 FORMAT JSONCompact settings exact_rows_before_limit = 0,output_format_write_statistics = 0;
+select 0 from test_rows_wide_part limit 1 FORMAT JSONCompact settings exact_rows_before_limit = 1, output_format_write_statistics = 0;
+drop table if exists test_rows_compact_part;