simplify a loop and make RANGE frame the default

This commit is contained in:
Alexander Kuzmenkov 2021-01-28 20:05:01 +03:00
parent e553eb112f
commit 3ace39fbf7
6 changed files with 127 additions and 152 deletions

View File

@ -46,10 +46,10 @@ struct WindowFrame
// This flag signifies that the frame properties were not set explicitly by
// user, but the fields of this structure still have to contain proper values
// for the default frame of ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW.
// for the default frame of RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW.
bool is_default = true;
FrameType type = FrameType::Rows;
FrameType type = FrameType::Range;
/*
* We don't need these yet.

View File

@ -561,7 +561,6 @@ static bool tryParseFrameDefinition(ASTWindowDefinition * node, IParser::Pos & p
{
return false;
}
}
else
{

View File

@ -263,68 +263,48 @@ void WindowTransform::advanceFrameEnd()
// The only frame end we have for now is CURRENT ROW.
advanceFrameEndCurrentRow();
// We might not have advanced the frame end if we found out we reached the
// end of input or the partition, or if we still don't know the frame start.
if (frame_end_before == frame_end)
{
return;
}
// Add the columns over which we advanced the frame to the aggregate function
// states.
std::vector<const IColumn *> argument_columns;
for (auto & ws : workspaces)
// We could have advanced over at most the entire last block.
uint64_t last_row = frame_end.row;
if (frame_end.row == 0)
{
const auto & f = ws.window_function;
const auto * a = f.aggregate_function.get();
auto * buf = ws.aggregate_function_state.data();
// FIXME we don't need these complex loops, because frame_end advances
// by one block at most.
// We use two explicit loops here instead of using advanceRowNumber(),
// because we want to cache the argument columns array per block. Later
// we also use batch add.
// Unfortunately this leads to tricky loop conditions, because the
// frame_end might be either a past-the-end block, or a valid block, in
// which case we also have to process its head.
// And we also have to remember to reset the row number when moving to
// the next block.
uint64_t past_the_end_block;
// Note that the past-the-end row is not in the past-the-end block, but
// in the block before it.
uint64_t past_the_end_row;
if (frame_end.block < first_block_number + blocks.size())
{
// The past-the-end row is in some valid block.
past_the_end_block = frame_end.block + 1;
past_the_end_row = frame_end.row;
assert(frame_end == blocksEnd());
last_row = blockRowsNumber(frame_end_before);
}
else
{
// The past-the-end row is at the total end of data.
past_the_end_block = first_block_number + blocks.size();
// It's in the previous block!
past_the_end_row = blocks.back().numRows();
assert(frame_end_before.block == frame_end.block);
}
for (auto r = frame_end_before;
r.block < past_the_end_block;
++r.block, r.row = 0)
{
const auto & block = blocks[r.block - first_block_number];
assert(frame_end_before.row < last_row);
argument_columns.clear();
for (auto & ws : workspaces)
{
if (frame_end_before.block != ws.cached_block_number)
{
const auto & block
= blocks[frame_end_before.block - first_block_number];
ws.argument_columns.clear();
for (const auto i : ws.argument_column_indices)
{
argument_columns.push_back(block.input_columns[i].get());
ws.argument_columns.push_back(block.input_columns[i].get());
}
ws.cached_block_number = frame_end_before.block;
}
// We process all rows of intermediate blocks, and the head of the
// last block.
const auto end = ((r.block + 1) == past_the_end_block)
? past_the_end_row
: block.numRows();
for (; r.row < end; ++r.row)
const auto * a = ws.window_function.aggregate_function.get();
auto * buf = ws.aggregate_function_state.data();
auto * columns = ws.argument_columns.data();
for (auto row = frame_end_before.row; row < last_row; ++row)
{
a->add(buf,
argument_columns.data(),
r.row,
arena.get());
}
a->add(buf, columns, row, arena.get());
}
}
}

View File

@ -22,11 +22,9 @@ struct WindowFunctionWorkspace
AlignedBuffer aggregate_function_state;
std::vector<size_t> argument_column_indices;
/*
// Argument and result columns. Be careful, they are per-chunk.
// Argument columns. Be careful, this is a per-block cache.
std::vector<const IColumn *> argument_columns;
MutableColumnPtr result_column;
*/
uint64_t cached_block_number = std::numeric_limits<uint64_t>::max();
};
struct WindowTransformBlock

View File

@ -2,7 +2,7 @@
set allow_experimental_window_functions = 1;
-- just something basic
select number, count() over (partition by intDiv(number, 3) order by number) from numbers(10);
select number, count() over (partition by intDiv(number, 3) order by number rows unbounded preceding) from numbers(10);
0 1
1 2
2 3
@ -14,7 +14,7 @@ select number, count() over (partition by intDiv(number, 3) order by number) fro
8 3
9 1
-- proper calculation across blocks
select number, max(number) over (partition by intDiv(number, 3) order by number desc) from numbers(10) settings max_block_size = 2;
select number, max(number) over (partition by intDiv(number, 3) order by number desc rows unbounded preceding) from numbers(10) settings max_block_size = 2;
2 2
1 2
0 2
@ -26,9 +26,9 @@ select number, max(number) over (partition by intDiv(number, 3) order by number
6 8
9 9
-- not a window function
select number, abs(number) over (partition by toString(intDiv(number, 3))) from numbers(10); -- { serverError 63 }
select number, abs(number) over (partition by toString(intDiv(number, 3)) rows unbounded preceding) from numbers(10); -- { serverError 63 }
-- no partition by
select number, avg(number) over (order by number) from numbers(10);
select number, avg(number) over (order by number rows unbounded preceding) from numbers(10);
0 0
1 0.5
2 1
@ -40,7 +40,7 @@ select number, avg(number) over (order by number) from numbers(10);
8 4
9 4.5
-- no order by
select number, quantileExact(number) over (partition by intDiv(number, 3)) from numbers(10);
select number, quantileExact(number) over (partition by intDiv(number, 3) rows unbounded preceding) from numbers(10);
0 0
1 1
2 1
@ -52,7 +52,7 @@ select number, quantileExact(number) over (partition by intDiv(number, 3)) from
8 7
9 9
-- can add an alias after window spec
select number, quantileExact(number) over (partition by intDiv(number, 3)) q from numbers(10);
select number, quantileExact(number) over (partition by intDiv(number, 3) rows unbounded preceding) q from numbers(10);
0 0
1 1
2 1
@ -65,14 +65,14 @@ select number, quantileExact(number) over (partition by intDiv(number, 3)) q fro
9 9
-- can't reference it yet -- the window functions are calculated at the
-- last stage of select, after all other functions.
select q * 10, quantileExact(number) over (partition by intDiv(number, 3)) q from numbers(10); -- { serverError 47 }
select q * 10, quantileExact(number) over (partition by intDiv(number, 3) rows unbounded preceding) q from numbers(10); -- { serverError 47 }
-- must work in WHERE if you wrap it in a subquery
select * from (select count(*) over () c from numbers(3)) where c > 0;
select * from (select count(*) over (rows unbounded preceding) c from numbers(3)) where c > 0;
1
2
3
-- should work in ORDER BY
select number, max(number) over (partition by intDiv(number, 3) order by number desc) m from numbers(10) order by m desc, number;
select number, max(number) over (partition by intDiv(number, 3) order by number desc rows unbounded preceding) m from numbers(10) order by m desc, number;
9 9
6 8
7 8
@ -84,14 +84,14 @@ select number, max(number) over (partition by intDiv(number, 3) order by number
1 2
2 2
-- also works in ORDER BY if you wrap it in a subquery
select * from (select count(*) over () c from numbers(3)) order by c;
select * from (select count(*) over (rows unbounded preceding) c from numbers(3)) order by c;
1
2
3
-- Example with window function only in ORDER BY. Here we make a rank of all
-- numbers sorted descending, and then sort by this rank descending, and must get
-- the ascending order.
select * from (select * from numbers(5) order by rand()) order by count() over (order by number desc) desc;
select * from (select * from numbers(5) order by rand()) order by count() over (order by number desc rows unbounded preceding) desc;
0
1
2
@ -100,23 +100,23 @@ select * from (select * from numbers(5) order by rand()) order by count() over (
-- Aggregate functions as window function arguments. This query is semantically
-- the same as the above one, only we replace `number` with
-- `any(number) group by number` and so on.
select * from (select * from numbers(5) order by rand()) group by number order by sum(any(number + 1)) over (order by min(number) desc) desc;
select * from (select * from numbers(5) order by rand()) group by number order by sum(any(number + 1)) over (order by min(number) desc rows unbounded preceding) desc;
0
1
2
3
4
-- some more simple cases w/aggregate functions
select sum(any(number)) over () from numbers(1);
select sum(any(number)) over (rows unbounded preceding) from numbers(1);
0
select sum(any(number) + 1) over () from numbers(1);
select sum(any(number) + 1) over (rows unbounded preceding) from numbers(1);
1
select sum(any(number + 1)) over () from numbers(1);
select sum(any(number + 1)) over (rows unbounded preceding) from numbers(1);
1
-- different windows
-- an explain test would also be helpful, but it's too immature now and I don't
-- want to change reference all the time
select number, max(number) over (partition by intDiv(number, 3) order by number desc), count(number) over (partition by intDiv(number, 5) order by number) as m from numbers(31) order by number settings max_block_size = 2;
select number, max(number) over (partition by intDiv(number, 3) order by number desc rows unbounded preceding), count(number) over (partition by intDiv(number, 5) order by number rows unbounded preceding) as m from numbers(31) order by number settings max_block_size = 2;
0 2 1
1 2 2
2 2 3
@ -151,7 +151,7 @@ select number, max(number) over (partition by intDiv(number, 3) order by number
-- two functions over the same window
-- an explain test would also be helpful, but it's too immature now and I don't
-- want to change reference all the time
select number, max(number) over (partition by intDiv(number, 3) order by number desc), count(number) over (partition by intDiv(number, 3) order by number desc) as m from numbers(7) order by number settings max_block_size = 2;
select number, max(number) over (partition by intDiv(number, 3) order by number desc rows unbounded preceding), count(number) over (partition by intDiv(number, 3) order by number desc rows unbounded preceding) as m from numbers(7) order by number settings max_block_size = 2;
0 2 3
1 2 2
2 2 1
@ -163,22 +163,26 @@ select number, max(number) over (partition by intDiv(number, 3) order by number
select median(x) over (partition by x) from (select 1 x);
1
-- an empty window definition is valid as well
select groupArray(number) over () from numbers(3);
select groupArray(number) over (rows unbounded preceding) from numbers(3);
[0]
[0,1]
[0,1,2]
select groupArray(number) over () from numbers(3);
[0,1,2]
[0,1,2]
[0,1,2]
-- This one tests we properly process the window function arguments.
-- Seen errors like 'column `1` not found' from count(1).
select count(1) over (), max(number + 1) over () from numbers(3);
select count(1) over (rows unbounded preceding), max(number + 1) over () from numbers(3);
1 3
-- Should work in DISTINCT
select distinct sum(0) over () from numbers(2);
select distinct sum(0) over (rows unbounded preceding) from numbers(2);
0
select distinct any(number) over () from numbers(2);
select distinct any(number) over (rows unbounded preceding) from numbers(2);
0
-- Various kinds of aliases are properly substituted into various parts of window
-- function definition.
with number + 1 as x select intDiv(number, 3) as y, sum(x + y) over (partition by y order by x) from numbers(7);
with number + 1 as x select intDiv(number, 3) as y, sum(x + y) over (partition by y order by x rows unbounded preceding) from numbers(7);
0 1
0 3
0 6
@ -192,8 +196,8 @@ select 1 window w1 as ();
select sum(number) over w1, sum(number) over w2
from numbers(10)
window
w1 as (),
w2 as (partition by intDiv(number, 3))
w1 as (rows unbounded preceding),
w2 as (partition by intDiv(number, 3) rows unbounded preceding)
;
0 0
1 1
@ -205,12 +209,14 @@ window
28 13
36 21
45 9
-- FIXME both functions should use the same window, but they don't. Add an
-- EXPLAIN test for this.
select
sum(number) over w1,
sum(number) over (partition by intDiv(number, 3))
sum(number) over (partition by intDiv(number, 3) rows unbounded preceding)
from numbers(10)
window
w1 as (partition by intDiv(number, 3))
w1 as (partition by intDiv(number, 3) rows unbounded preceding)
;
0 0
1 1
@ -222,25 +228,17 @@ window
13 13
21 21
9 9
-- ROWS frame
select
sum(number)
over (order by number rows between unbounded preceding and current row)
from numbers(3);
0
1
3
--select
-- sum(number)
-- over (order by number groups between unbounded preceding and current row)
--from numbers(3);
-- RANGE frame
-- It's the default
select sum(number) over () from numbers(3);
3
3
3
-- Try some mutually prime sizes of partition, group and block, for the number
-- of rows that is their least common multiple so that we see all the interesting
-- corner cases.
-- of rows that is their least common multiple + 1, so that we see all the
-- interesting corner cases.
select number, intDiv(number, 3) p, mod(number, 2) o, count(number) over w as c
from numbers(30)
from numbers(31)
window w as (partition by p order by o range unbounded preceding)
order by number
settings max_block_size = 5
@ -275,8 +273,9 @@ settings max_block_size = 5
27 9 1 3
28 9 0 1
29 9 1 3
30 10 0 1
select number, intDiv(number, 5) p, mod(number, 3) o, count(number) over w as c
from numbers(30)
from numbers(31)
window w as (partition by p order by o range unbounded preceding)
order by number
settings max_block_size = 2
@ -311,8 +310,9 @@ settings max_block_size = 2
27 5 0 1
28 5 1 3
29 5 2 5
30 6 0 1
select number, intDiv(number, 5) p, mod(number, 2) o, count(number) over w as c
from numbers(30)
from numbers(31)
window w as (partition by p order by o range unbounded preceding)
order by number
settings max_block_size = 3
@ -347,8 +347,9 @@ settings max_block_size = 3
27 5 1 5
28 5 0 2
29 5 1 5
30 6 0 1
select number, intDiv(number, 3) p, mod(number, 5) o, count(number) over w as c
from numbers(30)
from numbers(31)
window w as (partition by p order by o range unbounded preceding)
order by number
settings max_block_size = 2
@ -383,8 +384,9 @@ settings max_block_size = 2
27 9 2 1
28 9 3 2
29 9 4 3
30 10 0 1
select number, intDiv(number, 2) p, mod(number, 5) o, count(number) over w as c
from numbers(30)
from numbers(31)
window w as (partition by p order by o range unbounded preceding)
order by number
settings max_block_size = 3
@ -419,8 +421,9 @@ settings max_block_size = 3
27 13 2 2
28 14 3 1
29 14 4 2
30 15 0 1
select number, intDiv(number, 2) p, mod(number, 3) o, count(number) over w as c
from numbers(30)
from numbers(31)
window w as (partition by p order by o range unbounded preceding)
order by number
settings max_block_size = 5
@ -455,3 +458,4 @@ settings max_block_size = 5
27 13 0 1
28 14 1 1
29 14 2 2
30 15 0 1

View File

@ -3,77 +3,78 @@
set allow_experimental_window_functions = 1;
-- just something basic
select number, count() over (partition by intDiv(number, 3) order by number) from numbers(10);
select number, count() over (partition by intDiv(number, 3) order by number rows unbounded preceding) from numbers(10);
-- proper calculation across blocks
select number, max(number) over (partition by intDiv(number, 3) order by number desc) from numbers(10) settings max_block_size = 2;
select number, max(number) over (partition by intDiv(number, 3) order by number desc rows unbounded preceding) from numbers(10) settings max_block_size = 2;
-- not a window function
select number, abs(number) over (partition by toString(intDiv(number, 3))) from numbers(10); -- { serverError 63 }
select number, abs(number) over (partition by toString(intDiv(number, 3)) rows unbounded preceding) from numbers(10); -- { serverError 63 }
-- no partition by
select number, avg(number) over (order by number) from numbers(10);
select number, avg(number) over (order by number rows unbounded preceding) from numbers(10);
-- no order by
select number, quantileExact(number) over (partition by intDiv(number, 3)) from numbers(10);
select number, quantileExact(number) over (partition by intDiv(number, 3) rows unbounded preceding) from numbers(10);
-- can add an alias after window spec
select number, quantileExact(number) over (partition by intDiv(number, 3)) q from numbers(10);
select number, quantileExact(number) over (partition by intDiv(number, 3) rows unbounded preceding) q from numbers(10);
-- can't reference it yet -- the window functions are calculated at the
-- last stage of select, after all other functions.
select q * 10, quantileExact(number) over (partition by intDiv(number, 3)) q from numbers(10); -- { serverError 47 }
select q * 10, quantileExact(number) over (partition by intDiv(number, 3) rows unbounded preceding) q from numbers(10); -- { serverError 47 }
-- must work in WHERE if you wrap it in a subquery
select * from (select count(*) over () c from numbers(3)) where c > 0;
select * from (select count(*) over (rows unbounded preceding) c from numbers(3)) where c > 0;
-- should work in ORDER BY
select number, max(number) over (partition by intDiv(number, 3) order by number desc) m from numbers(10) order by m desc, number;
select number, max(number) over (partition by intDiv(number, 3) order by number desc rows unbounded preceding) m from numbers(10) order by m desc, number;
-- also works in ORDER BY if you wrap it in a subquery
select * from (select count(*) over () c from numbers(3)) order by c;
select * from (select count(*) over (rows unbounded preceding) c from numbers(3)) order by c;
-- Example with window function only in ORDER BY. Here we make a rank of all
-- numbers sorted descending, and then sort by this rank descending, and must get
-- the ascending order.
select * from (select * from numbers(5) order by rand()) order by count() over (order by number desc) desc;
select * from (select * from numbers(5) order by rand()) order by count() over (order by number desc rows unbounded preceding) desc;
-- Aggregate functions as window function arguments. This query is semantically
-- the same as the above one, only we replace `number` with
-- `any(number) group by number` and so on.
select * from (select * from numbers(5) order by rand()) group by number order by sum(any(number + 1)) over (order by min(number) desc) desc;
select * from (select * from numbers(5) order by rand()) group by number order by sum(any(number + 1)) over (order by min(number) desc rows unbounded preceding) desc;
-- some more simple cases w/aggregate functions
select sum(any(number)) over () from numbers(1);
select sum(any(number) + 1) over () from numbers(1);
select sum(any(number + 1)) over () from numbers(1);
select sum(any(number)) over (rows unbounded preceding) from numbers(1);
select sum(any(number) + 1) over (rows unbounded preceding) from numbers(1);
select sum(any(number + 1)) over (rows unbounded preceding) from numbers(1);
-- different windows
-- an explain test would also be helpful, but it's too immature now and I don't
-- want to change reference all the time
select number, max(number) over (partition by intDiv(number, 3) order by number desc), count(number) over (partition by intDiv(number, 5) order by number) as m from numbers(31) order by number settings max_block_size = 2;
select number, max(number) over (partition by intDiv(number, 3) order by number desc rows unbounded preceding), count(number) over (partition by intDiv(number, 5) order by number rows unbounded preceding) as m from numbers(31) order by number settings max_block_size = 2;
-- two functions over the same window
-- an explain test would also be helpful, but it's too immature now and I don't
-- want to change reference all the time
select number, max(number) over (partition by intDiv(number, 3) order by number desc), count(number) over (partition by intDiv(number, 3) order by number desc) as m from numbers(7) order by number settings max_block_size = 2;
select number, max(number) over (partition by intDiv(number, 3) order by number desc rows unbounded preceding), count(number) over (partition by intDiv(number, 3) order by number desc rows unbounded preceding) as m from numbers(7) order by number settings max_block_size = 2;
-- check that we can work with constant columns
select median(x) over (partition by x) from (select 1 x);
-- an empty window definition is valid as well
select groupArray(number) over (rows unbounded preceding) from numbers(3);
select groupArray(number) over () from numbers(3);
-- This one tests we properly process the window function arguments.
-- Seen errors like 'column `1` not found' from count(1).
select count(1) over (), max(number + 1) over () from numbers(3);
select count(1) over (rows unbounded preceding), max(number + 1) over () from numbers(3);
-- Should work in DISTINCT
select distinct sum(0) over () from numbers(2);
select distinct any(number) over () from numbers(2);
select distinct sum(0) over (rows unbounded preceding) from numbers(2);
select distinct any(number) over (rows unbounded preceding) from numbers(2);
-- Various kinds of aliases are properly substituted into various parts of window
-- function definition.
with number + 1 as x select intDiv(number, 3) as y, sum(x + y) over (partition by y order by x) from numbers(7);
with number + 1 as x select intDiv(number, 3) as y, sum(x + y) over (partition by y order by x rows unbounded preceding) from numbers(7);
-- WINDOW clause
select 1 window w1 as ();
@ -81,71 +82,64 @@ select 1 window w1 as ();
select sum(number) over w1, sum(number) over w2
from numbers(10)
window
w1 as (),
w2 as (partition by intDiv(number, 3))
w1 as (rows unbounded preceding),
w2 as (partition by intDiv(number, 3) rows unbounded preceding)
;
-- FIXME both functions should use the same window, but they don't. Add an
-- EXPLAIN test for this.
select
sum(number) over w1,
sum(number) over (partition by intDiv(number, 3))
sum(number) over (partition by intDiv(number, 3) rows unbounded preceding)
from numbers(10)
window
w1 as (partition by intDiv(number, 3))
w1 as (partition by intDiv(number, 3) rows unbounded preceding)
;
-- ROWS frame
select
sum(number)
over (order by number rows between unbounded preceding and current row)
from numbers(3);
--select
-- sum(number)
-- over (order by number groups between unbounded preceding and current row)
--from numbers(3);
-- RANGE frame
-- It's the default
select sum(number) over () from numbers(3);
-- Try some mutually prime sizes of partition, group and block, for the number
-- of rows that is their least common multiple so that we see all the interesting
-- corner cases.
-- of rows that is their least common multiple + 1, so that we see all the
-- interesting corner cases.
select number, intDiv(number, 3) p, mod(number, 2) o, count(number) over w as c
from numbers(30)
from numbers(31)
window w as (partition by p order by o range unbounded preceding)
order by number
settings max_block_size = 5
;
select number, intDiv(number, 5) p, mod(number, 3) o, count(number) over w as c
from numbers(30)
from numbers(31)
window w as (partition by p order by o range unbounded preceding)
order by number
settings max_block_size = 2
;
select number, intDiv(number, 5) p, mod(number, 2) o, count(number) over w as c
from numbers(30)
from numbers(31)
window w as (partition by p order by o range unbounded preceding)
order by number
settings max_block_size = 3
;
select number, intDiv(number, 3) p, mod(number, 5) o, count(number) over w as c
from numbers(30)
from numbers(31)
window w as (partition by p order by o range unbounded preceding)
order by number
settings max_block_size = 2
;
select number, intDiv(number, 2) p, mod(number, 5) o, count(number) over w as c
from numbers(30)
from numbers(31)
window w as (partition by p order by o range unbounded preceding)
order by number
settings max_block_size = 3
;
select number, intDiv(number, 2) p, mod(number, 3) o, count(number) over w as c
from numbers(30)
from numbers(31)
window w as (partition by p order by o range unbounded preceding)
order by number
settings max_block_size = 5