From fad1a4ccb3c79958699b8292da6aecc63efcca5e Mon Sep 17 00:00:00 2001 From: "philip.han" Date: Fri, 5 Mar 2021 21:46:09 +0900 Subject: [PATCH] Fix comparator of sequenceNextNode for more deterministic sorting --- .../AggregateFunctionSequenceNextNode.cpp | 2 +- .../AggregateFunctionSequenceNextNode.h | 49 ++++++------------- .../01656_sequence_next_node.reference | 16 +++--- .../0_stateless/01656_sequence_next_node.sql | 37 ++++++++------ 4 files changed, 45 insertions(+), 59 deletions(-) diff --git a/src/AggregateFunctions/AggregateFunctionSequenceNextNode.cpp b/src/AggregateFunctions/AggregateFunctionSequenceNextNode.cpp index 1513e5b3213..9d1d8aaa075 100644 --- a/src/AggregateFunctions/AggregateFunctionSequenceNextNode.cpp +++ b/src/AggregateFunctions/AggregateFunctionSequenceNextNode.cpp @@ -115,7 +115,7 @@ auto createAggregateFunctionSequenceNodeMaxArgs(UInt64 max_args) void registerAggregateFunctionSequenceNextNode(AggregateFunctionFactory & factory) { AggregateFunctionProperties properties = { .returns_default_when_only_null = true, .is_order_dependent = false }; - factory.registerFunction("sequenceNextNode", { createAggregateFunctionSequenceNodeMaxArgs(31), properties }); + factory.registerFunction("sequenceNextNode", { createAggregateFunctionSequenceNodeMaxArgs(MAX_EVENTS_SIZE), properties }); factory.registerFunction("sequenceFirstNode", { createAggregateFunctionSequenceNodeMaxArgs(0), properties }); } diff --git a/src/AggregateFunctions/AggregateFunctionSequenceNextNode.h b/src/AggregateFunctions/AggregateFunctionSequenceNextNode.h index e188ae58045..b8cf7ef5f5d 100644 --- a/src/AggregateFunctions/AggregateFunctionSequenceNextNode.h +++ b/src/AggregateFunctions/AggregateFunctionSequenceNextNode.h @@ -27,19 +27,7 @@ namespace DB { -/** - * When sorting the list of events the EMPTY_EVENTS_BITSET will be moved to the last. - * In the case of events, - * dt action - * 2020-01-01 00:00:01 'D' - * 2020-01-01 00:00:01 'A' - * 2020-01-01 00:00:01 'B' - * 2020-01-01 00:00:01 'C' - * The next node of a chain of events 'A' -> 'B' -> 'C' is expected to be the 'D'. - * Because EMPTY_EVENTS_BITSET is 0x80000000 the order of the sorted events is ['A", 'B', 'C', 'D']. The result value of this aggregation is 'D'. - * If EMPTY_EVENTS_BITSET is 0 hen the order of the sorted events is ['D', 'A', 'B', 'C']. This time, the result value is NULL. - */ -static const UInt32 EMPTY_EVENTS_BITSET = 0x80000000; +const UInt32 MAX_EVENTS_SIZE = 64; /// NodeBase used to implement a linked list for storage of SequenceNextNodeImpl template @@ -48,9 +36,7 @@ struct NodeBase UInt64 size; /// size of payload DataTypeDateTime::FieldType event_time; - UInt32 events_bitset; /// Bitsets of UInt32 are easy to compare. (< operator on bitsets) - /// Nodes in the list must be sorted in order to find a chain of events at the method getNextNodeIndex(). - /// While sorting, events_bitset is one of sorting criteria. + std::bitset events_bitset; char * data() { return reinterpret_cast(this) + sizeof(Node); } @@ -68,7 +54,8 @@ struct NodeBase buf.write(data(), size); writeBinary(event_time, buf); - writeBinary(events_bitset, buf); + UInt64 ulong_bitset = events_bitset.to_ulong(); + writeBinary(ulong_bitset, buf); } static Node * read(ReadBuffer & buf, Arena * arena) @@ -81,7 +68,9 @@ struct NodeBase buf.read(node->data(), size); readBinary(node->event_time, buf); - readBinary(node->events_bitset, buf); + UInt64 ulong_bitset; + readBinary(ulong_bitset, buf); + node->events_bitset = ulong_bitset; return node; } @@ -130,13 +119,9 @@ struct SequenceNextNodeGeneralData bool operator()(const Node * lhs, const Node * rhs) const { if constexpr (Descending) - return lhs->event_time == rhs->event_time ? - (lhs->events_bitset == rhs->events_bitset ? lhs->compare(rhs) : lhs->events_bitset < rhs->events_bitset) - : lhs->event_time > rhs->event_time; + return lhs->event_time == rhs->event_time ? !lhs->compare(rhs) : lhs->event_time > rhs->event_time; else - return lhs->event_time == rhs->event_time ? - (lhs->events_bitset == rhs->events_bitset ? lhs->compare(rhs) : lhs->events_bitset < rhs->events_bitset) - : lhs->event_time < rhs->event_time; + return lhs->event_time == rhs->event_time ? lhs->compare(rhs) : lhs->event_time < rhs->event_time; } }; @@ -179,7 +164,7 @@ public: AggregateFunctionPtr getOwnNullAdapter( const AggregateFunctionPtr & nested_function, const DataTypes & arguments, const Array & params, - const AggregateFunctionProperties & /*properties*/) const override + const AggregateFunctionProperties &) const override { /// This aggregate function sets insertion_requires_nullable_column on. /// Even though some values are mapped to aggregating key, it could return nulls for the below case. @@ -213,14 +198,11 @@ public: /// 0x00000000 /// + 1 (bit of event1) /// + 4 (bit of event3) - UInt32 events_bitset = 0; + node->events_bitset.reset(); for (UInt8 i = 0; i < events_size; ++i) if (assert_cast *>(columns[2 + i])->getData()[row_num]) - events_bitset += (1 << i); - if (events_bitset == 0) events_bitset = EMPTY_EVENTS_BITSET; // Any events are not matched. - + node->events_bitset.set(i); node->event_time = timestamp; - node->events_bitset = events_bitset; data(place).value.push_back(node, arena); } @@ -295,7 +277,7 @@ public: { UInt32 k = 0; for (; k < events_size - j; ++k) - if (data.value[i - j]->events_bitset & (1 << (events_size - 1 - j - k))) + if (data.value[i - j]->events_bitset.test(events_size - 1 - j - k)) return k; return k; } @@ -322,7 +304,7 @@ public: for (; j < events_size; ++j) /// It compares each matched events. /// The lower bitmask is the former matched event. - if (!(data.value[i - j]->events_bitset & (1 << (events_size - 1 - j)))) + if (data.value[i - j]->events_bitset.test(events_size - 1 - j) == false) break; /// If the chain of events are matched returns the index of result value. @@ -413,12 +395,11 @@ public: is_first = false; } - if (is_first) { Node * node = Node::allocate(*columns[1], row_num, arena); node->event_time = timestamp; - node->events_bitset = EMPTY_EVENTS_BITSET; + node->events_bitset.reset(); data(place).value.push_back(node, arena); } diff --git a/tests/queries/0_stateless/01656_sequence_next_node.reference b/tests/queries/0_stateless/01656_sequence_next_node.reference index 50755232cb9..da6ec2d97bf 100644 --- a/tests/queries/0_stateless/01656_sequence_next_node.reference +++ b/tests/queries/0_stateless/01656_sequence_next_node.reference @@ -124,14 +124,14 @@ (0, A) id >= 10 10 B (0, A) id >= 10 10 B (0, A) id >= 10 10 A -(0, A) id = 11 0 -(0, C) id = 11 0 -(0, B->C) id = 11 0 -(0, A->B->C) id = 11 0 -(0, A) id = 11 0 -(0, C) id = 11 0 -(0, C->B) id = 11 0 -(0, C->B->A) id = 11 0 +(0, A) id = 11 1 +(0, C) id = 11 1 +(0, B->C) id = 11 1 +(0, A->B->C) id = 11 1 +(0, A) id = 11 1 +(0, C) id = 11 1 +(0, C->B) id = 11 1 +(0, C->B->A) id = 11 1 (0) id < 10 1 A (0) id < 10 2 A (0) id < 10 3 A diff --git a/tests/queries/0_stateless/01656_sequence_next_node.sql b/tests/queries/0_stateless/01656_sequence_next_node.sql index c2270bc10ac..2c16f33aa0e 100644 --- a/tests/queries/0_stateless/01656_sequence_next_node.sql +++ b/tests/queries/0_stateless/01656_sequence_next_node.sql @@ -61,14 +61,19 @@ SELECT '(0, A) id >= 10', id, sequenceNextNode(1)(dt, action, action = 'C') AS n SELECT '(0, A) id >= 10', id, sequenceNextNode(1)(dt, action, action = 'D', action = 'C') AS next_node FROM test_sequenceNextNode_Nullable WHERE id >= 10 GROUP BY id ORDER BY id; SELECT '(0, A) id >= 10', id, sequenceNextNode(1)(dt, action, action = 'C', action = 'B') AS next_node FROM test_sequenceNextNode_Nullable WHERE id >= 10 GROUP BY id ORDER BY id; -SELECT '(0, A) id = 11', count() FROM (SELECT id, sequenceNextNode(0)(dt, action, action = 'A') AS next_node FROM test_sequenceNextNode_Nullable WHERE id = 11 GROUP BY id HAVING next_node in ('B', 'C', 'D')); -SELECT '(0, C) id = 11', count() FROM (SELECT id, sequenceNextNode(0)(dt, action, action = 'C') AS next_node FROM test_sequenceNextNode_Nullable WHERE id = 11 GROUP BY id HAVING next_node in ('B', 'A', 'D')); -SELECT '(0, B->C) id = 11', count() FROM (SELECT id, sequenceNextNode(0)(dt, action, action = 'B', action ='C') AS next_node FROM test_sequenceNextNode_Nullable WHERE id = 11 GROUP BY id HAVING next_node in ('A', 'D')); -SELECT '(0, A->B->C) id = 11', count() FROM (SELECT id, sequenceNextNode(0)(dt, action, action = 'A', action = 'B', action = 'C') AS next_node FROM test_sequenceNextNode_Nullable WHERE id = 11 GROUP BY id HAVING next_node in ('D')); -SELECT '(0, A) id = 11', count() FROM (SELECT id, sequenceNextNode(1)(dt, action, action = 'A') AS next_node FROM test_sequenceNextNode_Nullable WHERE id = 11 GROUP BY id HAVING next_node in ('B', 'C', 'D')); -SELECT '(0, C) id = 11', count() FROM (SELECT id, sequenceNextNode(1)(dt, action, action = 'C') AS next_node FROM test_sequenceNextNode_Nullable WHERE id = 11 GROUP BY id HAVING next_node in ('B', 'A', 'D')); -SELECT '(0, C->B) id = 11', count() FROM (SELECT id, sequenceNextNode(1)(dt, action, action = 'C', action ='B') AS next_node FROM test_sequenceNextNode_Nullable WHERE id = 11 GROUP BY id HAVING next_node in ('A', 'D')); -SELECT '(0, C->B->A) id = 11', count() FROM (SELECT id, sequenceNextNode(1)(dt, action, action = 'C', action = 'B', action = 'A') AS next_node FROM test_sequenceNextNode_Nullable WHERE id = 11 GROUP BY id HAVING next_node in ('D')); +INSERT INTO test_sequenceNextNode_Nullable values ('1970-01-01 09:00:01',11,'A'); +INSERT INTO test_sequenceNextNode_Nullable values ('1970-01-01 09:00:01',11,'B'); +INSERT INTO test_sequenceNextNode_Nullable values ('1970-01-01 09:00:01',11,'C'); +INSERT INTO test_sequenceNextNode_Nullable values ('1970-01-01 09:00:01',11,'D'); + +SELECT '(0, A) id = 11', count() FROM (SELECT id, sequenceNextNode(0)(dt, action, action = 'A') AS next_node FROM test_sequenceNextNode_Nullable WHERE id = 11 GROUP BY id HAVING next_node = 'B'); +SELECT '(0, C) id = 11', count() FROM (SELECT id, sequenceNextNode(0)(dt, action, action = 'C') AS next_node FROM test_sequenceNextNode_Nullable WHERE id = 11 GROUP BY id HAVING next_node = 'D'); +SELECT '(0, B->C) id = 11', count() FROM (SELECT id, sequenceNextNode(0)(dt, action, action = 'B', action ='C') AS next_node FROM test_sequenceNextNode_Nullable WHERE id = 11 GROUP BY id HAVING next_node = 'D'); +SELECT '(0, A->B->C) id = 11', count() FROM (SELECT id, sequenceNextNode(0)(dt, action, action = 'A', action = 'B', action = 'C') AS next_node FROM test_sequenceNextNode_Nullable WHERE id = 11 GROUP BY id HAVING next_node = 'D'); +SELECT '(0, A) id = 11', count() FROM (SELECT id, sequenceNextNode(1)(dt, action, action = 'A') AS next_node FROM test_sequenceNextNode_Nullable WHERE id = 11 GROUP BY id HAVING next_node is NULL); +SELECT '(0, C) id = 11', count() FROM (SELECT id, sequenceNextNode(1)(dt, action, action = 'C') AS next_node FROM test_sequenceNextNode_Nullable WHERE id = 11 GROUP BY id HAVING next_node = 'B'); +SELECT '(0, C->B) id = 11', count() FROM (SELECT id, sequenceNextNode(1)(dt, action, action = 'C', action ='B') AS next_node FROM test_sequenceNextNode_Nullable WHERE id = 11 GROUP BY id HAVING next_node = 'A'); +SELECT '(0, C->B->A) id = 11', count() FROM (SELECT id, sequenceNextNode(1)(dt, action, action = 'C', action = 'B', action = 'A') AS next_node FROM test_sequenceNextNode_Nullable WHERE id = 11 GROUP BY id HAVING next_node is null); SELECT '(0) id < 10', id, sequenceNextNode(0)(dt, action) AS next_node FROM test_sequenceNextNode_Nullable WHERE id < 10 GROUP BY id ORDER BY id; SELECT '(0) id < 10', id, sequenceFirstNode(0)(dt, action) AS next_node FROM test_sequenceNextNode_Nullable WHERE id < 10 GROUP BY id ORDER BY id; @@ -146,14 +151,14 @@ INSERT INTO test_sequenceNextNode values ('1970-01-01 09:00:01',11,'B'); INSERT INTO test_sequenceNextNode values ('1970-01-01 09:00:01',11,'C'); INSERT INTO test_sequenceNextNode values ('1970-01-01 09:00:01',11,'D'); -SELECT '(0, A) id = 11', count() FROM (SELECT id, sequenceNextNode(0)(dt, action, action = 'A') AS next_node FROM test_sequenceNextNode WHERE id = 11 GROUP BY id HAVING next_node in ('B', 'C', 'D')); -SELECT '(0, C) id = 11', count() FROM (SELECT id, sequenceNextNode(0)(dt, action, action = 'C') AS next_node FROM test_sequenceNextNode WHERE id = 11 GROUP BY id HAVING next_node in ('B', 'A', 'D')); -SELECT '(0, B->C) id = 11', count() FROM (SELECT id, sequenceNextNode(0)(dt, action, action = 'B', action ='C') AS next_node FROM test_sequenceNextNode WHERE id = 11 GROUP BY id HAVING next_node in ('A', 'D')); -SELECT '(0, A->B->C) id = 11', count() FROM (SELECT id, sequenceNextNode(0)(dt, action, action = 'A', action = 'B', action = 'C') AS next_node FROM test_sequenceNextNode WHERE id = 11 GROUP BY id HAVING next_node in ('D')); -SELECT '(0, A) id = 11', count() FROM (SELECT id, sequenceNextNode(1)(dt, action, action = 'A') AS next_node FROM test_sequenceNextNode WHERE id = 11 GROUP BY id HAVING next_node in ('B', 'C', 'D')); -SELECT '(0, C) id = 11', count() FROM (SELECT id, sequenceNextNode(1)(dt, action, action = 'C') AS next_node FROM test_sequenceNextNode WHERE id = 11 GROUP BY id HAVING next_node in ('B', 'A', 'D')); -SELECT '(0, C->B) id = 11', count() FROM (SELECT id, sequenceNextNode(1)(dt, action, action = 'C', action ='B') AS next_node FROM test_sequenceNextNode WHERE id = 11 GROUP BY id HAVING next_node in ('A', 'D')); -SELECT '(0, C->B->A) id = 11', count() FROM (SELECT id, sequenceNextNode(1)(dt, action, action = 'C', action = 'B', action = 'A') AS next_node FROM test_sequenceNextNode WHERE id = 11 GROUP BY id HAVING next_node in ('D')); +SELECT '(0, A) id = 11', count() FROM (SELECT id, sequenceNextNode(0)(dt, action, action = 'A') AS next_node FROM test_sequenceNextNode WHERE id = 11 GROUP BY id HAVING next_node = 'B'); +SELECT '(0, C) id = 11', count() FROM (SELECT id, sequenceNextNode(0)(dt, action, action = 'C') AS next_node FROM test_sequenceNextNode WHERE id = 11 GROUP BY id HAVING next_node = 'D'); +SELECT '(0, B->C) id = 11', count() FROM (SELECT id, sequenceNextNode(0)(dt, action, action = 'B', action ='C') AS next_node FROM test_sequenceNextNode WHERE id = 11 GROUP BY id HAVING next_node = 'D'); +SELECT '(0, A->B->C) id = 11', count() FROM (SELECT id, sequenceNextNode(0)(dt, action, action = 'A', action = 'B', action = 'C') AS next_node FROM test_sequenceNextNode WHERE id = 11 GROUP BY id HAVING next_node = 'D'); +SELECT '(0, A) id = 11', count() FROM (SELECT id, sequenceNextNode(1)(dt, action, action = 'A') AS next_node FROM test_sequenceNextNode WHERE id = 11 GROUP BY id HAVING next_node is NULL); +SELECT '(0, C) id = 11', count() FROM (SELECT id, sequenceNextNode(1)(dt, action, action = 'C') AS next_node FROM test_sequenceNextNode WHERE id = 11 GROUP BY id HAVING next_node = 'B'); +SELECT '(0, C->B) id = 11', count() FROM (SELECT id, sequenceNextNode(1)(dt, action, action = 'C', action ='B') AS next_node FROM test_sequenceNextNode WHERE id = 11 GROUP BY id HAVING next_node = 'A'); +SELECT '(0, C->B->A) id = 11', count() FROM (SELECT id, sequenceNextNode(1)(dt, action, action = 'C', action = 'B', action = 'A') AS next_node FROM test_sequenceNextNode WHERE id = 11 GROUP BY id HAVING next_node is null); SELECT '(0) id < 10', id, sequenceNextNode(0)(dt, action) AS next_node FROM test_sequenceNextNode WHERE id < 10 GROUP BY id ORDER BY id; SELECT '(0) id < 10', id, sequenceFirstNode(0)(dt, action) AS next_node FROM test_sequenceNextNode WHERE id < 10 GROUP BY id ORDER BY id;