From e3a96cc870c2f9b34f06b88e65ad109d8334ef2c Mon Sep 17 00:00:00 2001 From: Jakub Kuklis Date: Thu, 29 Jul 2021 14:30:07 +0200 Subject: [PATCH 01/12] Checking whether all deterministic parts can be matched in the right order in sequenceMatch --- .../AggregateFunctionSequenceMatch.h | 82 ++++++++++++++++++- 1 file changed, 79 insertions(+), 3 deletions(-) diff --git a/src/AggregateFunctions/AggregateFunctionSequenceMatch.h b/src/AggregateFunctions/AggregateFunctionSequenceMatch.h index d05a4ca314d..29fae66e291 100644 --- a/src/AggregateFunctions/AggregateFunctionSequenceMatch.h +++ b/src/AggregateFunctions/AggregateFunctionSequenceMatch.h @@ -518,6 +518,77 @@ protected: return action_it == action_end; } + /// Splits the pattern into deterministic parts separated by non-deterministic fragments + /// (time constraints and Kleene stars), and tries to match the deterministic parts in their specified order, + /// ignoring the non-deterministic fragments. + /// This function can quickly check that a full match is not possible if some deterministic fragment is missing. + template + bool couldMatchDeterministicParts(const EventEntry events_begin, const EventEntry events_end, bool limit_iterations = true) const + { + size_t events_processed = 0; + auto events_it = events_begin; + std::vector det_part; + + auto find_deterministic_part = [&events_it, &events_end, &events_processed, &det_part, limit_iterations]() + { + auto events_it_init = events_it; + const auto det_part_begin = std::begin(det_part); + const auto det_part_end = std::end(det_part); + auto det_part_it = det_part_begin; + + while (det_part_it != det_part_end && events_it != events_end) + { + /// matching any event + if (*det_part_it == 0) + ++events_it, ++det_part_it; + + /// matching specific event + else { + if (events_it->second.test(*det_part_it - 1)) + ++events_it, ++det_part_it; + + else + { + events_it = ++events_it_init; + det_part_it = det_part_begin; + } + } + + if (limit_iterations && ++events_processed > sequence_match_max_iterations) { + throw Exception{"Pattern application proves too difficult, exceeding max iterations (" + toString(sequence_match_max_iterations) + ")", + ErrorCodes::TOO_SLOW}; + } + } + + det_part.clear(); + return det_part_it == det_part_end; + }; + + for (auto action : actions) { + switch(action.type) { + /// mark AnyEvent action with 0 and SpecificEvent with positive numbers corresponding to the events + case PatternActionType::SpecificEvent: + det_part.push_back(action.extra + 1); + break; + case PatternActionType::AnyEvent: + det_part.push_back(0); + break; + case PatternActionType::KleeneStar: + case PatternActionType::TimeLessOrEqual: + case PatternActionType::TimeLess: + case PatternActionType::TimeGreaterOrEqual: + case PatternActionType::TimeGreater: + case PatternActionType::TimeEqual: + if (!find_deterministic_part()) + return false; + default: + throw Exception{"Unknown PatternActionType", ErrorCodes::LOGICAL_ERROR}; + } + } + + return find_deterministic_part(); + } + private: enum class DFATransition : char { @@ -592,7 +663,8 @@ public: const auto events_end = std::end(data_ref.events_list); auto events_it = events_begin; - bool match = this->pattern_has_time ? this->backtrackingMatch(events_it, events_end) : this->dfaMatch(events_it, events_end); + bool couldMatch = this->couldMatchDeterministicParts(events_begin, events_end, this->pattern_has_time); + bool match = couldMatch && (this->pattern_has_time ? this->backtrackingMatch(events_it, events_end) : this->dfaMatch(events_it, events_end)); assert_cast(to).getData().push_back(match); } }; @@ -628,8 +700,12 @@ private: auto events_it = events_begin; size_t count = 0; - while (events_it != events_end && this->backtrackingMatch(events_it, events_end)) - ++count; + // check if there is a chance of matching the sequence at least once + bool couldMatch = this->couldMatchDeterministicParts(events_begin, events_end); + if (couldMatch) { + while (events_it != events_end && this->backtrackingMatch(events_it, events_end)) + ++count; + } return count; } From 2fad1dd8c51c726dbe77a15b9d2c552b92f4916a Mon Sep 17 00:00:00 2001 From: Jakub Kuklis Date: Thu, 29 Jul 2021 14:39:51 +0200 Subject: [PATCH 02/12] Adding a break before default in switch --- src/AggregateFunctions/AggregateFunctionSequenceMatch.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/AggregateFunctions/AggregateFunctionSequenceMatch.h b/src/AggregateFunctions/AggregateFunctionSequenceMatch.h index 29fae66e291..4959a716c9c 100644 --- a/src/AggregateFunctions/AggregateFunctionSequenceMatch.h +++ b/src/AggregateFunctions/AggregateFunctionSequenceMatch.h @@ -581,6 +581,7 @@ protected: case PatternActionType::TimeEqual: if (!find_deterministic_part()) return false; + break; default: throw Exception{"Unknown PatternActionType", ErrorCodes::LOGICAL_ERROR}; } From 6a39546e5b566d43271a8bee9dd8b6f495edf733 Mon Sep 17 00:00:00 2001 From: Jakub Kuklis Date: Thu, 29 Jul 2021 16:28:22 +0200 Subject: [PATCH 03/12] Removing default placed after an exhaustive sweep through enum values --- src/AggregateFunctions/AggregateFunctionSequenceMatch.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/AggregateFunctions/AggregateFunctionSequenceMatch.h b/src/AggregateFunctions/AggregateFunctionSequenceMatch.h index 4959a716c9c..08c7c7a6a6a 100644 --- a/src/AggregateFunctions/AggregateFunctionSequenceMatch.h +++ b/src/AggregateFunctions/AggregateFunctionSequenceMatch.h @@ -581,9 +581,6 @@ protected: case PatternActionType::TimeEqual: if (!find_deterministic_part()) return false; - break; - default: - throw Exception{"Unknown PatternActionType", ErrorCodes::LOGICAL_ERROR}; } } From 24db6494de4b913bff0471b8165001a99cbd9655 Mon Sep 17 00:00:00 2001 From: Jakub Kuklis Date: Mon, 2 Aug 2021 16:23:23 +0200 Subject: [PATCH 04/12] Events conditions met using bitsets --- .../AggregateFunctionSequenceMatch.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/AggregateFunctions/AggregateFunctionSequenceMatch.h b/src/AggregateFunctions/AggregateFunctionSequenceMatch.h index 08c7c7a6a6a..2a34b40741e 100644 --- a/src/AggregateFunctions/AggregateFunctionSequenceMatch.h +++ b/src/AggregateFunctions/AggregateFunctionSequenceMatch.h @@ -48,6 +48,7 @@ struct AggregateFunctionSequenceMatchData final bool sorted = true; PODArrayWithStackMemory events_list; + std::bitset conditions_met; void add(const Timestamp timestamp, const Events & events) { @@ -56,6 +57,7 @@ struct AggregateFunctionSequenceMatchData final { events_list.emplace_back(timestamp, events); sorted = false; + conditions_met |= events; } } @@ -87,6 +89,8 @@ struct AggregateFunctionSequenceMatchData final } sorted = true; + + conditions_met |= other.conditions_met; } void sort() @@ -290,6 +294,7 @@ private: dfa_states.back().transition = DFATransition::SpecificEvent; dfa_states.back().event = event_number - 1; dfa_states.emplace_back(); + conditions_in_pattern.set(event_number - 1); } if (!match(")")) @@ -627,6 +632,7 @@ private: protected: /// `True` if the parsed pattern contains time assertions (?t...), `false` otherwise. bool pattern_has_time; + std::bitset conditions_in_pattern; private: std::string pattern; @@ -653,6 +659,11 @@ public: void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override { + if ((this->conditions_in_pattern & this->data(place).conditions_met) != this->conditions_in_pattern) + { + assert_cast(to).getData().push_back(false); + return; + } this->data(place).sort(); const auto & data_ref = this->data(place); @@ -684,6 +695,11 @@ public: void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override { + if ((this->conditions_in_pattern & this->data(place).conditions_met) != this->conditions_in_pattern) + { + assert_cast(to).getData().push_back(0); + return; + } this->data(place).sort(); assert_cast(to).getData().push_back(count(place)); } From 5abf24df9e464271d0cae27e6642d3917e3ed222 Mon Sep 17 00:00:00 2001 From: Jakub Kuklis Date: Tue, 3 Aug 2021 09:47:20 +0200 Subject: [PATCH 05/12] No sorting if unnecessary --- .../AggregateFunctionSequenceMatch.h | 24 +------------------ 1 file changed, 1 insertion(+), 23 deletions(-) diff --git a/src/AggregateFunctions/AggregateFunctionSequenceMatch.h b/src/AggregateFunctions/AggregateFunctionSequenceMatch.h index 2a34b40741e..16d5f49de1e 100644 --- a/src/AggregateFunctions/AggregateFunctionSequenceMatch.h +++ b/src/AggregateFunctions/AggregateFunctionSequenceMatch.h @@ -66,30 +66,8 @@ struct AggregateFunctionSequenceMatchData final if (other.events_list.empty()) return; - const auto size = events_list.size(); - events_list.insert(std::begin(other.events_list), std::end(other.events_list)); - - /// either sort whole container or do so partially merging ranges afterwards - if (!sorted && !other.sorted) - std::sort(std::begin(events_list), std::end(events_list), Comparator{}); - else - { - const auto begin = std::begin(events_list); - const auto middle = std::next(begin, size); - const auto end = std::end(events_list); - - if (!sorted) - std::sort(begin, middle, Comparator{}); - - if (!other.sorted) - std::sort(middle, end, Comparator{}); - - std::inplace_merge(begin, middle, end, Comparator{}); - } - - sorted = true; - + sorted = false; conditions_met |= other.conditions_met; } From 3f813e700db75bae0012bfd0e70979e32864fd33 Mon Sep 17 00:00:00 2001 From: Jakub Kuklis Date: Wed, 4 Aug 2021 10:58:59 +0200 Subject: [PATCH 06/12] Code clearance --- src/AggregateFunctions/AggregateFunctionSequenceMatch.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/AggregateFunctions/AggregateFunctionSequenceMatch.h b/src/AggregateFunctions/AggregateFunctionSequenceMatch.h index 16d5f49de1e..c136512bc78 100644 --- a/src/AggregateFunctions/AggregateFunctionSequenceMatch.h +++ b/src/AggregateFunctions/AggregateFunctionSequenceMatch.h @@ -530,6 +530,7 @@ protected: if (events_it->second.test(*det_part_it - 1)) ++events_it, ++det_part_it; + /// abandon current matching, try to match the deterministic fragment further in the list else { events_it = ++events_it_init; @@ -650,8 +651,9 @@ public: const auto events_end = std::end(data_ref.events_list); auto events_it = events_begin; - bool couldMatch = this->couldMatchDeterministicParts(events_begin, events_end, this->pattern_has_time); - bool match = couldMatch && (this->pattern_has_time ? this->backtrackingMatch(events_it, events_end) : this->dfaMatch(events_it, events_end)); + bool match = (this->pattern_has_time ? + (this->couldMatchDeterministicParts(events_begin, events_end) && this->backtrackingMatch(events_it, events_end)) : + this->dfaMatch(events_it, events_end)); assert_cast(to).getData().push_back(match); } }; @@ -693,8 +695,7 @@ private: size_t count = 0; // check if there is a chance of matching the sequence at least once - bool couldMatch = this->couldMatchDeterministicParts(events_begin, events_end); - if (couldMatch) { + if (this->couldMatchDeterministicParts(events_begin, events_end)) { while (events_it != events_end && this->backtrackingMatch(events_it, events_end)) ++count; } From edac57b08dd66aa0b0ef8fbf122876f78e2d9d56 Mon Sep 17 00:00:00 2001 From: Jakub Kuklis Date: Thu, 5 Aug 2021 09:34:30 +0200 Subject: [PATCH 07/12] Correction for style guidelines --- .../AggregateFunctionSequenceMatch.h | 55 ++++++++++--------- 1 file changed, 30 insertions(+), 25 deletions(-) diff --git a/src/AggregateFunctions/AggregateFunctionSequenceMatch.h b/src/AggregateFunctions/AggregateFunctionSequenceMatch.h index c136512bc78..52a258c2f94 100644 --- a/src/AggregateFunctions/AggregateFunctionSequenceMatch.h +++ b/src/AggregateFunctions/AggregateFunctionSequenceMatch.h @@ -526,7 +526,8 @@ protected: ++events_it, ++det_part_it; /// matching specific event - else { + else + { if (events_it->second.test(*det_part_it - 1)) ++events_it, ++det_part_it; @@ -538,33 +539,34 @@ protected: } } - if (limit_iterations && ++events_processed > sequence_match_max_iterations) { + if (limit_iterations && ++events_processed > sequence_match_max_iterations) throw Exception{"Pattern application proves too difficult, exceeding max iterations (" + toString(sequence_match_max_iterations) + ")", ErrorCodes::TOO_SLOW}; - } } det_part.clear(); return det_part_it == det_part_end; }; - for (auto action : actions) { - switch(action.type) { - /// mark AnyEvent action with 0 and SpecificEvent with positive numbers corresponding to the events - case PatternActionType::SpecificEvent: - det_part.push_back(action.extra + 1); - break; - case PatternActionType::AnyEvent: - det_part.push_back(0); - break; - case PatternActionType::KleeneStar: - case PatternActionType::TimeLessOrEqual: - case PatternActionType::TimeLess: - case PatternActionType::TimeGreaterOrEqual: - case PatternActionType::TimeGreater: - case PatternActionType::TimeEqual: - if (!find_deterministic_part()) - return false; + for (auto action : actions) + { + switch (action.type) + { + /// mark AnyEvent action with 0 and SpecificEvent with positive numbers corresponding to the events + case PatternActionType::SpecificEvent: + det_part.push_back(action.extra + 1); + break; + case PatternActionType::AnyEvent: + det_part.push_back(0); + break; + case PatternActionType::KleeneStar: + case PatternActionType::TimeLessOrEqual: + case PatternActionType::TimeLess: + case PatternActionType::TimeGreaterOrEqual: + case PatternActionType::TimeGreater: + case PatternActionType::TimeEqual: + if (!find_deterministic_part()) + return false; } } @@ -638,9 +640,10 @@ public: void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override { + auto & output = assert_cast(to).getData(); if ((this->conditions_in_pattern & this->data(place).conditions_met) != this->conditions_in_pattern) { - assert_cast(to).getData().push_back(false); + output.push_back(false); return; } this->data(place).sort(); @@ -654,7 +657,7 @@ public: bool match = (this->pattern_has_time ? (this->couldMatchDeterministicParts(events_begin, events_end) && this->backtrackingMatch(events_it, events_end)) : this->dfaMatch(events_it, events_end)); - assert_cast(to).getData().push_back(match); + output.push_back(match); } }; @@ -675,13 +678,14 @@ public: void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override { + auto & output = assert_cast(to).getData(); if ((this->conditions_in_pattern & this->data(place).conditions_met) != this->conditions_in_pattern) { - assert_cast(to).getData().push_back(0); + output.push_back(0); return; } this->data(place).sort(); - assert_cast(to).getData().push_back(count(place)); + output.push_back(count(place)); } private: @@ -695,7 +699,8 @@ private: size_t count = 0; // check if there is a chance of matching the sequence at least once - if (this->couldMatchDeterministicParts(events_begin, events_end)) { + if (this->couldMatchDeterministicParts(events_begin, events_end)) + { while (events_it != events_end && this->backtrackingMatch(events_it, events_end)) ++count; } From b9bb2b577b4a04830168645231e5d1fb5585f033 Mon Sep 17 00:00:00 2001 From: Jakub Kuklis Date: Thu, 5 Aug 2021 11:34:39 +0200 Subject: [PATCH 08/12] Simplifying couldMatchDeterministicParts --- .../AggregateFunctionSequenceMatch.h | 44 +++++++------------ 1 file changed, 15 insertions(+), 29 deletions(-) diff --git a/src/AggregateFunctions/AggregateFunctionSequenceMatch.h b/src/AggregateFunctions/AggregateFunctionSequenceMatch.h index 52a258c2f94..1f0227a9b6f 100644 --- a/src/AggregateFunctions/AggregateFunctionSequenceMatch.h +++ b/src/AggregateFunctions/AggregateFunctionSequenceMatch.h @@ -510,25 +510,26 @@ protected: { size_t events_processed = 0; auto events_it = events_begin; - std::vector det_part; - auto find_deterministic_part = [&events_it, &events_end, &events_processed, &det_part, limit_iterations]() + const auto actions_end = std::end(actions); + auto actions_it = std::begin(actions); + auto det_part_begin = actions_it; + + auto match_deterministic_part = [&events_it, events_end, &events_processed, det_part_begin, actions_it, limit_iterations]() { auto events_it_init = events_it; - const auto det_part_begin = std::begin(det_part); - const auto det_part_end = std::end(det_part); auto det_part_it = det_part_begin; - while (det_part_it != det_part_end && events_it != events_end) + while (det_part_it != actions_it && events_it != events_end) { /// matching any event - if (*det_part_it == 0) + if (det_part_it->type == PatternActionType::AnyEvent) ++events_it, ++det_part_it; /// matching specific event else { - if (events_it->second.test(*det_part_it - 1)) + if (events_it->second.test(det_part_it->extra)) ++events_it, ++det_part_it; /// abandon current matching, try to match the deterministic fragment further in the list @@ -544,33 +545,18 @@ protected: ErrorCodes::TOO_SLOW}; } - det_part.clear(); - return det_part_it == det_part_end; + return det_part_it == actions_it; }; - for (auto action : actions) - { - switch (action.type) + for (; actions_it != actions_end; ++actions_it) + if (actions_it->type != PatternActionType::SpecificEvent && actions_it->type != PatternActionType::AnyEvent) { - /// mark AnyEvent action with 0 and SpecificEvent with positive numbers corresponding to the events - case PatternActionType::SpecificEvent: - det_part.push_back(action.extra + 1); - break; - case PatternActionType::AnyEvent: - det_part.push_back(0); - break; - case PatternActionType::KleeneStar: - case PatternActionType::TimeLessOrEqual: - case PatternActionType::TimeLess: - case PatternActionType::TimeGreaterOrEqual: - case PatternActionType::TimeGreater: - case PatternActionType::TimeEqual: - if (!find_deterministic_part()) - return false; + if (!match_deterministic_part()) + return false; + det_part_begin = std::next(actions_it); } - } - return find_deterministic_part(); + return match_deterministic_part(); } private: From caf40d77edcdfd44a71100af3bb5e03e5bf9d309 Mon Sep 17 00:00:00 2001 From: Jakub Kuklis Date: Mon, 23 Aug 2021 11:03:59 +0200 Subject: [PATCH 09/12] Adding a performance test with queries presented in the PR --- tests/performance/sequence_match.xml | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 tests/performance/sequence_match.xml diff --git a/tests/performance/sequence_match.xml b/tests/performance/sequence_match.xml new file mode 100644 index 00000000000..dd70c186dc7 --- /dev/null +++ b/tests/performance/sequence_match.xml @@ -0,0 +1,21 @@ + + + hits_100m_single + + + SELECT COUNT(*) FROM hits_100m_single GROUP BY EventTime HAVING sequenceMatch('(?1)(?t<1)(?2)')(EventTime, Age >= 0, Age = -1) + + SELECT 1 FROM hits_100m_single WHERE RefererCategories != [] GROUP BY ClientIP, RequestNum HAVING + sequenceMatch('(?1)(?t>1000)(?3)')( + EventTime, hasAny(RefererCategories, [9]), hasAny(RefererCategories, [3849, 2, 3, 4, 5, 6, 7]), hasAll(RefererCategories, [1, 9]), hasAny(RefererCategories, [1, 2326, 5496])) + SELECT 1 FROM hits_100m_single WHERE RefererCategories != [] GROUP BY ClientIP, RequestNum HAVING + sequenceMatch('(?1)(?t<10000)(?2)')( + EventTime, hasAny(RefererCategories, [3849, 2, 3, 4, 5, 6, 7]), hasAny(RefererCategories, [1, 2])) + SELECT 1 FROM hits_100m_single WHERE RefererCategories != [] GROUP BY ClientIP, RequestNum HAVING + sequenceMatch('(?1)(?3)(?1)(?3)')( + EventTime, hasAny(RefererCategories, [9]), hasAny(RefererCategories, [3849, 2, 3, 4, 5, 6, 7]), hasAll(RefererCategories, [1, 9]), hasAny(RefererCategories, [1, 2326, 5496])) + SELECT 1 FROM hits_100m_single WHERE RefererCategories != [] GROUP BY ClientIP, RequestNum HAVING + sequenceMatch('(?1)(?2)(?1)(?2)(?1)')( + EventTime, hasAny(RefererCategories, [3849, 2, 3, 4, 5, 6, 7]), hasAny(RefererCategories, [1, 2])) + + \ No newline at end of file From 39ba4f660b6ea569551db41d008d3e8bb388f270 Mon Sep 17 00:00:00 2001 From: jkuklis Date: Wed, 25 Aug 2021 09:26:48 +0200 Subject: [PATCH 10/12] Use smaller dataset in the performance test so that it doesn't time out in PR --- tests/performance/sequence_match.xml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/performance/sequence_match.xml b/tests/performance/sequence_match.xml index dd70c186dc7..537a92b5526 100644 --- a/tests/performance/sequence_match.xml +++ b/tests/performance/sequence_match.xml @@ -1,21 +1,21 @@ - hits_100m_single + hits_10m_single - SELECT COUNT(*) FROM hits_100m_single GROUP BY EventTime HAVING sequenceMatch('(?1)(?t<1)(?2)')(EventTime, Age >= 0, Age = -1) + SELECT COUNT(*) FROM hits_10m_single GROUP BY EventTime HAVING sequenceMatch('(?1)(?t<1)(?2)')(EventTime, Age >= 0, Age = -1) - SELECT 1 FROM hits_100m_single WHERE RefererCategories != [] GROUP BY ClientIP, RequestNum HAVING + SELECT 1 FROM hits_10m_single WHERE RefererCategories != [] GROUP BY ClientIP, RequestNum HAVING sequenceMatch('(?1)(?t>1000)(?3)')( EventTime, hasAny(RefererCategories, [9]), hasAny(RefererCategories, [3849, 2, 3, 4, 5, 6, 7]), hasAll(RefererCategories, [1, 9]), hasAny(RefererCategories, [1, 2326, 5496])) - SELECT 1 FROM hits_100m_single WHERE RefererCategories != [] GROUP BY ClientIP, RequestNum HAVING + SELECT 1 FROM hits_10m_single WHERE RefererCategories != [] GROUP BY ClientIP, RequestNum HAVING sequenceMatch('(?1)(?t<10000)(?2)')( EventTime, hasAny(RefererCategories, [3849, 2, 3, 4, 5, 6, 7]), hasAny(RefererCategories, [1, 2])) - SELECT 1 FROM hits_100m_single WHERE RefererCategories != [] GROUP BY ClientIP, RequestNum HAVING + SELECT 1 FROM hits_10m_single WHERE RefererCategories != [] GROUP BY ClientIP, RequestNum HAVING sequenceMatch('(?1)(?3)(?1)(?3)')( EventTime, hasAny(RefererCategories, [9]), hasAny(RefererCategories, [3849, 2, 3, 4, 5, 6, 7]), hasAll(RefererCategories, [1, 9]), hasAny(RefererCategories, [1, 2326, 5496])) - SELECT 1 FROM hits_100m_single WHERE RefererCategories != [] GROUP BY ClientIP, RequestNum HAVING + SELECT 1 FROM hits_10m_single WHERE RefererCategories != [] GROUP BY ClientIP, RequestNum HAVING sequenceMatch('(?1)(?2)(?1)(?2)(?1)')( EventTime, hasAny(RefererCategories, [3849, 2, 3, 4, 5, 6, 7]), hasAny(RefererCategories, [1, 2])) - \ No newline at end of file + From c60e935830b1d8891a069b167917130b8d080e22 Mon Sep 17 00:00:00 2001 From: Jakub Kuklis Date: Thu, 26 Aug 2021 16:42:39 +0200 Subject: [PATCH 11/12] Adding Format Null to performance test queries, style improvement for the test, additional comments in the code --- .../AggregateFunctionSequenceMatch.h | 2 + tests/performance/sequence_match.xml | 39 ++++++++++++++----- 2 files changed, 32 insertions(+), 9 deletions(-) diff --git a/src/AggregateFunctions/AggregateFunctionSequenceMatch.h b/src/AggregateFunctions/AggregateFunctionSequenceMatch.h index 1f0227a9b6f..5dfe820b6be 100644 --- a/src/AggregateFunctions/AggregateFunctionSequenceMatch.h +++ b/src/AggregateFunctions/AggregateFunctionSequenceMatch.h @@ -48,6 +48,7 @@ struct AggregateFunctionSequenceMatchData final bool sorted = true; PODArrayWithStackMemory events_list; + /// sequenceMatch conditions met at least once in events_list std::bitset conditions_met; void add(const Timestamp timestamp, const Events & events) @@ -599,6 +600,7 @@ private: protected: /// `True` if the parsed pattern contains time assertions (?t...), `false` otherwise. bool pattern_has_time; + /// sequenceMatch conditions met at least once in the pattern std::bitset conditions_in_pattern; private: diff --git a/tests/performance/sequence_match.xml b/tests/performance/sequence_match.xml index 537a92b5526..8f2008d30fc 100644 --- a/tests/performance/sequence_match.xml +++ b/tests/performance/sequence_match.xml @@ -3,19 +3,40 @@ hits_10m_single - SELECT COUNT(*) FROM hits_10m_single GROUP BY EventTime HAVING sequenceMatch('(?1)(?t<1)(?2)')(EventTime, Age >= 0, Age = -1) + + + SELECT 1 FROM hits_10m_single GROUP BY EventTime HAVING + sequenceMatch('(?1)(?t<1)(?2)')( + EventTime, Age >= 0, Age = -1) + FORMAT Null + - SELECT 1 FROM hits_10m_single WHERE RefererCategories != [] GROUP BY ClientIP, RequestNum HAVING + + + SELECT 1 FROM hits_10m_single WHERE RefererCategories != [] GROUP BY ClientIP, RequestNum HAVING sequenceMatch('(?1)(?t>1000)(?3)')( - EventTime, hasAny(RefererCategories, [9]), hasAny(RefererCategories, [3849, 2, 3, 4, 5, 6, 7]), hasAll(RefererCategories, [1, 9]), hasAny(RefererCategories, [1, 2326, 5496])) - SELECT 1 FROM hits_10m_single WHERE RefererCategories != [] GROUP BY ClientIP, RequestNum HAVING + EventTime, hasAny(RefererCategories, [9]), hasAny(RefererCategories, [3849, 2, 3, 4, 5, 6, 7]), hasAll(RefererCategories, [1, 9]), hasAny(RefererCategories, [1, 2326, 5496])) + FORMAT Null + + + SELECT 1 FROM hits_10m_single WHERE RefererCategories != [] GROUP BY ClientIP, RequestNum HAVING sequenceMatch('(?1)(?t<10000)(?2)')( - EventTime, hasAny(RefererCategories, [3849, 2, 3, 4, 5, 6, 7]), hasAny(RefererCategories, [1, 2])) - SELECT 1 FROM hits_10m_single WHERE RefererCategories != [] GROUP BY ClientIP, RequestNum HAVING + EventTime, hasAny(RefererCategories, [3849, 2, 3, 4, 5, 6, 7]), hasAny(RefererCategories, [1, 2])) + FORMAT Null + + + + + SELECT 1 FROM hits_10m_single WHERE RefererCategories != [] GROUP BY ClientIP, RequestNum HAVING sequenceMatch('(?1)(?3)(?1)(?3)')( - EventTime, hasAny(RefererCategories, [9]), hasAny(RefererCategories, [3849, 2, 3, 4, 5, 6, 7]), hasAll(RefererCategories, [1, 9]), hasAny(RefererCategories, [1, 2326, 5496])) - SELECT 1 FROM hits_10m_single WHERE RefererCategories != [] GROUP BY ClientIP, RequestNum HAVING + EventTime, hasAny(RefererCategories, [9]), hasAny(RefererCategories, [3849, 2, 3, 4, 5, 6, 7]), hasAll(RefererCategories, [1, 9]), hasAny(RefererCategories, [1, 2326, 5496])) + FORMAT Null + + + SELECT 1 FROM hits_10m_single WHERE RefererCategories != [] GROUP BY ClientIP, RequestNum HAVING sequenceMatch('(?1)(?2)(?1)(?2)(?1)')( - EventTime, hasAny(RefererCategories, [3849, 2, 3, 4, 5, 6, 7]), hasAny(RefererCategories, [1, 2])) + EventTime, hasAny(RefererCategories, [3849, 2, 3, 4, 5, 6, 7]), hasAny(RefererCategories, [1, 2])) + FORMAT Null + From 7e3e0500034f6d75d8589ae6fb26edf421613661 Mon Sep 17 00:00:00 2001 From: Jakub Kuklis Date: Fri, 27 Aug 2021 10:14:17 +0200 Subject: [PATCH 12/12] Changing the tests to use mainly test.hits dataset, as hits_10[0]m_single datasets have a slightly different schema --- tests/performance/sequence_match.xml | 47 +++++++++++++++++++++++++--- 1 file changed, 42 insertions(+), 5 deletions(-) diff --git a/tests/performance/sequence_match.xml b/tests/performance/sequence_match.xml index 8f2008d30fc..35a2734df60 100644 --- a/tests/performance/sequence_match.xml +++ b/tests/performance/sequence_match.xml @@ -1,25 +1,62 @@ hits_10m_single + test.hits - + + + SELECT 1 FROM hits_10m_single GROUP BY EventTime HAVING + sequenceMatch('(?1)(?t<1)(?2)')( + EventTime, Age BETWEEN 20 AND 30, Age BETWEEN 35 AND 50) + FORMAT Null + + + SELECT 1 FROM test.hits GROUP BY EventTime HAVING + sequenceMatch('(?1)(?t<1)(?2)')( + EventTime, Age BETWEEN 20 AND 30, Age BETWEEN 35 AND 50) + FORMAT Null + + + + + SELECT 1 FROM hits_10m_single GROUP BY EventTime HAVING + sequenceMatch('(?1)(?t<1)(?2)')( + EventTime, Age BETWEEN 20 AND 30, Age BETWEEN 35 AND 50, Age >= 0) + FORMAT Null + + + SELECT 1 FROM test.hits GROUP BY EventTime HAVING + sequenceMatch('(?1)(?t<1)(?2)')( + EventTime, Age BETWEEN 20 AND 30, Age BETWEEN 35 AND 50, Age >= 0) + FORMAT Null + + + SELECT 1 FROM hits_10m_single GROUP BY EventTime HAVING sequenceMatch('(?1)(?t<1)(?2)')( EventTime, Age >= 0, Age = -1) FORMAT Null + + SELECT 1 FROM test.hits GROUP BY EventTime HAVING + sequenceMatch('(?1)(?t<1)(?2)')( + EventTime, Age >= 0, Age = -1) + FORMAT Null + + + - SELECT 1 FROM hits_10m_single WHERE RefererCategories != [] GROUP BY ClientIP, RequestNum HAVING + SELECT 1 FROM test.hits WHERE RefererCategories != [] GROUP BY ClientIP, RequestNum HAVING sequenceMatch('(?1)(?t>1000)(?3)')( EventTime, hasAny(RefererCategories, [9]), hasAny(RefererCategories, [3849, 2, 3, 4, 5, 6, 7]), hasAll(RefererCategories, [1, 9]), hasAny(RefererCategories, [1, 2326, 5496])) FORMAT Null - SELECT 1 FROM hits_10m_single WHERE RefererCategories != [] GROUP BY ClientIP, RequestNum HAVING + SELECT 1 FROM test.hits WHERE RefererCategories != [] GROUP BY ClientIP, RequestNum HAVING sequenceMatch('(?1)(?t<10000)(?2)')( EventTime, hasAny(RefererCategories, [3849, 2, 3, 4, 5, 6, 7]), hasAny(RefererCategories, [1, 2])) FORMAT Null @@ -27,13 +64,13 @@ - SELECT 1 FROM hits_10m_single WHERE RefererCategories != [] GROUP BY ClientIP, RequestNum HAVING + SELECT 1 FROM test.hits WHERE RefererCategories != [] GROUP BY ClientIP, RequestNum HAVING sequenceMatch('(?1)(?3)(?1)(?3)')( EventTime, hasAny(RefererCategories, [9]), hasAny(RefererCategories, [3849, 2, 3, 4, 5, 6, 7]), hasAll(RefererCategories, [1, 9]), hasAny(RefererCategories, [1, 2326, 5496])) FORMAT Null - SELECT 1 FROM hits_10m_single WHERE RefererCategories != [] GROUP BY ClientIP, RequestNum HAVING + SELECT 1 FROM test.hits WHERE RefererCategories != [] GROUP BY ClientIP, RequestNum HAVING sequenceMatch('(?1)(?2)(?1)(?2)(?1)')( EventTime, hasAny(RefererCategories, [3849, 2, 3, 4, 5, 6, 7]), hasAny(RefererCategories, [1, 2])) FORMAT Null