Merge pull request #27729 from ContentSquare/sequenceMatchQuickCheck

Sequence match quick check
This commit is contained in:
Kruglov Pavel 2021-08-30 13:04:23 +03:00 committed by GitHub
commit 95fd6197dc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 168 additions and 27 deletions

View File

@ -48,6 +48,8 @@ struct AggregateFunctionSequenceMatchData final
bool sorted = true;
PODArrayWithStackMemory<TimestampEvents, 64> events_list;
/// sequenceMatch conditions met at least once in events_list
std::bitset<max_events> conditions_met;
void add(const Timestamp timestamp, const Events & events)
{
@ -56,6 +58,7 @@ struct AggregateFunctionSequenceMatchData final
{
events_list.emplace_back(timestamp, events);
sorted = false;
conditions_met |= events;
}
}
@ -64,29 +67,9 @@ struct AggregateFunctionSequenceMatchData final
if (other.events_list.empty())
return;
const auto size = events_list.size();
events_list.insert(std::begin(other.events_list), std::end(other.events_list));
/// either sort whole container or do so partially merging ranges afterwards
if (!sorted && !other.sorted)
std::sort(std::begin(events_list), std::end(events_list), Comparator{});
else
{
const auto begin = std::begin(events_list);
const auto middle = std::next(begin, size);
const auto end = std::end(events_list);
if (!sorted)
std::sort(begin, middle, Comparator{});
if (!other.sorted)
std::sort(middle, end, Comparator{});
std::inplace_merge(begin, middle, end, Comparator{});
}
sorted = true;
sorted = false;
conditions_met |= other.conditions_met;
}
void sort()
@ -290,6 +273,7 @@ private:
dfa_states.back().transition = DFATransition::SpecificEvent;
dfa_states.back().event = event_number - 1;
dfa_states.emplace_back();
conditions_in_pattern.set(event_number - 1);
}
if (!match(")"))
@ -518,6 +502,64 @@ protected:
return action_it == action_end;
}
/// Splits the pattern into deterministic parts separated by non-deterministic fragments
/// (time constraints and Kleene stars), and tries to match the deterministic parts in their specified order,
/// ignoring the non-deterministic fragments.
/// This function can quickly check that a full match is not possible if some deterministic fragment is missing.
template <typename EventEntry>
bool couldMatchDeterministicParts(const EventEntry events_begin, const EventEntry events_end, bool limit_iterations = true) const
{
size_t events_processed = 0;
auto events_it = events_begin;
const auto actions_end = std::end(actions);
auto actions_it = std::begin(actions);
auto det_part_begin = actions_it;
auto match_deterministic_part = [&events_it, events_end, &events_processed, det_part_begin, actions_it, limit_iterations]()
{
auto events_it_init = events_it;
auto det_part_it = det_part_begin;
while (det_part_it != actions_it && events_it != events_end)
{
/// matching any event
if (det_part_it->type == PatternActionType::AnyEvent)
++events_it, ++det_part_it;
/// matching specific event
else
{
if (events_it->second.test(det_part_it->extra))
++events_it, ++det_part_it;
/// abandon current matching, try to match the deterministic fragment further in the list
else
{
events_it = ++events_it_init;
det_part_it = det_part_begin;
}
}
if (limit_iterations && ++events_processed > sequence_match_max_iterations)
throw Exception{"Pattern application proves too difficult, exceeding max iterations (" + toString(sequence_match_max_iterations) + ")",
ErrorCodes::TOO_SLOW};
}
return det_part_it == actions_it;
};
for (; actions_it != actions_end; ++actions_it)
if (actions_it->type != PatternActionType::SpecificEvent && actions_it->type != PatternActionType::AnyEvent)
{
if (!match_deterministic_part())
return false;
det_part_begin = std::next(actions_it);
}
return match_deterministic_part();
}
private:
enum class DFATransition : char
{
@ -558,6 +600,8 @@ private:
protected:
/// `True` if the parsed pattern contains time assertions (?t...), `false` otherwise.
bool pattern_has_time;
/// sequenceMatch conditions met at least once in the pattern
std::bitset<max_events> conditions_in_pattern;
private:
std::string pattern;
@ -584,6 +628,12 @@ public:
void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
{
auto & output = assert_cast<ColumnUInt8 &>(to).getData();
if ((this->conditions_in_pattern & this->data(place).conditions_met) != this->conditions_in_pattern)
{
output.push_back(false);
return;
}
this->data(place).sort();
const auto & data_ref = this->data(place);
@ -592,8 +642,10 @@ public:
const auto events_end = std::end(data_ref.events_list);
auto events_it = events_begin;
bool match = this->pattern_has_time ? this->backtrackingMatch(events_it, events_end) : this->dfaMatch(events_it, events_end);
assert_cast<ColumnUInt8 &>(to).getData().push_back(match);
bool match = (this->pattern_has_time ?
(this->couldMatchDeterministicParts(events_begin, events_end) && this->backtrackingMatch(events_it, events_end)) :
this->dfaMatch(events_it, events_end));
output.push_back(match);
}
};
@ -614,8 +666,14 @@ public:
void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
{
auto & output = assert_cast<ColumnUInt64 &>(to).getData();
if ((this->conditions_in_pattern & this->data(place).conditions_met) != this->conditions_in_pattern)
{
output.push_back(0);
return;
}
this->data(place).sort();
assert_cast<ColumnUInt64 &>(to).getData().push_back(count(place));
output.push_back(count(place));
}
private:
@ -628,8 +686,12 @@ private:
auto events_it = events_begin;
size_t count = 0;
while (events_it != events_end && this->backtrackingMatch(events_it, events_end))
++count;
// check if there is a chance of matching the sequence at least once
if (this->couldMatchDeterministicParts(events_begin, events_end))
{
while (events_it != events_end && this->backtrackingMatch(events_it, events_end))
++count;
}
return count;
}

View File

@ -0,0 +1,79 @@
<test>
<preconditions>
<table_exists>hits_10m_single</table_exists>
<table_exists>test.hits</table_exists>
</preconditions>
<!-- Queries with some matching rows -->
<query>
SELECT 1 FROM hits_10m_single GROUP BY EventTime HAVING
sequenceMatch('(?1)(?t&lt;1)(?2)')(
EventTime, Age BETWEEN 20 AND 30, Age BETWEEN 35 AND 50)
FORMAT Null
</query>
<query>
SELECT 1 FROM test.hits GROUP BY EventTime HAVING
sequenceMatch('(?1)(?t&lt;1)(?2)')(
EventTime, Age BETWEEN 20 AND 30, Age BETWEEN 35 AND 50)
FORMAT Null
</query>
<!-- Same queries as above, but with all rows matching the last condition -->
<query>
SELECT 1 FROM hits_10m_single GROUP BY EventTime HAVING
sequenceMatch('(?1)(?t&lt;1)(?2)')(
EventTime, Age BETWEEN 20 AND 30, Age BETWEEN 35 AND 50, Age >= 0)
FORMAT Null
</query>
<query>
SELECT 1 FROM test.hits GROUP BY EventTime HAVING
sequenceMatch('(?1)(?t&lt;1)(?2)')(
EventTime, Age BETWEEN 20 AND 30, Age BETWEEN 35 AND 50, Age >= 0)
FORMAT Null
</query>
<!-- Queries with no rows matching (Age is never negative) -->
<query>
SELECT 1 FROM hits_10m_single GROUP BY EventTime HAVING
sequenceMatch('(?1)(?t&lt;1)(?2)')(
EventTime, Age >= 0, Age = -1)
FORMAT Null
</query>
<query>
SELECT 1 FROM test.hits GROUP BY EventTime HAVING
sequenceMatch('(?1)(?t&lt;1)(?2)')(
EventTime, Age >= 0, Age = -1)
FORMAT Null
</query>
<!-- Using array conditions in further tests (only available for test.hits) for good diversity of matched rows percentage -->
<!-- Queries with time constraints -->
<query>
SELECT 1 FROM test.hits WHERE RefererCategories != [] GROUP BY ClientIP, RequestNum HAVING
sequenceMatch('(?1)(?t&gt;1000)(?3)')(
EventTime, hasAny(RefererCategories, [9]), hasAny(RefererCategories, [3849, 2, 3, 4, 5, 6, 7]), hasAll(RefererCategories, [1, 9]), hasAny(RefererCategories, [1, 2326, 5496]))
FORMAT Null
</query>
<query>
SELECT 1 FROM test.hits WHERE RefererCategories != [] GROUP BY ClientIP, RequestNum HAVING
sequenceMatch('(?1)(?t&lt;10000)(?2)')(
EventTime, hasAny(RefererCategories, [3849, 2, 3, 4, 5, 6, 7]), hasAny(RefererCategories, [1, 2]))
FORMAT Null
</query>
<!-- Queries without time constraints -->
<query>
SELECT 1 FROM test.hits WHERE RefererCategories != [] GROUP BY ClientIP, RequestNum HAVING
sequenceMatch('(?1)(?3)(?1)(?3)')(
EventTime, hasAny(RefererCategories, [9]), hasAny(RefererCategories, [3849, 2, 3, 4, 5, 6, 7]), hasAll(RefererCategories, [1, 9]), hasAny(RefererCategories, [1, 2326, 5496]))
FORMAT Null
</query>
<query>
SELECT 1 FROM test.hits WHERE RefererCategories != [] GROUP BY ClientIP, RequestNum HAVING
sequenceMatch('(?1)(?2)(?1)(?2)(?1)')(
EventTime, hasAny(RefererCategories, [3849, 2, 3, 4, 5, 6, 7]), hasAny(RefererCategories, [1, 2]))
FORMAT Null
</query>
</test>