mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-24 00:22:29 +00:00
Merge pull request #27729 from ContentSquare/sequenceMatchQuickCheck
Sequence match quick check
This commit is contained in:
commit
95fd6197dc
@ -48,6 +48,8 @@ struct AggregateFunctionSequenceMatchData final
|
||||
|
||||
bool sorted = true;
|
||||
PODArrayWithStackMemory<TimestampEvents, 64> events_list;
|
||||
/// sequenceMatch conditions met at least once in events_list
|
||||
std::bitset<max_events> conditions_met;
|
||||
|
||||
void add(const Timestamp timestamp, const Events & events)
|
||||
{
|
||||
@ -56,6 +58,7 @@ struct AggregateFunctionSequenceMatchData final
|
||||
{
|
||||
events_list.emplace_back(timestamp, events);
|
||||
sorted = false;
|
||||
conditions_met |= events;
|
||||
}
|
||||
}
|
||||
|
||||
@ -64,29 +67,9 @@ struct AggregateFunctionSequenceMatchData final
|
||||
if (other.events_list.empty())
|
||||
return;
|
||||
|
||||
const auto size = events_list.size();
|
||||
|
||||
events_list.insert(std::begin(other.events_list), std::end(other.events_list));
|
||||
|
||||
/// either sort whole container or do so partially merging ranges afterwards
|
||||
if (!sorted && !other.sorted)
|
||||
std::sort(std::begin(events_list), std::end(events_list), Comparator{});
|
||||
else
|
||||
{
|
||||
const auto begin = std::begin(events_list);
|
||||
const auto middle = std::next(begin, size);
|
||||
const auto end = std::end(events_list);
|
||||
|
||||
if (!sorted)
|
||||
std::sort(begin, middle, Comparator{});
|
||||
|
||||
if (!other.sorted)
|
||||
std::sort(middle, end, Comparator{});
|
||||
|
||||
std::inplace_merge(begin, middle, end, Comparator{});
|
||||
}
|
||||
|
||||
sorted = true;
|
||||
sorted = false;
|
||||
conditions_met |= other.conditions_met;
|
||||
}
|
||||
|
||||
void sort()
|
||||
@ -290,6 +273,7 @@ private:
|
||||
dfa_states.back().transition = DFATransition::SpecificEvent;
|
||||
dfa_states.back().event = event_number - 1;
|
||||
dfa_states.emplace_back();
|
||||
conditions_in_pattern.set(event_number - 1);
|
||||
}
|
||||
|
||||
if (!match(")"))
|
||||
@ -518,6 +502,64 @@ protected:
|
||||
return action_it == action_end;
|
||||
}
|
||||
|
||||
/// Splits the pattern into deterministic parts separated by non-deterministic fragments
|
||||
/// (time constraints and Kleene stars), and tries to match the deterministic parts in their specified order,
|
||||
/// ignoring the non-deterministic fragments.
|
||||
/// This function can quickly check that a full match is not possible if some deterministic fragment is missing.
|
||||
template <typename EventEntry>
|
||||
bool couldMatchDeterministicParts(const EventEntry events_begin, const EventEntry events_end, bool limit_iterations = true) const
|
||||
{
|
||||
size_t events_processed = 0;
|
||||
auto events_it = events_begin;
|
||||
|
||||
const auto actions_end = std::end(actions);
|
||||
auto actions_it = std::begin(actions);
|
||||
auto det_part_begin = actions_it;
|
||||
|
||||
auto match_deterministic_part = [&events_it, events_end, &events_processed, det_part_begin, actions_it, limit_iterations]()
|
||||
{
|
||||
auto events_it_init = events_it;
|
||||
auto det_part_it = det_part_begin;
|
||||
|
||||
while (det_part_it != actions_it && events_it != events_end)
|
||||
{
|
||||
/// matching any event
|
||||
if (det_part_it->type == PatternActionType::AnyEvent)
|
||||
++events_it, ++det_part_it;
|
||||
|
||||
/// matching specific event
|
||||
else
|
||||
{
|
||||
if (events_it->second.test(det_part_it->extra))
|
||||
++events_it, ++det_part_it;
|
||||
|
||||
/// abandon current matching, try to match the deterministic fragment further in the list
|
||||
else
|
||||
{
|
||||
events_it = ++events_it_init;
|
||||
det_part_it = det_part_begin;
|
||||
}
|
||||
}
|
||||
|
||||
if (limit_iterations && ++events_processed > sequence_match_max_iterations)
|
||||
throw Exception{"Pattern application proves too difficult, exceeding max iterations (" + toString(sequence_match_max_iterations) + ")",
|
||||
ErrorCodes::TOO_SLOW};
|
||||
}
|
||||
|
||||
return det_part_it == actions_it;
|
||||
};
|
||||
|
||||
for (; actions_it != actions_end; ++actions_it)
|
||||
if (actions_it->type != PatternActionType::SpecificEvent && actions_it->type != PatternActionType::AnyEvent)
|
||||
{
|
||||
if (!match_deterministic_part())
|
||||
return false;
|
||||
det_part_begin = std::next(actions_it);
|
||||
}
|
||||
|
||||
return match_deterministic_part();
|
||||
}
|
||||
|
||||
private:
|
||||
enum class DFATransition : char
|
||||
{
|
||||
@ -558,6 +600,8 @@ private:
|
||||
protected:
|
||||
/// `True` if the parsed pattern contains time assertions (?t...), `false` otherwise.
|
||||
bool pattern_has_time;
|
||||
/// sequenceMatch conditions met at least once in the pattern
|
||||
std::bitset<max_events> conditions_in_pattern;
|
||||
|
||||
private:
|
||||
std::string pattern;
|
||||
@ -584,6 +628,12 @@ public:
|
||||
|
||||
void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
|
||||
{
|
||||
auto & output = assert_cast<ColumnUInt8 &>(to).getData();
|
||||
if ((this->conditions_in_pattern & this->data(place).conditions_met) != this->conditions_in_pattern)
|
||||
{
|
||||
output.push_back(false);
|
||||
return;
|
||||
}
|
||||
this->data(place).sort();
|
||||
|
||||
const auto & data_ref = this->data(place);
|
||||
@ -592,8 +642,10 @@ public:
|
||||
const auto events_end = std::end(data_ref.events_list);
|
||||
auto events_it = events_begin;
|
||||
|
||||
bool match = this->pattern_has_time ? this->backtrackingMatch(events_it, events_end) : this->dfaMatch(events_it, events_end);
|
||||
assert_cast<ColumnUInt8 &>(to).getData().push_back(match);
|
||||
bool match = (this->pattern_has_time ?
|
||||
(this->couldMatchDeterministicParts(events_begin, events_end) && this->backtrackingMatch(events_it, events_end)) :
|
||||
this->dfaMatch(events_it, events_end));
|
||||
output.push_back(match);
|
||||
}
|
||||
};
|
||||
|
||||
@ -614,8 +666,14 @@ public:
|
||||
|
||||
void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
|
||||
{
|
||||
auto & output = assert_cast<ColumnUInt64 &>(to).getData();
|
||||
if ((this->conditions_in_pattern & this->data(place).conditions_met) != this->conditions_in_pattern)
|
||||
{
|
||||
output.push_back(0);
|
||||
return;
|
||||
}
|
||||
this->data(place).sort();
|
||||
assert_cast<ColumnUInt64 &>(to).getData().push_back(count(place));
|
||||
output.push_back(count(place));
|
||||
}
|
||||
|
||||
private:
|
||||
@ -628,8 +686,12 @@ private:
|
||||
auto events_it = events_begin;
|
||||
|
||||
size_t count = 0;
|
||||
while (events_it != events_end && this->backtrackingMatch(events_it, events_end))
|
||||
++count;
|
||||
// check if there is a chance of matching the sequence at least once
|
||||
if (this->couldMatchDeterministicParts(events_begin, events_end))
|
||||
{
|
||||
while (events_it != events_end && this->backtrackingMatch(events_it, events_end))
|
||||
++count;
|
||||
}
|
||||
|
||||
return count;
|
||||
}
|
||||
|
79
tests/performance/sequence_match.xml
Normal file
79
tests/performance/sequence_match.xml
Normal file
@ -0,0 +1,79 @@
|
||||
<test>
|
||||
<preconditions>
|
||||
<table_exists>hits_10m_single</table_exists>
|
||||
<table_exists>test.hits</table_exists>
|
||||
</preconditions>
|
||||
|
||||
<!-- Queries with some matching rows -->
|
||||
<query>
|
||||
SELECT 1 FROM hits_10m_single GROUP BY EventTime HAVING
|
||||
sequenceMatch('(?1)(?t<1)(?2)')(
|
||||
EventTime, Age BETWEEN 20 AND 30, Age BETWEEN 35 AND 50)
|
||||
FORMAT Null
|
||||
</query>
|
||||
<query>
|
||||
SELECT 1 FROM test.hits GROUP BY EventTime HAVING
|
||||
sequenceMatch('(?1)(?t<1)(?2)')(
|
||||
EventTime, Age BETWEEN 20 AND 30, Age BETWEEN 35 AND 50)
|
||||
FORMAT Null
|
||||
</query>
|
||||
|
||||
<!-- Same queries as above, but with all rows matching the last condition -->
|
||||
<query>
|
||||
SELECT 1 FROM hits_10m_single GROUP BY EventTime HAVING
|
||||
sequenceMatch('(?1)(?t<1)(?2)')(
|
||||
EventTime, Age BETWEEN 20 AND 30, Age BETWEEN 35 AND 50, Age >= 0)
|
||||
FORMAT Null
|
||||
</query>
|
||||
<query>
|
||||
SELECT 1 FROM test.hits GROUP BY EventTime HAVING
|
||||
sequenceMatch('(?1)(?t<1)(?2)')(
|
||||
EventTime, Age BETWEEN 20 AND 30, Age BETWEEN 35 AND 50, Age >= 0)
|
||||
FORMAT Null
|
||||
</query>
|
||||
|
||||
<!-- Queries with no rows matching (Age is never negative) -->
|
||||
<query>
|
||||
SELECT 1 FROM hits_10m_single GROUP BY EventTime HAVING
|
||||
sequenceMatch('(?1)(?t<1)(?2)')(
|
||||
EventTime, Age >= 0, Age = -1)
|
||||
FORMAT Null
|
||||
</query>
|
||||
<query>
|
||||
SELECT 1 FROM test.hits GROUP BY EventTime HAVING
|
||||
sequenceMatch('(?1)(?t<1)(?2)')(
|
||||
EventTime, Age >= 0, Age = -1)
|
||||
FORMAT Null
|
||||
</query>
|
||||
|
||||
<!-- Using array conditions in further tests (only available for test.hits) for good diversity of matched rows percentage -->
|
||||
|
||||
<!-- Queries with time constraints -->
|
||||
<query>
|
||||
SELECT 1 FROM test.hits WHERE RefererCategories != [] GROUP BY ClientIP, RequestNum HAVING
|
||||
sequenceMatch('(?1)(?t>1000)(?3)')(
|
||||
EventTime, hasAny(RefererCategories, [9]), hasAny(RefererCategories, [3849, 2, 3, 4, 5, 6, 7]), hasAll(RefererCategories, [1, 9]), hasAny(RefererCategories, [1, 2326, 5496]))
|
||||
FORMAT Null
|
||||
</query>
|
||||
<query>
|
||||
SELECT 1 FROM test.hits WHERE RefererCategories != [] GROUP BY ClientIP, RequestNum HAVING
|
||||
sequenceMatch('(?1)(?t<10000)(?2)')(
|
||||
EventTime, hasAny(RefererCategories, [3849, 2, 3, 4, 5, 6, 7]), hasAny(RefererCategories, [1, 2]))
|
||||
FORMAT Null
|
||||
</query>
|
||||
|
||||
<!-- Queries without time constraints -->
|
||||
<query>
|
||||
SELECT 1 FROM test.hits WHERE RefererCategories != [] GROUP BY ClientIP, RequestNum HAVING
|
||||
sequenceMatch('(?1)(?3)(?1)(?3)')(
|
||||
EventTime, hasAny(RefererCategories, [9]), hasAny(RefererCategories, [3849, 2, 3, 4, 5, 6, 7]), hasAll(RefererCategories, [1, 9]), hasAny(RefererCategories, [1, 2326, 5496]))
|
||||
FORMAT Null
|
||||
</query>
|
||||
<query>
|
||||
SELECT 1 FROM test.hits WHERE RefererCategories != [] GROUP BY ClientIP, RequestNum HAVING
|
||||
sequenceMatch('(?1)(?2)(?1)(?2)(?1)')(
|
||||
EventTime, hasAny(RefererCategories, [3849, 2, 3, 4, 5, 6, 7]), hasAny(RefererCategories, [1, 2]))
|
||||
FORMAT Null
|
||||
</query>
|
||||
|
||||
</test>
|
Loading…
Reference in New Issue
Block a user