mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-27 18:12:02 +00:00
Merge pull request #27729 from ContentSquare/sequenceMatchQuickCheck
Sequence match quick check
This commit is contained in:
commit
95fd6197dc
@ -48,6 +48,8 @@ struct AggregateFunctionSequenceMatchData final
|
|||||||
|
|
||||||
bool sorted = true;
|
bool sorted = true;
|
||||||
PODArrayWithStackMemory<TimestampEvents, 64> events_list;
|
PODArrayWithStackMemory<TimestampEvents, 64> events_list;
|
||||||
|
/// sequenceMatch conditions met at least once in events_list
|
||||||
|
std::bitset<max_events> conditions_met;
|
||||||
|
|
||||||
void add(const Timestamp timestamp, const Events & events)
|
void add(const Timestamp timestamp, const Events & events)
|
||||||
{
|
{
|
||||||
@ -56,6 +58,7 @@ struct AggregateFunctionSequenceMatchData final
|
|||||||
{
|
{
|
||||||
events_list.emplace_back(timestamp, events);
|
events_list.emplace_back(timestamp, events);
|
||||||
sorted = false;
|
sorted = false;
|
||||||
|
conditions_met |= events;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -64,29 +67,9 @@ struct AggregateFunctionSequenceMatchData final
|
|||||||
if (other.events_list.empty())
|
if (other.events_list.empty())
|
||||||
return;
|
return;
|
||||||
|
|
||||||
const auto size = events_list.size();
|
|
||||||
|
|
||||||
events_list.insert(std::begin(other.events_list), std::end(other.events_list));
|
events_list.insert(std::begin(other.events_list), std::end(other.events_list));
|
||||||
|
sorted = false;
|
||||||
/// either sort whole container or do so partially merging ranges afterwards
|
conditions_met |= other.conditions_met;
|
||||||
if (!sorted && !other.sorted)
|
|
||||||
std::sort(std::begin(events_list), std::end(events_list), Comparator{});
|
|
||||||
else
|
|
||||||
{
|
|
||||||
const auto begin = std::begin(events_list);
|
|
||||||
const auto middle = std::next(begin, size);
|
|
||||||
const auto end = std::end(events_list);
|
|
||||||
|
|
||||||
if (!sorted)
|
|
||||||
std::sort(begin, middle, Comparator{});
|
|
||||||
|
|
||||||
if (!other.sorted)
|
|
||||||
std::sort(middle, end, Comparator{});
|
|
||||||
|
|
||||||
std::inplace_merge(begin, middle, end, Comparator{});
|
|
||||||
}
|
|
||||||
|
|
||||||
sorted = true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void sort()
|
void sort()
|
||||||
@ -290,6 +273,7 @@ private:
|
|||||||
dfa_states.back().transition = DFATransition::SpecificEvent;
|
dfa_states.back().transition = DFATransition::SpecificEvent;
|
||||||
dfa_states.back().event = event_number - 1;
|
dfa_states.back().event = event_number - 1;
|
||||||
dfa_states.emplace_back();
|
dfa_states.emplace_back();
|
||||||
|
conditions_in_pattern.set(event_number - 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!match(")"))
|
if (!match(")"))
|
||||||
@ -518,6 +502,64 @@ protected:
|
|||||||
return action_it == action_end;
|
return action_it == action_end;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Splits the pattern into deterministic parts separated by non-deterministic fragments
|
||||||
|
/// (time constraints and Kleene stars), and tries to match the deterministic parts in their specified order,
|
||||||
|
/// ignoring the non-deterministic fragments.
|
||||||
|
/// This function can quickly check that a full match is not possible if some deterministic fragment is missing.
|
||||||
|
template <typename EventEntry>
|
||||||
|
bool couldMatchDeterministicParts(const EventEntry events_begin, const EventEntry events_end, bool limit_iterations = true) const
|
||||||
|
{
|
||||||
|
size_t events_processed = 0;
|
||||||
|
auto events_it = events_begin;
|
||||||
|
|
||||||
|
const auto actions_end = std::end(actions);
|
||||||
|
auto actions_it = std::begin(actions);
|
||||||
|
auto det_part_begin = actions_it;
|
||||||
|
|
||||||
|
auto match_deterministic_part = [&events_it, events_end, &events_processed, det_part_begin, actions_it, limit_iterations]()
|
||||||
|
{
|
||||||
|
auto events_it_init = events_it;
|
||||||
|
auto det_part_it = det_part_begin;
|
||||||
|
|
||||||
|
while (det_part_it != actions_it && events_it != events_end)
|
||||||
|
{
|
||||||
|
/// matching any event
|
||||||
|
if (det_part_it->type == PatternActionType::AnyEvent)
|
||||||
|
++events_it, ++det_part_it;
|
||||||
|
|
||||||
|
/// matching specific event
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (events_it->second.test(det_part_it->extra))
|
||||||
|
++events_it, ++det_part_it;
|
||||||
|
|
||||||
|
/// abandon current matching, try to match the deterministic fragment further in the list
|
||||||
|
else
|
||||||
|
{
|
||||||
|
events_it = ++events_it_init;
|
||||||
|
det_part_it = det_part_begin;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (limit_iterations && ++events_processed > sequence_match_max_iterations)
|
||||||
|
throw Exception{"Pattern application proves too difficult, exceeding max iterations (" + toString(sequence_match_max_iterations) + ")",
|
||||||
|
ErrorCodes::TOO_SLOW};
|
||||||
|
}
|
||||||
|
|
||||||
|
return det_part_it == actions_it;
|
||||||
|
};
|
||||||
|
|
||||||
|
for (; actions_it != actions_end; ++actions_it)
|
||||||
|
if (actions_it->type != PatternActionType::SpecificEvent && actions_it->type != PatternActionType::AnyEvent)
|
||||||
|
{
|
||||||
|
if (!match_deterministic_part())
|
||||||
|
return false;
|
||||||
|
det_part_begin = std::next(actions_it);
|
||||||
|
}
|
||||||
|
|
||||||
|
return match_deterministic_part();
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
enum class DFATransition : char
|
enum class DFATransition : char
|
||||||
{
|
{
|
||||||
@ -558,6 +600,8 @@ private:
|
|||||||
protected:
|
protected:
|
||||||
/// `True` if the parsed pattern contains time assertions (?t...), `false` otherwise.
|
/// `True` if the parsed pattern contains time assertions (?t...), `false` otherwise.
|
||||||
bool pattern_has_time;
|
bool pattern_has_time;
|
||||||
|
/// sequenceMatch conditions met at least once in the pattern
|
||||||
|
std::bitset<max_events> conditions_in_pattern;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
std::string pattern;
|
std::string pattern;
|
||||||
@ -584,6 +628,12 @@ public:
|
|||||||
|
|
||||||
void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
|
void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
|
||||||
{
|
{
|
||||||
|
auto & output = assert_cast<ColumnUInt8 &>(to).getData();
|
||||||
|
if ((this->conditions_in_pattern & this->data(place).conditions_met) != this->conditions_in_pattern)
|
||||||
|
{
|
||||||
|
output.push_back(false);
|
||||||
|
return;
|
||||||
|
}
|
||||||
this->data(place).sort();
|
this->data(place).sort();
|
||||||
|
|
||||||
const auto & data_ref = this->data(place);
|
const auto & data_ref = this->data(place);
|
||||||
@ -592,8 +642,10 @@ public:
|
|||||||
const auto events_end = std::end(data_ref.events_list);
|
const auto events_end = std::end(data_ref.events_list);
|
||||||
auto events_it = events_begin;
|
auto events_it = events_begin;
|
||||||
|
|
||||||
bool match = this->pattern_has_time ? this->backtrackingMatch(events_it, events_end) : this->dfaMatch(events_it, events_end);
|
bool match = (this->pattern_has_time ?
|
||||||
assert_cast<ColumnUInt8 &>(to).getData().push_back(match);
|
(this->couldMatchDeterministicParts(events_begin, events_end) && this->backtrackingMatch(events_it, events_end)) :
|
||||||
|
this->dfaMatch(events_it, events_end));
|
||||||
|
output.push_back(match);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -614,8 +666,14 @@ public:
|
|||||||
|
|
||||||
void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
|
void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
|
||||||
{
|
{
|
||||||
|
auto & output = assert_cast<ColumnUInt64 &>(to).getData();
|
||||||
|
if ((this->conditions_in_pattern & this->data(place).conditions_met) != this->conditions_in_pattern)
|
||||||
|
{
|
||||||
|
output.push_back(0);
|
||||||
|
return;
|
||||||
|
}
|
||||||
this->data(place).sort();
|
this->data(place).sort();
|
||||||
assert_cast<ColumnUInt64 &>(to).getData().push_back(count(place));
|
output.push_back(count(place));
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
@ -628,8 +686,12 @@ private:
|
|||||||
auto events_it = events_begin;
|
auto events_it = events_begin;
|
||||||
|
|
||||||
size_t count = 0;
|
size_t count = 0;
|
||||||
|
// check if there is a chance of matching the sequence at least once
|
||||||
|
if (this->couldMatchDeterministicParts(events_begin, events_end))
|
||||||
|
{
|
||||||
while (events_it != events_end && this->backtrackingMatch(events_it, events_end))
|
while (events_it != events_end && this->backtrackingMatch(events_it, events_end))
|
||||||
++count;
|
++count;
|
||||||
|
}
|
||||||
|
|
||||||
return count;
|
return count;
|
||||||
}
|
}
|
||||||
|
79
tests/performance/sequence_match.xml
Normal file
79
tests/performance/sequence_match.xml
Normal file
@ -0,0 +1,79 @@
|
|||||||
|
<test>
|
||||||
|
<preconditions>
|
||||||
|
<table_exists>hits_10m_single</table_exists>
|
||||||
|
<table_exists>test.hits</table_exists>
|
||||||
|
</preconditions>
|
||||||
|
|
||||||
|
<!-- Queries with some matching rows -->
|
||||||
|
<query>
|
||||||
|
SELECT 1 FROM hits_10m_single GROUP BY EventTime HAVING
|
||||||
|
sequenceMatch('(?1)(?t<1)(?2)')(
|
||||||
|
EventTime, Age BETWEEN 20 AND 30, Age BETWEEN 35 AND 50)
|
||||||
|
FORMAT Null
|
||||||
|
</query>
|
||||||
|
<query>
|
||||||
|
SELECT 1 FROM test.hits GROUP BY EventTime HAVING
|
||||||
|
sequenceMatch('(?1)(?t<1)(?2)')(
|
||||||
|
EventTime, Age BETWEEN 20 AND 30, Age BETWEEN 35 AND 50)
|
||||||
|
FORMAT Null
|
||||||
|
</query>
|
||||||
|
|
||||||
|
<!-- Same queries as above, but with all rows matching the last condition -->
|
||||||
|
<query>
|
||||||
|
SELECT 1 FROM hits_10m_single GROUP BY EventTime HAVING
|
||||||
|
sequenceMatch('(?1)(?t<1)(?2)')(
|
||||||
|
EventTime, Age BETWEEN 20 AND 30, Age BETWEEN 35 AND 50, Age >= 0)
|
||||||
|
FORMAT Null
|
||||||
|
</query>
|
||||||
|
<query>
|
||||||
|
SELECT 1 FROM test.hits GROUP BY EventTime HAVING
|
||||||
|
sequenceMatch('(?1)(?t<1)(?2)')(
|
||||||
|
EventTime, Age BETWEEN 20 AND 30, Age BETWEEN 35 AND 50, Age >= 0)
|
||||||
|
FORMAT Null
|
||||||
|
</query>
|
||||||
|
|
||||||
|
<!-- Queries with no rows matching (Age is never negative) -->
|
||||||
|
<query>
|
||||||
|
SELECT 1 FROM hits_10m_single GROUP BY EventTime HAVING
|
||||||
|
sequenceMatch('(?1)(?t<1)(?2)')(
|
||||||
|
EventTime, Age >= 0, Age = -1)
|
||||||
|
FORMAT Null
|
||||||
|
</query>
|
||||||
|
<query>
|
||||||
|
SELECT 1 FROM test.hits GROUP BY EventTime HAVING
|
||||||
|
sequenceMatch('(?1)(?t<1)(?2)')(
|
||||||
|
EventTime, Age >= 0, Age = -1)
|
||||||
|
FORMAT Null
|
||||||
|
</query>
|
||||||
|
|
||||||
|
<!-- Using array conditions in further tests (only available for test.hits) for good diversity of matched rows percentage -->
|
||||||
|
|
||||||
|
<!-- Queries with time constraints -->
|
||||||
|
<query>
|
||||||
|
SELECT 1 FROM test.hits WHERE RefererCategories != [] GROUP BY ClientIP, RequestNum HAVING
|
||||||
|
sequenceMatch('(?1)(?t>1000)(?3)')(
|
||||||
|
EventTime, hasAny(RefererCategories, [9]), hasAny(RefererCategories, [3849, 2, 3, 4, 5, 6, 7]), hasAll(RefererCategories, [1, 9]), hasAny(RefererCategories, [1, 2326, 5496]))
|
||||||
|
FORMAT Null
|
||||||
|
</query>
|
||||||
|
<query>
|
||||||
|
SELECT 1 FROM test.hits WHERE RefererCategories != [] GROUP BY ClientIP, RequestNum HAVING
|
||||||
|
sequenceMatch('(?1)(?t<10000)(?2)')(
|
||||||
|
EventTime, hasAny(RefererCategories, [3849, 2, 3, 4, 5, 6, 7]), hasAny(RefererCategories, [1, 2]))
|
||||||
|
FORMAT Null
|
||||||
|
</query>
|
||||||
|
|
||||||
|
<!-- Queries without time constraints -->
|
||||||
|
<query>
|
||||||
|
SELECT 1 FROM test.hits WHERE RefererCategories != [] GROUP BY ClientIP, RequestNum HAVING
|
||||||
|
sequenceMatch('(?1)(?3)(?1)(?3)')(
|
||||||
|
EventTime, hasAny(RefererCategories, [9]), hasAny(RefererCategories, [3849, 2, 3, 4, 5, 6, 7]), hasAll(RefererCategories, [1, 9]), hasAny(RefererCategories, [1, 2326, 5496]))
|
||||||
|
FORMAT Null
|
||||||
|
</query>
|
||||||
|
<query>
|
||||||
|
SELECT 1 FROM test.hits WHERE RefererCategories != [] GROUP BY ClientIP, RequestNum HAVING
|
||||||
|
sequenceMatch('(?1)(?2)(?1)(?2)(?1)')(
|
||||||
|
EventTime, hasAny(RefererCategories, [3849, 2, 3, 4, 5, 6, 7]), hasAny(RefererCategories, [1, 2]))
|
||||||
|
FORMAT Null
|
||||||
|
</query>
|
||||||
|
|
||||||
|
</test>
|
Loading…
Reference in New Issue
Block a user