mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-21 15:12:02 +00:00
Merge pull request #22025 from vdimir/window-funncel-strict-increase
Add option strict_increase to windowFunnel
This commit is contained in:
commit
5ada14082f
@ -243,7 +243,7 @@ The function works according to the algorithm:
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
windowFunnel(window, [mode])(timestamp, cond1, cond2, ..., condN)
|
||||
windowFunnel(window, [mode, [mode, ... ]])(timestamp, cond1, cond2, ..., condN)
|
||||
```
|
||||
|
||||
**Arguments**
|
||||
@ -254,8 +254,10 @@ windowFunnel(window, [mode])(timestamp, cond1, cond2, ..., condN)
|
||||
**Parameters**
|
||||
|
||||
- `window` — Length of the sliding window. The unit of `window` depends on the `timestamp` itself and varies. Determined using the expression `timestamp of cond2 <= timestamp of cond1 + window`.
|
||||
- `mode` — It is an optional argument.
|
||||
- `'strict'` — When the `'strict'` is set, the windowFunnel() applies conditions only for the unique values.
|
||||
- `mode` — It is an optional argument. One or more modes can be set.
|
||||
- `'strict'` — If same condition holds for sequence of events then such non-unique events would be skipped.
|
||||
- `'strict_order'` — Don't allow interventions of other events. E.g. in the case of `A->B->D->C`, it stops finding `A->B->C` at the `D` and the max event level is 2.
|
||||
- `'strict_increase'` — Apply conditions only to events with strictly increasing timestamps.
|
||||
|
||||
**Returned value**
|
||||
|
||||
|
@ -243,7 +243,7 @@ SELECT sequenceCount('(?1).*(?2)')(time, number = 1, number = 2) FROM t
|
||||
**Синтаксис**
|
||||
|
||||
``` sql
|
||||
windowFunnel(window, [mode])(timestamp, cond1, cond2, ..., condN)
|
||||
windowFunnel(window, [mode, [mode, ... ]])(timestamp, cond1, cond2, ..., condN)
|
||||
```
|
||||
|
||||
**Аргументы**
|
||||
@ -254,7 +254,10 @@ windowFunnel(window, [mode])(timestamp, cond1, cond2, ..., condN)
|
||||
**Параметры**
|
||||
|
||||
- `window` — ширина скользящего окна по времени. Единица измерения зависит от `timestamp` и может варьироваться. Должно соблюдаться условие `timestamp события cond2 <= timestamp события cond1 + window`.
|
||||
- `mode` — необязательный параметр. Если установлено значение `'strict'`, то функция `windowFunnel()` применяет условия только для уникальных значений.
|
||||
- `mode` — необязательный параметр. Может быть установленно несколько значений одновременно.
|
||||
- `'strict'` — не учитывать подряд идущие повторяющиеся события.
|
||||
- `'strict_order'` — запрещает посторонние события в искомой последовательности. Например, при поиске цепочки `A->B->C` в `A->B->D->C` поиск будет остановлен на `D` и функция вернет 2.
|
||||
- `'strict_increase'` — условия прменяются только для событий со строго возрастающими временными метками.
|
||||
|
||||
**Возвращаемое значение**
|
||||
|
||||
|
@ -6,7 +6,6 @@
|
||||
#include <DataTypes/DataTypeDateTime.h>
|
||||
|
||||
#include <ext/range.h>
|
||||
#include "registerAggregateFunctions.h"
|
||||
|
||||
|
||||
namespace DB
|
||||
|
@ -29,6 +29,7 @@ struct ComparePair final
|
||||
};
|
||||
|
||||
static constexpr auto max_events = 32;
|
||||
|
||||
template <typename T>
|
||||
struct AggregateFunctionWindowFunnelData
|
||||
{
|
||||
@ -46,7 +47,7 @@ struct AggregateFunctionWindowFunnelData
|
||||
|
||||
void add(T timestamp, UInt8 event)
|
||||
{
|
||||
// Since most events should have already been sorted by timestamp.
|
||||
/// Since most events should have already been sorted by timestamp.
|
||||
if (sorted && events_list.size() > 0)
|
||||
{
|
||||
if (events_list.back().first == timestamp)
|
||||
@ -145,14 +146,20 @@ class AggregateFunctionWindowFunnel final
|
||||
private:
|
||||
UInt64 window;
|
||||
UInt8 events_size;
|
||||
UInt8 strict; // When the 'strict' is set, it applies conditions only for the not repeating values.
|
||||
UInt8 strict_order; // When the 'strict_order' is set, it doesn't allow interventions of other events.
|
||||
// In the case of 'A->B->D->C', it stops finding 'A->B->C' at the 'D' and the max event level is 2.
|
||||
/// When the 'strict' is set, it applies conditions only for the not repeating values.
|
||||
bool strict;
|
||||
|
||||
// Loop through the entire events_list, update the event timestamp value
|
||||
// The level path must be 1---2---3---...---check_events_size, find the max event level that satisfied the path in the sliding window.
|
||||
// If found, returns the max event level, else return 0.
|
||||
// The Algorithm complexity is O(n).
|
||||
/// When the 'strict_order' is set, it doesn't allow interventions of other events.
|
||||
/// In the case of 'A->B->D->C', it stops finding 'A->B->C' at the 'D' and the max event level is 2.
|
||||
bool strict_order;
|
||||
|
||||
/// Applies conditions only to events with strictly increasing timestamps
|
||||
bool strict_increase;
|
||||
|
||||
/// Loop through the entire events_list, update the event timestamp value
|
||||
/// The level path must be 1---2---3---...---check_events_size, find the max event level that satisfied the path in the sliding window.
|
||||
/// If found, returns the max event level, else return 0.
|
||||
/// The Algorithm complexity is O(n).
|
||||
UInt8 getEventLevel(Data & data) const
|
||||
{
|
||||
if (data.size() == 0)
|
||||
@ -162,16 +169,13 @@ private:
|
||||
|
||||
data.sort();
|
||||
|
||||
/// events_timestamp stores the timestamp that latest i-th level event happen within time window after previous level event.
|
||||
/// timestamp defaults to -1, which unsigned timestamp value never meet
|
||||
/// there may be some bugs when UInt64 type timstamp overflows Int64, but it works on most cases.
|
||||
std::vector<Int64> events_timestamp(events_size, -1);
|
||||
/// events_timestamp stores the timestamp of the first and previous i-th level event happen within time window
|
||||
std::vector<std::optional<std::pair<UInt64, UInt64>>> events_timestamp(events_size);
|
||||
bool first_event = false;
|
||||
for (const auto & pair : data.events_list)
|
||||
{
|
||||
const T & timestamp = pair.first;
|
||||
const auto & event_idx = pair.second - 1;
|
||||
|
||||
if (strict_order && event_idx == -1)
|
||||
{
|
||||
if (first_event)
|
||||
@ -181,31 +185,39 @@ private:
|
||||
}
|
||||
else if (event_idx == 0)
|
||||
{
|
||||
events_timestamp[0] = timestamp;
|
||||
events_timestamp[0] = std::make_pair(timestamp, timestamp);
|
||||
first_event = true;
|
||||
}
|
||||
else if (strict && events_timestamp[event_idx] >= 0)
|
||||
else if (strict && events_timestamp[event_idx].has_value())
|
||||
{
|
||||
return event_idx + 1;
|
||||
}
|
||||
else if (strict_order && first_event && events_timestamp[event_idx - 1] == -1)
|
||||
else if (strict_order && first_event && !events_timestamp[event_idx - 1].has_value())
|
||||
{
|
||||
for (size_t event = 0; event < events_timestamp.size(); ++event)
|
||||
{
|
||||
if (events_timestamp[event] == -1)
|
||||
if (!events_timestamp[event].has_value())
|
||||
return event;
|
||||
}
|
||||
}
|
||||
else if (events_timestamp[event_idx - 1] >= 0 && timestamp <= events_timestamp[event_idx - 1] + window)
|
||||
else if (events_timestamp[event_idx - 1].has_value())
|
||||
{
|
||||
events_timestamp[event_idx] = events_timestamp[event_idx - 1];
|
||||
if (event_idx + 1 == events_size)
|
||||
return events_size;
|
||||
auto first_timestamp = events_timestamp[event_idx - 1]->first;
|
||||
bool time_matched = timestamp <= first_timestamp + window;
|
||||
if (strict_increase)
|
||||
time_matched = time_matched && events_timestamp[event_idx - 1]->second < timestamp;
|
||||
if (time_matched)
|
||||
{
|
||||
events_timestamp[event_idx] = std::make_pair(first_timestamp, timestamp);
|
||||
if (event_idx + 1 == events_size)
|
||||
return events_size;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t event = events_timestamp.size(); event > 0; --event)
|
||||
{
|
||||
if (events_timestamp[event - 1] >= 0)
|
||||
if (events_timestamp[event - 1].has_value())
|
||||
return event;
|
||||
}
|
||||
return 0;
|
||||
@ -223,15 +235,18 @@ public:
|
||||
events_size = arguments.size() - 1;
|
||||
window = params.at(0).safeGet<UInt64>();
|
||||
|
||||
strict = 0;
|
||||
strict_order = 0;
|
||||
strict = false;
|
||||
strict_order = false;
|
||||
strict_increase = false;
|
||||
for (size_t i = 1; i < params.size(); ++i)
|
||||
{
|
||||
String option = params.at(i).safeGet<String>();
|
||||
if (option.compare("strict") == 0)
|
||||
strict = 1;
|
||||
else if (option.compare("strict_order") == 0)
|
||||
strict_order = 1;
|
||||
if (option == "strict")
|
||||
strict = true;
|
||||
else if (option == "strict_order")
|
||||
strict_order = true;
|
||||
else if (option == "strict_increase")
|
||||
strict_increase = true;
|
||||
else
|
||||
throw Exception{"Aggregate function " + getName() + " doesn't support a parameter: " + option, ErrorCodes::BAD_ARGUMENTS};
|
||||
}
|
||||
@ -253,7 +268,7 @@ public:
|
||||
{
|
||||
bool has_event = false;
|
||||
const auto timestamp = assert_cast<const ColumnVector<T> *>(columns[0])->getData()[row_num];
|
||||
// reverse iteration and stable sorting are needed for events that are qualified by more than one condition.
|
||||
/// reverse iteration and stable sorting are needed for events that are qualified by more than one condition.
|
||||
for (auto i = events_size; i > 0; --i)
|
||||
{
|
||||
auto event = assert_cast<const ColumnVector<UInt8> *>(columns[i])->getData()[row_num];
|
||||
|
@ -57,3 +57,7 @@
|
||||
[2, 0]
|
||||
[3, 1]
|
||||
[4, 1]
|
||||
1
|
||||
1
|
||||
1
|
||||
1
|
||||
|
@ -79,3 +79,11 @@ select u, windowFunnel(86400)(dt, a is null and b is null) as s from funnel_test
|
||||
select u, windowFunnel(86400)(dt, a is null, b = 'b3') as s from funnel_test_non_null group by u order by u format JSONCompactEachRow;
|
||||
select u, windowFunnel(86400, 'strict_order')(dt, a is null, b = 'b3') as s from funnel_test_non_null group by u order by u format JSONCompactEachRow;
|
||||
drop table funnel_test_non_null;
|
||||
|
||||
create table funnel_test_strict_increase (timestamp UInt32, event UInt32) engine=Memory;
|
||||
insert into funnel_test_strict_increase values (0,1000),(1,1001),(1,1002),(1,1003),(2,1004);
|
||||
|
||||
select 5 = windowFunnel(10000)(timestamp, event = 1000, event = 1001, event = 1002, event = 1003, event = 1004) from funnel_test_strict_increase;
|
||||
select 2 = windowFunnel(10000, 'strict_increase')(timestamp, event = 1000, event = 1001, event = 1002, event = 1003, event = 1004) from funnel_test_strict_increase;
|
||||
select 3 = windowFunnel(10000)(timestamp, event = 1004, event = 1004, event = 1004) from funnel_test_strict_increase;
|
||||
select 1 = windowFunnel(10000, 'strict_increase')(timestamp, event = 1004, event = 1004, event = 1004) from funnel_test_strict_increase;
|
||||
|
Loading…
Reference in New Issue
Block a user