2017-03-10 17:52:36 +00:00
|
|
|
#pragma once
|
2018-08-30 18:40:46 +00:00
|
|
|
|
2019-03-31 20:55:36 +00:00
|
|
|
#include <map>
|
2019-03-29 01:02:05 +00:00
|
|
|
#include <memory>
|
2019-03-31 20:55:36 +00:00
|
|
|
#include <mutex>
|
2019-03-29 01:02:05 +00:00
|
|
|
#include <optional>
|
|
|
|
#include <string>
|
|
|
|
#include <utility>
|
|
|
|
#include <vector>
|
2019-03-23 19:40:16 +00:00
|
|
|
#include <Functions/likePatternToRegexp.h>
|
2019-10-13 13:22:09 +00:00
|
|
|
#include <Common/Exception.h>
|
2019-03-23 19:40:16 +00:00
|
|
|
#include <Common/OptimizedRegularExpression.h>
|
2017-04-01 09:19:00 +00:00
|
|
|
#include <Common/ProfileEvents.h>
|
2022-01-18 07:21:41 +00:00
|
|
|
#include <Common/config.h>
|
2022-08-15 18:58:46 +00:00
|
|
|
#include <base/defines.h>
|
2021-10-02 07:13:14 +00:00
|
|
|
#include <base/StringRef.h>
|
2022-08-16 09:56:53 +00:00
|
|
|
#include <boost/container_hash/hash.hpp>
|
2017-03-11 00:27:59 +00:00
|
|
|
|
2021-10-27 23:10:39 +00:00
|
|
|
#include "config_functions.h"
|
2019-03-31 20:55:36 +00:00
|
|
|
|
2022-06-17 10:15:19 +00:00
|
|
|
#if USE_VECTORSCAN
|
2020-04-16 12:31:57 +00:00
|
|
|
# include <hs.h>
|
2019-03-23 19:40:16 +00:00
|
|
|
#endif
|
2017-03-10 17:52:36 +00:00
|
|
|
|
|
|
|
namespace ProfileEvents
|
|
|
|
{
|
2019-03-23 19:40:16 +00:00
|
|
|
extern const Event RegexpCreated;
|
2017-03-10 17:52:36 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2018-01-10 00:04:08 +00:00
|
|
|
namespace DB
|
|
|
|
{
|
2019-03-23 19:40:16 +00:00
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
|
|
|
extern const int CANNOT_ALLOCATE_MEMORY;
|
|
|
|
extern const int LOGICAL_ERROR;
|
2019-12-30 14:46:02 +00:00
|
|
|
extern const int BAD_ARGUMENTS;
|
2019-03-23 19:40:16 +00:00
|
|
|
}
|
2017-03-10 17:52:36 +00:00
|
|
|
|
|
|
|
namespace Regexps
|
|
|
|
{
|
2022-05-25 18:33:13 +00:00
|
|
|
using Regexp = OptimizedRegularExpressionSingleThreaded;
|
Measure and rework internal re2 caching
This commit is based on local benchmarks of ClickHouse's re2 caching.
Question 1: -----------------------------------------------------------
Is pattern caching useful for queries with const LIKE/REGEX
patterns? E.g. SELECT LIKE(col_haystack, '%HelloWorld') FROM T;
The short answer is: no. Runtime is (unsurprisingly) dominated by
pattern evaluation + other stuff going on in queries, but definitely not
pattern compilation. For space reasons, I omit details of the local
experiments.
(Side note: the current caching scheme is unbounded in size which poses
a DoS risk (think of multi-tenancy). This risk is more pronounced when
unbounded caching is used with non-const patterns ..., see next
question)
Question 2: -----------------------------------------------------------
Is pattern caching useful for queries with non-const LIKE/REGEX
patterns? E.g. SELECT LIKE(col_haystack, col_needle) FROM T;
I benchmarked five caching strategies:
1. no caching as a baseline (= recompile for each row)
2. unbounded cache (= threadsafe global hash-map)
3. LRU cache (= threadsafe global hash-map + LRU queue)
4. lightweight local cache 1 (= not threadsafe local hashmap with
collision list which grows to a certain size (here: 10 elements) and
afterwards never changes)
5. lightweight local cache 2 (not threadsafe local hashmap without
collision list in which a collision replaces the stored element, idea
by Alexey)
... using a haystack of 2 mio strings and
A). 2 mio distinct simple patterns
B). 10 simple patterns
C) 2 mio distinct complex patterns
D) 10 complex patterns
Fo A) and C), caching does not help but these queries still allow to
judge the static overhead of caching on query runtimes.
B) and D) are extreme but common cases in practice. They include
queries like "SELECT ... WHERE LIKE (col_haystack, flag ? '%pattern1%' :
'%pattern2%'). Caching should help significantly.
Because LIKE patterns are internally translated to re2 expressions, I
show only measurements for MATCH queries.
Results in sec, averaged over on multiple measurements;
1.A): 2.12
B): 1.68
C): 9.75
D): 9.45
2.A): 2.17
B): 1.73
C): 9.78
D): 9.47
3.A): 9.8
B): 0.63
C): 31.8
D): 0.98
4.A): 2.14
B): 0.29
C): 9.82
D): 0.41
5.A) 2.12 / 2.15 / 2.26
B) 1.51 / 0.43 / 0.30
C) 9.97 / 9.88 / 10.13
D) 5.70 / 0.42 / 0.43
(10/100/1000 buckets, resp. 10/1/0.1% collision rate)
Evaluation:
1. This is the baseline. It was surprised that complex patterns (C, D)
slow down the queries so badly compared to simple patterns (A, B).
The runtime includes evaluation costs, but as caching only helps with
compilation, and looking at 4.D and 5.D, compilation makes up over 90%
of the runtime!
2. No speedup compared to 1, probably due to locking overhead. The cache
is unbounded, and in experiments with data sets > 2 mio rows, 2. is
the only scheme to throw OOM exceptions which is not acceptable.
3. Unique patterns (A and C) lead to thrashing of the LRU cache and very
bad runtimes due to LRU queue maintenance and locking. Works pretty
well however with few distinct patterns (B and D).
4. This scheme is tailored to queries B and D where it performs pretty
good. More importantly, the caching is lightweight enough to not
deteriorate performance on datasets A and C.
5. After some tuning of the hash map size, 100 buckets seem optimal to
be in the same ballpark with 10 distinct patterns as 4. Performance
also does not deteriorate on A and C compared to the baseline.
Unlike 4., this scheme behaves LRU-like and can adjust to changing
pattern distributions.
As a conclusion, this commit implementes two things:
1. Based on Q1, pattern search with const needle no longer uses
caching. This applies to LIKE and MATCH + a few (exotic) other SQL
functions. The code for the unbounded caching was removed.
2. Based on Q2, pattern search with non-const needles now use method 5.
2022-05-27 10:40:53 +00:00
|
|
|
using RegexpPtr = std::shared_ptr<Regexp>;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
Measure and rework internal re2 caching
This commit is based on local benchmarks of ClickHouse's re2 caching.
Question 1: -----------------------------------------------------------
Is pattern caching useful for queries with const LIKE/REGEX
patterns? E.g. SELECT LIKE(col_haystack, '%HelloWorld') FROM T;
The short answer is: no. Runtime is (unsurprisingly) dominated by
pattern evaluation + other stuff going on in queries, but definitely not
pattern compilation. For space reasons, I omit details of the local
experiments.
(Side note: the current caching scheme is unbounded in size which poses
a DoS risk (think of multi-tenancy). This risk is more pronounced when
unbounded caching is used with non-const patterns ..., see next
question)
Question 2: -----------------------------------------------------------
Is pattern caching useful for queries with non-const LIKE/REGEX
patterns? E.g. SELECT LIKE(col_haystack, col_needle) FROM T;
I benchmarked five caching strategies:
1. no caching as a baseline (= recompile for each row)
2. unbounded cache (= threadsafe global hash-map)
3. LRU cache (= threadsafe global hash-map + LRU queue)
4. lightweight local cache 1 (= not threadsafe local hashmap with
collision list which grows to a certain size (here: 10 elements) and
afterwards never changes)
5. lightweight local cache 2 (not threadsafe local hashmap without
collision list in which a collision replaces the stored element, idea
by Alexey)
... using a haystack of 2 mio strings and
A). 2 mio distinct simple patterns
B). 10 simple patterns
C) 2 mio distinct complex patterns
D) 10 complex patterns
Fo A) and C), caching does not help but these queries still allow to
judge the static overhead of caching on query runtimes.
B) and D) are extreme but common cases in practice. They include
queries like "SELECT ... WHERE LIKE (col_haystack, flag ? '%pattern1%' :
'%pattern2%'). Caching should help significantly.
Because LIKE patterns are internally translated to re2 expressions, I
show only measurements for MATCH queries.
Results in sec, averaged over on multiple measurements;
1.A): 2.12
B): 1.68
C): 9.75
D): 9.45
2.A): 2.17
B): 1.73
C): 9.78
D): 9.47
3.A): 9.8
B): 0.63
C): 31.8
D): 0.98
4.A): 2.14
B): 0.29
C): 9.82
D): 0.41
5.A) 2.12 / 2.15 / 2.26
B) 1.51 / 0.43 / 0.30
C) 9.97 / 9.88 / 10.13
D) 5.70 / 0.42 / 0.43
(10/100/1000 buckets, resp. 10/1/0.1% collision rate)
Evaluation:
1. This is the baseline. It was surprised that complex patterns (C, D)
slow down the queries so badly compared to simple patterns (A, B).
The runtime includes evaluation costs, but as caching only helps with
compilation, and looking at 4.D and 5.D, compilation makes up over 90%
of the runtime!
2. No speedup compared to 1, probably due to locking overhead. The cache
is unbounded, and in experiments with data sets > 2 mio rows, 2. is
the only scheme to throw OOM exceptions which is not acceptable.
3. Unique patterns (A and C) lead to thrashing of the LRU cache and very
bad runtimes due to LRU queue maintenance and locking. Works pretty
well however with few distinct patterns (B and D).
4. This scheme is tailored to queries B and D where it performs pretty
good. More importantly, the caching is lightweight enough to not
deteriorate performance on datasets A and C.
5. After some tuning of the hash map size, 100 buckets seem optimal to
be in the same ballpark with 10 distinct patterns as 4. Performance
also does not deteriorate on A and C compared to the baseline.
Unlike 4., this scheme behaves LRU-like and can adjust to changing
pattern distributions.
As a conclusion, this commit implementes two things:
1. Based on Q1, pattern search with const needle no longer uses
caching. This applies to LIKE and MATCH + a few (exotic) other SQL
functions. The code for the unbounded caching was removed.
2. Based on Q2, pattern search with non-const needles now use method 5.
2022-05-27 10:40:53 +00:00
|
|
|
template <bool like, bool no_capture, bool case_insensitive>
|
2022-08-15 18:58:46 +00:00
|
|
|
inline Regexp createRegexp(const String & pattern)
|
2022-05-25 18:33:13 +00:00
|
|
|
{
|
|
|
|
int flags = OptimizedRegularExpression::RE_DOT_NL;
|
|
|
|
if constexpr (no_capture)
|
|
|
|
flags |= OptimizedRegularExpression::RE_NO_CAPTURE;
|
|
|
|
if constexpr (case_insensitive)
|
|
|
|
flags |= OptimizedRegularExpression::RE_CASELESS;
|
Measure and rework internal re2 caching
This commit is based on local benchmarks of ClickHouse's re2 caching.
Question 1: -----------------------------------------------------------
Is pattern caching useful for queries with const LIKE/REGEX
patterns? E.g. SELECT LIKE(col_haystack, '%HelloWorld') FROM T;
The short answer is: no. Runtime is (unsurprisingly) dominated by
pattern evaluation + other stuff going on in queries, but definitely not
pattern compilation. For space reasons, I omit details of the local
experiments.
(Side note: the current caching scheme is unbounded in size which poses
a DoS risk (think of multi-tenancy). This risk is more pronounced when
unbounded caching is used with non-const patterns ..., see next
question)
Question 2: -----------------------------------------------------------
Is pattern caching useful for queries with non-const LIKE/REGEX
patterns? E.g. SELECT LIKE(col_haystack, col_needle) FROM T;
I benchmarked five caching strategies:
1. no caching as a baseline (= recompile for each row)
2. unbounded cache (= threadsafe global hash-map)
3. LRU cache (= threadsafe global hash-map + LRU queue)
4. lightweight local cache 1 (= not threadsafe local hashmap with
collision list which grows to a certain size (here: 10 elements) and
afterwards never changes)
5. lightweight local cache 2 (not threadsafe local hashmap without
collision list in which a collision replaces the stored element, idea
by Alexey)
... using a haystack of 2 mio strings and
A). 2 mio distinct simple patterns
B). 10 simple patterns
C) 2 mio distinct complex patterns
D) 10 complex patterns
Fo A) and C), caching does not help but these queries still allow to
judge the static overhead of caching on query runtimes.
B) and D) are extreme but common cases in practice. They include
queries like "SELECT ... WHERE LIKE (col_haystack, flag ? '%pattern1%' :
'%pattern2%'). Caching should help significantly.
Because LIKE patterns are internally translated to re2 expressions, I
show only measurements for MATCH queries.
Results in sec, averaged over on multiple measurements;
1.A): 2.12
B): 1.68
C): 9.75
D): 9.45
2.A): 2.17
B): 1.73
C): 9.78
D): 9.47
3.A): 9.8
B): 0.63
C): 31.8
D): 0.98
4.A): 2.14
B): 0.29
C): 9.82
D): 0.41
5.A) 2.12 / 2.15 / 2.26
B) 1.51 / 0.43 / 0.30
C) 9.97 / 9.88 / 10.13
D) 5.70 / 0.42 / 0.43
(10/100/1000 buckets, resp. 10/1/0.1% collision rate)
Evaluation:
1. This is the baseline. It was surprised that complex patterns (C, D)
slow down the queries so badly compared to simple patterns (A, B).
The runtime includes evaluation costs, but as caching only helps with
compilation, and looking at 4.D and 5.D, compilation makes up over 90%
of the runtime!
2. No speedup compared to 1, probably due to locking overhead. The cache
is unbounded, and in experiments with data sets > 2 mio rows, 2. is
the only scheme to throw OOM exceptions which is not acceptable.
3. Unique patterns (A and C) lead to thrashing of the LRU cache and very
bad runtimes due to LRU queue maintenance and locking. Works pretty
well however with few distinct patterns (B and D).
4. This scheme is tailored to queries B and D where it performs pretty
good. More importantly, the caching is lightweight enough to not
deteriorate performance on datasets A and C.
5. After some tuning of the hash map size, 100 buckets seem optimal to
be in the same ballpark with 10 distinct patterns as 4. Performance
also does not deteriorate on A and C compared to the baseline.
Unlike 4., this scheme behaves LRU-like and can adjust to changing
pattern distributions.
As a conclusion, this commit implementes two things:
1. Based on Q1, pattern search with const needle no longer uses
caching. This applies to LIKE and MATCH + a few (exotic) other SQL
functions. The code for the unbounded caching was removed.
2. Based on Q2, pattern search with non-const needles now use method 5.
2022-05-27 10:40:53 +00:00
|
|
|
|
|
|
|
if constexpr (like)
|
|
|
|
return {likePatternToRegexp(pattern), flags};
|
|
|
|
else
|
|
|
|
return {pattern, flags};
|
2022-05-25 18:33:13 +00:00
|
|
|
}
|
|
|
|
|
Measure and rework internal re2 caching
This commit is based on local benchmarks of ClickHouse's re2 caching.
Question 1: -----------------------------------------------------------
Is pattern caching useful for queries with const LIKE/REGEX
patterns? E.g. SELECT LIKE(col_haystack, '%HelloWorld') FROM T;
The short answer is: no. Runtime is (unsurprisingly) dominated by
pattern evaluation + other stuff going on in queries, but definitely not
pattern compilation. For space reasons, I omit details of the local
experiments.
(Side note: the current caching scheme is unbounded in size which poses
a DoS risk (think of multi-tenancy). This risk is more pronounced when
unbounded caching is used with non-const patterns ..., see next
question)
Question 2: -----------------------------------------------------------
Is pattern caching useful for queries with non-const LIKE/REGEX
patterns? E.g. SELECT LIKE(col_haystack, col_needle) FROM T;
I benchmarked five caching strategies:
1. no caching as a baseline (= recompile for each row)
2. unbounded cache (= threadsafe global hash-map)
3. LRU cache (= threadsafe global hash-map + LRU queue)
4. lightweight local cache 1 (= not threadsafe local hashmap with
collision list which grows to a certain size (here: 10 elements) and
afterwards never changes)
5. lightweight local cache 2 (not threadsafe local hashmap without
collision list in which a collision replaces the stored element, idea
by Alexey)
... using a haystack of 2 mio strings and
A). 2 mio distinct simple patterns
B). 10 simple patterns
C) 2 mio distinct complex patterns
D) 10 complex patterns
Fo A) and C), caching does not help but these queries still allow to
judge the static overhead of caching on query runtimes.
B) and D) are extreme but common cases in practice. They include
queries like "SELECT ... WHERE LIKE (col_haystack, flag ? '%pattern1%' :
'%pattern2%'). Caching should help significantly.
Because LIKE patterns are internally translated to re2 expressions, I
show only measurements for MATCH queries.
Results in sec, averaged over on multiple measurements;
1.A): 2.12
B): 1.68
C): 9.75
D): 9.45
2.A): 2.17
B): 1.73
C): 9.78
D): 9.47
3.A): 9.8
B): 0.63
C): 31.8
D): 0.98
4.A): 2.14
B): 0.29
C): 9.82
D): 0.41
5.A) 2.12 / 2.15 / 2.26
B) 1.51 / 0.43 / 0.30
C) 9.97 / 9.88 / 10.13
D) 5.70 / 0.42 / 0.43
(10/100/1000 buckets, resp. 10/1/0.1% collision rate)
Evaluation:
1. This is the baseline. It was surprised that complex patterns (C, D)
slow down the queries so badly compared to simple patterns (A, B).
The runtime includes evaluation costs, but as caching only helps with
compilation, and looking at 4.D and 5.D, compilation makes up over 90%
of the runtime!
2. No speedup compared to 1, probably due to locking overhead. The cache
is unbounded, and in experiments with data sets > 2 mio rows, 2. is
the only scheme to throw OOM exceptions which is not acceptable.
3. Unique patterns (A and C) lead to thrashing of the LRU cache and very
bad runtimes due to LRU queue maintenance and locking. Works pretty
well however with few distinct patterns (B and D).
4. This scheme is tailored to queries B and D where it performs pretty
good. More importantly, the caching is lightweight enough to not
deteriorate performance on datasets A and C.
5. After some tuning of the hash map size, 100 buckets seem optimal to
be in the same ballpark with 10 distinct patterns as 4. Performance
also does not deteriorate on A and C compared to the baseline.
Unlike 4., this scheme behaves LRU-like and can adjust to changing
pattern distributions.
As a conclusion, this commit implementes two things:
1. Based on Q1, pattern search with const needle no longer uses
caching. This applies to LIKE and MATCH + a few (exotic) other SQL
functions. The code for the unbounded caching was removed.
2. Based on Q2, pattern search with non-const needles now use method 5.
2022-05-27 10:40:53 +00:00
|
|
|
/// Caches compiled re2 objects for given string patterns. Intended to support the common situation of a small set of patterns which are
|
|
|
|
/// evaluated over and over within the same query. In these situations, usage of the cache will save unnecessary pattern re-compilation.
|
|
|
|
/// However, we must be careful that caching does not add too much static overhead to overall pattern evaluation. Therefore, the cache is
|
|
|
|
/// intentionally very lightweight: a) no thread-safety/mutexes, b) small & fixed capacity, c) no collision list, d) but also no open
|
|
|
|
/// addressing, instead collisions simply replace the existing element.
|
|
|
|
class LocalCacheTable
|
2022-05-25 18:33:13 +00:00
|
|
|
{
|
Measure and rework internal re2 caching
This commit is based on local benchmarks of ClickHouse's re2 caching.
Question 1: -----------------------------------------------------------
Is pattern caching useful for queries with const LIKE/REGEX
patterns? E.g. SELECT LIKE(col_haystack, '%HelloWorld') FROM T;
The short answer is: no. Runtime is (unsurprisingly) dominated by
pattern evaluation + other stuff going on in queries, but definitely not
pattern compilation. For space reasons, I omit details of the local
experiments.
(Side note: the current caching scheme is unbounded in size which poses
a DoS risk (think of multi-tenancy). This risk is more pronounced when
unbounded caching is used with non-const patterns ..., see next
question)
Question 2: -----------------------------------------------------------
Is pattern caching useful for queries with non-const LIKE/REGEX
patterns? E.g. SELECT LIKE(col_haystack, col_needle) FROM T;
I benchmarked five caching strategies:
1. no caching as a baseline (= recompile for each row)
2. unbounded cache (= threadsafe global hash-map)
3. LRU cache (= threadsafe global hash-map + LRU queue)
4. lightweight local cache 1 (= not threadsafe local hashmap with
collision list which grows to a certain size (here: 10 elements) and
afterwards never changes)
5. lightweight local cache 2 (not threadsafe local hashmap without
collision list in which a collision replaces the stored element, idea
by Alexey)
... using a haystack of 2 mio strings and
A). 2 mio distinct simple patterns
B). 10 simple patterns
C) 2 mio distinct complex patterns
D) 10 complex patterns
Fo A) and C), caching does not help but these queries still allow to
judge the static overhead of caching on query runtimes.
B) and D) are extreme but common cases in practice. They include
queries like "SELECT ... WHERE LIKE (col_haystack, flag ? '%pattern1%' :
'%pattern2%'). Caching should help significantly.
Because LIKE patterns are internally translated to re2 expressions, I
show only measurements for MATCH queries.
Results in sec, averaged over on multiple measurements;
1.A): 2.12
B): 1.68
C): 9.75
D): 9.45
2.A): 2.17
B): 1.73
C): 9.78
D): 9.47
3.A): 9.8
B): 0.63
C): 31.8
D): 0.98
4.A): 2.14
B): 0.29
C): 9.82
D): 0.41
5.A) 2.12 / 2.15 / 2.26
B) 1.51 / 0.43 / 0.30
C) 9.97 / 9.88 / 10.13
D) 5.70 / 0.42 / 0.43
(10/100/1000 buckets, resp. 10/1/0.1% collision rate)
Evaluation:
1. This is the baseline. It was surprised that complex patterns (C, D)
slow down the queries so badly compared to simple patterns (A, B).
The runtime includes evaluation costs, but as caching only helps with
compilation, and looking at 4.D and 5.D, compilation makes up over 90%
of the runtime!
2. No speedup compared to 1, probably due to locking overhead. The cache
is unbounded, and in experiments with data sets > 2 mio rows, 2. is
the only scheme to throw OOM exceptions which is not acceptable.
3. Unique patterns (A and C) lead to thrashing of the LRU cache and very
bad runtimes due to LRU queue maintenance and locking. Works pretty
well however with few distinct patterns (B and D).
4. This scheme is tailored to queries B and D where it performs pretty
good. More importantly, the caching is lightweight enough to not
deteriorate performance on datasets A and C.
5. After some tuning of the hash map size, 100 buckets seem optimal to
be in the same ballpark with 10 distinct patterns as 4. Performance
also does not deteriorate on A and C compared to the baseline.
Unlike 4., this scheme behaves LRU-like and can adjust to changing
pattern distributions.
As a conclusion, this commit implementes two things:
1. Based on Q1, pattern search with const needle no longer uses
caching. This applies to LIKE and MATCH + a few (exotic) other SQL
functions. The code for the unbounded caching was removed.
2. Based on Q2, pattern search with non-const needles now use method 5.
2022-05-27 10:40:53 +00:00
|
|
|
public:
|
|
|
|
using RegexpPtr = std::shared_ptr<Regexp>;
|
Implement SQL functions (NOT) (I)LIKE() + MATCH() with non-const needles
With this commit, SQL functions LIKE and MATCH and their variants can
work with non-const needle arguments. E.g.
create table tab
(id UInt32, haystack String, needle String)
engine = MergeTree()
order by id;
insert into tab values
(1, 'Hello', '%ell%')
(2, 'World', '%orl%')
select id, haystack, needle, like(haystack, needle)
from tab;
For that, methods vectorVector() and vectorFixedVector() were added to
MatchImpl. The existing code for const needles has an optimization where
the compiled regexp is cached. The new code expects a different needle
per row and consequently does not cache the regexp.
2022-05-16 20:37:31 +00:00
|
|
|
|
Measure and rework internal re2 caching
This commit is based on local benchmarks of ClickHouse's re2 caching.
Question 1: -----------------------------------------------------------
Is pattern caching useful for queries with const LIKE/REGEX
patterns? E.g. SELECT LIKE(col_haystack, '%HelloWorld') FROM T;
The short answer is: no. Runtime is (unsurprisingly) dominated by
pattern evaluation + other stuff going on in queries, but definitely not
pattern compilation. For space reasons, I omit details of the local
experiments.
(Side note: the current caching scheme is unbounded in size which poses
a DoS risk (think of multi-tenancy). This risk is more pronounced when
unbounded caching is used with non-const patterns ..., see next
question)
Question 2: -----------------------------------------------------------
Is pattern caching useful for queries with non-const LIKE/REGEX
patterns? E.g. SELECT LIKE(col_haystack, col_needle) FROM T;
I benchmarked five caching strategies:
1. no caching as a baseline (= recompile for each row)
2. unbounded cache (= threadsafe global hash-map)
3. LRU cache (= threadsafe global hash-map + LRU queue)
4. lightweight local cache 1 (= not threadsafe local hashmap with
collision list which grows to a certain size (here: 10 elements) and
afterwards never changes)
5. lightweight local cache 2 (not threadsafe local hashmap without
collision list in which a collision replaces the stored element, idea
by Alexey)
... using a haystack of 2 mio strings and
A). 2 mio distinct simple patterns
B). 10 simple patterns
C) 2 mio distinct complex patterns
D) 10 complex patterns
Fo A) and C), caching does not help but these queries still allow to
judge the static overhead of caching on query runtimes.
B) and D) are extreme but common cases in practice. They include
queries like "SELECT ... WHERE LIKE (col_haystack, flag ? '%pattern1%' :
'%pattern2%'). Caching should help significantly.
Because LIKE patterns are internally translated to re2 expressions, I
show only measurements for MATCH queries.
Results in sec, averaged over on multiple measurements;
1.A): 2.12
B): 1.68
C): 9.75
D): 9.45
2.A): 2.17
B): 1.73
C): 9.78
D): 9.47
3.A): 9.8
B): 0.63
C): 31.8
D): 0.98
4.A): 2.14
B): 0.29
C): 9.82
D): 0.41
5.A) 2.12 / 2.15 / 2.26
B) 1.51 / 0.43 / 0.30
C) 9.97 / 9.88 / 10.13
D) 5.70 / 0.42 / 0.43
(10/100/1000 buckets, resp. 10/1/0.1% collision rate)
Evaluation:
1. This is the baseline. It was surprised that complex patterns (C, D)
slow down the queries so badly compared to simple patterns (A, B).
The runtime includes evaluation costs, but as caching only helps with
compilation, and looking at 4.D and 5.D, compilation makes up over 90%
of the runtime!
2. No speedup compared to 1, probably due to locking overhead. The cache
is unbounded, and in experiments with data sets > 2 mio rows, 2. is
the only scheme to throw OOM exceptions which is not acceptable.
3. Unique patterns (A and C) lead to thrashing of the LRU cache and very
bad runtimes due to LRU queue maintenance and locking. Works pretty
well however with few distinct patterns (B and D).
4. This scheme is tailored to queries B and D where it performs pretty
good. More importantly, the caching is lightweight enough to not
deteriorate performance on datasets A and C.
5. After some tuning of the hash map size, 100 buckets seem optimal to
be in the same ballpark with 10 distinct patterns as 4. Performance
also does not deteriorate on A and C compared to the baseline.
Unlike 4., this scheme behaves LRU-like and can adjust to changing
pattern distributions.
As a conclusion, this commit implementes two things:
1. Based on Q1, pattern search with const needle no longer uses
caching. This applies to LIKE and MATCH + a few (exotic) other SQL
functions. The code for the unbounded caching was removed.
2. Based on Q2, pattern search with non-const needles now use method 5.
2022-05-27 10:40:53 +00:00
|
|
|
template <bool like, bool no_capture, bool case_insensitive>
|
2022-08-15 18:58:46 +00:00
|
|
|
RegexpPtr getOrSet(const String & pattern)
|
Measure and rework internal re2 caching
This commit is based on local benchmarks of ClickHouse's re2 caching.
Question 1: -----------------------------------------------------------
Is pattern caching useful for queries with const LIKE/REGEX
patterns? E.g. SELECT LIKE(col_haystack, '%HelloWorld') FROM T;
The short answer is: no. Runtime is (unsurprisingly) dominated by
pattern evaluation + other stuff going on in queries, but definitely not
pattern compilation. For space reasons, I omit details of the local
experiments.
(Side note: the current caching scheme is unbounded in size which poses
a DoS risk (think of multi-tenancy). This risk is more pronounced when
unbounded caching is used with non-const patterns ..., see next
question)
Question 2: -----------------------------------------------------------
Is pattern caching useful for queries with non-const LIKE/REGEX
patterns? E.g. SELECT LIKE(col_haystack, col_needle) FROM T;
I benchmarked five caching strategies:
1. no caching as a baseline (= recompile for each row)
2. unbounded cache (= threadsafe global hash-map)
3. LRU cache (= threadsafe global hash-map + LRU queue)
4. lightweight local cache 1 (= not threadsafe local hashmap with
collision list which grows to a certain size (here: 10 elements) and
afterwards never changes)
5. lightweight local cache 2 (not threadsafe local hashmap without
collision list in which a collision replaces the stored element, idea
by Alexey)
... using a haystack of 2 mio strings and
A). 2 mio distinct simple patterns
B). 10 simple patterns
C) 2 mio distinct complex patterns
D) 10 complex patterns
Fo A) and C), caching does not help but these queries still allow to
judge the static overhead of caching on query runtimes.
B) and D) are extreme but common cases in practice. They include
queries like "SELECT ... WHERE LIKE (col_haystack, flag ? '%pattern1%' :
'%pattern2%'). Caching should help significantly.
Because LIKE patterns are internally translated to re2 expressions, I
show only measurements for MATCH queries.
Results in sec, averaged over on multiple measurements;
1.A): 2.12
B): 1.68
C): 9.75
D): 9.45
2.A): 2.17
B): 1.73
C): 9.78
D): 9.47
3.A): 9.8
B): 0.63
C): 31.8
D): 0.98
4.A): 2.14
B): 0.29
C): 9.82
D): 0.41
5.A) 2.12 / 2.15 / 2.26
B) 1.51 / 0.43 / 0.30
C) 9.97 / 9.88 / 10.13
D) 5.70 / 0.42 / 0.43
(10/100/1000 buckets, resp. 10/1/0.1% collision rate)
Evaluation:
1. This is the baseline. It was surprised that complex patterns (C, D)
slow down the queries so badly compared to simple patterns (A, B).
The runtime includes evaluation costs, but as caching only helps with
compilation, and looking at 4.D and 5.D, compilation makes up over 90%
of the runtime!
2. No speedup compared to 1, probably due to locking overhead. The cache
is unbounded, and in experiments with data sets > 2 mio rows, 2. is
the only scheme to throw OOM exceptions which is not acceptable.
3. Unique patterns (A and C) lead to thrashing of the LRU cache and very
bad runtimes due to LRU queue maintenance and locking. Works pretty
well however with few distinct patterns (B and D).
4. This scheme is tailored to queries B and D where it performs pretty
good. More importantly, the caching is lightweight enough to not
deteriorate performance on datasets A and C.
5. After some tuning of the hash map size, 100 buckets seem optimal to
be in the same ballpark with 10 distinct patterns as 4. Performance
also does not deteriorate on A and C compared to the baseline.
Unlike 4., this scheme behaves LRU-like and can adjust to changing
pattern distributions.
As a conclusion, this commit implementes two things:
1. Based on Q1, pattern search with const needle no longer uses
caching. This applies to LIKE and MATCH + a few (exotic) other SQL
functions. The code for the unbounded caching was removed.
2. Based on Q2, pattern search with non-const needles now use method 5.
2022-05-27 10:40:53 +00:00
|
|
|
{
|
2022-08-15 18:58:46 +00:00
|
|
|
Bucket & bucket = known_regexps[hasher(pattern) % CACHE_SIZE];
|
Measure and rework internal re2 caching
This commit is based on local benchmarks of ClickHouse's re2 caching.
Question 1: -----------------------------------------------------------
Is pattern caching useful for queries with const LIKE/REGEX
patterns? E.g. SELECT LIKE(col_haystack, '%HelloWorld') FROM T;
The short answer is: no. Runtime is (unsurprisingly) dominated by
pattern evaluation + other stuff going on in queries, but definitely not
pattern compilation. For space reasons, I omit details of the local
experiments.
(Side note: the current caching scheme is unbounded in size which poses
a DoS risk (think of multi-tenancy). This risk is more pronounced when
unbounded caching is used with non-const patterns ..., see next
question)
Question 2: -----------------------------------------------------------
Is pattern caching useful for queries with non-const LIKE/REGEX
patterns? E.g. SELECT LIKE(col_haystack, col_needle) FROM T;
I benchmarked five caching strategies:
1. no caching as a baseline (= recompile for each row)
2. unbounded cache (= threadsafe global hash-map)
3. LRU cache (= threadsafe global hash-map + LRU queue)
4. lightweight local cache 1 (= not threadsafe local hashmap with
collision list which grows to a certain size (here: 10 elements) and
afterwards never changes)
5. lightweight local cache 2 (not threadsafe local hashmap without
collision list in which a collision replaces the stored element, idea
by Alexey)
... using a haystack of 2 mio strings and
A). 2 mio distinct simple patterns
B). 10 simple patterns
C) 2 mio distinct complex patterns
D) 10 complex patterns
Fo A) and C), caching does not help but these queries still allow to
judge the static overhead of caching on query runtimes.
B) and D) are extreme but common cases in practice. They include
queries like "SELECT ... WHERE LIKE (col_haystack, flag ? '%pattern1%' :
'%pattern2%'). Caching should help significantly.
Because LIKE patterns are internally translated to re2 expressions, I
show only measurements for MATCH queries.
Results in sec, averaged over on multiple measurements;
1.A): 2.12
B): 1.68
C): 9.75
D): 9.45
2.A): 2.17
B): 1.73
C): 9.78
D): 9.47
3.A): 9.8
B): 0.63
C): 31.8
D): 0.98
4.A): 2.14
B): 0.29
C): 9.82
D): 0.41
5.A) 2.12 / 2.15 / 2.26
B) 1.51 / 0.43 / 0.30
C) 9.97 / 9.88 / 10.13
D) 5.70 / 0.42 / 0.43
(10/100/1000 buckets, resp. 10/1/0.1% collision rate)
Evaluation:
1. This is the baseline. It was surprised that complex patterns (C, D)
slow down the queries so badly compared to simple patterns (A, B).
The runtime includes evaluation costs, but as caching only helps with
compilation, and looking at 4.D and 5.D, compilation makes up over 90%
of the runtime!
2. No speedup compared to 1, probably due to locking overhead. The cache
is unbounded, and in experiments with data sets > 2 mio rows, 2. is
the only scheme to throw OOM exceptions which is not acceptable.
3. Unique patterns (A and C) lead to thrashing of the LRU cache and very
bad runtimes due to LRU queue maintenance and locking. Works pretty
well however with few distinct patterns (B and D).
4. This scheme is tailored to queries B and D where it performs pretty
good. More importantly, the caching is lightweight enough to not
deteriorate performance on datasets A and C.
5. After some tuning of the hash map size, 100 buckets seem optimal to
be in the same ballpark with 10 distinct patterns as 4. Performance
also does not deteriorate on A and C compared to the baseline.
Unlike 4., this scheme behaves LRU-like and can adjust to changing
pattern distributions.
As a conclusion, this commit implementes two things:
1. Based on Q1, pattern search with const needle no longer uses
caching. This applies to LIKE and MATCH + a few (exotic) other SQL
functions. The code for the unbounded caching was removed.
2. Based on Q2, pattern search with non-const needles now use method 5.
2022-05-27 10:40:53 +00:00
|
|
|
|
2022-08-15 18:58:46 +00:00
|
|
|
if (bucket.regexp == nullptr) [[unlikely]]
|
|
|
|
/// insert new entry
|
|
|
|
bucket = {pattern, std::make_shared<Regexp>(createRegexp<like, no_capture, case_insensitive>(pattern))};
|
Cache compiled regexps when evaluating non-const needles
Needles in a (non-const) needle column may repeat and this commit allows
to skip compilation for known needles. Out of the different design
alternatives (see below, if someone is interested), we now maintain
- one global pattern cache,
- with a fixed size of 42k elements currently,
- and use LRU as eviction strategy.
------------------------------------------------------------------------
(sorry for the wall of text, dumping it here not for reading but just
for reference)
Write-up about considered design alternatives:
1. Keep the current global cache of const needles. For non-const
needles, probe the cache but don't store values in it.
Pros: need to maintain just a single cache, no problem with cache
pollution assuming there are few distinct constant needles
Cons: only useful if a non-const needle occurred as already as a
const needle
--> overall too simplistic
2. Keep the current global cache for const needles. For non-const
needles, create a local (e.g. per-query) cache
Pros: unlike (1.), non-const needles can be skipped even if they
did not occur yet, no pollution of the const pattern cache when
there are very many non-const needles (e.g. large / highly
distinct needle columns).
Cons: caches may explode "horizontally", i.e. we'll end up with the
const cache + caches for Q1, Q2, ... QN, this makes it harder
to control the overall space consumption, also patterns
residing in different caches cannot be reused between queries,
another difficulty is that the concept of "query" does not
really exist at matching level - there are only column chunks
and we'd potentially end up with 1 cache / chunk
3. Queries with const and non-const needles insert into the same global
cache.
Pros: the advantages of (2.) + allows to reuse compiled patterns
accross parallel queries
Cons: needs an eviction strategy to control cache size and pollution
(and btw. (2.) also needs eviction strategies for the
individual caches)
4. Queries with const needle use global cache, queries with non-const
needle use a different global cache
--> Overall similar to (3) but ignores the (likely) edge case that
const and non-const needles overlap.
In sum, (3.) seems the simplest and most beneficial approach.
Eviction strategies:
0. Don't ever evict --> cache may grow infinitely and eventually make
the system unusable (may even pose a DoS risk)
1. Flush the cache after a certain threshold is exceeded --> very
simple but may lead to peridic performance drops
2. Use LRU --> more graceful performance degradation at threshold but
comes with a (constant) performance overhead to maintain the LRU
queue
In sum, given that the pattern compilation in RE2 should be quite costly
(pattern-to-DFA/NFA), LRU may be acceptable.
2022-05-25 19:22:45 +00:00
|
|
|
else
|
2022-08-15 18:58:46 +00:00
|
|
|
if (pattern != bucket.pattern)
|
|
|
|
/// replace existing entry
|
|
|
|
bucket = {pattern, std::make_shared<Regexp>(createRegexp<like, no_capture, case_insensitive>(pattern))};
|
|
|
|
|
|
|
|
return bucket.regexp;
|
Measure and rework internal re2 caching
This commit is based on local benchmarks of ClickHouse's re2 caching.
Question 1: -----------------------------------------------------------
Is pattern caching useful for queries with const LIKE/REGEX
patterns? E.g. SELECT LIKE(col_haystack, '%HelloWorld') FROM T;
The short answer is: no. Runtime is (unsurprisingly) dominated by
pattern evaluation + other stuff going on in queries, but definitely not
pattern compilation. For space reasons, I omit details of the local
experiments.
(Side note: the current caching scheme is unbounded in size which poses
a DoS risk (think of multi-tenancy). This risk is more pronounced when
unbounded caching is used with non-const patterns ..., see next
question)
Question 2: -----------------------------------------------------------
Is pattern caching useful for queries with non-const LIKE/REGEX
patterns? E.g. SELECT LIKE(col_haystack, col_needle) FROM T;
I benchmarked five caching strategies:
1. no caching as a baseline (= recompile for each row)
2. unbounded cache (= threadsafe global hash-map)
3. LRU cache (= threadsafe global hash-map + LRU queue)
4. lightweight local cache 1 (= not threadsafe local hashmap with
collision list which grows to a certain size (here: 10 elements) and
afterwards never changes)
5. lightweight local cache 2 (not threadsafe local hashmap without
collision list in which a collision replaces the stored element, idea
by Alexey)
... using a haystack of 2 mio strings and
A). 2 mio distinct simple patterns
B). 10 simple patterns
C) 2 mio distinct complex patterns
D) 10 complex patterns
Fo A) and C), caching does not help but these queries still allow to
judge the static overhead of caching on query runtimes.
B) and D) are extreme but common cases in practice. They include
queries like "SELECT ... WHERE LIKE (col_haystack, flag ? '%pattern1%' :
'%pattern2%'). Caching should help significantly.
Because LIKE patterns are internally translated to re2 expressions, I
show only measurements for MATCH queries.
Results in sec, averaged over on multiple measurements;
1.A): 2.12
B): 1.68
C): 9.75
D): 9.45
2.A): 2.17
B): 1.73
C): 9.78
D): 9.47
3.A): 9.8
B): 0.63
C): 31.8
D): 0.98
4.A): 2.14
B): 0.29
C): 9.82
D): 0.41
5.A) 2.12 / 2.15 / 2.26
B) 1.51 / 0.43 / 0.30
C) 9.97 / 9.88 / 10.13
D) 5.70 / 0.42 / 0.43
(10/100/1000 buckets, resp. 10/1/0.1% collision rate)
Evaluation:
1. This is the baseline. It was surprised that complex patterns (C, D)
slow down the queries so badly compared to simple patterns (A, B).
The runtime includes evaluation costs, but as caching only helps with
compilation, and looking at 4.D and 5.D, compilation makes up over 90%
of the runtime!
2. No speedup compared to 1, probably due to locking overhead. The cache
is unbounded, and in experiments with data sets > 2 mio rows, 2. is
the only scheme to throw OOM exceptions which is not acceptable.
3. Unique patterns (A and C) lead to thrashing of the LRU cache and very
bad runtimes due to LRU queue maintenance and locking. Works pretty
well however with few distinct patterns (B and D).
4. This scheme is tailored to queries B and D where it performs pretty
good. More importantly, the caching is lightweight enough to not
deteriorate performance on datasets A and C.
5. After some tuning of the hash map size, 100 buckets seem optimal to
be in the same ballpark with 10 distinct patterns as 4. Performance
also does not deteriorate on A and C compared to the baseline.
Unlike 4., this scheme behaves LRU-like and can adjust to changing
pattern distributions.
As a conclusion, this commit implementes two things:
1. Based on Q1, pattern search with const needle no longer uses
caching. This applies to LIKE and MATCH + a few (exotic) other SQL
functions. The code for the unbounded caching was removed.
2. Based on Q2, pattern search with non-const needles now use method 5.
2022-05-27 10:40:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
2022-08-16 09:56:53 +00:00
|
|
|
constexpr static size_t CACHE_SIZE = 100; /// collision probability
|
2022-06-02 18:18:10 +00:00
|
|
|
|
2022-08-15 18:58:46 +00:00
|
|
|
std::hash<String> hasher;
|
|
|
|
struct Bucket
|
Measure and rework internal re2 caching
This commit is based on local benchmarks of ClickHouse's re2 caching.
Question 1: -----------------------------------------------------------
Is pattern caching useful for queries with const LIKE/REGEX
patterns? E.g. SELECT LIKE(col_haystack, '%HelloWorld') FROM T;
The short answer is: no. Runtime is (unsurprisingly) dominated by
pattern evaluation + other stuff going on in queries, but definitely not
pattern compilation. For space reasons, I omit details of the local
experiments.
(Side note: the current caching scheme is unbounded in size which poses
a DoS risk (think of multi-tenancy). This risk is more pronounced when
unbounded caching is used with non-const patterns ..., see next
question)
Question 2: -----------------------------------------------------------
Is pattern caching useful for queries with non-const LIKE/REGEX
patterns? E.g. SELECT LIKE(col_haystack, col_needle) FROM T;
I benchmarked five caching strategies:
1. no caching as a baseline (= recompile for each row)
2. unbounded cache (= threadsafe global hash-map)
3. LRU cache (= threadsafe global hash-map + LRU queue)
4. lightweight local cache 1 (= not threadsafe local hashmap with
collision list which grows to a certain size (here: 10 elements) and
afterwards never changes)
5. lightweight local cache 2 (not threadsafe local hashmap without
collision list in which a collision replaces the stored element, idea
by Alexey)
... using a haystack of 2 mio strings and
A). 2 mio distinct simple patterns
B). 10 simple patterns
C) 2 mio distinct complex patterns
D) 10 complex patterns
Fo A) and C), caching does not help but these queries still allow to
judge the static overhead of caching on query runtimes.
B) and D) are extreme but common cases in practice. They include
queries like "SELECT ... WHERE LIKE (col_haystack, flag ? '%pattern1%' :
'%pattern2%'). Caching should help significantly.
Because LIKE patterns are internally translated to re2 expressions, I
show only measurements for MATCH queries.
Results in sec, averaged over on multiple measurements;
1.A): 2.12
B): 1.68
C): 9.75
D): 9.45
2.A): 2.17
B): 1.73
C): 9.78
D): 9.47
3.A): 9.8
B): 0.63
C): 31.8
D): 0.98
4.A): 2.14
B): 0.29
C): 9.82
D): 0.41
5.A) 2.12 / 2.15 / 2.26
B) 1.51 / 0.43 / 0.30
C) 9.97 / 9.88 / 10.13
D) 5.70 / 0.42 / 0.43
(10/100/1000 buckets, resp. 10/1/0.1% collision rate)
Evaluation:
1. This is the baseline. It was surprised that complex patterns (C, D)
slow down the queries so badly compared to simple patterns (A, B).
The runtime includes evaluation costs, but as caching only helps with
compilation, and looking at 4.D and 5.D, compilation makes up over 90%
of the runtime!
2. No speedup compared to 1, probably due to locking overhead. The cache
is unbounded, and in experiments with data sets > 2 mio rows, 2. is
the only scheme to throw OOM exceptions which is not acceptable.
3. Unique patterns (A and C) lead to thrashing of the LRU cache and very
bad runtimes due to LRU queue maintenance and locking. Works pretty
well however with few distinct patterns (B and D).
4. This scheme is tailored to queries B and D where it performs pretty
good. More importantly, the caching is lightweight enough to not
deteriorate performance on datasets A and C.
5. After some tuning of the hash map size, 100 buckets seem optimal to
be in the same ballpark with 10 distinct patterns as 4. Performance
also does not deteriorate on A and C compared to the baseline.
Unlike 4., this scheme behaves LRU-like and can adjust to changing
pattern distributions.
As a conclusion, this commit implementes two things:
1. Based on Q1, pattern search with const needle no longer uses
caching. This applies to LIKE and MATCH + a few (exotic) other SQL
functions. The code for the unbounded caching was removed.
2. Based on Q2, pattern search with non-const needles now use method 5.
2022-05-27 10:40:53 +00:00
|
|
|
{
|
2022-08-15 18:58:46 +00:00
|
|
|
String pattern; /// key
|
|
|
|
RegexpPtr regexp; /// value
|
Measure and rework internal re2 caching
This commit is based on local benchmarks of ClickHouse's re2 caching.
Question 1: -----------------------------------------------------------
Is pattern caching useful for queries with const LIKE/REGEX
patterns? E.g. SELECT LIKE(col_haystack, '%HelloWorld') FROM T;
The short answer is: no. Runtime is (unsurprisingly) dominated by
pattern evaluation + other stuff going on in queries, but definitely not
pattern compilation. For space reasons, I omit details of the local
experiments.
(Side note: the current caching scheme is unbounded in size which poses
a DoS risk (think of multi-tenancy). This risk is more pronounced when
unbounded caching is used with non-const patterns ..., see next
question)
Question 2: -----------------------------------------------------------
Is pattern caching useful for queries with non-const LIKE/REGEX
patterns? E.g. SELECT LIKE(col_haystack, col_needle) FROM T;
I benchmarked five caching strategies:
1. no caching as a baseline (= recompile for each row)
2. unbounded cache (= threadsafe global hash-map)
3. LRU cache (= threadsafe global hash-map + LRU queue)
4. lightweight local cache 1 (= not threadsafe local hashmap with
collision list which grows to a certain size (here: 10 elements) and
afterwards never changes)
5. lightweight local cache 2 (not threadsafe local hashmap without
collision list in which a collision replaces the stored element, idea
by Alexey)
... using a haystack of 2 mio strings and
A). 2 mio distinct simple patterns
B). 10 simple patterns
C) 2 mio distinct complex patterns
D) 10 complex patterns
Fo A) and C), caching does not help but these queries still allow to
judge the static overhead of caching on query runtimes.
B) and D) are extreme but common cases in practice. They include
queries like "SELECT ... WHERE LIKE (col_haystack, flag ? '%pattern1%' :
'%pattern2%'). Caching should help significantly.
Because LIKE patterns are internally translated to re2 expressions, I
show only measurements for MATCH queries.
Results in sec, averaged over on multiple measurements;
1.A): 2.12
B): 1.68
C): 9.75
D): 9.45
2.A): 2.17
B): 1.73
C): 9.78
D): 9.47
3.A): 9.8
B): 0.63
C): 31.8
D): 0.98
4.A): 2.14
B): 0.29
C): 9.82
D): 0.41
5.A) 2.12 / 2.15 / 2.26
B) 1.51 / 0.43 / 0.30
C) 9.97 / 9.88 / 10.13
D) 5.70 / 0.42 / 0.43
(10/100/1000 buckets, resp. 10/1/0.1% collision rate)
Evaluation:
1. This is the baseline. It was surprised that complex patterns (C, D)
slow down the queries so badly compared to simple patterns (A, B).
The runtime includes evaluation costs, but as caching only helps with
compilation, and looking at 4.D and 5.D, compilation makes up over 90%
of the runtime!
2. No speedup compared to 1, probably due to locking overhead. The cache
is unbounded, and in experiments with data sets > 2 mio rows, 2. is
the only scheme to throw OOM exceptions which is not acceptable.
3. Unique patterns (A and C) lead to thrashing of the LRU cache and very
bad runtimes due to LRU queue maintenance and locking. Works pretty
well however with few distinct patterns (B and D).
4. This scheme is tailored to queries B and D where it performs pretty
good. More importantly, the caching is lightweight enough to not
deteriorate performance on datasets A and C.
5. After some tuning of the hash map size, 100 buckets seem optimal to
be in the same ballpark with 10 distinct patterns as 4. Performance
also does not deteriorate on A and C compared to the baseline.
Unlike 4., this scheme behaves LRU-like and can adjust to changing
pattern distributions.
As a conclusion, this commit implementes two things:
1. Based on Q1, pattern search with const needle no longer uses
caching. This applies to LIKE and MATCH + a few (exotic) other SQL
functions. The code for the unbounded caching was removed.
2. Based on Q2, pattern search with non-const needles now use method 5.
2022-05-27 10:40:53 +00:00
|
|
|
};
|
2022-08-15 18:58:46 +00:00
|
|
|
using CacheTable = std::array<Bucket, CACHE_SIZE>;
|
Measure and rework internal re2 caching
This commit is based on local benchmarks of ClickHouse's re2 caching.
Question 1: -----------------------------------------------------------
Is pattern caching useful for queries with const LIKE/REGEX
patterns? E.g. SELECT LIKE(col_haystack, '%HelloWorld') FROM T;
The short answer is: no. Runtime is (unsurprisingly) dominated by
pattern evaluation + other stuff going on in queries, but definitely not
pattern compilation. For space reasons, I omit details of the local
experiments.
(Side note: the current caching scheme is unbounded in size which poses
a DoS risk (think of multi-tenancy). This risk is more pronounced when
unbounded caching is used with non-const patterns ..., see next
question)
Question 2: -----------------------------------------------------------
Is pattern caching useful for queries with non-const LIKE/REGEX
patterns? E.g. SELECT LIKE(col_haystack, col_needle) FROM T;
I benchmarked five caching strategies:
1. no caching as a baseline (= recompile for each row)
2. unbounded cache (= threadsafe global hash-map)
3. LRU cache (= threadsafe global hash-map + LRU queue)
4. lightweight local cache 1 (= not threadsafe local hashmap with
collision list which grows to a certain size (here: 10 elements) and
afterwards never changes)
5. lightweight local cache 2 (not threadsafe local hashmap without
collision list in which a collision replaces the stored element, idea
by Alexey)
... using a haystack of 2 mio strings and
A). 2 mio distinct simple patterns
B). 10 simple patterns
C) 2 mio distinct complex patterns
D) 10 complex patterns
Fo A) and C), caching does not help but these queries still allow to
judge the static overhead of caching on query runtimes.
B) and D) are extreme but common cases in practice. They include
queries like "SELECT ... WHERE LIKE (col_haystack, flag ? '%pattern1%' :
'%pattern2%'). Caching should help significantly.
Because LIKE patterns are internally translated to re2 expressions, I
show only measurements for MATCH queries.
Results in sec, averaged over on multiple measurements;
1.A): 2.12
B): 1.68
C): 9.75
D): 9.45
2.A): 2.17
B): 1.73
C): 9.78
D): 9.47
3.A): 9.8
B): 0.63
C): 31.8
D): 0.98
4.A): 2.14
B): 0.29
C): 9.82
D): 0.41
5.A) 2.12 / 2.15 / 2.26
B) 1.51 / 0.43 / 0.30
C) 9.97 / 9.88 / 10.13
D) 5.70 / 0.42 / 0.43
(10/100/1000 buckets, resp. 10/1/0.1% collision rate)
Evaluation:
1. This is the baseline. It was surprised that complex patterns (C, D)
slow down the queries so badly compared to simple patterns (A, B).
The runtime includes evaluation costs, but as caching only helps with
compilation, and looking at 4.D and 5.D, compilation makes up over 90%
of the runtime!
2. No speedup compared to 1, probably due to locking overhead. The cache
is unbounded, and in experiments with data sets > 2 mio rows, 2. is
the only scheme to throw OOM exceptions which is not acceptable.
3. Unique patterns (A and C) lead to thrashing of the LRU cache and very
bad runtimes due to LRU queue maintenance and locking. Works pretty
well however with few distinct patterns (B and D).
4. This scheme is tailored to queries B and D where it performs pretty
good. More importantly, the caching is lightweight enough to not
deteriorate performance on datasets A and C.
5. After some tuning of the hash map size, 100 buckets seem optimal to
be in the same ballpark with 10 distinct patterns as 4. Performance
also does not deteriorate on A and C compared to the baseline.
Unlike 4., this scheme behaves LRU-like and can adjust to changing
pattern distributions.
As a conclusion, this commit implementes two things:
1. Based on Q1, pattern search with const needle no longer uses
caching. This applies to LIKE and MATCH + a few (exotic) other SQL
functions. The code for the unbounded caching was removed.
2. Based on Q2, pattern search with non-const needles now use method 5.
2022-05-27 10:40:53 +00:00
|
|
|
CacheTable known_regexps;
|
|
|
|
};
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-03-10 17:52:36 +00:00
|
|
|
}
|
|
|
|
|
2022-06-17 10:15:19 +00:00
|
|
|
#if USE_VECTORSCAN
|
2019-03-23 19:40:16 +00:00
|
|
|
|
|
|
|
namespace MultiRegexps
|
|
|
|
{
|
2022-05-25 18:33:13 +00:00
|
|
|
template <typename Deleter, Deleter deleter>
|
|
|
|
struct HyperscanDeleter
|
|
|
|
{
|
|
|
|
template <typename T>
|
|
|
|
void operator()(T * ptr) const
|
2019-03-23 19:40:16 +00:00
|
|
|
{
|
2022-05-25 18:33:13 +00:00
|
|
|
deleter(ptr);
|
|
|
|
}
|
|
|
|
};
|
2019-03-23 19:40:16 +00:00
|
|
|
|
2022-05-25 18:33:13 +00:00
|
|
|
/// Helper unique pointers to correctly delete the allocated space when hyperscan cannot compile something and we throw an exception.
|
|
|
|
using CompilerError = std::unique_ptr<hs_compile_error_t, HyperscanDeleter<decltype(&hs_free_compile_error), &hs_free_compile_error>>;
|
|
|
|
using ScratchPtr = std::unique_ptr<hs_scratch_t, HyperscanDeleter<decltype(&hs_free_scratch), &hs_free_scratch>>;
|
|
|
|
using DataBasePtr = std::unique_ptr<hs_database_t, HyperscanDeleter<decltype(&hs_free_database), &hs_free_database>>;
|
2019-03-31 20:19:22 +00:00
|
|
|
|
2022-05-25 18:33:13 +00:00
|
|
|
/// Database is thread safe across multiple threads and Scratch is not but we can copy it whenever we use it in the searcher.
|
|
|
|
class Regexps
|
|
|
|
{
|
|
|
|
public:
|
|
|
|
Regexps(hs_database_t * db_, hs_scratch_t * scratch_) : db{db_}, scratch{scratch_} { }
|
2019-03-31 20:19:22 +00:00
|
|
|
|
2022-05-25 18:33:13 +00:00
|
|
|
hs_database_t * getDB() const { return db.get(); }
|
|
|
|
hs_scratch_t * getScratch() const { return scratch.get(); }
|
2019-10-13 13:22:09 +00:00
|
|
|
|
2022-05-25 18:33:13 +00:00
|
|
|
private:
|
|
|
|
DataBasePtr db;
|
|
|
|
ScratchPtr scratch;
|
|
|
|
};
|
2019-03-23 19:40:16 +00:00
|
|
|
|
2022-08-15 18:58:46 +00:00
|
|
|
class DeferredConstructedRegexps
|
2022-05-25 18:33:13 +00:00
|
|
|
{
|
|
|
|
public:
|
2022-08-16 09:56:53 +00:00
|
|
|
explicit DeferredConstructedRegexps(std::function<Regexps()> constructor_)
|
|
|
|
: constructor(std::move(constructor_))
|
|
|
|
{}
|
2021-08-05 02:28:11 +00:00
|
|
|
|
2022-08-15 18:58:46 +00:00
|
|
|
Regexps * get()
|
2022-05-25 18:33:13 +00:00
|
|
|
{
|
2022-06-24 13:42:42 +00:00
|
|
|
std::lock_guard lock(mutex);
|
2022-08-15 18:58:46 +00:00
|
|
|
if (regexps)
|
|
|
|
return &*regexps;
|
|
|
|
regexps = constructor();
|
|
|
|
return &*regexps;
|
2022-05-25 18:33:13 +00:00
|
|
|
}
|
2021-08-05 02:28:11 +00:00
|
|
|
|
2022-05-25 18:33:13 +00:00
|
|
|
private:
|
2022-08-16 09:56:53 +00:00
|
|
|
std::function<Regexps()> constructor TSA_GUARDED_BY(mutex);
|
2022-08-15 18:58:46 +00:00
|
|
|
std::optional<Regexps> regexps TSA_GUARDED_BY(mutex);
|
2022-05-25 18:33:13 +00:00
|
|
|
std::mutex mutex;
|
|
|
|
};
|
2021-08-05 02:28:11 +00:00
|
|
|
|
2022-08-16 09:56:53 +00:00
|
|
|
using DeferredConstructedRegexpsPtr = std::shared_ptr<DeferredConstructedRegexps>;
|
2022-05-25 18:33:13 +00:00
|
|
|
|
2022-06-25 15:28:15 +00:00
|
|
|
template <bool save_indices, bool WithEditDistance>
|
2022-06-24 13:34:40 +00:00
|
|
|
inline Regexps constructRegexps(const std::vector<String> & str_patterns, [[maybe_unused]] std::optional<UInt32> edit_distance)
|
2022-05-25 18:33:13 +00:00
|
|
|
{
|
|
|
|
/// Common pointers
|
|
|
|
std::vector<const char *> patterns;
|
|
|
|
std::vector<unsigned int> flags;
|
2019-03-23 19:40:16 +00:00
|
|
|
|
2022-05-25 18:33:13 +00:00
|
|
|
/// Pointer for external edit distance compilation
|
|
|
|
std::vector<hs_expr_ext> ext_exprs;
|
|
|
|
std::vector<const hs_expr_ext *> ext_exprs_ptrs;
|
2019-03-29 01:02:05 +00:00
|
|
|
|
2022-05-25 18:33:13 +00:00
|
|
|
patterns.reserve(str_patterns.size());
|
|
|
|
flags.reserve(str_patterns.size());
|
|
|
|
|
2022-06-25 15:28:15 +00:00
|
|
|
if constexpr (WithEditDistance)
|
2022-05-25 18:33:13 +00:00
|
|
|
{
|
|
|
|
ext_exprs.reserve(str_patterns.size());
|
|
|
|
ext_exprs_ptrs.reserve(str_patterns.size());
|
|
|
|
}
|
2019-03-29 01:02:05 +00:00
|
|
|
|
2022-08-15 18:58:46 +00:00
|
|
|
for (std::string_view ref : str_patterns)
|
2022-05-25 18:33:13 +00:00
|
|
|
{
|
2022-08-15 18:58:46 +00:00
|
|
|
patterns.push_back(ref.data());
|
2022-05-25 18:33:13 +00:00
|
|
|
/* Flags below are the pattern matching flags.
|
|
|
|
* HS_FLAG_DOTALL is a compile flag where matching a . will not exclude newlines. This is a good
|
|
|
|
* performance practice according to Hyperscan API. https://intel.github.io/hyperscan/dev-reference/performance.html#dot-all-mode
|
|
|
|
* HS_FLAG_ALLOWEMPTY is a compile flag where empty strings are allowed to match.
|
|
|
|
* HS_FLAG_UTF8 is a flag where UTF8 literals are matched.
|
|
|
|
* HS_FLAG_SINGLEMATCH is a compile flag where each pattern match will be returned only once. it is a good performance practice
|
|
|
|
* as it is said in the Hyperscan documentation. https://intel.github.io/hyperscan/dev-reference/performance.html#single-match-flag
|
|
|
|
*/
|
|
|
|
flags.push_back(HS_FLAG_DOTALL | HS_FLAG_SINGLEMATCH | HS_FLAG_ALLOWEMPTY | HS_FLAG_UTF8);
|
2022-06-25 15:28:15 +00:00
|
|
|
if constexpr (WithEditDistance)
|
2019-03-31 20:55:36 +00:00
|
|
|
{
|
2022-05-25 18:33:13 +00:00
|
|
|
/// Hyperscan currently does not support UTF8 matching with edit distance.
|
|
|
|
flags.back() &= ~HS_FLAG_UTF8;
|
|
|
|
ext_exprs.emplace_back();
|
|
|
|
/// HS_EXT_FLAG_EDIT_DISTANCE is a compile flag responsible for Levenstein distance.
|
|
|
|
ext_exprs.back().flags = HS_EXT_FLAG_EDIT_DISTANCE;
|
|
|
|
ext_exprs.back().edit_distance = edit_distance.value();
|
|
|
|
ext_exprs_ptrs.push_back(&ext_exprs.back());
|
2019-03-31 20:55:36 +00:00
|
|
|
}
|
2022-05-25 18:33:13 +00:00
|
|
|
}
|
|
|
|
hs_database_t * db = nullptr;
|
|
|
|
hs_compile_error_t * compile_error;
|
2019-03-29 01:02:05 +00:00
|
|
|
|
2022-05-25 18:33:13 +00:00
|
|
|
std::unique_ptr<unsigned int[]> ids;
|
2019-03-29 01:02:05 +00:00
|
|
|
|
2022-05-25 18:33:13 +00:00
|
|
|
/// We mark the patterns to provide the callback results.
|
|
|
|
if constexpr (save_indices)
|
|
|
|
{
|
|
|
|
ids.reset(new unsigned int[patterns.size()]);
|
|
|
|
for (size_t i = 0; i < patterns.size(); ++i)
|
|
|
|
ids[i] = i + 1;
|
|
|
|
}
|
2019-03-23 22:49:38 +00:00
|
|
|
|
2022-05-25 18:33:13 +00:00
|
|
|
hs_error_t err;
|
2022-06-25 15:28:15 +00:00
|
|
|
if constexpr (!WithEditDistance)
|
2022-05-25 18:33:13 +00:00
|
|
|
err = hs_compile_multi(
|
|
|
|
patterns.data(),
|
|
|
|
flags.data(),
|
|
|
|
ids.get(),
|
|
|
|
patterns.size(),
|
|
|
|
HS_MODE_BLOCK,
|
|
|
|
nullptr,
|
|
|
|
&db,
|
|
|
|
&compile_error);
|
|
|
|
else
|
|
|
|
err = hs_compile_ext_multi(
|
|
|
|
patterns.data(),
|
|
|
|
flags.data(),
|
|
|
|
ids.get(),
|
|
|
|
ext_exprs_ptrs.data(),
|
|
|
|
patterns.size(),
|
|
|
|
HS_MODE_BLOCK,
|
|
|
|
nullptr,
|
|
|
|
&db,
|
|
|
|
&compile_error);
|
|
|
|
|
|
|
|
if (err != HS_SUCCESS)
|
|
|
|
{
|
|
|
|
/// CompilerError is a unique_ptr, so correct memory free after the exception is thrown.
|
|
|
|
CompilerError error(compile_error);
|
2019-03-23 22:49:38 +00:00
|
|
|
|
2022-05-25 18:33:13 +00:00
|
|
|
if (error->expression < 0)
|
2022-08-15 18:58:46 +00:00
|
|
|
throw Exception(ErrorCodes::LOGICAL_ERROR, String(error->message));
|
2019-03-31 20:55:36 +00:00
|
|
|
else
|
2022-08-15 18:58:46 +00:00
|
|
|
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Pattern '{}' failed with error '{}'", str_patterns[error->expression], String(error->message));
|
2022-05-25 18:33:13 +00:00
|
|
|
}
|
2019-03-23 19:40:16 +00:00
|
|
|
|
2022-05-25 18:33:13 +00:00
|
|
|
ProfileEvents::increment(ProfileEvents::RegexpCreated);
|
2019-03-23 19:40:16 +00:00
|
|
|
|
2022-05-25 18:33:13 +00:00
|
|
|
/// We allocate the scratch space only once, then copy it across multiple threads with hs_clone_scratch
|
|
|
|
/// function which is faster than allocating scratch space each time in each thread.
|
|
|
|
hs_scratch_t * scratch = nullptr;
|
|
|
|
err = hs_alloc_scratch(db, &scratch);
|
2019-03-31 20:19:22 +00:00
|
|
|
|
2022-05-25 18:33:13 +00:00
|
|
|
/// If not HS_SUCCESS, it is guaranteed that the memory would not be allocated for scratch.
|
|
|
|
if (err != HS_SUCCESS)
|
2022-08-15 18:58:46 +00:00
|
|
|
throw Exception(ErrorCodes::CANNOT_ALLOCATE_MEMORY, "Could not allocate scratch space for hyperscan");
|
2019-03-31 20:19:22 +00:00
|
|
|
|
2022-06-24 13:34:40 +00:00
|
|
|
return {db, scratch};
|
2022-05-25 18:33:13 +00:00
|
|
|
}
|
2019-03-31 20:55:36 +00:00
|
|
|
|
2022-08-16 09:56:53 +00:00
|
|
|
/// Maps string pattern vectors + edit distance to compiled vectorscan regexps. Uses the same eviction mechanism as the LocalCacheTable for
|
|
|
|
/// re2 patterns. Because vectorscan regexes are overall more heavy-weight (more expensive compilation, regexes can grow up to multiple
|
|
|
|
/// MBs, usage of scratch space), 1. GlobalCacheTable is a global singleton and, as a result, needs locking 2. the pattern compilation is
|
|
|
|
/// done outside GlobalCacheTable's lock, at the cost of another level of locking.
|
|
|
|
struct GlobalCacheTable
|
|
|
|
{
|
|
|
|
constexpr static size_t CACHE_SIZE = 500; /// collision probability
|
|
|
|
|
|
|
|
struct Bucket
|
|
|
|
{
|
|
|
|
std::vector<String> patterns; /// key
|
|
|
|
std::optional<UInt32> edit_distance; /// key
|
|
|
|
/// The compiled patterns and their state (vectorscan 'database' + scratch space) are wrapped in a shared_ptr. Refcounting guarantees
|
|
|
|
/// that eviction of a pattern does not affect parallel threads still using the pattern.
|
|
|
|
DeferredConstructedRegexpsPtr regexps; /// value
|
|
|
|
};
|
|
|
|
|
|
|
|
std::array<Bucket, CACHE_SIZE> known_regexps TSA_GUARDED_BY(mutex);
|
|
|
|
std::mutex mutex;
|
|
|
|
|
|
|
|
static size_t getBucketIndexFor(const std::vector<String> patterns, std::optional<UInt32> edit_distance)
|
|
|
|
{
|
|
|
|
size_t hash = 0;
|
|
|
|
for (const auto & pattern : patterns)
|
|
|
|
boost::hash_combine(hash, pattern);
|
|
|
|
boost::hash_combine(hash, edit_distance);
|
|
|
|
return hash % CACHE_SIZE;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
/// If WithEditDistance is False, edit_distance must be nullopt. Also, we use templates here because each instantiation of function template
|
|
|
|
/// has its own copy of local static variables which must not be the same for different hyperscan compilations.
|
2022-06-25 15:28:15 +00:00
|
|
|
template <bool save_indices, bool WithEditDistance>
|
2022-08-16 09:56:53 +00:00
|
|
|
inline DeferredConstructedRegexpsPtr getOrSet(const std::vector<std::string_view> & patterns, std::optional<UInt32> edit_distance)
|
2022-05-25 18:33:13 +00:00
|
|
|
{
|
2022-08-16 09:56:53 +00:00
|
|
|
static GlobalCacheTable pool; /// Different variables for different pattern parameters, thread-safe in C++11
|
2019-03-31 20:55:36 +00:00
|
|
|
|
2022-05-25 18:33:13 +00:00
|
|
|
std::vector<String> str_patterns;
|
|
|
|
str_patterns.reserve(patterns.size());
|
2022-06-24 13:16:57 +00:00
|
|
|
for (const auto & pattern : patterns)
|
2022-08-15 18:58:46 +00:00
|
|
|
str_patterns.emplace_back(String(pattern));
|
2019-03-31 20:55:36 +00:00
|
|
|
|
2022-08-16 09:56:53 +00:00
|
|
|
size_t bucket_idx = GlobalCacheTable::getBucketIndexFor(str_patterns, edit_distance);
|
2019-03-31 20:55:36 +00:00
|
|
|
|
2022-08-16 09:56:53 +00:00
|
|
|
/// Lock cache to find compiled regexp for given pattern vector + edit distance.
|
|
|
|
std::lock_guard lock(pool.mutex);
|
2019-03-31 21:04:45 +00:00
|
|
|
|
2022-08-16 09:56:53 +00:00
|
|
|
GlobalCacheTable::Bucket & bucket = pool.known_regexps[bucket_idx];
|
|
|
|
|
|
|
|
/// Pattern compilation is expensive and we don't want to block other threads reading from / inserting into the cache while we hold the
|
|
|
|
/// cache lock during pattern compilation. Therefore, when a cache entry is created or replaced, only set the regexp constructor method
|
|
|
|
/// and compile outside the cache lock.
|
|
|
|
/// Note that the string patterns and the edit distance is passed into the constructor lambda by value, i.e. copied - it is not an
|
|
|
|
/// option to reference the corresponding string patterns / edit distance key in the cache table bucket because the cache entry may
|
|
|
|
/// already be evicted at the time the compilation starts.
|
|
|
|
|
|
|
|
if (bucket.regexps == nullptr) [[unlikely]]
|
2022-05-25 18:33:13 +00:00
|
|
|
{
|
2022-08-16 09:56:53 +00:00
|
|
|
/// insert new entry
|
|
|
|
auto deferred_constructed_regexps = std::make_shared<DeferredConstructedRegexps>(
|
|
|
|
[str_patterns, edit_distance]()
|
|
|
|
{
|
|
|
|
return constructRegexps<save_indices, WithEditDistance>(str_patterns, edit_distance);
|
|
|
|
});
|
|
|
|
bucket = {std::move(str_patterns), edit_distance, deferred_constructed_regexps};
|
2019-03-23 19:40:16 +00:00
|
|
|
}
|
2022-08-16 09:56:53 +00:00
|
|
|
else
|
|
|
|
if (bucket.patterns != str_patterns || bucket.edit_distance != edit_distance)
|
|
|
|
{
|
|
|
|
/// replace existing entry
|
|
|
|
auto deferred_constructed_regexps = std::make_shared<DeferredConstructedRegexps>(
|
|
|
|
[str_patterns, edit_distance]()
|
|
|
|
{
|
|
|
|
return constructRegexps<save_indices, WithEditDistance>(str_patterns, edit_distance);
|
|
|
|
});
|
|
|
|
bucket = {std::move(str_patterns), edit_distance, deferred_constructed_regexps};
|
|
|
|
}
|
2022-05-25 18:33:13 +00:00
|
|
|
|
2022-08-16 09:56:53 +00:00
|
|
|
return bucket.regexps;
|
2022-05-25 18:33:13 +00:00
|
|
|
}
|
|
|
|
|
2019-03-23 19:40:16 +00:00
|
|
|
}
|
|
|
|
|
2022-06-17 10:15:19 +00:00
|
|
|
#endif // USE_VECTORSCAN
|
2019-03-23 19:40:16 +00:00
|
|
|
|
2017-03-10 17:52:36 +00:00
|
|
|
}
|