This commit is contained in:
Nikita Vasilev 2019-02-23 16:06:23 +03:00
parent 970f93d3c4
commit ca4d1e137c
4 changed files with 62 additions and 16 deletions

View File

@ -27,7 +27,7 @@ namespace ErrorCodes
/// Adds all tokens from string to bloom filter.
static void stringToBloomFilter(
const char * data, size_t size, const std::unique_ptr<TokenExtractor> & token_extractor, StringBloomFilter & bloom_filter)
const char * data, size_t size, const std::unique_ptr<ITokenExtractor> & token_extractor, StringBloomFilter & bloom_filter)
{
size_t cur = 0;
size_t token_start = 0;
@ -38,7 +38,7 @@ static void stringToBloomFilter(
/// Adds all tokens from like pattern string to bloom filter. (Because like pattern can contain `\%` and `\_`.)
static void likeStringToBloomFilter(
const String & data, const std::unique_ptr<TokenExtractor> & token_extractor, StringBloomFilter & bloom_filter)
const String & data, const std::unique_ptr<ITokenExtractor> & token_extractor, StringBloomFilter & bloom_filter)
{
size_t cur = 0;
String token;

View File

@ -96,17 +96,22 @@ private:
PreparedSets prepared_sets;
};
struct TokenExtractor
/// Interface for string parsers.
struct ITokenExtractor
{
virtual ~TokenExtractor() = default;
virtual ~ITokenExtractor() = default;
/// Fast inplace implementation for regular use.
/// Gets string (data ptr and len) and start position for extracting next token (state of extractor).
/// Returns false if parsing is finished, otherwise returns true.
virtual bool next(const char * data, size_t len, size_t * pos, size_t * token_start, size_t * token_len) const = 0;
/// Special implementation for creating bloom filter for LIKE function.
/// It skips unescaped `%` and `_` and supports escaping symbols, but it is less lightweight.
virtual bool nextLike(const String & str, size_t * pos, String & out) const = 0;
};
struct NgramTokenExtractor : public TokenExtractor
/// Parser extracting all ngrams from string.
struct NgramTokenExtractor : public ITokenExtractor
{
NgramTokenExtractor(size_t n_) : n(n_) {}
@ -143,7 +148,7 @@ public:
size_t bloom_filter_size_,
size_t bloom_filter_hashes_,
size_t seed_,
std::unique_ptr<TokenExtractor> && token_extractor_func_)
std::unique_ptr<ITokenExtractor> && token_extractor_func_)
: IMergeTreeIndex(name_, expr_, columns_, data_types_, header_, granularity_)
, bloom_filter_size(bloom_filter_size_)
, bloom_filter_hashes(bloom_filter_hashes_)
@ -164,7 +169,7 @@ public:
/// Bloom filter seed.
size_t seed;
/// Fucntion for selecting next token.
std::unique_ptr<TokenExtractor> token_extractor_func;
std::unique_ptr<ITokenExtractor> token_extractor_func;
};
}

View File

@ -0,0 +1,23 @@
8 aбвгдеёж
"rows_read": 2,
13 abc
"rows_read": 1,
1 ClickHouse is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).
"rows_read": 2,
1 ClickHouse is a column-oriented database management system (DBMS) for online analytical processing of queries (OLAP).
"rows_read": 2,
0 ClickHouse - столбцовая система управления базами данных (СУБД) для онлайн обработки аналитических запросов (OLAP).
"rows_read": 2,
0 ClickHouse - столбцовая система управления базами данных (СУБД) для онлайн обработки аналитических запросов (OLAP).
5 еще строка
"rows_read": 4,
12 <div> странный <strong>html</strong> </div>
"rows_read": 2,
9 2_2%2_2\\
"rows_read": 2,
9 2_2%2_2\\
"rows_read": 2,
9 2_2%2_2\\
"rows_read": 2,
9 2_2%2_2\\
"rows_read": 2,

View File

@ -12,7 +12,7 @@ CREATE TABLE test.bloom_filter_idx
(
k UInt64,
s String,
INDEX bf (s) TYPE ngrambf(3, 16, 0) GRANULARITY 1,
INDEX bf (s, lower(s)) TYPE ngrambf(3, 512, 0) GRANULARITY 1
) ENGINE = MergeTree()
ORDER BY k
SETTINGS index_granularity = 2;"
@ -28,27 +28,45 @@ $CLICKHOUSE_CLIENT --query="INSERT INTO test.bloom_filter_idx VALUES
(6, 'some string'),
(7, 'another string'),
(8, 'aбвгдеёж'),
(9, '2_2%2_2\\'),
(11, '!@#$%^&*()1234567890<>?:|{}'),
(9, '2_2%2_2\\\\'),
(11, '!@#$%^&*0123456789'),
(12, '<div> странный <strong>html</strong> </div>'),
(13, 'abc')"
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE s = 'aбвгдеёж' ORDER BY k"
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE s = 'aбвгдеёж' ORDER BY k FORMAT JSON" | grep "rows_read"
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE s = 'abc' ORDER BY k"
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE s = 'abc' ORDER BY k FORMAT JSON" | grep "rows_read"
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE s LIKE '%database%' AND s LIKE '%ClickHouse%' ORDER BY k"
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx" | grep "rows_read"
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE s LIKE '%database%' AND s LIKE '%ClickHouse%' ORDER BY k FORMAT JSON" | grep "rows_read"
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE s LIKE '%database%' AND lower(s) LIKE '%clickhouse%' ORDER BY k"
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE s LIKE '%database%' AND lower(s) LIKE '%clickhouse%' ORDER BY k FORMAT JSON" | grep "rows_read"
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE s LIKE '%базами данных%' AND s LIKE '%ClickHouse%' ORDER BY k"
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE s LIKE '%базами данных%' AND s LIKE '%ClickHouse%' ORDER BY k FORMAT JSON" | grep "rows_read"
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE (s LIKE '%базами данных%' AND s LIKE '%ClickHouse%') OR s LIKE '____строка' ORDER BY k"
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE (s LIKE '%базами данных%' AND s LIKE '%ClickHouse%') OR s LIKE '____строка' ORDER BY k FORMAT JSON" | grep "rows_read"
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE s LIKE '%%<div>_%_%_</div>%%' ORDER BY k"
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE s LIKE '%%<div>_%_%_</div>%%' ORDER BY k FORMAT JSON" | grep "rows_read"
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE s LIKE '%2\\\\%2%' ORDER BY k"
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE s LIKE '%2\\\\%2%' ORDER BY k FORMAT JSON" | grep "rows_read"
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE s LIKE '%_\\\\%2\\\\__\\\\' ORDER BY k"
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE s LIKE '%_\\\\%2\\\\__\\\\' ORDER BY k FORMAT JSON" | grep "rows_read"
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE s LIKE '2\\\\_2\\\\%2_2\\\\' ORDER BY k"
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE s LIKE '2\\\\_2\\\\%2_2\\\\' ORDER BY k FORMAT JSON" | grep "rows_read"
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE s LIKE '2\\\\_2\\\\%2_2_' ORDER BY k"
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE s LIKE '2\\\\_2\\\\%2_2_' ORDER BY k FORMAT JSON" | grep "rows_read"
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE s LIKE '%2\\%2%' ORDER BY k"
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE s LIKE '%2\\_2%' ORDER BY k"
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE s LIKE '%2\\\\' ORDER BY k"
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE s LIKE '2\\_2\\%2_2\\\\' ORDER BY k"
$CLICKHOUSE_CLIENT --query="SELECT * FROM test.bloom_filter_idx WHERE s LIKE '2\\_2\\%2_2_' ORDER BY k"
$CLICKHOUSE_CLIENT --query="DROP TABLE test.bloom_filter_idx"