mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-12-01 20:12:02 +00:00
like
This commit is contained in:
parent
f66cef68ff
commit
34e2e2623e
@ -12,6 +12,8 @@
|
|||||||
|
|
||||||
#include <Poco/Logger.h>
|
#include <Poco/Logger.h>
|
||||||
|
|
||||||
|
#include <boost/algorithm/string.hpp>
|
||||||
|
|
||||||
|
|
||||||
namespace DB
|
namespace DB
|
||||||
{
|
{
|
||||||
@ -23,16 +25,27 @@ namespace ErrorCodes
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/// Adds all tokens from string to bloom filter.
|
||||||
static void stringToBloomFilter(
|
static void stringToBloomFilter(
|
||||||
const char * data, size_t size, TokenExtractor tokenExtractor, StringBloomFilter & bloom_filter)
|
const char * data, size_t size, TokenExtractor token_extractor, StringBloomFilter & bloom_filter)
|
||||||
{
|
{
|
||||||
size_t cur = 0;
|
size_t cur = 0;
|
||||||
size_t token_start = 0;
|
size_t token_start = 0;
|
||||||
size_t token_len = 0;
|
size_t token_len = 0;
|
||||||
while (cur < size && tokenExtractor(data, size, &cur, &token_start, &token_len))
|
while (cur < size && token_extractor.next(data, size, &cur, &token_start, &token_len))
|
||||||
bloom_filter.add(data + token_start, token_len);
|
bloom_filter.add(data + token_start, token_len);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Adds all tokens from like pattern string to bloom filter. (Because like pattern can contain `\%` and `\_`.)
|
||||||
|
static void likeStringToBloomFilter(
|
||||||
|
const String & data, TokenExtractor token_extractor, StringBloomFilter & bloom_filter)
|
||||||
|
{
|
||||||
|
size_t cur = 0;
|
||||||
|
String token;
|
||||||
|
while (cur < data.size() && token_extractor.nextLike(data, &cur, token))
|
||||||
|
bloom_filter.add(token.c_str(), token.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
MergeTreeBloomFilterIndexGranule::MergeTreeBloomFilterIndexGranule(const MergeTreeBloomFilterIndex & index)
|
MergeTreeBloomFilterIndexGranule::MergeTreeBloomFilterIndexGranule(const MergeTreeBloomFilterIndex & index)
|
||||||
: IMergeTreeIndexGranule()
|
: IMergeTreeIndexGranule()
|
||||||
@ -113,12 +126,16 @@ const BloomFilterCondition::AtomMap BloomFilterCondition::atom_map
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
/*{
|
{
|
||||||
"like",
|
"like",
|
||||||
[] (RPNElement & out, const Field & value, const MergeTreeBloomFilterIndex & idx)
|
[] (RPNElement & out, const Field & value, const MergeTreeBloomFilterIndex & idx)
|
||||||
{
|
{
|
||||||
out.function = RPNElement::FUNCTION_LIKE;
|
out.function = RPNElement::FUNCTION_EQUALS;
|
||||||
out.bloom_filter = std::move(bf);
|
out.bloom_filter = std::make_unique<StringBloomFilter>(
|
||||||
|
idx.bloom_filter_size, idx.bloom_filter_hashes, idx.seed);
|
||||||
|
|
||||||
|
String str = value.get<String>();
|
||||||
|
likeStringToBloomFilter(str, idx.tokenExtractorFunc, *out.bloom_filter);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
@ -126,11 +143,15 @@ const BloomFilterCondition::AtomMap BloomFilterCondition::atom_map
|
|||||||
"notLike",
|
"notLike",
|
||||||
[] (RPNElement & out, const Field & value, const MergeTreeBloomFilterIndex & idx)
|
[] (RPNElement & out, const Field & value, const MergeTreeBloomFilterIndex & idx)
|
||||||
{
|
{
|
||||||
out.function = RPNElement::FUNCTION_NOT_LIKE;
|
out.function = RPNElement::FUNCTION_EQUALS;
|
||||||
out.bloom_filter = std::move(bf);
|
out.bloom_filter = std::make_unique<StringBloomFilter>(
|
||||||
|
idx.bloom_filter_size, idx.bloom_filter_hashes, idx.seed);
|
||||||
|
|
||||||
|
String str = value.get<String>();
|
||||||
|
likeStringToBloomFilter(str, idx.tokenExtractorFunc, *out.bloom_filter);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}*/
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
BloomFilterCondition::BloomFilterCondition(
|
BloomFilterCondition::BloomFilterCondition(
|
||||||
@ -360,7 +381,6 @@ bool BloomFilterCondition::atomFromAST(
|
|||||||
return false;
|
return false;
|
||||||
|
|
||||||
out.key_column = key_column_num;
|
out.key_column = key_column_num;
|
||||||
|
|
||||||
return atom_it->second(out, const_value, index);
|
return atom_it->second(out, const_value, index);
|
||||||
}
|
}
|
||||||
else if (KeyCondition::getConstant(node, block_with_constants, const_value, const_type))
|
else if (KeyCondition::getConstant(node, block_with_constants, const_value, const_type))
|
||||||
@ -421,7 +441,18 @@ IndexConditionPtr MergeTreeBloomFilterIndex::createIndexCondition(
|
|||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
struct NgramTokenExtractor
|
bool TokenExtractor::next(const char *, size_t, size_t *, size_t *, size_t *)
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool TokenExtractor::nextLike(const String &, size_t *, String &)
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
struct NgramTokenExtractor : TokenExtractor
|
||||||
{
|
{
|
||||||
NgramTokenExtractor(size_t n_) : n(n_) {}
|
NgramTokenExtractor(size_t n_) : n(n_) {}
|
||||||
|
|
||||||
@ -430,7 +461,7 @@ struct NgramTokenExtractor
|
|||||||
return name;
|
return name;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool operator() (const char * data, size_t len, size_t * pos, size_t * token_start, size_t * token_len)
|
bool next(const char * data, size_t len, size_t * pos, size_t * token_start, size_t * token_len) override
|
||||||
{
|
{
|
||||||
*token_start = *pos;
|
*token_start = *pos;
|
||||||
*token_len = 0;
|
*token_len = 0;
|
||||||
@ -446,6 +477,45 @@ struct NgramTokenExtractor
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool nextLike(const String & str, size_t * pos, String & token) override
|
||||||
|
{
|
||||||
|
token.clear();
|
||||||
|
|
||||||
|
bool escaped = false;
|
||||||
|
for (size_t i = *pos; i < str.size(); ++i)
|
||||||
|
{
|
||||||
|
if (escaped && (str[*pos] == '%' || str[*pos] == '_' || str[*pos] == '\\'))
|
||||||
|
{
|
||||||
|
token += str[*pos];
|
||||||
|
escaped = false;
|
||||||
|
}
|
||||||
|
else if (!escaped && (str[*pos] == '%' || str[*pos] == '_'))
|
||||||
|
{
|
||||||
|
/// This token is too small, go to the next.
|
||||||
|
token.clear();
|
||||||
|
escaped = false;
|
||||||
|
*pos = i;
|
||||||
|
}
|
||||||
|
else if (!escaped && str[*pos] == '\\')
|
||||||
|
{
|
||||||
|
token += str[*pos];
|
||||||
|
escaped = true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
token += str[*pos];
|
||||||
|
escaped = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (token.size() == n) {
|
||||||
|
++*pos;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
size_t n;
|
size_t n;
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -487,6 +557,7 @@ std::unique_ptr<IMergeTreeIndex> bloomFilterIndexCreator(
|
|||||||
throw Exception("Bloom filter index can be used only with `String` and `FixedString` column.", ErrorCodes::INCORRECT_QUERY);
|
throw Exception("Bloom filter index can be used only with `String` and `FixedString` column.", ErrorCodes::INCORRECT_QUERY);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
boost::algorithm::to_lower(node->type->name);
|
||||||
if (node->type->name == NgramTokenExtractor::getName()) {
|
if (node->type->name == NgramTokenExtractor::getName()) {
|
||||||
if (!node->type->arguments || node->type->arguments->children.size() != 3)
|
if (!node->type->arguments || node->type->arguments->children.size() != 3)
|
||||||
throw Exception("`ngrambf` index must have exactly 3 arguments.", ErrorCodes::INCORRECT_QUERY);
|
throw Exception("`ngrambf` index must have exactly 3 arguments.", ErrorCodes::INCORRECT_QUERY);
|
||||||
|
@ -56,8 +56,8 @@ private:
|
|||||||
FUNCTION_NOT_EQUALS,
|
FUNCTION_NOT_EQUALS,
|
||||||
FUNCTION_LIKE,
|
FUNCTION_LIKE,
|
||||||
FUNCTION_NOT_LIKE,
|
FUNCTION_NOT_LIKE,
|
||||||
FUNCTION_IN,
|
/*FUNCTION_IN,
|
||||||
FUNCTION_NOT_IN,
|
FUNCTION_NOT_IN,*/
|
||||||
FUNCTION_UNKNOWN, /// Can take any value.
|
FUNCTION_UNKNOWN, /// Can take any value.
|
||||||
/// Operators of the logical expression.
|
/// Operators of the logical expression.
|
||||||
FUNCTION_NOT,
|
FUNCTION_NOT,
|
||||||
@ -93,8 +93,15 @@ private:
|
|||||||
RPN rpn;
|
RPN rpn;
|
||||||
};
|
};
|
||||||
|
|
||||||
using TokenExtractor = std::function<
|
struct TokenExtractor
|
||||||
bool(const char * data, size_t len, size_t * pos, size_t * token_start, size_t * token_len)>;
|
{
|
||||||
|
virtual ~TokenExtractor() = default;
|
||||||
|
/// Fast inplace implementation for regular use.
|
||||||
|
virtual bool next(const char * data, size_t len, size_t * pos, size_t * token_start, size_t * token_len);
|
||||||
|
/// Special implementation for creating bloom filter for LIKE function.
|
||||||
|
/// It skips unescaped `%` and `_` and supports escaping symbols, but it is less lightweight.
|
||||||
|
virtual bool nextLike(const String & str, size_t * pos, String & token);
|
||||||
|
};
|
||||||
|
|
||||||
class MergeTreeBloomFilterIndex : public IMergeTreeIndex
|
class MergeTreeBloomFilterIndex : public IMergeTreeIndex
|
||||||
{
|
{
|
||||||
|
Loading…
Reference in New Issue
Block a user