This commit is contained in:
Nikita Vasilev 2019-02-21 23:32:36 +03:00
parent f66cef68ff
commit 34e2e2623e
2 changed files with 93 additions and 15 deletions

View File

@ -12,6 +12,8 @@
#include <Poco/Logger.h> #include <Poco/Logger.h>
#include <boost/algorithm/string.hpp>
namespace DB namespace DB
{ {
@ -23,16 +25,27 @@ namespace ErrorCodes
} }
/// Adds all tokens from string to bloom filter.
static void stringToBloomFilter( static void stringToBloomFilter(
const char * data, size_t size, TokenExtractor tokenExtractor, StringBloomFilter & bloom_filter) const char * data, size_t size, TokenExtractor token_extractor, StringBloomFilter & bloom_filter)
{ {
size_t cur = 0; size_t cur = 0;
size_t token_start = 0; size_t token_start = 0;
size_t token_len = 0; size_t token_len = 0;
while (cur < size && tokenExtractor(data, size, &cur, &token_start, &token_len)) while (cur < size && token_extractor.next(data, size, &cur, &token_start, &token_len))
bloom_filter.add(data + token_start, token_len); bloom_filter.add(data + token_start, token_len);
} }
/// Adds all tokens from like pattern string to bloom filter. (Because like pattern can contain `\%` and `\_`.)
static void likeStringToBloomFilter(
const String & data, TokenExtractor token_extractor, StringBloomFilter & bloom_filter)
{
size_t cur = 0;
String token;
while (cur < data.size() && token_extractor.nextLike(data, &cur, token))
bloom_filter.add(token.c_str(), token.size());
}
MergeTreeBloomFilterIndexGranule::MergeTreeBloomFilterIndexGranule(const MergeTreeBloomFilterIndex & index) MergeTreeBloomFilterIndexGranule::MergeTreeBloomFilterIndexGranule(const MergeTreeBloomFilterIndex & index)
: IMergeTreeIndexGranule() : IMergeTreeIndexGranule()
@ -113,12 +126,16 @@ const BloomFilterCondition::AtomMap BloomFilterCondition::atom_map
return true; return true;
} }
}, },
/*{ {
"like", "like",
[] (RPNElement & out, const Field & value, const MergeTreeBloomFilterIndex & idx) [] (RPNElement & out, const Field & value, const MergeTreeBloomFilterIndex & idx)
{ {
out.function = RPNElement::FUNCTION_LIKE; out.function = RPNElement::FUNCTION_EQUALS;
out.bloom_filter = std::move(bf); out.bloom_filter = std::make_unique<StringBloomFilter>(
idx.bloom_filter_size, idx.bloom_filter_hashes, idx.seed);
String str = value.get<String>();
likeStringToBloomFilter(str, idx.tokenExtractorFunc, *out.bloom_filter);
return true; return true;
} }
}, },
@ -126,11 +143,15 @@ const BloomFilterCondition::AtomMap BloomFilterCondition::atom_map
"notLike", "notLike",
[] (RPNElement & out, const Field & value, const MergeTreeBloomFilterIndex & idx) [] (RPNElement & out, const Field & value, const MergeTreeBloomFilterIndex & idx)
{ {
out.function = RPNElement::FUNCTION_NOT_LIKE; out.function = RPNElement::FUNCTION_EQUALS;
out.bloom_filter = std::move(bf); out.bloom_filter = std::make_unique<StringBloomFilter>(
idx.bloom_filter_size, idx.bloom_filter_hashes, idx.seed);
String str = value.get<String>();
likeStringToBloomFilter(str, idx.tokenExtractorFunc, *out.bloom_filter);
return true; return true;
} }
}*/ }
}; };
BloomFilterCondition::BloomFilterCondition( BloomFilterCondition::BloomFilterCondition(
@ -360,7 +381,6 @@ bool BloomFilterCondition::atomFromAST(
return false; return false;
out.key_column = key_column_num; out.key_column = key_column_num;
return atom_it->second(out, const_value, index); return atom_it->second(out, const_value, index);
} }
else if (KeyCondition::getConstant(node, block_with_constants, const_value, const_type)) else if (KeyCondition::getConstant(node, block_with_constants, const_value, const_type))
@ -421,7 +441,18 @@ IndexConditionPtr MergeTreeBloomFilterIndex::createIndexCondition(
}; };
struct NgramTokenExtractor bool TokenExtractor::next(const char *, size_t, size_t *, size_t *, size_t *)
{
return false;
}
bool TokenExtractor::nextLike(const String &, size_t *, String &)
{
return false;
}
struct NgramTokenExtractor : TokenExtractor
{ {
NgramTokenExtractor(size_t n_) : n(n_) {} NgramTokenExtractor(size_t n_) : n(n_) {}
@ -430,7 +461,7 @@ struct NgramTokenExtractor
return name; return name;
} }
bool operator() (const char * data, size_t len, size_t * pos, size_t * token_start, size_t * token_len) bool next(const char * data, size_t len, size_t * pos, size_t * token_start, size_t * token_len) override
{ {
*token_start = *pos; *token_start = *pos;
*token_len = 0; *token_len = 0;
@ -446,6 +477,45 @@ struct NgramTokenExtractor
return true; return true;
} }
bool nextLike(const String & str, size_t * pos, String & token) override
{
token.clear();
bool escaped = false;
for (size_t i = *pos; i < str.size(); ++i)
{
if (escaped && (str[*pos] == '%' || str[*pos] == '_' || str[*pos] == '\\'))
{
token += str[*pos];
escaped = false;
}
else if (!escaped && (str[*pos] == '%' || str[*pos] == '_'))
{
/// This token is too small, go to the next.
token.clear();
escaped = false;
*pos = i;
}
else if (!escaped && str[*pos] == '\\')
{
token += str[*pos];
escaped = true;
}
else
{
token += str[*pos];
escaped = false;
}
if (token.size() == n) {
++*pos;
return true;
}
}
return false;
}
size_t n; size_t n;
}; };
@ -487,6 +557,7 @@ std::unique_ptr<IMergeTreeIndex> bloomFilterIndexCreator(
throw Exception("Bloom filter index can be used only with `String` and `FixedString` column.", ErrorCodes::INCORRECT_QUERY); throw Exception("Bloom filter index can be used only with `String` and `FixedString` column.", ErrorCodes::INCORRECT_QUERY);
} }
boost::algorithm::to_lower(node->type->name);
if (node->type->name == NgramTokenExtractor::getName()) { if (node->type->name == NgramTokenExtractor::getName()) {
if (!node->type->arguments || node->type->arguments->children.size() != 3) if (!node->type->arguments || node->type->arguments->children.size() != 3)
throw Exception("`ngrambf` index must have exactly 3 arguments.", ErrorCodes::INCORRECT_QUERY); throw Exception("`ngrambf` index must have exactly 3 arguments.", ErrorCodes::INCORRECT_QUERY);

View File

@ -56,8 +56,8 @@ private:
FUNCTION_NOT_EQUALS, FUNCTION_NOT_EQUALS,
FUNCTION_LIKE, FUNCTION_LIKE,
FUNCTION_NOT_LIKE, FUNCTION_NOT_LIKE,
FUNCTION_IN, /*FUNCTION_IN,
FUNCTION_NOT_IN, FUNCTION_NOT_IN,*/
FUNCTION_UNKNOWN, /// Can take any value. FUNCTION_UNKNOWN, /// Can take any value.
/// Operators of the logical expression. /// Operators of the logical expression.
FUNCTION_NOT, FUNCTION_NOT,
@ -93,8 +93,15 @@ private:
RPN rpn; RPN rpn;
}; };
using TokenExtractor = std::function< struct TokenExtractor
bool(const char * data, size_t len, size_t * pos, size_t * token_start, size_t * token_len)>; {
virtual ~TokenExtractor() = default;
/// Fast inplace implementation for regular use.
virtual bool next(const char * data, size_t len, size_t * pos, size_t * token_start, size_t * token_len);
/// Special implementation for creating bloom filter for LIKE function.
/// It skips unescaped `%` and `_` and supports escaping symbols, but it is less lightweight.
virtual bool nextLike(const String & str, size_t * pos, String & token);
};
class MergeTreeBloomFilterIndex : public IMergeTreeIndex class MergeTreeBloomFilterIndex : public IMergeTreeIndex
{ {