multiSearchAny fixed, tests added, minor syle changes

This commit is contained in:
dimarub2000 2019-07-12 14:35:17 +03:00
parent e5d4f11c73
commit 4005987fc4
5 changed files with 176 additions and 28 deletions

View File

@ -194,17 +194,20 @@ const MergeTreeConditionFullText::AtomMap MergeTreeConditionFullText::atom_map
"multiSearchAny",
[] (RPNElement & out, const Field & value, const MergeTreeIndexFullText & idx)
{
out.function = RPNElement::FUNCTION_EQUALS;
out.bloom_filter = std::make_unique<BloomFilter>(
idx.bloom_filter_size, idx.bloom_filter_hashes, idx.seed);
out.function = RPNElement::FUNCTION_MULTI_SEARCH;
std::vector<std::vector<BloomFilter>> bloom_filters;
bloom_filters.emplace_back();
for (const auto & element : value.get<Array>())
{
if (element.getType() != Field::Types::String)
return false;
bloom_filters.back().emplace_back(idx.bloom_filter_size, idx.bloom_filter_hashes, idx.seed);
const auto & str = element.get<String>();
stringToBloomFilter(str.c_str(), str.size(), idx.token_extractor_func, *out.bloom_filter);
stringToBloomFilter(str.c_str(), str.size(), idx.token_extractor_func, bloom_filters.back().back());
}
out.set_bloom_filters = std::move(bloom_filters);
return true;
}
},
@ -256,6 +259,7 @@ bool MergeTreeConditionFullText::alwaysUnknownOrTrue() const
|| element.function == RPNElement::FUNCTION_NOT_EQUALS
|| element.function == RPNElement::FUNCTION_IN
|| element.function == RPNElement::FUNCTION_NOT_IN
|| element.function == RPNElement::FUNCTION_MULTI_SEARCH
|| element.function == RPNElement::ALWAYS_FALSE)
{
rpn_stack.push_back(false);
@ -311,7 +315,7 @@ bool MergeTreeConditionFullText::mayBeTrueOnGranule(MergeTreeIndexGranulePtr idx
rpn_stack.back() = !rpn_stack.back();
}
else if (element.function == RPNElement::FUNCTION_IN
|| element.function == RPNElement::FUNCTION_NOT_IN)
|| element.function == RPNElement::FUNCTION_NOT_IN)
{
std::vector<bool> result(element.set_bloom_filters.back().size(), true);
@ -329,6 +333,18 @@ bool MergeTreeConditionFullText::mayBeTrueOnGranule(MergeTreeIndexGranulePtr idx
if (element.function == RPNElement::FUNCTION_NOT_IN)
rpn_stack.back() = !rpn_stack.back();
}
else if (element.function == RPNElement::FUNCTION_MULTI_SEARCH)
{
std::vector<bool> result(element.set_bloom_filters.back().size(), true);
const auto & bloom_filters = element.set_bloom_filters[0];
for (size_t row = 0; row < bloom_filters.size(); ++row)
result[row] = result[row] && granule->bloom_filters[element.key_column].contains(bloom_filters[row]);
rpn_stack.emplace_back(
std::find(std::cbegin(result), std::cend(result), true) != std::end(result), true);
}
else if (element.function == RPNElement::FUNCTION_NOT)
{
rpn_stack.back() = !rpn_stack.back();
@ -376,7 +392,7 @@ bool MergeTreeConditionFullText::getKey(const ASTPtr & node, size_t & key_column
}
bool MergeTreeConditionFullText::atomFromAST(
const ASTPtr & node, Block & block_with_constants, RPNElement & out)
const ASTPtr & node, Block & block_with_constants, RPNElement & out)
{
Field const_value;
DataTypePtr const_type;
@ -389,8 +405,9 @@ bool MergeTreeConditionFullText::atomFromAST(
size_t key_arg_pos; /// Position of argument with key column (non-const argument)
size_t key_column_num = -1; /// Number of a key column (inside key_column_names array)
std::string func_name = func->name;
if (functionIsInOrGlobalInOperator(func->name) && tryPrepareSetBloomFilter(args, out))
if (functionIsInOrGlobalInOperator(func_name) && tryPrepareSetBloomFilter(args, out))
{
key_arg_pos = 0;
}
@ -410,12 +427,12 @@ bool MergeTreeConditionFullText::atomFromAST(
&& const_type->getTypeId() != TypeIndex::Array)
return false;
if (key_arg_pos == 1 && (func->name != "equals" || func->name != "notEquals"))
if (key_arg_pos == 1 && (func_name != "equals" || func_name != "notEquals"))
return false;
else if (!index.token_extractor_func->supportLike() && (func->name == "like" || func->name == "notLike"))
else if (!index.token_extractor_func->supportLike() && (func_name == "like" || func_name == "notLike"))
return false;
const auto atom_it = atom_map.find(func->name);
const auto atom_it = atom_map.find(func_name);
if (atom_it == std::end(atom_map))
return false;
@ -521,7 +538,6 @@ bool MergeTreeConditionFullText::tryPrepareSetBloomFilter(
return true;
}
MergeTreeIndexGranulePtr MergeTreeIndexFullText::createIndexGranule() const
{
return std::make_shared<MergeTreeIndexGranuleFullText>(*this);

View File

@ -80,6 +80,7 @@ private:
FUNCTION_NOT_EQUALS,
FUNCTION_IN,
FUNCTION_NOT_IN,
FUNCTION_MULTI_SEARCH,
FUNCTION_UNKNOWN, /// Can take any value.
/// Operators of the logical expression.
FUNCTION_NOT,
@ -91,15 +92,20 @@ private:
};
RPNElement(
Function function_ = FUNCTION_UNKNOWN, size_t key_column_ = 0, std::unique_ptr<BloomFilter> && const_bloom_filter_ = nullptr)
: function(function_), key_column(key_column_), bloom_filter(std::move(const_bloom_filter_)) {}
Function function_ = FUNCTION_UNKNOWN, size_t key_column_ = 0, std::unique_ptr<BloomFilter> && const_bloom_filter_ = nullptr)
: function(function_), key_column(key_column_), bloom_filter(std::move(const_bloom_filter_)) {}
Function function = FUNCTION_UNKNOWN;
/// For FUNCTION_EQUALS, FUNCTION_NOT_EQUALS
/// For FUNCTION_EQUALS, FUNCTION_NOT_EQUALS and FUNCTION_MULTI_SEARCH
size_t key_column;
/// For FUNCTION_EQUALS, FUNCTION_NOT_EQUALS
std::unique_ptr<BloomFilter> bloom_filter;
/// For FUNCTION_IN and FUNCTION_NOT_IN
/// For FUNCTION_IN, FUNCTION_NOT_IN and FUNCTION_MULTI_SEARCH
std::vector<std::vector<BloomFilter>> set_bloom_filters;
/// For FUNCTION_IN and FUNCTION_NOT_IN
std::vector<size_t> set_key_position;
};

View File

@ -0,0 +1,53 @@
9 abra
14 abracadabra
"rows_read": 6,
8 computer science
"rows_read": 2,
9 abra
10 cadabra
11 crabacadabra
14 abracadabra
15 cadabraabra
"rows_read": 6,
6 some string
7 another string
"rows_read": 2,
9 abra
14 abracadabra
"rows_read": 6,
8 computer science
"rows_read": 2,
1 ClickHouse is a column-oriented database management system (DBMS)
2 column-oriented database management system
13 basement
"rows_read": 6,
6 some string
7 another string
"rows_read": 2,
6 some string
7 another string
8 computer science
"rows_read": 4,
1 ClickHouse is a column-oriented database management system (DBMS)
2 column-oriented database management system
13 basement
"rows_read": 6,
9 abra
10 cadabra
11 crabacadabra
14 abracadabra
15 cadabraabra
"rows_read": 6,
4 какая-то строка
5 еще строка
6 some string
7 another string
"rows_read": 4,
14 abracadabra
"rows_read": 4,
1 ClickHouse is a column-oriented database management system (DBMS)
2 column-oriented database management system
10 cadabra
11 crabacadabra
15 cadabraabra
"rows_read": 8,

View File

@ -0,0 +1,86 @@
#!/usr/bin/env bash
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
. $CURDIR/../shell_config.sh
$CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS bloom_filter_idx;"
# NGRAM BF
$CLICKHOUSE_CLIENT -n --query="
SET allow_experimental_data_skipping_indices = 1;
CREATE TABLE bloom_filter_idx
(
k UInt64,
s String,
INDEX bf (s, lower(s)) TYPE ngrambf_v1(3, 512, 2, 0) GRANULARITY 1
) ENGINE = MergeTree()
ORDER BY k
SETTINGS index_granularity = 2;"
$CLICKHOUSE_CLIENT --query="INSERT INTO bloom_filter_idx VALUES
(0, 'ClickHouse - столбцовая система управления базами данных (СУБД)'),
(1, 'ClickHouse is a column-oriented database management system (DBMS)'),
(2, 'column-oriented database management system'),
(3, 'columns'),
(4, 'какая-то строка'),
(5, 'еще строка'),
(6, 'some string'),
(7, 'another string'),
(8, 'computer science'),
(9, 'abra'),
(10, 'cadabra'),
(11, 'crabacadabra'),
(12, 'crab'),
(13, 'basement'),
(14, 'abracadabra'),
(15, 'cadabraabra')"
# STARTS_WITH
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE startsWith(s, 'abra') ORDER BY k"
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE startsWith(s, 'abra') ORDER BY k FORMAT JSON" | grep "rows_read"
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE startsWith(s, 'computer') ORDER BY k"
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE startsWith(s, 'computer') ORDER BY k FORMAT JSON" | grep "rows_read"
# ENDS_WITH
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE endsWith(s, 'abra') ORDER BY k"
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE endsWith(s, 'abra') ORDER BY k FORMAT JSON" | grep "rows_read"
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE endsWith(s, 'ring') ORDER BY k"
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE endsWith(s, 'ring') ORDER BY k FORMAT JSON" | grep "rows_read"
# COMBINED
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE startsWith(s, 'abra') AND endsWith(s, 'abra')"
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE startsWith(s, 'abra') AND endsWith(s, 'abra') FORMAT JSON" | grep "rows_read"
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE startsWith(s, 'c') AND endsWith(s, 'science')"
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE startsWith(s, 'c') AND endsWith(s, 'science') FORMAT JSON" | grep "rows_read"
# MULTY_SEARCH_ANY
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE multiSearchAny(s, ['data', 'base'])"
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE multiSearchAny(s, ['data', 'base']) FORMAT JSON" | grep "rows_read"
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE multiSearchAny(s, ['string'])"
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE multiSearchAny(s, ['string']) FORMAT JSON" | grep "rows_read"
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE multiSearchAny(s, ['string', 'computer'])"
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE multiSearchAny(s, ['string', 'computer']) FORMAT JSON" | grep "rows_read"
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE multiSearchAny(s, ['base', 'seme', 'gement'])"
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE multiSearchAny(s, ['base', 'seme', 'gement']) FORMAT JSON" | grep "rows_read"
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE multiSearchAny(s, ['abra', 'cadabra', 'cab', 'extra'])"
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE multiSearchAny(s, ['abra', 'cadabra', 'cab', 'extra']) FORMAT JSON" | grep "rows_read"
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE multiSearchAny(s, ['строка', 'string'])"
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE multiSearchAny(s, ['строка', 'string']) FORMAT JSON" | grep "rows_read"
# MULTY_SEARCH_ANY + OTHER
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE multiSearchAny(s, ['adab', 'cad', 'aba']) AND startsWith(s, 'abra')"
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE multiSearchAny(s, ['adab', 'cad', 'aba']) AND startsWith(s, 'abra') FORMAT JSON" | grep "rows_read"
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE multiSearchAny(s, ['adab', 'cad', 'aba']) AND (startsWith(s, 'c') OR startsWith(s, 'C'))"
$CLICKHOUSE_CLIENT --query="SELECT * FROM bloom_filter_idx WHERE multiSearchAny(s, ['adab', 'cad', 'aba']) AND (startsWith(s, 'c') OR startsWith(s, 'C')) FORMAT JSON" | grep "rows_read"
$CLICKHOUSE_CLIENT --query="DROP TABLE bloom_filter_idx;"

View File

@ -51,16 +51,3 @@ $CLICKHOUSE_CLIENT --query="SELECT * FROM set_idx WHERE multiSearchAny(s, ['data
$CLICKHOUSE_CLIENT --query="SELECT * FROM set_idx WHERE multiSearchAny(s, ['data', 'base']) FORMAT JSON" | grep "rows_read"
$CLICKHOUSE_CLIENT --query="DROP TABLE set_idx;"