mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-21 23:21:59 +00:00
Merge pull request #27984 from BraulioVM/optimize/has-all-bloom-filter
Make `hasAll` filter condition leverage bloom filters
This commit is contained in:
commit
0db8b524f0
@ -51,7 +51,18 @@ ColumnWithTypeAndName getPreparedSetInfo(const SetPtr & prepared_set)
|
||||
return {ColumnTuple::create(set_elements), std::make_shared<DataTypeTuple>(prepared_set->getElementsTypes()), "dummy"};
|
||||
}
|
||||
|
||||
bool maybeTrueOnBloomFilter(const IColumn * hash_column, const BloomFilterPtr & bloom_filter, size_t hash_functions)
|
||||
bool hashMatchesFilter(const BloomFilterPtr& bloom_filter, UInt64 hash, size_t hash_functions)
|
||||
{
|
||||
return std::all_of(BloomFilterHash::bf_hash_seed,
|
||||
BloomFilterHash::bf_hash_seed + hash_functions,
|
||||
[&](const auto &hash_seed)
|
||||
{
|
||||
return bloom_filter->findHashWithSeed(hash,
|
||||
hash_seed);
|
||||
});
|
||||
}
|
||||
|
||||
bool maybeTrueOnBloomFilter(const IColumn * hash_column, const BloomFilterPtr & bloom_filter, size_t hash_functions, bool match_all)
|
||||
{
|
||||
const auto * const_column = typeid_cast<const ColumnConst *>(hash_column);
|
||||
const auto * non_const_column = typeid_cast<const ColumnUInt64 *>(hash_column);
|
||||
@ -61,26 +72,34 @@ bool maybeTrueOnBloomFilter(const IColumn * hash_column, const BloomFilterPtr &
|
||||
|
||||
if (const_column)
|
||||
{
|
||||
for (size_t index = 0; index < hash_functions; ++index)
|
||||
if (!bloom_filter->findHashWithSeed(const_column->getValue<UInt64>(), BloomFilterHash::bf_hash_seed[index]))
|
||||
return false;
|
||||
return true;
|
||||
return hashMatchesFilter(bloom_filter,
|
||||
const_column->getValue<UInt64>(),
|
||||
hash_functions);
|
||||
}
|
||||
|
||||
const ColumnUInt64::Container & hashes = non_const_column->getData();
|
||||
|
||||
if (match_all)
|
||||
{
|
||||
return std::all_of(hashes.begin(),
|
||||
hashes.end(),
|
||||
[&](const auto& hash_row)
|
||||
{
|
||||
return hashMatchesFilter(bloom_filter,
|
||||
hash_row,
|
||||
hash_functions);
|
||||
});
|
||||
}
|
||||
else
|
||||
{
|
||||
bool missing_rows = true;
|
||||
const ColumnUInt64::Container & data = non_const_column->getData();
|
||||
|
||||
for (size_t index = 0, size = data.size(); missing_rows && index < size; ++index)
|
||||
{
|
||||
bool match_row = true;
|
||||
for (size_t hash_index = 0; match_row && hash_index < hash_functions; ++hash_index)
|
||||
match_row = bloom_filter->findHashWithSeed(data[index], BloomFilterHash::bf_hash_seed[hash_index]);
|
||||
|
||||
missing_rows = !match_row;
|
||||
}
|
||||
|
||||
return !missing_rows;
|
||||
return std::any_of(hashes.begin(),
|
||||
hashes.end(),
|
||||
[&](const auto& hash_row)
|
||||
{
|
||||
return hashMatchesFilter(bloom_filter,
|
||||
hash_row,
|
||||
hash_functions);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@ -109,6 +128,7 @@ bool MergeTreeIndexConditionBloomFilter::alwaysUnknownOrTrue() const
|
||||
|| element.function == RPNElement::FUNCTION_NOT_EQUALS
|
||||
|| element.function == RPNElement::FUNCTION_HAS
|
||||
|| element.function == RPNElement::FUNCTION_HAS_ANY
|
||||
|| element.function == RPNElement::FUNCTION_HAS_ALL
|
||||
|| element.function == RPNElement::FUNCTION_IN
|
||||
|| element.function == RPNElement::FUNCTION_NOT_IN
|
||||
|| element.function == RPNElement::ALWAYS_FALSE)
|
||||
@ -156,16 +176,23 @@ bool MergeTreeIndexConditionBloomFilter::mayBeTrueOnGranule(const MergeTreeIndex
|
||||
|| element.function == RPNElement::FUNCTION_EQUALS
|
||||
|| element.function == RPNElement::FUNCTION_NOT_EQUALS
|
||||
|| element.function == RPNElement::FUNCTION_HAS
|
||||
|| element.function == RPNElement::FUNCTION_HAS_ANY)
|
||||
|| element.function == RPNElement::FUNCTION_HAS_ANY
|
||||
|| element.function == RPNElement::FUNCTION_HAS_ALL)
|
||||
{
|
||||
bool match_rows = true;
|
||||
bool match_all = element.function == RPNElement::FUNCTION_HAS_ALL;
|
||||
const auto & predicate = element.predicate;
|
||||
for (size_t index = 0; match_rows && index < predicate.size(); ++index)
|
||||
{
|
||||
const auto & query_index_hash = predicate[index];
|
||||
const auto & filter = filters[query_index_hash.first];
|
||||
const ColumnPtr & hash_column = query_index_hash.second;
|
||||
match_rows = maybeTrueOnBloomFilter(&*hash_column, filter, hash_functions);
|
||||
|
||||
|
||||
match_rows = maybeTrueOnBloomFilter(&*hash_column,
|
||||
filter,
|
||||
hash_functions,
|
||||
match_all);
|
||||
}
|
||||
|
||||
rpn_stack.emplace_back(match_rows, true);
|
||||
@ -255,7 +282,12 @@ bool MergeTreeIndexConditionBloomFilter::traverseFunction(const ASTPtr & node, B
|
||||
maybe_useful = true;
|
||||
}
|
||||
}
|
||||
else if (function->name == "equals" || function->name == "notEquals" || function->name == "has" || function->name == "indexOf" || function->name == "hasAny")
|
||||
else if (function->name == "equals" ||
|
||||
function->name == "notEquals" ||
|
||||
function->name == "has" ||
|
||||
function->name == "indexOf" ||
|
||||
function->name == "hasAny" ||
|
||||
function->name == "hasAll")
|
||||
{
|
||||
Field const_value;
|
||||
DataTypePtr const_type;
|
||||
@ -412,7 +444,7 @@ bool MergeTreeIndexConditionBloomFilter::traverseASTEquals(
|
||||
out.predicate.emplace_back(std::make_pair(position, BloomFilterHash::hashWithField(actual_type.get(), converted_field)));
|
||||
}
|
||||
}
|
||||
else if (function_name == "hasAny")
|
||||
else if (function_name == "hasAny" || function_name == "hasAll")
|
||||
{
|
||||
if (!array_type)
|
||||
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "First argument for function {} must be an array.", function_name);
|
||||
@ -437,7 +469,9 @@ bool MergeTreeIndexConditionBloomFilter::traverseASTEquals(
|
||||
column = std::move(mutable_column);
|
||||
}
|
||||
|
||||
out.function = RPNElement::FUNCTION_HAS_ANY;
|
||||
out.function = function_name == "hasAny" ?
|
||||
RPNElement::FUNCTION_HAS_ANY :
|
||||
RPNElement::FUNCTION_HAS_ALL;
|
||||
out.predicate.emplace_back(std::make_pair(position, BloomFilterHash::hashWithColumn(actual_type, column, 0, column->size())));
|
||||
}
|
||||
else
|
||||
|
@ -25,6 +25,7 @@ public:
|
||||
FUNCTION_NOT_EQUALS,
|
||||
FUNCTION_HAS,
|
||||
FUNCTION_HAS_ANY,
|
||||
FUNCTION_HAS_ALL,
|
||||
FUNCTION_IN,
|
||||
FUNCTION_NOT_IN,
|
||||
FUNCTION_UNKNOWN, /// Can take any value.
|
||||
|
41
tests/queries/0_stateless/02013_bloom_filter_hasAll.sql
Normal file
41
tests/queries/0_stateless/02013_bloom_filter_hasAll.sql
Normal file
@ -0,0 +1,41 @@
|
||||
DROP TABLE IF EXISTS bftest;
|
||||
CREATE TABLE bftest (
|
||||
k Int64,
|
||||
y Array(Int64) DEFAULT x,
|
||||
x Array(Int64),
|
||||
index ix1(x) TYPE bloom_filter GRANULARITY 3
|
||||
)
|
||||
Engine=MergeTree
|
||||
ORDER BY k;
|
||||
|
||||
INSERT INTO bftest (k, x) SELECT number, arrayMap(i->rand64()%565656, range(10)) FROM numbers(1000);
|
||||
|
||||
-- index is not used, but query should still work
|
||||
SELECT count() FROM bftest WHERE hasAll(x, materialize([1,2,3])) FORMAT Null;
|
||||
|
||||
-- verify the expression in WHERE works on non-index col the same way as on index cols
|
||||
SELECT count() FROM bftest WHERE hasAll(y, [NULL,-42]) FORMAT Null;
|
||||
SELECT count() FROM bftest WHERE hasAll(y, [0,NULL]) FORMAT Null;
|
||||
SELECT count() FROM bftest WHERE hasAll(y, [[123], -42]) FORMAT Null; -- { serverError 386 }
|
||||
SELECT count() FROM bftest WHERE hasAll(y, [toDecimal32(123, 3), 2]) FORMAT Null; -- different, doesn't fail
|
||||
|
||||
SET force_data_skipping_indices='ix1';
|
||||
SELECT count() FROM bftest WHERE has (x, 42) and has(x, -42) FORMAT Null;
|
||||
SELECT count() FROM bftest WHERE hasAll(x, [42,-42]) FORMAT Null;
|
||||
SELECT count() FROM bftest WHERE hasAll(x, []) FORMAT Null;
|
||||
SELECT count() FROM bftest WHERE hasAll(x, [1]) FORMAT Null;
|
||||
|
||||
-- can't use bloom_filter with `hasAll` on non-constant arguments (just like `has`)
|
||||
SELECT count() FROM bftest WHERE hasAll(x, materialize([1,2,3])) FORMAT Null; -- { serverError 277 }
|
||||
|
||||
-- NULLs are not Ok
|
||||
SELECT count() FROM bftest WHERE hasAll(x, [NULL,-42]) FORMAT Null; -- { serverError 277 }
|
||||
SELECT count() FROM bftest WHERE hasAll(x, [0,NULL]) FORMAT Null; -- { serverError 277 }
|
||||
|
||||
-- non-compatible types
|
||||
SELECT count() FROM bftest WHERE hasAll(x, [[123], -42]) FORMAT Null; -- { serverError 386 }
|
||||
SELECT count() FROM bftest WHERE hasAll(x, [toDecimal32(123, 3), 2]) FORMAT Null; -- { serverError 277 }
|
||||
|
||||
-- Bug discovered by AST fuzzier (fixed, shouldn't crash).
|
||||
SELECT 1 FROM bftest WHERE has(x, -0.) OR 0. FORMAT Null;
|
||||
SELECT count() FROM bftest WHERE hasAll(x, [0, 1]) OR 0. FORMAT Null;
|
Loading…
Reference in New Issue
Block a user