Merge pull request #67423 from ClickHouse/bff

Fix bloom filter index breaking some queries
This commit is contained in:
Alexey Milovidov 2024-08-06 14:45:44 +00:00 committed by GitHub
commit 2df8d6acde
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 83 additions and 52 deletions

View File

@ -371,67 +371,78 @@ bool MergeTreeIndexConditionBloomFilter::extractAtomFromTree(const RPNBuilderTre
bool MergeTreeIndexConditionBloomFilter::traverseFunction(const RPNBuilderTreeNode & node, RPNElement & out, const RPNBuilderTreeNode * parent) bool MergeTreeIndexConditionBloomFilter::traverseFunction(const RPNBuilderTreeNode & node, RPNElement & out, const RPNBuilderTreeNode * parent)
{ {
bool maybe_useful = false; if (!node.isFunction())
return false;
if (node.isFunction()) const auto function = node.toFunctionNode();
auto arguments_size = function.getArgumentsSize();
auto function_name = function.getFunctionName();
if (parent == nullptr)
{ {
const auto function = node.toFunctionNode(); /// Recurse a little bit for indexOf().
auto arguments_size = function.getArgumentsSize();
auto function_name = function.getFunctionName();
for (size_t i = 0; i < arguments_size; ++i) for (size_t i = 0; i < arguments_size; ++i)
{ {
auto argument = function.getArgumentAt(i); auto argument = function.getArgumentAt(i);
if (traverseFunction(argument, out, &node)) if (traverseFunction(argument, out, &node))
maybe_useful = true; return true;
}
if (arguments_size != 2)
return false;
auto lhs_argument = function.getArgumentAt(0);
auto rhs_argument = function.getArgumentAt(1);
if (functionIsInOrGlobalInOperator(function_name))
{
if (auto future_set = rhs_argument.tryGetPreparedSet(); future_set)
{
if (auto prepared_set = future_set->buildOrderedSetInplace(rhs_argument.getTreeContext().getQueryContext()); prepared_set)
{
if (prepared_set->hasExplicitSetElements())
{
const auto prepared_info = getPreparedSetInfo(prepared_set);
if (traverseTreeIn(function_name, lhs_argument, prepared_set, prepared_info.type, prepared_info.column, out))
maybe_useful = true;
}
}
}
}
else if (function_name == "equals" ||
function_name == "notEquals" ||
function_name == "has" ||
function_name == "mapContains" ||
function_name == "indexOf" ||
function_name == "hasAny" ||
function_name == "hasAll")
{
Field const_value;
DataTypePtr const_type;
if (rhs_argument.tryGetConstant(const_value, const_type))
{
if (traverseTreeEquals(function_name, lhs_argument, const_type, const_value, out, parent))
maybe_useful = true;
}
else if (lhs_argument.tryGetConstant(const_value, const_type))
{
if (traverseTreeEquals(function_name, rhs_argument, const_type, const_value, out, parent))
maybe_useful = true;
}
} }
} }
return maybe_useful; if (arguments_size != 2)
return false;
/// indexOf() should be inside comparison function, e.g. greater(indexOf(key, 42), 0).
/// Other conditions should be at top level, e.g. equals(key, 42), not equals(equals(key, 42), 1).
if ((function_name == "indexOf") != (parent != nullptr))
return false;
auto lhs_argument = function.getArgumentAt(0);
auto rhs_argument = function.getArgumentAt(1);
if (functionIsInOrGlobalInOperator(function_name))
{
if (auto future_set = rhs_argument.tryGetPreparedSet(); future_set)
{
if (auto prepared_set = future_set->buildOrderedSetInplace(rhs_argument.getTreeContext().getQueryContext()); prepared_set)
{
if (prepared_set->hasExplicitSetElements())
{
const auto prepared_info = getPreparedSetInfo(prepared_set);
if (traverseTreeIn(function_name, lhs_argument, prepared_set, prepared_info.type, prepared_info.column, out))
return true;
}
}
}
return false;
}
if (function_name == "equals" ||
function_name == "notEquals" ||
function_name == "has" ||
function_name == "mapContains" ||
function_name == "indexOf" ||
function_name == "hasAny" ||
function_name == "hasAll")
{
Field const_value;
DataTypePtr const_type;
if (rhs_argument.tryGetConstant(const_value, const_type))
{
if (traverseTreeEquals(function_name, lhs_argument, const_type, const_value, out, parent))
return true;
}
else if (lhs_argument.tryGetConstant(const_value, const_type) && (function_name == "equals" || function_name == "notEquals"))
{
if (traverseTreeEquals(function_name, rhs_argument, const_type, const_value, out, parent))
return true;
}
return false;
}
return false;
} }
bool MergeTreeIndexConditionBloomFilter::traverseTreeIn( bool MergeTreeIndexConditionBloomFilter::traverseTreeIn(

View File

@ -28,6 +28,8 @@
"rows_read": 3, "rows_read": 3,
8 aбвгдеёж 8 aбвгдеёж
"rows_read": 2, "rows_read": 2,
13
1
1 column-oriented 1 column-oriented
2 column-oriented 2 column-oriented
"rows_read": 4, "rows_read": 4,

View File

@ -103,6 +103,10 @@ $CLICKHOUSE_CLIENT --optimize_or_like_chain 0 --query="SELECT * FROM bloom_filte
$CLICKHOUSE_CLIENT --optimize_or_like_chain 0 --query="SELECT * FROM bloom_filter_idx WHERE (s, lower(s)) IN (('aбвгдеёж', 'aбвгдеёж'), ('abc', 'cba')) ORDER BY k" $CLICKHOUSE_CLIENT --optimize_or_like_chain 0 --query="SELECT * FROM bloom_filter_idx WHERE (s, lower(s)) IN (('aбвгдеёж', 'aбвгдеёж'), ('abc', 'cba')) ORDER BY k"
$CLICKHOUSE_CLIENT --optimize_or_like_chain 0 --query="SELECT * FROM bloom_filter_idx WHERE (s, lower(s)) IN (('aбвгдеёж', 'aбвгдеёж'), ('abc', 'cba')) ORDER BY k FORMAT JSON" | grep "rows_read" $CLICKHOUSE_CLIENT --optimize_or_like_chain 0 --query="SELECT * FROM bloom_filter_idx WHERE (s, lower(s)) IN (('aбвгдеёж', 'aбвгдеёж'), ('abc', 'cba')) ORDER BY k FORMAT JSON" | grep "rows_read"
# Weird conditions not supported by the index.
$CLICKHOUSE_CLIENT --optimize_or_like_chain 0 --query="SELECT count() FROM bloom_filter_idx WHERE (s = 'asd') = (s = 'asd')"
$CLICKHOUSE_CLIENT --optimize_or_like_chain 0 --query="SELECT count() FROM bloom_filter_idx WHERE has(['asd', 'some string'], s)"
# TOKEN BF # TOKEN BF
$CLICKHOUSE_CLIENT -n --query=" $CLICKHOUSE_CLIENT -n --query="

View File

@ -14,6 +14,11 @@
0 0
2 2
2 2
18
100
100
3
100
1 1
1 1
1 1

View File

@ -25,6 +25,15 @@ WITH ((1, 2), (2, 3)) AS liter_prepared_set SELECT COUNT() FROM single_column_bl
WITH ((1, 1), (2, 2)) AS liter_prepared_set SELECT COUNT() FROM single_column_bloom_filter WHERE (i32, i64) IN liter_prepared_set SETTINGS max_rows_to_read = 6; WITH ((1, 1), (2, 2)) AS liter_prepared_set SELECT COUNT() FROM single_column_bloom_filter WHERE (i32, i64) IN liter_prepared_set SETTINGS max_rows_to_read = 6;
WITH ((1, (1, 1)), (2, (2, 2))) AS liter_prepared_set SELECT COUNT() FROM single_column_bloom_filter WHERE (i64, (i64, i32)) IN liter_prepared_set SETTINGS max_rows_to_read = 6; WITH ((1, (1, 1)), (2, (2, 2))) AS liter_prepared_set SELECT COUNT() FROM single_column_bloom_filter WHERE (i64, (i64, i32)) IN liter_prepared_set SETTINGS max_rows_to_read = 6;
-- Check that indexHint() works (but it doesn't work with COUNT()).
SELECT SUM(ignore(*) + 1) FROM single_column_bloom_filter WHERE indexHint(i32 in (3, 15, 50));
-- The index doesn't understand expressions like these, but it shouldn't break the query.
SELECT COUNT() FROM single_column_bloom_filter WHERE (i32 = 200) = (i32 = 200);
SELECT SUM(ignore(*) + 1) FROM single_column_bloom_filter WHERE indexHint((i32 = 200) != (i32 = 200));
SELECT COUNT() FROM single_column_bloom_filter WHERE indexOf([10, 20, 30], i32) != 0;
SELECT COUNT() FROM single_column_bloom_filter WHERE has([100, 200, 300], 200);
DROP TABLE IF EXISTS single_column_bloom_filter; DROP TABLE IF EXISTS single_column_bloom_filter;