From 1b6c602c3fb96ec840e9aadf686dd96c76bf6c45 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Fri, 1 Nov 2024 13:12:48 -0300 Subject: [PATCH 1/8] draft / poc --- .../Impl/Parquet/ParquetFilterCondition.cpp | 799 ++++++++++++++++++ .../Impl/Parquet/ParquetFilterCondition.h | 44 + .../Formats/Impl/ParquetBlockInputFormat.cpp | 55 +- src/Storages/MergeTree/KeyCondition.cpp | 4 + src/Storages/MergeTree/KeyCondition.h | 39 +- 5 files changed, 910 insertions(+), 31 deletions(-) create mode 100644 src/Processors/Formats/Impl/Parquet/ParquetFilterCondition.cpp create mode 100644 src/Processors/Formats/Impl/Parquet/ParquetFilterCondition.h diff --git a/src/Processors/Formats/Impl/Parquet/ParquetFilterCondition.cpp b/src/Processors/Formats/Impl/Parquet/ParquetFilterCondition.cpp new file mode 100644 index 00000000000..7a43128e7ae --- /dev/null +++ b/src/Processors/Formats/Impl/Parquet/ParquetFilterCondition.cpp @@ -0,0 +1,799 @@ +#include + +#if USE_PARQUET + +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +static Field applyFunctionForField( + const FunctionBasePtr & func, + const DataTypePtr & arg_type, + const Field & arg_value) +{ + ColumnsWithTypeAndName columns + { + { arg_type->createColumnConst(1, arg_value), arg_type, "x" }, + }; + + auto col = func->execute(columns, func->getResultType(), 1); + return (*col)[0]; +} + +/// applyFunction will execute the function with one `field` or the column which `field` refers to. +static FieldRef applyFunction(const FunctionBasePtr & func, const DataTypePtr & current_type, const FieldRef & field) +{ + chassert(func != nullptr); + /// Fallback for fields without block reference. + if (field.isExplicit()) + return applyFunctionForField(func, current_type, field); + + /// We will cache the function result inside `field.columns`, because this function will call many times + /// from many fields from same column. When the column is huge, for example there are thousands of marks, we need a cache. + /// The cache key is like `_[function_pointer]_[param_column_id]` to identify a unique pair. + WriteBufferFromOwnString buf; + writeText("_", buf); + writePointerHex(func.get(), buf); + writeText("_" + toString(field.column_idx), buf); + String result_name = buf.str(); + const auto & columns = field.columns; + size_t result_idx = columns->size(); + + for (size_t i = 0; i < result_idx; ++i) + { + if ((*columns)[i].name == result_name) + result_idx = i; + } + + if (result_idx == columns->size()) + { + /// When cache is missed, we calculate the whole column where the field comes from. This will avoid repeated calculation. + ColumnsWithTypeAndName args{(*columns)[field.column_idx]}; + field.columns->emplace_back(ColumnWithTypeAndName {nullptr, func->getResultType(), result_name}); + (*columns)[result_idx].column = func->execute(args, (*columns)[result_idx].type, columns->front().column->size()); + } + + return {field.columns, field.row_idx, result_idx}; +} + +std::optional applyMonotonicFunctionsChainToRange( + Range key_range, + const KeyCondition::MonotonicFunctionsChain & functions, + DataTypePtr current_type, + bool single_point) +{ + for (const auto & func : functions) + { + /// We check the monotonicity of each function on a specific range. + /// If we know the given range only contains one value, then we treat all functions as positive monotonic. + IFunction::Monotonicity monotonicity = single_point + ? IFunction::Monotonicity{true} + : func->getMonotonicityForRange(*current_type.get(), key_range.left, key_range.right); + + if (!monotonicity.is_monotonic) + { + return {}; + } + + /// If we apply function to open interval, we can get empty intervals in result. + /// E.g. for ('2020-01-03', '2020-01-20') after applying 'toYYYYMM' we will get ('202001', '202001'). + /// To avoid this we make range left and right included. + /// Any function that treats NULL specially is not monotonic. + /// Thus we can safely use isNull() as an -Inf/+Inf indicator here. + if (!key_range.left.isNull()) + { + key_range.left = applyFunction(func, current_type, key_range.left); + key_range.left_included = true; + } + + if (!key_range.right.isNull()) + { + key_range.right = applyFunction(func, current_type, key_range.right); + key_range.right_included = true; + } + + current_type = func->getResultType(); + + if (!monotonicity.is_positive) + key_range.invert(); + } + return key_range; +} + +const parquet::ColumnDescriptor * getColumnDescriptorIfBloomFilterIsPresent( + const std::unique_ptr & parquet_rg_metadata, + const std::vector & clickhouse_column_index_to_parquet_index, + std::size_t clickhouse_column_index) +{ + if (clickhouse_column_index_to_parquet_index.size() <= clickhouse_column_index) + { + return nullptr; + } + + const auto & parquet_indexes = clickhouse_column_index_to_parquet_index[clickhouse_column_index].parquet_indexes; + + // complex types like structs, tuples and maps will have more than one index. + // we don't support those for now + if (parquet_indexes.size() > 1) + { + return nullptr; + } + + if (parquet_indexes.empty()) + { + throw Exception(ErrorCodes::LOGICAL_ERROR, "Something bad happened, raise an issue and try the query with `input_format_parquet_bloom_filter_push_down=false`"); + } + + auto parquet_column_index = parquet_indexes[0]; + + const auto * parquet_column_descriptor = parquet_rg_metadata->schema()->Column(parquet_column_index); + + bool column_has_bloom_filter = parquet_rg_metadata->ColumnChunk(parquet_column_index)->bloom_filter_offset().has_value(); + if (!column_has_bloom_filter) + { + return nullptr; + } + + return parquet_column_descriptor; +} + + +bool isParquetStringTypeSupportedForBloomFilters( + const std::shared_ptr & logical_type, + parquet::ConvertedType::type converted_type) +{ + if (logical_type && + !logical_type->is_none() + && !(logical_type->is_string() || logical_type->is_BSON() || logical_type->is_JSON())) + { + return false; + } + + if (parquet::ConvertedType::type::NONE != converted_type && + !(converted_type == parquet::ConvertedType::JSON || converted_type == parquet::ConvertedType::UTF8 + || converted_type == parquet::ConvertedType::BSON)) + { + return false; + } + + return true; +} + +bool isParquetIntegerTypeSupportedForBloomFilters(const std::shared_ptr & logical_type, parquet::ConvertedType::type converted_type) +{ + if (logical_type && !logical_type->is_none() && !logical_type->is_int()) + { + return false; + } + + if (parquet::ConvertedType::type::NONE != converted_type && !(converted_type == parquet::ConvertedType::INT_8 || converted_type == parquet::ConvertedType::INT_16 + || converted_type == parquet::ConvertedType::INT_32 || converted_type == parquet::ConvertedType::INT_64 + || converted_type == parquet::ConvertedType::UINT_8 || converted_type == parquet::ConvertedType::UINT_16 + || converted_type == parquet::ConvertedType::UINT_32 || converted_type == parquet::ConvertedType::UINT_64)) + { + return false; + } + + return true; +} + +template +uint64_t hashSpecialFLBATypes(const Field & field) +{ + const T & value = field.safeGet(); + + parquet::FLBA flba(reinterpret_cast(&value)); + + parquet::XxHasher hasher; + + return hasher.Hash(&flba, sizeof(T)); +}; + +std::optional tryHashStringWithoutCompatibilityCheck(const Field & field) +{ + const auto field_type = field.getType(); + + if (field_type != Field::Types::Which::String) + { + return std::nullopt; + } + + parquet::XxHasher hasher; + parquet::ByteArray ba { field.safeGet() }; + + return hasher.Hash(&ba); +} + +std::optional tryHashString( + const Field & field, + const std::shared_ptr & logical_type, + parquet::ConvertedType::type converted_type) +{ + if (!isParquetStringTypeSupportedForBloomFilters(logical_type, converted_type)) + { + return std::nullopt; + } + + return tryHashStringWithoutCompatibilityCheck(field); +} + +std::optional tryHashFLBA( + const Field & field, + const std::shared_ptr & logical_type, + parquet::ConvertedType::type converted_type, + std::size_t parquet_column_length) +{ + if (!isParquetStringTypeSupportedForBloomFilters(logical_type, converted_type)) + { + return std::nullopt; + } + + const auto field_type = field.getType(); + + if (field_type == Field::Types::Which::IPv6 && parquet_column_length == sizeof(IPv6)) + { + return hashSpecialFLBATypes(field); + } + + return tryHashStringWithoutCompatibilityCheck(field); +} + +template +std::optional tryHashInt(const Field & field, const std::shared_ptr & logical_type, parquet::ConvertedType::type converted_type) +{ + if (!isParquetIntegerTypeSupportedForBloomFilters(logical_type, converted_type)) + { + return std::nullopt; + } + + parquet::XxHasher hasher; + + if (field.getType() == Field::Types::Which::Int64) + { + return hasher.Hash(static_cast(field.safeGet())); + } + else if (field.getType() == Field::Types::Which::UInt64) + { + return hasher.Hash(static_cast(field.safeGet())); + } + else if (field.getType() == Field::Types::IPv4) + { + /* + * In theory, we could accept IPv4 over 64 bits variables. It would only be a problem in case it was hashed using the byte array api + * with a zero-ed buffer that had a 32 bits variable copied into it. + * + * To be on the safe side, accept only in case physical type is 32 bits. + * */ + if constexpr (std::is_same_v) + { + return hasher.Hash(static_cast(field.safeGet())); + } + } + + return std::nullopt; +} + +std::optional tryHash(const Field & field, const parquet::ColumnDescriptor * parquet_column_descriptor) +{ + const auto physical_type = parquet_column_descriptor->physical_type(); + const auto & logical_type = parquet_column_descriptor->logical_type(); + const auto converted_type = parquet_column_descriptor->converted_type(); + + switch (physical_type) + { + case parquet::Type::type::INT32: + return tryHashInt(field, logical_type, converted_type); + case parquet::Type::type::INT64: + return tryHashInt(field, logical_type, converted_type); + case parquet::Type::type::BYTE_ARRAY: + return tryHashString(field, logical_type, converted_type); + case parquet::Type::type::FIXED_LEN_BYTE_ARRAY: + return tryHashFLBA(field, logical_type, converted_type, parquet_column_descriptor->type_length()); + default: + return std::nullopt; + } +} + +std::optional> hash(const IColumn * data_column, const parquet::ColumnDescriptor * parquet_column_descriptor) +{ + std::vector hashes; + + for (size_t i = 0u; i < data_column->size(); i++) + { + Field f; + data_column->get(i, f); + + auto hashed_value = tryHash(f, parquet_column_descriptor); + + if (!hashed_value) + { + return std::nullopt; + } + + hashes.emplace_back(*hashed_value); + } + + return hashes; +} + +bool maybeTrueOnBloomFilter(const std::vector & hashes, const std::unique_ptr & bloom_filter) +{ + for (const auto hash : hashes) + { + if (bloom_filter->FindHash(hash)) + { + return true; + } + } + + return false; +} + +bool mayBeTrueOnParquetRowGroup(const ParquetFilterCondition::BloomFilterData & condition_bloom_filter_data, + const ParquetBloomFilterCondition::ColumnIndexToBF & column_index_to_column_bf) +{ + bool maybe_true = true; + for (auto column_index = 0u; column_index < condition_bloom_filter_data.hashes_per_column.size(); column_index++) + { + // in case bloom filter is not present for this row group + // https://github.com/ClickHouse/ClickHouse/pull/62966#discussion_r1722361237 + if (!column_index_to_column_bf.contains(condition_bloom_filter_data.key_columns[column_index])) + { + continue; + } + + bool column_maybe_contains = maybeTrueOnBloomFilter( + condition_bloom_filter_data.hashes_per_column[column_index], + column_index_to_column_bf.at(condition_bloom_filter_data.key_columns[column_index])); + + if (!column_maybe_contains) + { + maybe_true = false; + break; + } + } + + return maybe_true; +} + +std::vector abcdefgh( + const std::vector & rpn, + const std::vector & clickhouse_column_index_to_parquet_index, + const std::unique_ptr & parquet_rg_metadata) +{ + std::vector condition_elements; + + using RPNElement = KeyCondition::RPNElement; + + for (const auto & rpn_element : rpn) + { + condition_elements.emplace_back(rpn_element); + // this would be a problem for `where negate(x) = -58`. + // It would perform a bf search on `-58`, and possibly miss row groups containing this data. + if (!rpn_element.monotonic_functions_chain.empty()) + { + continue; + } + + ParquetBloomFilterCondition::ConditionElement::HashesForColumns hashes; + + if (rpn_element.function == RPNElement::FUNCTION_IN_RANGE + || rpn_element.function == RPNElement::FUNCTION_NOT_IN_RANGE) + { + // Only FUNCTION_EQUALS is supported and for that extremes need to be the same + if (rpn_element.range.left != rpn_element.range.right) + { + continue; + } + + const auto * parquet_column_descriptor = + getColumnDescriptorIfBloomFilterIsPresent(parquet_rg_metadata, clickhouse_column_index_to_parquet_index, rpn_element.key_column); + + if (!parquet_column_descriptor) + { + continue; + } + + auto hashed_value = tryHash(rpn_element.range.left, parquet_column_descriptor); + + if (!hashed_value) + { + continue; + } + + std::vector hashes_for_column; + hashes_for_column.emplace_back(*hashed_value); + + hashes.emplace_back(std::move(hashes_for_column)); + + std::vector key_columns; + key_columns.emplace_back(rpn_element.key_column); + + condition_elements.back().bloom_filter_data = ParquetFilterCondition::BloomFilterData {std::move(hashes), std::move(key_columns)}; + } + else if (rpn_element.function == RPNElement::FUNCTION_IN_SET + || rpn_element.function == RPNElement::FUNCTION_NOT_IN_SET) + { + const auto & set_index = rpn_element.set_index; + const auto & ordered_set = set_index->getOrderedSet(); + const auto & indexes_mapping = set_index->getIndexesMapping(); + bool found_empty_column = false; + + std::vector key_columns; + + for (auto i = 0u; i < ordered_set.size(); i++) + { + const auto & set_column = ordered_set[i]; + + const auto * parquet_column_descriptor = getColumnDescriptorIfBloomFilterIsPresent( + parquet_rg_metadata, + clickhouse_column_index_to_parquet_index, + indexes_mapping[i].key_index); + + if (!parquet_column_descriptor) + { + continue; + } + + auto column = set_column; + + if (column->empty()) + { + found_empty_column = true; + break; + } + + if (const auto & nullable_column = checkAndGetColumn(set_column.get())) + { + column = nullable_column->getNestedColumnPtr(); + } + + auto hashes_for_column_opt = hash(column.get(), parquet_column_descriptor); + + if (!hashes_for_column_opt) + { + continue; + } + + auto & hashes_for_column = *hashes_for_column_opt; + + if (hashes_for_column.empty()) + { + continue; + } + + hashes.emplace_back(hashes_for_column); + + key_columns.push_back(indexes_mapping[i].key_index); + } + + if (found_empty_column) + { + // todo arthur + continue; + } + + if (hashes.empty()) + { + continue; + } + + condition_elements.back().bloom_filter_data = {std::move(hashes), std::move(key_columns)}; + } + } + + return condition_elements; +} + +BoolMask ParquetFilterCondition::check(const std::vector & rpn, + const Hyperrectangle & hyperrectangle, + const KeyCondition::SpaceFillingCurveDescriptions & key_space_filling_curves, + const DataTypes & data_types, + const ParquetBloomFilterCondition::ColumnIndexToBF & column_index_to_column_bf, + bool single_point) +{ + std::vector rpn_stack; + + auto curve_type = [&](size_t key_column_pos) + { + for (const auto & curve : key_space_filling_curves) + if (curve.key_column_pos == key_column_pos) + return curve.type; + return KeyCondition::SpaceFillingCurveType::Unknown; + }; + + for (const auto & element : rpn) + { + if (element.argument_num_of_space_filling_curve.has_value()) + { + // todo arthur, not sure what to do here yet + /// If a condition on argument of a space filling curve wasn't collapsed into FUNCTION_ARGS_IN_HYPERRECTANGLE, + /// we cannot process it. + rpn_stack.emplace_back(true, true); + } + else if (element.function == ConditionElement::FUNCTION_UNKNOWN) + { + rpn_stack.emplace_back(true, true); + } + else if (element.function == ConditionElement::FUNCTION_IN_RANGE + || element.function == ConditionElement::FUNCTION_NOT_IN_RANGE) + { + if (element.key_column >= hyperrectangle.size()) + { + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Hyperrectangle size is {}, but requested element at posittion {} ({})", + hyperrectangle.size(), element.key_column, element.toString()); + } + + const Range * key_range = &hyperrectangle[element.key_column]; + + /// The case when the column is wrapped in a chain of possibly monotonic functions. + Range transformed_range = Range::createWholeUniverse(); + if (!element.monotonic_functions_chain.empty()) + { + std::optional new_range = applyMonotonicFunctionsChainToRange( + *key_range, + element.monotonic_functions_chain, + data_types[element.key_column], + single_point + ); + + if (!new_range) + { + rpn_stack.emplace_back(true, true); + + if (element.bloom_filter_data) + { + rpn_stack.back().can_be_true = mayBeTrueOnParquetRowGroup(*element.bloom_filter_data, column_index_to_column_bf); + } + + continue; + } + transformed_range = *new_range; + key_range = &transformed_range; + } + + bool intersects = element.range.intersectsRange(*key_range); + bool contains = element.range.containsRange(*key_range); + + rpn_stack.emplace_back(intersects, !contains); + + if (rpn_stack.back().can_be_true && element.bloom_filter_data) + { + rpn_stack.back().can_be_true = mayBeTrueOnParquetRowGroup(*element.bloom_filter_data, column_index_to_column_bf); + } + + if (element.function == ConditionElement::FUNCTION_NOT_IN_RANGE) + rpn_stack.back() = !rpn_stack.back(); + } + else if (element.function == ConditionElement::FUNCTION_ARGS_IN_HYPERRECTANGLE) + { + /** The case of space-filling curves. + * We unpack the range of a space filling curve into hyperrectangles of their arguments, + * and then check the intersection of them with the given hyperrectangle from the key condition. + * + * Note: you might find this code hard to understand, + * because there are three different hyperrectangles involved: + * + * 1. A hyperrectangle derived from the range of the table's sparse index (marks granule): `hyperrectangle` + * We analyze its dimension `key_range`, corresponding to the `key_column`. + * For example, the table's key is a single column `mortonEncode(x, y)`, + * the current granule is [500, 600], and it means that + * mortonEncode(x, y) in [500, 600] + * + * 2. A hyperrectangle derived from the key condition, e.g. + * `x >= 10 AND x <= 20 AND y >= 20 AND y <= 30` defines: (x, y) in [10, 20] × [20, 30] + * + * 3. A set of hyperrectangles that we obtain by inverting the space-filling curve on the range: + * From mortonEncode(x, y) in [500, 600] + * We get (x, y) in [30, 31] × [12, 13] + * or (x, y) in [28, 31] × [14, 15]; + * or (x, y) in [0, 7] × [16, 23]; + * or (x, y) in [8, 11] × [16, 19]; + * or (x, y) in [12, 15] × [16, 17]; + * or (x, y) in [12, 12] × [18, 18]; + * + * And we analyze the intersection of (2) and (3). + */ + + Range key_range = hyperrectangle[element.key_column]; + + /// The only possible result type of a space filling curve is UInt64. + /// We also only check bounded ranges. + if (key_range.left.getType() == Field::Types::UInt64 + && key_range.right.getType() == Field::Types::UInt64) + { + key_range.shrinkToIncludedIfPossible(); + + size_t num_dimensions = element.space_filling_curve_args_hyperrectangle.size(); + + /// Let's support only the case of 2d, because I'm not confident in other cases. + if (num_dimensions == 2) + { + UInt64 left = key_range.left.safeGet(); + UInt64 right = key_range.right.safeGet(); + + BoolMask mask(false, true); + auto hyperrectangle_intersection_callback = [&](std::array, 2> curve_hyperrectangle) + { + BoolMask current_intersection(true, false); + for (size_t dim = 0; dim < num_dimensions; ++dim) + { + const Range & condition_arg_range = element.space_filling_curve_args_hyperrectangle[dim]; + + const Range curve_arg_range( + curve_hyperrectangle[dim].first, true, + curve_hyperrectangle[dim].second, true); + + bool intersects = condition_arg_range.intersectsRange(curve_arg_range); + bool contains = condition_arg_range.containsRange(curve_arg_range); + + current_intersection = current_intersection & BoolMask(intersects, !contains); + } + + mask = mask | current_intersection; + }; + + switch (curve_type(element.key_column)) + { + case KeyCondition::SpaceFillingCurveType::Hilbert: + { + hilbertIntervalToHyperrectangles2D(left, right, hyperrectangle_intersection_callback); + break; + } + case KeyCondition::SpaceFillingCurveType::Morton: + { + mortonIntervalToHyperrectangles<2>(left, right, hyperrectangle_intersection_callback); + break; + } + case KeyCondition::SpaceFillingCurveType::Unknown: + { + throw Exception(ErrorCodes::LOGICAL_ERROR, "curve_type is `Unknown`. It is a bug."); + } + } + + rpn_stack.emplace_back(mask); + } + else + rpn_stack.emplace_back(true, true); + } + else + rpn_stack.emplace_back(true, true); + + /** Note: we can consider implementing a simpler solution, based on "hidden keys". + * It means, when we have a table's key like (a, b, mortonCurve(x, y)) + * we extract the arguments from the curves, and append them to the key, + * imagining that we have the key (a, b, mortonCurve(x, y), x, y) + * + * Then while we analyze the granule's range between (a, b, mortonCurve(x, y)) + * and decompose it to the series of hyperrectangles, + * we can construct a series of hyperrectangles of the extended key (a, b, mortonCurve(x, y), x, y), + * and then do everything as usual. + * + * This approach is generalizable to any functions, that have preimage of interval + * represented by a set of hyperrectangles. + */ + } + else if (element.function == ConditionElement::FUNCTION_POINT_IN_POLYGON) + { + /** There are 2 kinds of polygons: + * 1. Polygon by minmax index + * 2. Polygons which is provided by user + * + * Polygon by minmax index: + * For hyperactangle [1, 2] × [3, 4] we can create a polygon with 4 points: (1, 3), (1, 4), (2, 4), (2, 3) + * + * Algorithm: + * Check whether there is any intersection of the 2 polygons. If true return {true, true}, else return {false, true}. + */ + const auto & key_column_positions = element.point_in_polygon_column_description->key_column_positions; + + Float64 x_min = applyVisitor(FieldVisitorConvertToNumber(), hyperrectangle[key_column_positions[0]].left); + Float64 x_max = applyVisitor(FieldVisitorConvertToNumber(), hyperrectangle[key_column_positions[0]].right); + Float64 y_min = applyVisitor(FieldVisitorConvertToNumber(), hyperrectangle[key_column_positions[1]].left); + Float64 y_max = applyVisitor(FieldVisitorConvertToNumber(), hyperrectangle[key_column_positions[1]].right); + + if (unlikely(isNaN(x_min) || isNaN(x_max) || isNaN(y_min) || isNaN(y_max))) + { + rpn_stack.emplace_back(true, true); + continue; + } + + using Point = boost::geometry::model::d2::point_xy; + using Polygon = boost::geometry::model::polygon; + Polygon polygon_by_minmax_index; + polygon_by_minmax_index.outer().emplace_back(x_min, y_min); + polygon_by_minmax_index.outer().emplace_back(x_min, y_max); + polygon_by_minmax_index.outer().emplace_back(x_max, y_max); + polygon_by_minmax_index.outer().emplace_back(x_max, y_min); + + /// Close ring + boost::geometry::correct(polygon_by_minmax_index); + + /// Because the polygon may have a hole so the "can_be_false" should always be true. + rpn_stack.emplace_back( + boost::geometry::intersects(polygon_by_minmax_index, element.polygon), true); + } + else if ( + element.function == ConditionElement::FUNCTION_IS_NULL + || element.function == ConditionElement::FUNCTION_IS_NOT_NULL) + { + const Range * key_range = &hyperrectangle[element.key_column]; + + /// No need to apply monotonic functions as nulls are kept. + bool intersects = element.range.intersectsRange(*key_range); + bool contains = element.range.containsRange(*key_range); + + rpn_stack.emplace_back(intersects, !contains); + if (element.function == ConditionElement::FUNCTION_IS_NULL) + rpn_stack.back() = !rpn_stack.back(); + } + else if ( + element.function == ConditionElement::FUNCTION_IN_SET + || element.function == ConditionElement::FUNCTION_NOT_IN_SET) + { + if (!element.set_index) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Set for IN is not created yet"); + + rpn_stack.emplace_back(element.set_index->checkInRange(hyperrectangle, data_types, single_point)); + + if (rpn_stack.back().can_be_true && element.bloom_filter_data) + { + rpn_stack.back().can_be_true = mayBeTrueOnParquetRowGroup(*element.bloom_filter_data, column_index_to_column_bf); + } + + if (element.function == ConditionElement::FUNCTION_NOT_IN_SET) + rpn_stack.back() = !rpn_stack.back(); + } + else if (element.function == ConditionElement::FUNCTION_NOT) + { + assert(!rpn_stack.empty()); + + rpn_stack.back() = !rpn_stack.back(); + } + else if (element.function == ConditionElement::FUNCTION_AND) + { + assert(!rpn_stack.empty()); + + auto arg1 = rpn_stack.back(); + rpn_stack.pop_back(); + auto arg2 = rpn_stack.back(); + rpn_stack.back() = arg1 & arg2; + } + else if (element.function == ConditionElement::FUNCTION_OR) + { + assert(!rpn_stack.empty()); + + auto arg1 = rpn_stack.back(); + rpn_stack.pop_back(); + auto arg2 = rpn_stack.back(); + rpn_stack.back() = arg1 | arg2; + } + else if (element.function == ConditionElement::ALWAYS_FALSE) + { + rpn_stack.emplace_back(false, true); + } + else if (element.function == ConditionElement::ALWAYS_TRUE) + { + rpn_stack.emplace_back(true, false); + } + else + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected function type in KeyCondition::ConditionElement"); + } + + if (rpn_stack.size() != 1) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected stack size in KeyCondition::checkInHyperrectangle"); + + return rpn_stack[0]; +} + +} + +#endif diff --git a/src/Processors/Formats/Impl/Parquet/ParquetFilterCondition.h b/src/Processors/Formats/Impl/Parquet/ParquetFilterCondition.h new file mode 100644 index 00000000000..0edb48e0f03 --- /dev/null +++ b/src/Processors/Formats/Impl/Parquet/ParquetFilterCondition.h @@ -0,0 +1,44 @@ +#pragma once + +#include + +#if USE_PARQUET + +#include +#include + +namespace DB +{ + +class ParquetFilterCondition +{ +public: + + struct BloomFilterData + { + using HashesForColumns = std::vector>; + HashesForColumns hashes_per_column; + std::vector key_columns; + }; + + struct ConditionElement : public KeyCondition::RPNElement + { + std::optional bloom_filter_data; + }; + + static BoolMask check(const std::vector & RPN, + const Hyperrectangle & hyperrectangle, + const KeyCondition::SpaceFillingCurveDescriptions & key_space_filling_curves, + const DataTypes & data_types, + const ParquetBloomFilterCondition::ColumnIndexToBF & column_index_to_column_bf, + bool single_point); +}; + +std::vector abcdefgh( + const std::vector & rpn, + const std::vector & clickhouse_column_index_to_parquet_index, + const std::unique_ptr & parquet_rg_metadata); + +} + +#endif diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp index f9567ec90f0..92bec6c4aca 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp @@ -28,6 +28,7 @@ #include #include #include +#include #include namespace ProfileEvents @@ -599,24 +600,52 @@ void ParquetBlockInputFormat::initializeIfNeeded() if (skip_row_groups.contains(row_group)) continue; - if (parquet_bloom_filter_condition) + if (key_condition) { - const auto column_index_to_bf = buildColumnIndexToBF(*bf_reader, row_group, index_mapping, filtering_columns); - - if (!parquet_bloom_filter_condition->mayBeTrueOnRowGroup(column_index_to_bf)) + if (format_settings.parquet.filter_push_down && format_settings.parquet.bloom_filter_push_down) { - continue; + auto parquet_rpn = abcdefgh(key_condition->getRPN(), + index_mapping, + metadata->RowGroup(row_group)); + auto hyperrectangle = getHyperrectangleForRowGroup(*metadata, row_group, getPort().getHeader(), format_settings); + + const auto column_index_to_bf = buildColumnIndexToBF(*bf_reader, row_group, index_mapping, filtering_columns); + + bool maybe_exists = ParquetFilterCondition::check( + parquet_rpn, + hyperrectangle, + key_condition->key_space_filling_curves, + getPort().getHeader().getDataTypes(), + column_index_to_bf, + key_condition->isSinglePoint()).can_be_true; + + if (!maybe_exists) + { + continue; + } + } + else if (format_settings.parquet.filter_push_down) + { + if (!key_condition + ->checkInHyperrectangle( + getHyperrectangleForRowGroup(*metadata, row_group, getPort().getHeader(), format_settings), + getPort().getHeader().getDataTypes()) + .can_be_true) + { + continue; + } + } + else if (format_settings.parquet.bloom_filter_push_down) + { + const auto column_index_to_bf = buildColumnIndexToBF(*bf_reader, row_group, index_mapping, filtering_columns); + + if (!parquet_bloom_filter_condition->mayBeTrueOnRowGroup(column_index_to_bf)) + { + continue; + } } } - if (format_settings.parquet.filter_push_down && key_condition - && !key_condition - ->checkInHyperrectangle( - getHyperrectangleForRowGroup(*metadata, row_group, getPort().getHeader(), format_settings), - getPort().getHeader().getDataTypes()) - .can_be_true) - continue; - // When single-threaded parsing, can prefetch row groups, so need to put all row groups in the same row_group_batch if (row_group_batches.empty() || (!prefetch_group && row_group_batches.back().total_bytes_compressed >= min_bytes_for_seek)) row_group_batches.emplace_back(); diff --git a/src/Storages/MergeTree/KeyCondition.cpp b/src/Storages/MergeTree/KeyCondition.cpp index 17723d341fb..2363161fbd9 100644 --- a/src/Storages/MergeTree/KeyCondition.cpp +++ b/src/Storages/MergeTree/KeyCondition.cpp @@ -3032,6 +3032,7 @@ BoolMask KeyCondition::checkInHyperrectangle( if (!new_range) { rpn_stack.emplace_back(true, true); + // aqui eu pergunto pro bloom filter continue; } transformed_range = *new_range; @@ -3041,6 +3042,8 @@ BoolMask KeyCondition::checkInHyperrectangle( bool intersects = element.range.intersectsRange(*key_range); bool contains = element.range.containsRange(*key_range); + // aqui eu pergunto pro bloom filter + rpn_stack.emplace_back(intersects, !contains); if (element.function == RPNElement::FUNCTION_NOT_IN_RANGE) rpn_stack.back() = !rpn_stack.back(); @@ -3214,6 +3217,7 @@ BoolMask KeyCondition::checkInHyperrectangle( if (!element.set_index) throw Exception(ErrorCodes::LOGICAL_ERROR, "Set for IN is not created yet"); + // aqui eu pergunto pro bloom filter rpn_stack.emplace_back(element.set_index->checkInRange(hyperrectangle, data_types, single_point)); if (element.function == RPNElement::FUNCTION_NOT_IN_SET) rpn_stack.back() = !rpn_stack.back(); diff --git a/src/Storages/MergeTree/KeyCondition.h b/src/Storages/MergeTree/KeyCondition.h index 8c946bd3bbd..00e741f549c 100644 --- a/src/Storages/MergeTree/KeyCondition.h +++ b/src/Storages/MergeTree/KeyCondition.h @@ -237,6 +237,27 @@ public: bool isRelaxed() const { return relaxed; } + /// Space-filling curves in the key + enum class SpaceFillingCurveType + { + Unknown = 0, + Morton, + Hilbert + }; + static const std::unordered_map space_filling_curve_name_to_type; + + struct SpaceFillingCurveDescription + { + size_t key_column_pos; + String function_name; + std::vector arguments; + SpaceFillingCurveType type; + }; + using SpaceFillingCurveDescriptions = std::vector; + SpaceFillingCurveDescriptions key_space_filling_curves; + + bool isSinglePoint() const { return single_point; } + private: BoolMask checkInRange( size_t used_key_size, @@ -357,24 +378,6 @@ private: /// All intermediate columns are used to calculate key_expr. const NameSet key_subexpr_names; - /// Space-filling curves in the key - enum class SpaceFillingCurveType - { - Unknown = 0, - Morton, - Hilbert - }; - static const std::unordered_map space_filling_curve_name_to_type; - - struct SpaceFillingCurveDescription - { - size_t key_column_pos; - String function_name; - std::vector arguments; - SpaceFillingCurveType type; - }; - using SpaceFillingCurveDescriptions = std::vector; - SpaceFillingCurveDescriptions key_space_filling_curves; void getAllSpaceFillingCurves(); /// Array joined column names From 55e387da9056d94cc3ec0ec4580864a8c5c744e5 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Fri, 1 Nov 2024 14:17:33 -0300 Subject: [PATCH 2/8] add a test --- ...arquet_bloom_filter_minmax_stats.reference | 23 ++++++++++++++ ...merge_parquet_bloom_filter_minmax_stats.sh | 29 ++++++++++++++++++ .../integers_1_5_no_3_bf_minmax.parquet | Bin 0 -> 1048958 bytes 3 files changed, 52 insertions(+) create mode 100644 tests/queries/0_stateless/03261_test_merge_parquet_bloom_filter_minmax_stats.reference create mode 100755 tests/queries/0_stateless/03261_test_merge_parquet_bloom_filter_minmax_stats.sh create mode 100644 tests/queries/0_stateless/data_parquet/integers_1_5_no_3_bf_minmax.parquet diff --git a/tests/queries/0_stateless/03261_test_merge_parquet_bloom_filter_minmax_stats.reference b/tests/queries/0_stateless/03261_test_merge_parquet_bloom_filter_minmax_stats.reference new file mode 100644 index 00000000000..9d5dea4cc09 --- /dev/null +++ b/tests/queries/0_stateless/03261_test_merge_parquet_bloom_filter_minmax_stats.reference @@ -0,0 +1,23 @@ +{ + "meta": + [ + { + "name": "int8", + "type": "Nullable(Int8)" + } + ], + + "data": + [ + + ], + + "rows": 0, + + "statistics": + { + "elapsed": 0.05269874, + "rows_read": 0, + "bytes_read": 0 + } +} diff --git a/tests/queries/0_stateless/03261_test_merge_parquet_bloom_filter_minmax_stats.sh b/tests/queries/0_stateless/03261_test_merge_parquet_bloom_filter_minmax_stats.sh new file mode 100755 index 00000000000..db58f0b69e5 --- /dev/null +++ b/tests/queries/0_stateless/03261_test_merge_parquet_bloom_filter_minmax_stats.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash +# Tags: no-ubsan, no-fasttest + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + + +USER_FILES_PATH=$($CLICKHOUSE_CLIENT_BINARY --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') + +WORKING_DIR="${USER_FILES_PATH}/${CLICKHOUSE_TEST_UNIQUE_NAME}" + +mkdir -p "${WORKING_DIR}" + +DATA_FILE="${CUR_DIR}/data_parquet/integers_1_5_no_3_bf_minmax.parquet" + +DATA_FILE_USER_PATH="${WORKING_DIR}/integers_1to5_no_3_bf_minmax.parquet" + +cp ${DATA_FILE} ${DATA_FILE_USER_PATH} + +# Prior to this PR, bloom filter and minmax were evaluated separately. +# This was sub-optimal for conditions like `x = 3 or x > 5` where data is [1, 2, 4, 5]. +# Bloom filter is not able to handle greater than operations. Therefore, it can't evaluate x > 5. Even though it can tell +# `3` is not in the set by evaluating `x = 3`, it can't discard the row group because of the `or` condition. +# On the other hand, min max can handle both. It'll evaluate x = 3 to true (because it is within the range) and the latter to false +# Therefore, bloom filter would determine `false or true` and minmax would determine `true or false`. Resulting in true. + +# Since both structures are now evaluated together, the row group should be skipped +${CLICKHOUSE_CLIENT} --query="select * from file('${DATA_FILE_USER_PATH}', Parquet) WHERE int8 = 3 or int8 > 5 FORMAT Json SETTINGS input_format_parquet_filter_push_down=true, input_format_parquet_bloom_filter_push_down=true;" diff --git a/tests/queries/0_stateless/data_parquet/integers_1_5_no_3_bf_minmax.parquet b/tests/queries/0_stateless/data_parquet/integers_1_5_no_3_bf_minmax.parquet new file mode 100644 index 0000000000000000000000000000000000000000..1790322cdf6915fa4572bf9ef0f82692644b5db3 GIT binary patch literal 1048958 zcmeIuL2nyH6ae72D<_>}BDyQDw1T}PY9)wjTqQN)Ksg}gN}~{(h&eQr@O2 zMZETH4#U{~w%u;OIo{vjue(q8F5-vp&c8p%_2#f_#-n_;&1o+esY=*XY>Kvx_;co(L4W`O0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+ z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBly zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF z5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkL{|12;XUOq)^qS$@L zCc0s95WBo9iUbG{AV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs z0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U zAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7 z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7csfqzdR8m-DKOsyvGE z=<@pJ(b20P{P;>b%x&t_v`u@Q+I;&cMYB7vMs=z`s#nM9?eS+Ay%o7jI!^WKZshCZ!>`ttH~so#`E=1=om|hC z{rFD*e7;;=^)Y^bHD7=JtiO3BW4y=%=$c^X_7PziHe2(nS$*{`kq{4@dh~r~m)} literal 0 HcmV?d00001 From 679cb6e4e1223e484e67bbf3f901cbb58935eb81 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Tue, 5 Nov 2024 11:48:25 -0300 Subject: [PATCH 3/8] merge minmax and bf eval --- .../Parquet/ParquetBloomFilterCondition.h | 73 -- .../Impl/Parquet/ParquetFilterCondition.cpp | 799 ------------------ .../Impl/Parquet/ParquetFilterCondition.h | 44 - ...on.cpp => keyConditionRPNToParquetRPN.cpp} | 265 ++---- .../Parquet/keyConditionRPNToParquetRPN.h | 24 + .../Formats/Impl/ParquetBlockInputFormat.cpp | 136 +-- src/Storages/MergeTree/KeyCondition.cpp | 68 +- src/Storages/MergeTree/KeyCondition.h | 26 + 8 files changed, 244 insertions(+), 1191 deletions(-) delete mode 100644 src/Processors/Formats/Impl/Parquet/ParquetBloomFilterCondition.h delete mode 100644 src/Processors/Formats/Impl/Parquet/ParquetFilterCondition.cpp delete mode 100644 src/Processors/Formats/Impl/Parquet/ParquetFilterCondition.h rename src/Processors/Formats/Impl/Parquet/{ParquetBloomFilterCondition.cpp => keyConditionRPNToParquetRPN.cpp} (58%) create mode 100644 src/Processors/Formats/Impl/Parquet/keyConditionRPNToParquetRPN.h diff --git a/src/Processors/Formats/Impl/Parquet/ParquetBloomFilterCondition.h b/src/Processors/Formats/Impl/Parquet/ParquetBloomFilterCondition.h deleted file mode 100644 index 6de6030b23c..00000000000 --- a/src/Processors/Formats/Impl/Parquet/ParquetBloomFilterCondition.h +++ /dev/null @@ -1,73 +0,0 @@ -#pragma once - -#include - -#if USE_PARQUET - -#include -#include -#include - -namespace parquet -{ -class BloomFilter; -} - -namespace DB -{ - -class ParquetBloomFilterCondition -{ -public: - - struct ConditionElement - { - enum Function - { - /// Atoms of a Boolean expression. - FUNCTION_IN, - FUNCTION_NOT_IN, - /// Can take any value. - FUNCTION_UNKNOWN, - /// Operators of the logical expression. - FUNCTION_NOT, - FUNCTION_AND, - FUNCTION_OR, - /// Constants - ALWAYS_FALSE, - ALWAYS_TRUE, - }; - - using ColumnPtr = IColumn::Ptr; - using HashesForColumns = std::vector>; - using KeyColumns = std::vector; - - Function function; - // each entry represents a list of hashes per column - // suppose there are three columns with 2 rows each - // hashes_per_column.size() == 3 and hashes_per_column[0].size() == 2 - HashesForColumns hashes_per_column; - KeyColumns key_columns; - }; - - using RPNElement = KeyCondition::RPNElement; - using ColumnIndexToBF = std::unordered_map>; - - explicit ParquetBloomFilterCondition(const std::vector & condition_, const Block & header_); - - bool mayBeTrueOnRowGroup(const ColumnIndexToBF & column_index_to_column_bf) const; - std::unordered_set getFilteringColumnKeys() const; - -private: - std::vector condition; - Block header; -}; - -std::vector keyConditionRPNToParquetBloomFilterCondition( - const std::vector & rpn, - const std::vector & clickhouse_column_index_to_parquet_index, - const std::unique_ptr & parquet_rg_metadata); - -} - -#endif diff --git a/src/Processors/Formats/Impl/Parquet/ParquetFilterCondition.cpp b/src/Processors/Formats/Impl/Parquet/ParquetFilterCondition.cpp deleted file mode 100644 index 7a43128e7ae..00000000000 --- a/src/Processors/Formats/Impl/Parquet/ParquetFilterCondition.cpp +++ /dev/null @@ -1,799 +0,0 @@ -#include - -#if USE_PARQUET - -#include -#include -#include -#include -#include -#include -#include - -namespace DB -{ - -static Field applyFunctionForField( - const FunctionBasePtr & func, - const DataTypePtr & arg_type, - const Field & arg_value) -{ - ColumnsWithTypeAndName columns - { - { arg_type->createColumnConst(1, arg_value), arg_type, "x" }, - }; - - auto col = func->execute(columns, func->getResultType(), 1); - return (*col)[0]; -} - -/// applyFunction will execute the function with one `field` or the column which `field` refers to. -static FieldRef applyFunction(const FunctionBasePtr & func, const DataTypePtr & current_type, const FieldRef & field) -{ - chassert(func != nullptr); - /// Fallback for fields without block reference. - if (field.isExplicit()) - return applyFunctionForField(func, current_type, field); - - /// We will cache the function result inside `field.columns`, because this function will call many times - /// from many fields from same column. When the column is huge, for example there are thousands of marks, we need a cache. - /// The cache key is like `_[function_pointer]_[param_column_id]` to identify a unique pair. - WriteBufferFromOwnString buf; - writeText("_", buf); - writePointerHex(func.get(), buf); - writeText("_" + toString(field.column_idx), buf); - String result_name = buf.str(); - const auto & columns = field.columns; - size_t result_idx = columns->size(); - - for (size_t i = 0; i < result_idx; ++i) - { - if ((*columns)[i].name == result_name) - result_idx = i; - } - - if (result_idx == columns->size()) - { - /// When cache is missed, we calculate the whole column where the field comes from. This will avoid repeated calculation. - ColumnsWithTypeAndName args{(*columns)[field.column_idx]}; - field.columns->emplace_back(ColumnWithTypeAndName {nullptr, func->getResultType(), result_name}); - (*columns)[result_idx].column = func->execute(args, (*columns)[result_idx].type, columns->front().column->size()); - } - - return {field.columns, field.row_idx, result_idx}; -} - -std::optional applyMonotonicFunctionsChainToRange( - Range key_range, - const KeyCondition::MonotonicFunctionsChain & functions, - DataTypePtr current_type, - bool single_point) -{ - for (const auto & func : functions) - { - /// We check the monotonicity of each function on a specific range. - /// If we know the given range only contains one value, then we treat all functions as positive monotonic. - IFunction::Monotonicity monotonicity = single_point - ? IFunction::Monotonicity{true} - : func->getMonotonicityForRange(*current_type.get(), key_range.left, key_range.right); - - if (!monotonicity.is_monotonic) - { - return {}; - } - - /// If we apply function to open interval, we can get empty intervals in result. - /// E.g. for ('2020-01-03', '2020-01-20') after applying 'toYYYYMM' we will get ('202001', '202001'). - /// To avoid this we make range left and right included. - /// Any function that treats NULL specially is not monotonic. - /// Thus we can safely use isNull() as an -Inf/+Inf indicator here. - if (!key_range.left.isNull()) - { - key_range.left = applyFunction(func, current_type, key_range.left); - key_range.left_included = true; - } - - if (!key_range.right.isNull()) - { - key_range.right = applyFunction(func, current_type, key_range.right); - key_range.right_included = true; - } - - current_type = func->getResultType(); - - if (!monotonicity.is_positive) - key_range.invert(); - } - return key_range; -} - -const parquet::ColumnDescriptor * getColumnDescriptorIfBloomFilterIsPresent( - const std::unique_ptr & parquet_rg_metadata, - const std::vector & clickhouse_column_index_to_parquet_index, - std::size_t clickhouse_column_index) -{ - if (clickhouse_column_index_to_parquet_index.size() <= clickhouse_column_index) - { - return nullptr; - } - - const auto & parquet_indexes = clickhouse_column_index_to_parquet_index[clickhouse_column_index].parquet_indexes; - - // complex types like structs, tuples and maps will have more than one index. - // we don't support those for now - if (parquet_indexes.size() > 1) - { - return nullptr; - } - - if (parquet_indexes.empty()) - { - throw Exception(ErrorCodes::LOGICAL_ERROR, "Something bad happened, raise an issue and try the query with `input_format_parquet_bloom_filter_push_down=false`"); - } - - auto parquet_column_index = parquet_indexes[0]; - - const auto * parquet_column_descriptor = parquet_rg_metadata->schema()->Column(parquet_column_index); - - bool column_has_bloom_filter = parquet_rg_metadata->ColumnChunk(parquet_column_index)->bloom_filter_offset().has_value(); - if (!column_has_bloom_filter) - { - return nullptr; - } - - return parquet_column_descriptor; -} - - -bool isParquetStringTypeSupportedForBloomFilters( - const std::shared_ptr & logical_type, - parquet::ConvertedType::type converted_type) -{ - if (logical_type && - !logical_type->is_none() - && !(logical_type->is_string() || logical_type->is_BSON() || logical_type->is_JSON())) - { - return false; - } - - if (parquet::ConvertedType::type::NONE != converted_type && - !(converted_type == parquet::ConvertedType::JSON || converted_type == parquet::ConvertedType::UTF8 - || converted_type == parquet::ConvertedType::BSON)) - { - return false; - } - - return true; -} - -bool isParquetIntegerTypeSupportedForBloomFilters(const std::shared_ptr & logical_type, parquet::ConvertedType::type converted_type) -{ - if (logical_type && !logical_type->is_none() && !logical_type->is_int()) - { - return false; - } - - if (parquet::ConvertedType::type::NONE != converted_type && !(converted_type == parquet::ConvertedType::INT_8 || converted_type == parquet::ConvertedType::INT_16 - || converted_type == parquet::ConvertedType::INT_32 || converted_type == parquet::ConvertedType::INT_64 - || converted_type == parquet::ConvertedType::UINT_8 || converted_type == parquet::ConvertedType::UINT_16 - || converted_type == parquet::ConvertedType::UINT_32 || converted_type == parquet::ConvertedType::UINT_64)) - { - return false; - } - - return true; -} - -template -uint64_t hashSpecialFLBATypes(const Field & field) -{ - const T & value = field.safeGet(); - - parquet::FLBA flba(reinterpret_cast(&value)); - - parquet::XxHasher hasher; - - return hasher.Hash(&flba, sizeof(T)); -}; - -std::optional tryHashStringWithoutCompatibilityCheck(const Field & field) -{ - const auto field_type = field.getType(); - - if (field_type != Field::Types::Which::String) - { - return std::nullopt; - } - - parquet::XxHasher hasher; - parquet::ByteArray ba { field.safeGet() }; - - return hasher.Hash(&ba); -} - -std::optional tryHashString( - const Field & field, - const std::shared_ptr & logical_type, - parquet::ConvertedType::type converted_type) -{ - if (!isParquetStringTypeSupportedForBloomFilters(logical_type, converted_type)) - { - return std::nullopt; - } - - return tryHashStringWithoutCompatibilityCheck(field); -} - -std::optional tryHashFLBA( - const Field & field, - const std::shared_ptr & logical_type, - parquet::ConvertedType::type converted_type, - std::size_t parquet_column_length) -{ - if (!isParquetStringTypeSupportedForBloomFilters(logical_type, converted_type)) - { - return std::nullopt; - } - - const auto field_type = field.getType(); - - if (field_type == Field::Types::Which::IPv6 && parquet_column_length == sizeof(IPv6)) - { - return hashSpecialFLBATypes(field); - } - - return tryHashStringWithoutCompatibilityCheck(field); -} - -template -std::optional tryHashInt(const Field & field, const std::shared_ptr & logical_type, parquet::ConvertedType::type converted_type) -{ - if (!isParquetIntegerTypeSupportedForBloomFilters(logical_type, converted_type)) - { - return std::nullopt; - } - - parquet::XxHasher hasher; - - if (field.getType() == Field::Types::Which::Int64) - { - return hasher.Hash(static_cast(field.safeGet())); - } - else if (field.getType() == Field::Types::Which::UInt64) - { - return hasher.Hash(static_cast(field.safeGet())); - } - else if (field.getType() == Field::Types::IPv4) - { - /* - * In theory, we could accept IPv4 over 64 bits variables. It would only be a problem in case it was hashed using the byte array api - * with a zero-ed buffer that had a 32 bits variable copied into it. - * - * To be on the safe side, accept only in case physical type is 32 bits. - * */ - if constexpr (std::is_same_v) - { - return hasher.Hash(static_cast(field.safeGet())); - } - } - - return std::nullopt; -} - -std::optional tryHash(const Field & field, const parquet::ColumnDescriptor * parquet_column_descriptor) -{ - const auto physical_type = parquet_column_descriptor->physical_type(); - const auto & logical_type = parquet_column_descriptor->logical_type(); - const auto converted_type = parquet_column_descriptor->converted_type(); - - switch (physical_type) - { - case parquet::Type::type::INT32: - return tryHashInt(field, logical_type, converted_type); - case parquet::Type::type::INT64: - return tryHashInt(field, logical_type, converted_type); - case parquet::Type::type::BYTE_ARRAY: - return tryHashString(field, logical_type, converted_type); - case parquet::Type::type::FIXED_LEN_BYTE_ARRAY: - return tryHashFLBA(field, logical_type, converted_type, parquet_column_descriptor->type_length()); - default: - return std::nullopt; - } -} - -std::optional> hash(const IColumn * data_column, const parquet::ColumnDescriptor * parquet_column_descriptor) -{ - std::vector hashes; - - for (size_t i = 0u; i < data_column->size(); i++) - { - Field f; - data_column->get(i, f); - - auto hashed_value = tryHash(f, parquet_column_descriptor); - - if (!hashed_value) - { - return std::nullopt; - } - - hashes.emplace_back(*hashed_value); - } - - return hashes; -} - -bool maybeTrueOnBloomFilter(const std::vector & hashes, const std::unique_ptr & bloom_filter) -{ - for (const auto hash : hashes) - { - if (bloom_filter->FindHash(hash)) - { - return true; - } - } - - return false; -} - -bool mayBeTrueOnParquetRowGroup(const ParquetFilterCondition::BloomFilterData & condition_bloom_filter_data, - const ParquetBloomFilterCondition::ColumnIndexToBF & column_index_to_column_bf) -{ - bool maybe_true = true; - for (auto column_index = 0u; column_index < condition_bloom_filter_data.hashes_per_column.size(); column_index++) - { - // in case bloom filter is not present for this row group - // https://github.com/ClickHouse/ClickHouse/pull/62966#discussion_r1722361237 - if (!column_index_to_column_bf.contains(condition_bloom_filter_data.key_columns[column_index])) - { - continue; - } - - bool column_maybe_contains = maybeTrueOnBloomFilter( - condition_bloom_filter_data.hashes_per_column[column_index], - column_index_to_column_bf.at(condition_bloom_filter_data.key_columns[column_index])); - - if (!column_maybe_contains) - { - maybe_true = false; - break; - } - } - - return maybe_true; -} - -std::vector abcdefgh( - const std::vector & rpn, - const std::vector & clickhouse_column_index_to_parquet_index, - const std::unique_ptr & parquet_rg_metadata) -{ - std::vector condition_elements; - - using RPNElement = KeyCondition::RPNElement; - - for (const auto & rpn_element : rpn) - { - condition_elements.emplace_back(rpn_element); - // this would be a problem for `where negate(x) = -58`. - // It would perform a bf search on `-58`, and possibly miss row groups containing this data. - if (!rpn_element.monotonic_functions_chain.empty()) - { - continue; - } - - ParquetBloomFilterCondition::ConditionElement::HashesForColumns hashes; - - if (rpn_element.function == RPNElement::FUNCTION_IN_RANGE - || rpn_element.function == RPNElement::FUNCTION_NOT_IN_RANGE) - { - // Only FUNCTION_EQUALS is supported and for that extremes need to be the same - if (rpn_element.range.left != rpn_element.range.right) - { - continue; - } - - const auto * parquet_column_descriptor = - getColumnDescriptorIfBloomFilterIsPresent(parquet_rg_metadata, clickhouse_column_index_to_parquet_index, rpn_element.key_column); - - if (!parquet_column_descriptor) - { - continue; - } - - auto hashed_value = tryHash(rpn_element.range.left, parquet_column_descriptor); - - if (!hashed_value) - { - continue; - } - - std::vector hashes_for_column; - hashes_for_column.emplace_back(*hashed_value); - - hashes.emplace_back(std::move(hashes_for_column)); - - std::vector key_columns; - key_columns.emplace_back(rpn_element.key_column); - - condition_elements.back().bloom_filter_data = ParquetFilterCondition::BloomFilterData {std::move(hashes), std::move(key_columns)}; - } - else if (rpn_element.function == RPNElement::FUNCTION_IN_SET - || rpn_element.function == RPNElement::FUNCTION_NOT_IN_SET) - { - const auto & set_index = rpn_element.set_index; - const auto & ordered_set = set_index->getOrderedSet(); - const auto & indexes_mapping = set_index->getIndexesMapping(); - bool found_empty_column = false; - - std::vector key_columns; - - for (auto i = 0u; i < ordered_set.size(); i++) - { - const auto & set_column = ordered_set[i]; - - const auto * parquet_column_descriptor = getColumnDescriptorIfBloomFilterIsPresent( - parquet_rg_metadata, - clickhouse_column_index_to_parquet_index, - indexes_mapping[i].key_index); - - if (!parquet_column_descriptor) - { - continue; - } - - auto column = set_column; - - if (column->empty()) - { - found_empty_column = true; - break; - } - - if (const auto & nullable_column = checkAndGetColumn(set_column.get())) - { - column = nullable_column->getNestedColumnPtr(); - } - - auto hashes_for_column_opt = hash(column.get(), parquet_column_descriptor); - - if (!hashes_for_column_opt) - { - continue; - } - - auto & hashes_for_column = *hashes_for_column_opt; - - if (hashes_for_column.empty()) - { - continue; - } - - hashes.emplace_back(hashes_for_column); - - key_columns.push_back(indexes_mapping[i].key_index); - } - - if (found_empty_column) - { - // todo arthur - continue; - } - - if (hashes.empty()) - { - continue; - } - - condition_elements.back().bloom_filter_data = {std::move(hashes), std::move(key_columns)}; - } - } - - return condition_elements; -} - -BoolMask ParquetFilterCondition::check(const std::vector & rpn, - const Hyperrectangle & hyperrectangle, - const KeyCondition::SpaceFillingCurveDescriptions & key_space_filling_curves, - const DataTypes & data_types, - const ParquetBloomFilterCondition::ColumnIndexToBF & column_index_to_column_bf, - bool single_point) -{ - std::vector rpn_stack; - - auto curve_type = [&](size_t key_column_pos) - { - for (const auto & curve : key_space_filling_curves) - if (curve.key_column_pos == key_column_pos) - return curve.type; - return KeyCondition::SpaceFillingCurveType::Unknown; - }; - - for (const auto & element : rpn) - { - if (element.argument_num_of_space_filling_curve.has_value()) - { - // todo arthur, not sure what to do here yet - /// If a condition on argument of a space filling curve wasn't collapsed into FUNCTION_ARGS_IN_HYPERRECTANGLE, - /// we cannot process it. - rpn_stack.emplace_back(true, true); - } - else if (element.function == ConditionElement::FUNCTION_UNKNOWN) - { - rpn_stack.emplace_back(true, true); - } - else if (element.function == ConditionElement::FUNCTION_IN_RANGE - || element.function == ConditionElement::FUNCTION_NOT_IN_RANGE) - { - if (element.key_column >= hyperrectangle.size()) - { - throw Exception(ErrorCodes::LOGICAL_ERROR, - "Hyperrectangle size is {}, but requested element at posittion {} ({})", - hyperrectangle.size(), element.key_column, element.toString()); - } - - const Range * key_range = &hyperrectangle[element.key_column]; - - /// The case when the column is wrapped in a chain of possibly monotonic functions. - Range transformed_range = Range::createWholeUniverse(); - if (!element.monotonic_functions_chain.empty()) - { - std::optional new_range = applyMonotonicFunctionsChainToRange( - *key_range, - element.monotonic_functions_chain, - data_types[element.key_column], - single_point - ); - - if (!new_range) - { - rpn_stack.emplace_back(true, true); - - if (element.bloom_filter_data) - { - rpn_stack.back().can_be_true = mayBeTrueOnParquetRowGroup(*element.bloom_filter_data, column_index_to_column_bf); - } - - continue; - } - transformed_range = *new_range; - key_range = &transformed_range; - } - - bool intersects = element.range.intersectsRange(*key_range); - bool contains = element.range.containsRange(*key_range); - - rpn_stack.emplace_back(intersects, !contains); - - if (rpn_stack.back().can_be_true && element.bloom_filter_data) - { - rpn_stack.back().can_be_true = mayBeTrueOnParquetRowGroup(*element.bloom_filter_data, column_index_to_column_bf); - } - - if (element.function == ConditionElement::FUNCTION_NOT_IN_RANGE) - rpn_stack.back() = !rpn_stack.back(); - } - else if (element.function == ConditionElement::FUNCTION_ARGS_IN_HYPERRECTANGLE) - { - /** The case of space-filling curves. - * We unpack the range of a space filling curve into hyperrectangles of their arguments, - * and then check the intersection of them with the given hyperrectangle from the key condition. - * - * Note: you might find this code hard to understand, - * because there are three different hyperrectangles involved: - * - * 1. A hyperrectangle derived from the range of the table's sparse index (marks granule): `hyperrectangle` - * We analyze its dimension `key_range`, corresponding to the `key_column`. - * For example, the table's key is a single column `mortonEncode(x, y)`, - * the current granule is [500, 600], and it means that - * mortonEncode(x, y) in [500, 600] - * - * 2. A hyperrectangle derived from the key condition, e.g. - * `x >= 10 AND x <= 20 AND y >= 20 AND y <= 30` defines: (x, y) in [10, 20] × [20, 30] - * - * 3. A set of hyperrectangles that we obtain by inverting the space-filling curve on the range: - * From mortonEncode(x, y) in [500, 600] - * We get (x, y) in [30, 31] × [12, 13] - * or (x, y) in [28, 31] × [14, 15]; - * or (x, y) in [0, 7] × [16, 23]; - * or (x, y) in [8, 11] × [16, 19]; - * or (x, y) in [12, 15] × [16, 17]; - * or (x, y) in [12, 12] × [18, 18]; - * - * And we analyze the intersection of (2) and (3). - */ - - Range key_range = hyperrectangle[element.key_column]; - - /// The only possible result type of a space filling curve is UInt64. - /// We also only check bounded ranges. - if (key_range.left.getType() == Field::Types::UInt64 - && key_range.right.getType() == Field::Types::UInt64) - { - key_range.shrinkToIncludedIfPossible(); - - size_t num_dimensions = element.space_filling_curve_args_hyperrectangle.size(); - - /// Let's support only the case of 2d, because I'm not confident in other cases. - if (num_dimensions == 2) - { - UInt64 left = key_range.left.safeGet(); - UInt64 right = key_range.right.safeGet(); - - BoolMask mask(false, true); - auto hyperrectangle_intersection_callback = [&](std::array, 2> curve_hyperrectangle) - { - BoolMask current_intersection(true, false); - for (size_t dim = 0; dim < num_dimensions; ++dim) - { - const Range & condition_arg_range = element.space_filling_curve_args_hyperrectangle[dim]; - - const Range curve_arg_range( - curve_hyperrectangle[dim].first, true, - curve_hyperrectangle[dim].second, true); - - bool intersects = condition_arg_range.intersectsRange(curve_arg_range); - bool contains = condition_arg_range.containsRange(curve_arg_range); - - current_intersection = current_intersection & BoolMask(intersects, !contains); - } - - mask = mask | current_intersection; - }; - - switch (curve_type(element.key_column)) - { - case KeyCondition::SpaceFillingCurveType::Hilbert: - { - hilbertIntervalToHyperrectangles2D(left, right, hyperrectangle_intersection_callback); - break; - } - case KeyCondition::SpaceFillingCurveType::Morton: - { - mortonIntervalToHyperrectangles<2>(left, right, hyperrectangle_intersection_callback); - break; - } - case KeyCondition::SpaceFillingCurveType::Unknown: - { - throw Exception(ErrorCodes::LOGICAL_ERROR, "curve_type is `Unknown`. It is a bug."); - } - } - - rpn_stack.emplace_back(mask); - } - else - rpn_stack.emplace_back(true, true); - } - else - rpn_stack.emplace_back(true, true); - - /** Note: we can consider implementing a simpler solution, based on "hidden keys". - * It means, when we have a table's key like (a, b, mortonCurve(x, y)) - * we extract the arguments from the curves, and append them to the key, - * imagining that we have the key (a, b, mortonCurve(x, y), x, y) - * - * Then while we analyze the granule's range between (a, b, mortonCurve(x, y)) - * and decompose it to the series of hyperrectangles, - * we can construct a series of hyperrectangles of the extended key (a, b, mortonCurve(x, y), x, y), - * and then do everything as usual. - * - * This approach is generalizable to any functions, that have preimage of interval - * represented by a set of hyperrectangles. - */ - } - else if (element.function == ConditionElement::FUNCTION_POINT_IN_POLYGON) - { - /** There are 2 kinds of polygons: - * 1. Polygon by minmax index - * 2. Polygons which is provided by user - * - * Polygon by minmax index: - * For hyperactangle [1, 2] × [3, 4] we can create a polygon with 4 points: (1, 3), (1, 4), (2, 4), (2, 3) - * - * Algorithm: - * Check whether there is any intersection of the 2 polygons. If true return {true, true}, else return {false, true}. - */ - const auto & key_column_positions = element.point_in_polygon_column_description->key_column_positions; - - Float64 x_min = applyVisitor(FieldVisitorConvertToNumber(), hyperrectangle[key_column_positions[0]].left); - Float64 x_max = applyVisitor(FieldVisitorConvertToNumber(), hyperrectangle[key_column_positions[0]].right); - Float64 y_min = applyVisitor(FieldVisitorConvertToNumber(), hyperrectangle[key_column_positions[1]].left); - Float64 y_max = applyVisitor(FieldVisitorConvertToNumber(), hyperrectangle[key_column_positions[1]].right); - - if (unlikely(isNaN(x_min) || isNaN(x_max) || isNaN(y_min) || isNaN(y_max))) - { - rpn_stack.emplace_back(true, true); - continue; - } - - using Point = boost::geometry::model::d2::point_xy; - using Polygon = boost::geometry::model::polygon; - Polygon polygon_by_minmax_index; - polygon_by_minmax_index.outer().emplace_back(x_min, y_min); - polygon_by_minmax_index.outer().emplace_back(x_min, y_max); - polygon_by_minmax_index.outer().emplace_back(x_max, y_max); - polygon_by_minmax_index.outer().emplace_back(x_max, y_min); - - /// Close ring - boost::geometry::correct(polygon_by_minmax_index); - - /// Because the polygon may have a hole so the "can_be_false" should always be true. - rpn_stack.emplace_back( - boost::geometry::intersects(polygon_by_minmax_index, element.polygon), true); - } - else if ( - element.function == ConditionElement::FUNCTION_IS_NULL - || element.function == ConditionElement::FUNCTION_IS_NOT_NULL) - { - const Range * key_range = &hyperrectangle[element.key_column]; - - /// No need to apply monotonic functions as nulls are kept. - bool intersects = element.range.intersectsRange(*key_range); - bool contains = element.range.containsRange(*key_range); - - rpn_stack.emplace_back(intersects, !contains); - if (element.function == ConditionElement::FUNCTION_IS_NULL) - rpn_stack.back() = !rpn_stack.back(); - } - else if ( - element.function == ConditionElement::FUNCTION_IN_SET - || element.function == ConditionElement::FUNCTION_NOT_IN_SET) - { - if (!element.set_index) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Set for IN is not created yet"); - - rpn_stack.emplace_back(element.set_index->checkInRange(hyperrectangle, data_types, single_point)); - - if (rpn_stack.back().can_be_true && element.bloom_filter_data) - { - rpn_stack.back().can_be_true = mayBeTrueOnParquetRowGroup(*element.bloom_filter_data, column_index_to_column_bf); - } - - if (element.function == ConditionElement::FUNCTION_NOT_IN_SET) - rpn_stack.back() = !rpn_stack.back(); - } - else if (element.function == ConditionElement::FUNCTION_NOT) - { - assert(!rpn_stack.empty()); - - rpn_stack.back() = !rpn_stack.back(); - } - else if (element.function == ConditionElement::FUNCTION_AND) - { - assert(!rpn_stack.empty()); - - auto arg1 = rpn_stack.back(); - rpn_stack.pop_back(); - auto arg2 = rpn_stack.back(); - rpn_stack.back() = arg1 & arg2; - } - else if (element.function == ConditionElement::FUNCTION_OR) - { - assert(!rpn_stack.empty()); - - auto arg1 = rpn_stack.back(); - rpn_stack.pop_back(); - auto arg2 = rpn_stack.back(); - rpn_stack.back() = arg1 | arg2; - } - else if (element.function == ConditionElement::ALWAYS_FALSE) - { - rpn_stack.emplace_back(false, true); - } - else if (element.function == ConditionElement::ALWAYS_TRUE) - { - rpn_stack.emplace_back(true, false); - } - else - throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected function type in KeyCondition::ConditionElement"); - } - - if (rpn_stack.size() != 1) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected stack size in KeyCondition::checkInHyperrectangle"); - - return rpn_stack[0]; -} - -} - -#endif diff --git a/src/Processors/Formats/Impl/Parquet/ParquetFilterCondition.h b/src/Processors/Formats/Impl/Parquet/ParquetFilterCondition.h deleted file mode 100644 index 0edb48e0f03..00000000000 --- a/src/Processors/Formats/Impl/Parquet/ParquetFilterCondition.h +++ /dev/null @@ -1,44 +0,0 @@ -#pragma once - -#include - -#if USE_PARQUET - -#include -#include - -namespace DB -{ - -class ParquetFilterCondition -{ -public: - - struct BloomFilterData - { - using HashesForColumns = std::vector>; - HashesForColumns hashes_per_column; - std::vector key_columns; - }; - - struct ConditionElement : public KeyCondition::RPNElement - { - std::optional bloom_filter_data; - }; - - static BoolMask check(const std::vector & RPN, - const Hyperrectangle & hyperrectangle, - const KeyCondition::SpaceFillingCurveDescriptions & key_space_filling_curves, - const DataTypes & data_types, - const ParquetBloomFilterCondition::ColumnIndexToBF & column_index_to_column_bf, - bool single_point); -}; - -std::vector abcdefgh( - const std::vector & rpn, - const std::vector & clickhouse_column_index_to_parquet_index, - const std::unique_ptr & parquet_rg_metadata); - -} - -#endif diff --git a/src/Processors/Formats/Impl/Parquet/ParquetBloomFilterCondition.cpp b/src/Processors/Formats/Impl/Parquet/keyConditionRPNToParquetRPN.cpp similarity index 58% rename from src/Processors/Formats/Impl/Parquet/ParquetBloomFilterCondition.cpp rename to src/Processors/Formats/Impl/Parquet/keyConditionRPNToParquetRPN.cpp index 75eeb15a519..406ac13faf3 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetBloomFilterCondition.cpp +++ b/src/Processors/Formats/Impl/Parquet/keyConditionRPNToParquetRPN.cpp @@ -1,23 +1,51 @@ -#include -#include +#include #if USE_PARQUET -#include +#include #include -#include -#include + namespace DB { -namespace ErrorCodes +const parquet::ColumnDescriptor * getColumnDescriptorIfBloomFilterIsPresent( + const std::unique_ptr & parquet_rg_metadata, + const std::vector & clickhouse_column_index_to_parquet_index, + std::size_t clickhouse_column_index) { - extern const int LOGICAL_ERROR; + if (clickhouse_column_index_to_parquet_index.size() <= clickhouse_column_index) + { + return nullptr; + } + + const auto & parquet_indexes = clickhouse_column_index_to_parquet_index[clickhouse_column_index].parquet_indexes; + + // complex types like structs, tuples and maps will have more than one index. + // we don't support those for now + if (parquet_indexes.size() > 1) + { + return nullptr; + } + + if (parquet_indexes.empty()) + { + throw Exception(ErrorCodes::LOGICAL_ERROR, "Something bad happened, raise an issue and try the query with `input_format_parquet_bloom_filter_push_down=false`"); + } + + auto parquet_column_index = parquet_indexes[0]; + + const auto * parquet_column_descriptor = parquet_rg_metadata->schema()->Column(parquet_column_index); + + bool column_has_bloom_filter = parquet_rg_metadata->ColumnChunk(parquet_column_index)->bloom_filter_offset().has_value(); + if (!column_has_bloom_filter) + { + return nullptr; + } + + return parquet_column_descriptor; } -namespace -{ bool isParquetStringTypeSupportedForBloomFilters( const std::shared_ptr & logical_type, @@ -48,9 +76,9 @@ bool isParquetIntegerTypeSupportedForBloomFilters(const std::shared_ptr> hash(const IColumn * data_column, const par return hashes; } -bool maybeTrueOnBloomFilter(const std::vector & hashes, const std::unique_ptr & bloom_filter) +KeyCondition::RPN keyConditionRPNToParquetRPN(const std::vector & rpn, + const std::vector & clickhouse_column_index_to_parquet_index, + const std::unique_ptr & parquet_rg_metadata) { - for (const auto hash : hashes) - { - if (bloom_filter->FindHash(hash)) - { - return true; - } - } - - return false; -} - -const parquet::ColumnDescriptor * getColumnDescriptorIfBloomFilterIsPresent( - const std::unique_ptr & parquet_rg_metadata, - const std::vector & clickhouse_column_index_to_parquet_index, - std::size_t clickhouse_column_index) -{ - if (clickhouse_column_index_to_parquet_index.size() <= clickhouse_column_index) - { - return nullptr; - } - - const auto & parquet_indexes = clickhouse_column_index_to_parquet_index[clickhouse_column_index].parquet_indexes; - - // complex types like structs, tuples and maps will have more than one index. - // we don't support those for now - if (parquet_indexes.size() > 1) - { - return nullptr; - } - - if (parquet_indexes.empty()) - { - throw Exception(ErrorCodes::LOGICAL_ERROR, "Something bad happened, raise an issue and try the query with `input_format_parquet_bloom_filter_push_down=false`"); - } - - auto parquet_column_index = parquet_indexes[0]; - - const auto * parquet_column_descriptor = parquet_rg_metadata->schema()->Column(parquet_column_index); - - bool column_has_bloom_filter = parquet_rg_metadata->ColumnChunk(parquet_column_index)->bloom_filter_offset().has_value(); - if (!column_has_bloom_filter) - { - return nullptr; - } - - return parquet_column_descriptor; -} - -} - -ParquetBloomFilterCondition::ParquetBloomFilterCondition(const std::vector & condition_, const Block & header_) - : condition(condition_), header(header_) -{ -} - -bool ParquetBloomFilterCondition::mayBeTrueOnRowGroup(const ColumnIndexToBF & column_index_to_column_bf) const -{ - using Function = ConditionElement::Function; - std::vector rpn_stack; - - for (const auto & element : condition) - { - if (element.function == Function::FUNCTION_IN - || element.function == Function::FUNCTION_NOT_IN) - { - bool maybe_true = true; - for (auto column_index = 0u; column_index < element.hashes_per_column.size(); column_index++) - { - // in case bloom filter is not present for this row group - // https://github.com/ClickHouse/ClickHouse/pull/62966#discussion_r1722361237 - if (!column_index_to_column_bf.contains(element.key_columns[column_index])) - { - rpn_stack.emplace_back(true, true); - continue; - } - - bool column_maybe_contains = maybeTrueOnBloomFilter( - element.hashes_per_column[column_index], - column_index_to_column_bf.at(element.key_columns[column_index])); - - if (!column_maybe_contains) - { - maybe_true = false; - break; - } - } - - rpn_stack.emplace_back(maybe_true, true); - if (element.function == Function::FUNCTION_NOT_IN) - rpn_stack.back() = !rpn_stack.back(); - } - else if (element.function == Function::FUNCTION_NOT) - { - rpn_stack.back() = !rpn_stack.back(); - } - else if (element.function == Function::FUNCTION_OR) - { - auto arg1 = rpn_stack.back(); - rpn_stack.pop_back(); - auto arg2 = rpn_stack.back(); - rpn_stack.back() = arg1 | arg2; - } - else if (element.function == Function::FUNCTION_AND) - { - auto arg1 = rpn_stack.back(); - rpn_stack.pop_back(); - auto arg2 = rpn_stack.back(); - rpn_stack.back() = arg1 & arg2; - } - else if (element.function == Function::ALWAYS_TRUE) - { - rpn_stack.emplace_back(true, false); - } - else if (element.function == Function::ALWAYS_FALSE) - { - rpn_stack.emplace_back(false, true); - } - else - { - rpn_stack.emplace_back(true, true); - } - } - - if (rpn_stack.size() != 1) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected stack size in KeyCondition::mayBeTrueOnRowGroup"); - - return rpn_stack[0].can_be_true; -} - -std::unordered_set ParquetBloomFilterCondition::getFilteringColumnKeys() const -{ - std::unordered_set column_keys; - - for (const auto & element : condition) - { - for (const auto index : element.key_columns) - { - column_keys.insert(index); - } - } - - return column_keys; -} - -/* - * `KeyCondition::rpn` is overly complex for bloom filters, some operations are not even supported. Not only that, but to avoid hashing each time - * we loop over a rpn element, we need to store hashes instead of where predicate values. To address this, we loop over `KeyCondition::rpn` - * and build a simplified RPN that holds hashes instead of values. - * - * `KeyCondition::RPNElement::FUNCTION_IN_RANGE` becomes: - * `FUNCTION_IN` - * `FUNCTION_UNKNOWN` when range limits are different - * `KeyCondition::RPNElement::FUNCTION_IN_SET` becomes - * `FUNCTION_IN` - * - * Complex types and structs are not supported. - * There are two sources of data types being analyzed, and they need to be compatible: DB::Field type and parquet type. - * This is determined by the `isColumnSupported` method. - * - * Some interesting examples: - * 1. file(..., 'str_column UInt64') where str_column = 50; Field.type == UInt64. Parquet type string. Not supported. - * 2. file(...) where str_column = 50; Field.type == String (conversion already taken care by `KeyCondition`). Parquet type string. - * 3. file(...) where uint32_column = toIPv4(5). Field.type == IPv4. Incompatible column types, resolved by `KeyCondition` itself. - * 4. file(...) where toIPv4(uint32_column) = toIPv4(5). Field.type == IPv4. We know it is safe to hash it using an int32 API. - * */ -std::vector keyConditionRPNToParquetBloomFilterCondition( - const std::vector & rpn, - const std::vector & clickhouse_column_index_to_parquet_index, - const std::unique_ptr & parquet_rg_metadata) -{ - std::vector condition_elements; + std::vector condition_elements; using RPNElement = KeyCondition::RPNElement; - using Function = ParquetBloomFilterCondition::ConditionElement::Function; for (const auto & rpn_element : rpn) { + condition_elements.emplace_back(rpn_element); // this would be a problem for `where negate(x) = -58`. // It would perform a bf search on `-58`, and possibly miss row groups containing this data. if (!rpn_element.monotonic_functions_chain.empty()) { - condition_elements.emplace_back(Function::FUNCTION_UNKNOWN); continue; } - ParquetBloomFilterCondition::ConditionElement::HashesForColumns hashes; + KeyCondition::BloomFilterData::HashesForColumns hashes; if (rpn_element.function == RPNElement::FUNCTION_IN_RANGE || rpn_element.function == RPNElement::FUNCTION_NOT_IN_RANGE) @@ -392,7 +251,6 @@ std::vector keyConditionRPNToParq // Only FUNCTION_EQUALS is supported and for that extremes need to be the same if (rpn_element.range.left != rpn_element.range.right) { - condition_elements.emplace_back(Function::FUNCTION_UNKNOWN); continue; } @@ -401,7 +259,6 @@ std::vector keyConditionRPNToParq if (!parquet_column_descriptor) { - condition_elements.emplace_back(Function::FUNCTION_UNKNOWN); continue; } @@ -409,7 +266,6 @@ std::vector keyConditionRPNToParq if (!hashed_value) { - condition_elements.emplace_back(Function::FUNCTION_UNKNOWN); continue; } @@ -418,14 +274,10 @@ std::vector keyConditionRPNToParq hashes.emplace_back(std::move(hashes_for_column)); - auto function = rpn_element.function == RPNElement::FUNCTION_IN_RANGE - ? ParquetBloomFilterCondition::ConditionElement::Function::FUNCTION_IN - : ParquetBloomFilterCondition::ConditionElement::Function::FUNCTION_NOT_IN; - std::vector key_columns; key_columns.emplace_back(rpn_element.key_column); - condition_elements.emplace_back(function, std::move(hashes), std::move(key_columns)); + condition_elements.back().bloom_filter_data = KeyCondition::BloomFilterData {std::move(hashes), std::move(key_columns)}; } else if (rpn_element.function == RPNElement::FUNCTION_IN_SET || rpn_element.function == RPNElement::FUNCTION_NOT_IN_SET) @@ -485,35 +337,16 @@ std::vector keyConditionRPNToParq if (found_empty_column) { - condition_elements.emplace_back(Function::ALWAYS_FALSE); + // todo arthur continue; } if (hashes.empty()) { - condition_elements.emplace_back(Function::FUNCTION_UNKNOWN); continue; } - auto function = RPNElement::FUNCTION_IN_SET == rpn_element.function ? Function::FUNCTION_IN : Function::FUNCTION_NOT_IN; - - condition_elements.emplace_back(function, hashes, key_columns); - } - else if (rpn_element.function == RPNElement::FUNCTION_NOT) - { - condition_elements.emplace_back(Function::FUNCTION_NOT); - } - else if (rpn_element.function == RPNElement::FUNCTION_OR) - { - condition_elements.emplace_back(Function::FUNCTION_OR); - } - else if (rpn_element.function == RPNElement::FUNCTION_AND) - { - condition_elements.emplace_back(Function::FUNCTION_AND); - } - else - { - condition_elements.emplace_back(Function::ALWAYS_TRUE); + condition_elements.back().bloom_filter_data = {std::move(hashes), std::move(key_columns)}; } } diff --git a/src/Processors/Formats/Impl/Parquet/keyConditionRPNToParquetRPN.h b/src/Processors/Formats/Impl/Parquet/keyConditionRPNToParquetRPN.h new file mode 100644 index 00000000000..3b206a4ff0e --- /dev/null +++ b/src/Processors/Formats/Impl/Parquet/keyConditionRPNToParquetRPN.h @@ -0,0 +1,24 @@ +#pragma once + +#include + +#if USE_PARQUET + +#include +#include + +namespace parquet +{ + class RowGroupMetadata; +} + +namespace DB +{ + +KeyCondition::RPN keyConditionRPNToParquetRPN(const std::vector & rpn, + const std::vector & clickhouse_column_index_to_parquet_index, + const std::unique_ptr & parquet_rg_metadata); + +} + +#endif diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp index 92bec6c4aca..ee9f2acadfe 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp @@ -27,8 +27,7 @@ #include #include #include -#include -#include +#include #include namespace ProfileEvents @@ -276,7 +275,21 @@ static Field decodePlainParquetValueSlow(const std::string & data, parquet::Type return field; } -static ParquetBloomFilterCondition::ColumnIndexToBF buildColumnIndexToBF( +struct ParquetBloomFilter : public KeyCondition::BloomFilter +{ + ParquetBloomFilter(std::unique_ptr && parquet_bf_) + : parquet_bf(std::move(parquet_bf_)) {} + + bool findHash(uint64_t hash) override + { + return parquet_bf->FindHash(hash); + } + +private: + std::unique_ptr parquet_bf; +}; + +static KeyCondition::ColumnIndexToBloomFilter buildColumnIndexToBF( parquet::BloomFilterReader & bf_reader, int row_group, const std::vector & clickhouse_column_index_to_parquet_index, @@ -290,7 +303,7 @@ static ParquetBloomFilterCondition::ColumnIndexToBF buildColumnIndexToBF( return {}; } - ParquetBloomFilterCondition::ColumnIndexToBF index_to_column_bf; + KeyCondition::ColumnIndexToBloomFilter index_to_column_bf; for (const auto & [clickhouse_index, parquet_indexes] : clickhouse_column_index_to_parquet_index) { @@ -307,14 +320,14 @@ static ParquetBloomFilterCondition::ColumnIndexToBF buildColumnIndexToBF( auto parquet_index = parquet_indexes[0]; - auto bf = rg_bf->GetColumnBloomFilter(parquet_index); + auto parquet_bf = rg_bf->GetColumnBloomFilter(parquet_index); - if (!bf) + if (!parquet_bf) { continue; } - index_to_column_bf[clickhouse_index] = std::move(bf); + index_to_column_bf[clickhouse_index] = std::make_unique(std::move(parquet_bf)); } return index_to_column_bf; @@ -485,6 +498,24 @@ static std::vector getHyperrectangleForRowGroup(const parquet::FileMetaDa return hyperrectangle; } +std::unordered_set getBloomFilterFilteringColumnKeys(const KeyCondition::RPN & rpn) +{ + std::unordered_set column_keys; + + for (const auto & element : rpn) + { + if (auto bf_data = element.bloom_filter_data) + { + for (const auto index : bf_data->key_columns) + { + column_keys.insert(index); + } + } + } + + return column_keys; +} + ParquetBlockInputFormat::ParquetBlockInputFormat( ReadBuffer & buf, const Block & header_, @@ -578,72 +609,67 @@ void ParquetBlockInputFormat::initializeIfNeeded() return std::min(std::max(preferred_num_rows, MIN_ROW_NUM), static_cast(format_settings.parquet.max_block_size)); }; - std::unique_ptr parquet_bloom_filter_condition; - std::unordered_set filtering_columns; if (format_settings.parquet.bloom_filter_push_down && key_condition) { bf_reader = parquet::BloomFilterReader::Make(arrow_file, metadata, bf_reader_properties, nullptr); - const auto parquet_conditions = keyConditionRPNToParquetBloomFilterCondition( + const auto parquet_conditions = keyConditionRPNToParquetRPN( key_condition->getRPN(), index_mapping, metadata->RowGroup(0)); - parquet_bloom_filter_condition = std::make_unique(parquet_conditions, getPort().getHeader()); - filtering_columns = parquet_bloom_filter_condition->getFilteringColumnKeys(); + filtering_columns = getBloomFilterFilteringColumnKeys(parquet_conditions); } + auto skip_row_group_based_on_filters = [&](int row_group) + { + if (!format_settings.parquet.filter_push_down && !format_settings.parquet.bloom_filter_push_down) + { + return false; + } + + KeyCondition::RPN possibly_modified_rpn = key_condition->getRPN(); + KeyCondition::ColumnIndexToBloomFilter column_index_to_bloom_filter; + + const auto & header = getPort().getHeader(); + + std::vector hyperrectangle(header.columns(), Range::createWholeUniverse()); + + if (format_settings.parquet.filter_push_down) + { + hyperrectangle = getHyperrectangleForRowGroup(*metadata, row_group, header, format_settings);getHyperrectangleForRowGroup(*metadata, row_group, getPort().getHeader(), format_settings); + } + + if (format_settings.parquet.bloom_filter_push_down) + { + possibly_modified_rpn = keyConditionRPNToParquetRPN(key_condition->getRPN(), + index_mapping, + metadata->RowGroup(row_group)); + + column_index_to_bloom_filter = buildColumnIndexToBF(*bf_reader, row_group, index_mapping, filtering_columns); + } + + bool maybe_exists = KeyCondition::checkRPNAgainstHyperrectangle( + possibly_modified_rpn, + hyperrectangle, + key_condition->key_space_filling_curves, + getPort().getHeader().getDataTypes(), + key_condition->isSinglePoint(), + column_index_to_bloom_filter).can_be_true; + + return !maybe_exists; + }; + for (int row_group = 0; row_group < num_row_groups; ++row_group) { if (skip_row_groups.contains(row_group)) continue; - if (key_condition) + if (key_condition && skip_row_group_based_on_filters(row_group)) { - if (format_settings.parquet.filter_push_down && format_settings.parquet.bloom_filter_push_down) - { - auto parquet_rpn = abcdefgh(key_condition->getRPN(), - index_mapping, - metadata->RowGroup(row_group)); - auto hyperrectangle = getHyperrectangleForRowGroup(*metadata, row_group, getPort().getHeader(), format_settings); - - const auto column_index_to_bf = buildColumnIndexToBF(*bf_reader, row_group, index_mapping, filtering_columns); - - bool maybe_exists = ParquetFilterCondition::check( - parquet_rpn, - hyperrectangle, - key_condition->key_space_filling_curves, - getPort().getHeader().getDataTypes(), - column_index_to_bf, - key_condition->isSinglePoint()).can_be_true; - - if (!maybe_exists) - { - continue; - } - } - else if (format_settings.parquet.filter_push_down) - { - if (!key_condition - ->checkInHyperrectangle( - getHyperrectangleForRowGroup(*metadata, row_group, getPort().getHeader(), format_settings), - getPort().getHeader().getDataTypes()) - .can_be_true) - { - continue; - } - } - else if (format_settings.parquet.bloom_filter_push_down) - { - const auto column_index_to_bf = buildColumnIndexToBF(*bf_reader, row_group, index_mapping, filtering_columns); - - if (!parquet_bloom_filter_condition->mayBeTrueOnRowGroup(column_index_to_bf)) - { - continue; - } - } + continue; } // When single-threaded parsing, can prefetch row groups, so need to put all row groups in the same row_group_batch diff --git a/src/Storages/MergeTree/KeyCondition.cpp b/src/Storages/MergeTree/KeyCondition.cpp index 2363161fbd9..2bf18b4bc82 100644 --- a/src/Storages/MergeTree/KeyCondition.cpp +++ b/src/Storages/MergeTree/KeyCondition.cpp @@ -2983,6 +2983,57 @@ bool KeyCondition::extractPlainRanges(Ranges & ranges) const BoolMask KeyCondition::checkInHyperrectangle( const Hyperrectangle & hyperrectangle, const DataTypes & data_types) const +{ + return checkRPNAgainstHyperrectangle(rpn, hyperrectangle, key_space_filling_curves, data_types, single_point); +} + +bool mayExistOnBloomFilter(const std::vector & hashes, const std::unique_ptr & bloom_filter) +{ + for (const auto hash : hashes) + { + if (bloom_filter->findHash(hash)) + { + return true; + } + } + + return false; +} + +bool mayExistOnBloomFilter(const KeyCondition::BloomFilterData & condition_bloom_filter_data, + const KeyCondition::ColumnIndexToBloomFilter & column_index_to_column_bf) +{ + bool maybe_true = true; + for (auto column_index = 0u; column_index < condition_bloom_filter_data.hashes_per_column.size(); column_index++) + { + // in case bloom filter is not present for this row group + // https://github.com/ClickHouse/ClickHouse/pull/62966#discussion_r1722361237 + if (!column_index_to_column_bf.contains(condition_bloom_filter_data.key_columns[column_index])) + { + continue; + } + + bool column_maybe_contains = mayExistOnBloomFilter( + condition_bloom_filter_data.hashes_per_column[column_index], + column_index_to_column_bf.at(condition_bloom_filter_data.key_columns[column_index])); + + if (!column_maybe_contains) + { + maybe_true = false; + break; + } + } + + return maybe_true; +} + +BoolMask KeyCondition::checkRPNAgainstHyperrectangle( + const RPN & rpn, + const Hyperrectangle & hyperrectangle, + const KeyCondition::SpaceFillingCurveDescriptions & key_space_filling_curves, + const DataTypes & data_types, + bool single_point, + const ColumnIndexToBloomFilter & column_index_to_column_bf) { std::vector rpn_stack; @@ -2998,6 +3049,7 @@ BoolMask KeyCondition::checkInHyperrectangle( { if (element.argument_num_of_space_filling_curve.has_value()) { + // todo arthur, not sure what to do here yet /// If a condition on argument of a space filling curve wasn't collapsed into FUNCTION_ARGS_IN_HYPERRECTANGLE, /// we cannot process it. rpn_stack.emplace_back(true, true); @@ -3032,7 +3084,6 @@ BoolMask KeyCondition::checkInHyperrectangle( if (!new_range) { rpn_stack.emplace_back(true, true); - // aqui eu pergunto pro bloom filter continue; } transformed_range = *new_range; @@ -3042,9 +3093,13 @@ BoolMask KeyCondition::checkInHyperrectangle( bool intersects = element.range.intersectsRange(*key_range); bool contains = element.range.containsRange(*key_range); - // aqui eu pergunto pro bloom filter - rpn_stack.emplace_back(intersects, !contains); + + if (rpn_stack.back().can_be_true && element.bloom_filter_data) + { + rpn_stack.back().can_be_true = mayExistOnBloomFilter(*element.bloom_filter_data, column_index_to_column_bf); + } + if (element.function == RPNElement::FUNCTION_NOT_IN_RANGE) rpn_stack.back() = !rpn_stack.back(); } @@ -3217,8 +3272,13 @@ BoolMask KeyCondition::checkInHyperrectangle( if (!element.set_index) throw Exception(ErrorCodes::LOGICAL_ERROR, "Set for IN is not created yet"); - // aqui eu pergunto pro bloom filter rpn_stack.emplace_back(element.set_index->checkInRange(hyperrectangle, data_types, single_point)); + + if (rpn_stack.back().can_be_true && element.bloom_filter_data) + { + rpn_stack.back().can_be_true = mayExistOnBloomFilter(*element.bloom_filter_data, column_index_to_column_bf); + } + if (element.function == RPNElement::FUNCTION_NOT_IN_SET) rpn_stack.back() = !rpn_stack.back(); } diff --git a/src/Storages/MergeTree/KeyCondition.h b/src/Storages/MergeTree/KeyCondition.h index 00e741f549c..cca57e6dbba 100644 --- a/src/Storages/MergeTree/KeyCondition.h +++ b/src/Storages/MergeTree/KeyCondition.h @@ -148,6 +148,13 @@ public: /// TODO handle the cases when generate RPN. bool extractPlainRanges(Ranges & ranges) const; + struct BloomFilterData + { + using HashesForColumns = std::vector>; + HashesForColumns hashes_per_column; + std::vector key_columns; + }; + /// The expression is stored as Reverse Polish Notation. struct RPNElement { @@ -224,6 +231,8 @@ public: Polygon polygon; MonotonicFunctionsChain monotonic_functions_chain; + + std::optional bloom_filter_data; }; using RPN = std::vector; @@ -256,6 +265,23 @@ public: using SpaceFillingCurveDescriptions = std::vector; SpaceFillingCurveDescriptions key_space_filling_curves; + struct BloomFilter + { + virtual ~BloomFilter() = default; + + virtual bool findHash(uint64_t hash) = 0; + }; + + using ColumnIndexToBloomFilter = std::unordered_map>; + + static BoolMask checkRPNAgainstHyperrectangle( + const RPN & rpn, + const Hyperrectangle & hyperrectangle, + const KeyCondition::SpaceFillingCurveDescriptions & key_space_filling_curves, + const DataTypes & data_types, + bool single_point, + const ColumnIndexToBloomFilter & column_index_to_column_bf = {}); + bool isSinglePoint() const { return single_point; } private: From f66be67d0bbb7d1f49caf2e1a92664aae3255b99 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 6 Nov 2024 09:28:02 -0300 Subject: [PATCH 4/8] trigger ci From a7c78bdf84ed3bf849f2aa249b8f44f8d81bd8ce Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 6 Nov 2024 10:40:03 -0300 Subject: [PATCH 5/8] extern logical_error --- .../Formats/Impl/Parquet/keyConditionRPNToParquetRPN.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/Processors/Formats/Impl/Parquet/keyConditionRPNToParquetRPN.cpp b/src/Processors/Formats/Impl/Parquet/keyConditionRPNToParquetRPN.cpp index 406ac13faf3..441591896fc 100644 --- a/src/Processors/Formats/Impl/Parquet/keyConditionRPNToParquetRPN.cpp +++ b/src/Processors/Formats/Impl/Parquet/keyConditionRPNToParquetRPN.cpp @@ -9,6 +9,11 @@ namespace DB { +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + const parquet::ColumnDescriptor * getColumnDescriptorIfBloomFilterIsPresent( const std::unique_ptr & parquet_rg_metadata, const std::vector & clickhouse_column_index_to_parquet_index, From b5d7f7847b75d3ff17ce8dc06857fb1f97a86bf0 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 6 Nov 2024 12:43:49 -0300 Subject: [PATCH 6/8] update test --- .../03261_test_merge_parquet_bloom_filter_minmax_stats.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/03261_test_merge_parquet_bloom_filter_minmax_stats.sh b/tests/queries/0_stateless/03261_test_merge_parquet_bloom_filter_minmax_stats.sh index db58f0b69e5..64082dffaa6 100755 --- a/tests/queries/0_stateless/03261_test_merge_parquet_bloom_filter_minmax_stats.sh +++ b/tests/queries/0_stateless/03261_test_merge_parquet_bloom_filter_minmax_stats.sh @@ -26,4 +26,4 @@ cp ${DATA_FILE} ${DATA_FILE_USER_PATH} # Therefore, bloom filter would determine `false or true` and minmax would determine `true or false`. Resulting in true. # Since both structures are now evaluated together, the row group should be skipped -${CLICKHOUSE_CLIENT} --query="select * from file('${DATA_FILE_USER_PATH}', Parquet) WHERE int8 = 3 or int8 > 5 FORMAT Json SETTINGS input_format_parquet_filter_push_down=true, input_format_parquet_bloom_filter_push_down=true;" +${CLICKHOUSE_CLIENT} --query="select * from file('${DATA_FILE_USER_PATH}', Parquet) WHERE int8 = 3 or int8 > 5 FORMAT Json SETTINGS input_format_parquet_filter_push_down=true, input_format_parquet_bloom_filter_push_down=true;" | jq 'del(.meta,.statistics.elapsed)' From 49923bb8456e3f0e7f12bd4d5f5dc3aec3e7e71e Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 6 Nov 2024 14:14:57 -0300 Subject: [PATCH 7/8] update test --- ...arquet_bloom_filter_minmax_stats.reference | 27 +++++-------------- 1 file changed, 6 insertions(+), 21 deletions(-) diff --git a/tests/queries/0_stateless/03261_test_merge_parquet_bloom_filter_minmax_stats.reference b/tests/queries/0_stateless/03261_test_merge_parquet_bloom_filter_minmax_stats.reference index 9d5dea4cc09..f501f539a80 100644 --- a/tests/queries/0_stateless/03261_test_merge_parquet_bloom_filter_minmax_stats.reference +++ b/tests/queries/0_stateless/03261_test_merge_parquet_bloom_filter_minmax_stats.reference @@ -1,23 +1,8 @@ { - "meta": - [ - { - "name": "int8", - "type": "Nullable(Int8)" - } - ], - - "data": - [ - - ], - - "rows": 0, - - "statistics": - { - "elapsed": 0.05269874, - "rows_read": 0, - "bytes_read": 0 - } + "data": [], + "rows": 0, + "statistics": { + "rows_read": 0, + "bytes_read": 0 + } } From f559037c4d936cf6ae996e6b07445adbda49fa46 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 6 Nov 2024 16:29:19 -0300 Subject: [PATCH 8/8] explicit in constructor --- src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp index ee9f2acadfe..3be525ccb41 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp @@ -277,7 +277,7 @@ static Field decodePlainParquetValueSlow(const std::string & data, parquet::Type struct ParquetBloomFilter : public KeyCondition::BloomFilter { - ParquetBloomFilter(std::unique_ptr && parquet_bf_) + explicit ParquetBloomFilter(std::unique_ptr && parquet_bf_) : parquet_bf(std::move(parquet_bf_)) {} bool findHash(uint64_t hash) override