From 5c316ffabeed85f5283498fa5fdd37f28c016b4f Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Tue, 15 Feb 2022 14:26:16 +0300 Subject: [PATCH] support filtering by sparse columns without convertion to full --- src/Columns/FilterDescription.cpp | 10 ++++++ src/Columns/FilterDescription.h | 22 +++++++++++-- src/Processors/Transforms/FilterTransform.cpp | 14 +++++--- tests/performance/sparse_column.xml | 7 ++-- .../01780_column_sparse_filter.reference | 13 ++++++++ .../01780_column_sparse_filter.sql | 33 +++++++++++++++++++ 6 files changed, 90 insertions(+), 9 deletions(-) create mode 100644 tests/queries/0_stateless/01780_column_sparse_filter.reference create mode 100644 tests/queries/0_stateless/01780_column_sparse_filter.sql diff --git a/src/Columns/FilterDescription.cpp b/src/Columns/FilterDescription.cpp index 973d5bc4391..f8f4ee365ef 100644 --- a/src/Columns/FilterDescription.cpp +++ b/src/Columns/FilterDescription.cpp @@ -91,4 +91,14 @@ FilterDescription::FilterDescription(const IColumn & column_) ErrorCodes::ILLEGAL_TYPE_OF_COLUMN_FOR_FILTER); } +SparseFilterDescription::SparseFilterDescription(const IColumn & column) +{ + const auto * column_sparse = typeid_cast(&column); + if (!column_sparse || !typeid_cast(&column_sparse->getValuesColumn())) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_COLUMN_FOR_FILTER, + "Illegal type {} of column for sparse filter. Must be Sparse(UInt8)", column.getName()); + + filter_indices = &column_sparse->getOffsetsColumn(); +} + } diff --git a/src/Columns/FilterDescription.h b/src/Columns/FilterDescription.h index 05812fea283..a63ebeaa3d2 100644 --- a/src/Columns/FilterDescription.h +++ b/src/Columns/FilterDescription.h @@ -1,6 +1,7 @@ #pragma once #include +#include namespace DB @@ -15,20 +16,37 @@ struct ConstantFilterDescription bool always_false = false; bool always_true = false; - ConstantFilterDescription() {} + ConstantFilterDescription() = default; explicit ConstantFilterDescription(const IColumn & column); }; +struct IFilterDescription +{ + virtual ColumnPtr filter(const IColumn & column) const = 0; + virtual size_t countBytesInFilter() const = 0; + virtual ~IFilterDescription() = default; +}; /// Obtain a filter from non constant Column, that may have type: UInt8, Nullable(UInt8). -struct FilterDescription +struct FilterDescription final : public IFilterDescription { const IColumn::Filter * data = nullptr; /// Pointer to filter when it is not always true or always false. ColumnPtr data_holder; /// If new column was generated, it will be owned by holder. explicit FilterDescription(const IColumn & column); + + ColumnPtr filter(const IColumn & column) const override { return column.filter(*data, - 1); } + size_t countBytesInFilter() const override { return DB::countBytesInFilter(*data); } }; +struct SparseFilterDescription final : public IFilterDescription +{ + const IColumn * filter_indices = nullptr; + explicit SparseFilterDescription(const IColumn & column); + + ColumnPtr filter(const IColumn & column) const override { return column.index(*filter_indices, 0); } + size_t countBytesInFilter() const override { return filter_indices->size(); } +}; struct ColumnWithTypeAndName; diff --git a/src/Processors/Transforms/FilterTransform.cpp b/src/Processors/Transforms/FilterTransform.cpp index 364fb8e1958..bcaa2853b0c 100644 --- a/src/Processors/Transforms/FilterTransform.cpp +++ b/src/Processors/Transforms/FilterTransform.cpp @@ -138,8 +138,6 @@ void FilterTransform::transform(Chunk & chunk) return; } - FilterDescription filter_and_holder(*filter_column); - /** Let's find out how many rows will be in result. * To do this, we filter out the first non-constant column * or calculate number of set bytes in the filter. @@ -154,14 +152,20 @@ void FilterTransform::transform(Chunk & chunk) } } + std::unique_ptr filter_description; + if (filter_column->isSparse()) + filter_description = std::make_unique(*filter_column); + else + filter_description = std::make_unique(*filter_column); + size_t num_filtered_rows = 0; if (first_non_constant_column != num_columns) { - columns[first_non_constant_column] = columns[first_non_constant_column]->filter(*filter_and_holder.data, -1); + columns[first_non_constant_column] = filter_description->filter(*columns[first_non_constant_column]); num_filtered_rows = columns[first_non_constant_column]->size(); } else - num_filtered_rows = countBytesInFilter(*filter_and_holder.data); + num_filtered_rows = filter_description->countBytesInFilter(); /// If the current block is completely filtered out, let's move on to the next one. if (num_filtered_rows == 0) @@ -207,7 +211,7 @@ void FilterTransform::transform(Chunk & chunk) if (isColumnConst(*current_column)) current_column = current_column->cut(0, num_filtered_rows); else - current_column = current_column->filter(*filter_and_holder.data, num_filtered_rows); + current_column = filter_description->filter(*current_column); } chunk.setColumns(std::move(columns), num_filtered_rows); diff --git a/tests/performance/sparse_column.xml b/tests/performance/sparse_column.xml index 6523d37df44..1d270165c68 100644 --- a/tests/performance/sparse_column.xml +++ b/tests/performance/sparse_column.xml @@ -25,7 +25,7 @@ CREATE TABLE test_sparse_{ratio} (id UInt64, u8 UInt8, u64 UInt64, str String) ENGINE = MergeTree ORDER BY id - SETTINGS ratio_of_defaults_for_sparse_serialization = 0.9 + SETTINGS ratio_of_defaults_for_sparse_serialization = 0.8 SYSTEM STOP MERGES test_{serialization}_{ratio} @@ -54,5 +54,8 @@ SELECT sum(u64) FROM test_{serialization}_{ratio} GROUP BY id % 11 FORMAT Null SELECT uniq(str) FROM test_{serialization}_{ratio} GROUP BY id % 11 FORMAT Null - + SELECT count() FROM test_{serialization}_{ratio} WHERE u64 > 0 + SELECT count() FROM test_{serialization}_{ratio} WHERE notEmpty(str) + + DROP TABLE IF EXISTS test_{serialization}_{ratio} diff --git a/tests/queries/0_stateless/01780_column_sparse_filter.reference b/tests/queries/0_stateless/01780_column_sparse_filter.reference new file mode 100644 index 00000000000..d673acfc89f --- /dev/null +++ b/tests/queries/0_stateless/01780_column_sparse_filter.reference @@ -0,0 +1,13 @@ +id Default +s Sparse +u Sparse +5000 +2000 +id Default +id Default +s Default +s Sparse +u Default +u Sparse +105000 +102000 diff --git a/tests/queries/0_stateless/01780_column_sparse_filter.sql b/tests/queries/0_stateless/01780_column_sparse_filter.sql new file mode 100644 index 00000000000..45958b5c4e0 --- /dev/null +++ b/tests/queries/0_stateless/01780_column_sparse_filter.sql @@ -0,0 +1,33 @@ +DROP TABLE IF EXISTS t_sparse; + +CREATE TABLE t_sparse (id UInt64, u UInt64, s String) +ENGINE = MergeTree ORDER BY id +SETTINGS ratio_of_defaults_for_sparse_serialization = 0.9; + +INSERT INTO t_sparse SELECT + number, + if (number % 20 = 0, number, 0), + if (number % 50 = 0, toString(number), '') +FROM numbers(1, 100000); + +SELECT column, serialization_kind FROM system.parts_columns +WHERE table = 't_sparse' AND database = currentDatabase() +ORDER BY column, serialization_kind; + +SELECT count() FROM t_sparse WHERE u > 0; +SELECT count() FROM t_sparse WHERE notEmpty(s); + +SYSTEM STOP MERGES t_sparse; + +INSERT INTO t_sparse SELECT + number, number, toString(number) +FROM numbers (1, 100000); + +SELECT column, serialization_kind FROM system.parts_columns +WHERE table = 't_sparse' AND database = currentDatabase() +ORDER BY column, serialization_kind; + +SELECT count() FROM t_sparse WHERE u > 0; +SELECT count() FROM t_sparse WHERE notEmpty(s); + +DROP TABLE t_sparse;