Merge pull request #34601 from CurtizJ/filtering-by-sparse-columns

Support filtering by sparse columns without conversion to full
This commit is contained in:
Anton Popov 2022-02-15 23:26:13 +03:00 committed by GitHub
commit 72e75fdaf5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 90 additions and 9 deletions

View File

@ -91,4 +91,14 @@ FilterDescription::FilterDescription(const IColumn & column_)
ErrorCodes::ILLEGAL_TYPE_OF_COLUMN_FOR_FILTER);
}
SparseFilterDescription::SparseFilterDescription(const IColumn & column)
{
const auto * column_sparse = typeid_cast<const ColumnSparse *>(&column);
if (!column_sparse || !typeid_cast<const ColumnUInt8 *>(&column_sparse->getValuesColumn()))
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_COLUMN_FOR_FILTER,
"Illegal type {} of column for sparse filter. Must be Sparse(UInt8)", column.getName());
filter_indices = &column_sparse->getOffsetsColumn();
}
}

View File

@ -1,6 +1,7 @@
#pragma once
#include <Columns/IColumn.h>
#include <Columns/ColumnsCommon.h>
namespace DB
@ -15,20 +16,37 @@ struct ConstantFilterDescription
bool always_false = false;
bool always_true = false;
ConstantFilterDescription() {}
ConstantFilterDescription() = default;
explicit ConstantFilterDescription(const IColumn & column);
};
struct IFilterDescription
{
virtual ColumnPtr filter(const IColumn & column, ssize_t result_size_hint) const = 0;
virtual size_t countBytesInFilter() const = 0;
virtual ~IFilterDescription() = default;
};
/// Obtain a filter from non constant Column, that may have type: UInt8, Nullable(UInt8).
struct FilterDescription
struct FilterDescription final : public IFilterDescription
{
const IColumn::Filter * data = nullptr; /// Pointer to filter when it is not always true or always false.
ColumnPtr data_holder; /// If new column was generated, it will be owned by holder.
explicit FilterDescription(const IColumn & column);
ColumnPtr filter(const IColumn & column, ssize_t result_size_hint) const override { return column.filter(*data, result_size_hint); }
size_t countBytesInFilter() const override { return DB::countBytesInFilter(*data); }
};
struct SparseFilterDescription final : public IFilterDescription
{
const IColumn * filter_indices = nullptr;
explicit SparseFilterDescription(const IColumn & column);
ColumnPtr filter(const IColumn & column, ssize_t) const override { return column.index(*filter_indices, 0); }
size_t countBytesInFilter() const override { return filter_indices->size(); }
};
struct ColumnWithTypeAndName;

View File

@ -138,8 +138,6 @@ void FilterTransform::transform(Chunk & chunk)
return;
}
FilterDescription filter_and_holder(*filter_column);
/** Let's find out how many rows will be in result.
* To do this, we filter out the first non-constant column
* or calculate number of set bytes in the filter.
@ -154,14 +152,20 @@ void FilterTransform::transform(Chunk & chunk)
}
}
std::unique_ptr<IFilterDescription> filter_description;
if (filter_column->isSparse())
filter_description = std::make_unique<SparseFilterDescription>(*filter_column);
else
filter_description = std::make_unique<FilterDescription>(*filter_column);
size_t num_filtered_rows = 0;
if (first_non_constant_column != num_columns)
{
columns[first_non_constant_column] = columns[first_non_constant_column]->filter(*filter_and_holder.data, -1);
columns[first_non_constant_column] = filter_description->filter(*columns[first_non_constant_column], -1);
num_filtered_rows = columns[first_non_constant_column]->size();
}
else
num_filtered_rows = countBytesInFilter(*filter_and_holder.data);
num_filtered_rows = filter_description->countBytesInFilter();
/// If the current block is completely filtered out, let's move on to the next one.
if (num_filtered_rows == 0)
@ -207,7 +211,7 @@ void FilterTransform::transform(Chunk & chunk)
if (isColumnConst(*current_column))
current_column = current_column->cut(0, num_filtered_rows);
else
current_column = current_column->filter(*filter_and_holder.data, num_filtered_rows);
current_column = filter_description->filter(*current_column, num_filtered_rows);
}
chunk.setColumns(std::move(columns), num_filtered_rows);

View File

@ -25,7 +25,7 @@
<create_query>
CREATE TABLE test_sparse_{ratio} (id UInt64, u8 UInt8, u64 UInt64, str String)
ENGINE = MergeTree ORDER BY id
SETTINGS ratio_of_defaults_for_sparse_serialization = 0.9
SETTINGS ratio_of_defaults_for_sparse_serialization = 0.8
</create_query>
<create_query>SYSTEM STOP MERGES test_{serialization}_{ratio}</create_query>
@ -54,5 +54,8 @@
<query>SELECT sum(u64) FROM test_{serialization}_{ratio} GROUP BY id % 11 FORMAT Null</query>
<query>SELECT uniq(str) FROM test_{serialization}_{ratio} GROUP BY id % 11 FORMAT Null</query>
<!-- <drop_query>DROP TABLE IF EXISTS test_{serialization}_{ratio}</drop_query> -->
<query>SELECT count() FROM test_{serialization}_{ratio} WHERE u64 > 0</query>
<query>SELECT count() FROM test_{serialization}_{ratio} WHERE notEmpty(str)</query>
<drop_query>DROP TABLE IF EXISTS test_{serialization}_{ratio}</drop_query>
</test>

View File

@ -0,0 +1,13 @@
id Default
s Sparse
u Sparse
5000
2000
id Default
id Default
s Default
s Sparse
u Default
u Sparse
105000
102000

View File

@ -0,0 +1,33 @@
DROP TABLE IF EXISTS t_sparse;
CREATE TABLE t_sparse (id UInt64, u UInt64, s String)
ENGINE = MergeTree ORDER BY id
SETTINGS ratio_of_defaults_for_sparse_serialization = 0.9;
INSERT INTO t_sparse SELECT
number,
if (number % 20 = 0, number, 0),
if (number % 50 = 0, toString(number), '')
FROM numbers(1, 100000);
SELECT column, serialization_kind FROM system.parts_columns
WHERE table = 't_sparse' AND database = currentDatabase()
ORDER BY column, serialization_kind;
SELECT count() FROM t_sparse WHERE u > 0;
SELECT count() FROM t_sparse WHERE notEmpty(s);
SYSTEM STOP MERGES t_sparse;
INSERT INTO t_sparse SELECT
number, number, toString(number)
FROM numbers (1, 100000);
SELECT column, serialization_kind FROM system.parts_columns
WHERE table = 't_sparse' AND database = currentDatabase()
ORDER BY column, serialization_kind;
SELECT count() FROM t_sparse WHERE u > 0;
SELECT count() FROM t_sparse WHERE notEmpty(s);
DROP TABLE t_sparse;