mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-10 01:25:21 +00:00
Merge pull request #34601 from CurtizJ/filtering-by-sparse-columns
Support filtering by sparse columns without conversion to full
This commit is contained in:
commit
72e75fdaf5
@ -91,4 +91,14 @@ FilterDescription::FilterDescription(const IColumn & column_)
|
||||
ErrorCodes::ILLEGAL_TYPE_OF_COLUMN_FOR_FILTER);
|
||||
}
|
||||
|
||||
SparseFilterDescription::SparseFilterDescription(const IColumn & column)
|
||||
{
|
||||
const auto * column_sparse = typeid_cast<const ColumnSparse *>(&column);
|
||||
if (!column_sparse || !typeid_cast<const ColumnUInt8 *>(&column_sparse->getValuesColumn()))
|
||||
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_COLUMN_FOR_FILTER,
|
||||
"Illegal type {} of column for sparse filter. Must be Sparse(UInt8)", column.getName());
|
||||
|
||||
filter_indices = &column_sparse->getOffsetsColumn();
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,6 +1,7 @@
|
||||
#pragma once
|
||||
|
||||
#include <Columns/IColumn.h>
|
||||
#include <Columns/ColumnsCommon.h>
|
||||
|
||||
|
||||
namespace DB
|
||||
@ -15,20 +16,37 @@ struct ConstantFilterDescription
|
||||
bool always_false = false;
|
||||
bool always_true = false;
|
||||
|
||||
ConstantFilterDescription() {}
|
||||
ConstantFilterDescription() = default;
|
||||
explicit ConstantFilterDescription(const IColumn & column);
|
||||
};
|
||||
|
||||
struct IFilterDescription
|
||||
{
|
||||
virtual ColumnPtr filter(const IColumn & column, ssize_t result_size_hint) const = 0;
|
||||
virtual size_t countBytesInFilter() const = 0;
|
||||
virtual ~IFilterDescription() = default;
|
||||
};
|
||||
|
||||
/// Obtain a filter from non constant Column, that may have type: UInt8, Nullable(UInt8).
|
||||
struct FilterDescription
|
||||
struct FilterDescription final : public IFilterDescription
|
||||
{
|
||||
const IColumn::Filter * data = nullptr; /// Pointer to filter when it is not always true or always false.
|
||||
ColumnPtr data_holder; /// If new column was generated, it will be owned by holder.
|
||||
|
||||
explicit FilterDescription(const IColumn & column);
|
||||
|
||||
ColumnPtr filter(const IColumn & column, ssize_t result_size_hint) const override { return column.filter(*data, result_size_hint); }
|
||||
size_t countBytesInFilter() const override { return DB::countBytesInFilter(*data); }
|
||||
};
|
||||
|
||||
struct SparseFilterDescription final : public IFilterDescription
|
||||
{
|
||||
const IColumn * filter_indices = nullptr;
|
||||
explicit SparseFilterDescription(const IColumn & column);
|
||||
|
||||
ColumnPtr filter(const IColumn & column, ssize_t) const override { return column.index(*filter_indices, 0); }
|
||||
size_t countBytesInFilter() const override { return filter_indices->size(); }
|
||||
};
|
||||
|
||||
struct ColumnWithTypeAndName;
|
||||
|
||||
|
@ -138,8 +138,6 @@ void FilterTransform::transform(Chunk & chunk)
|
||||
return;
|
||||
}
|
||||
|
||||
FilterDescription filter_and_holder(*filter_column);
|
||||
|
||||
/** Let's find out how many rows will be in result.
|
||||
* To do this, we filter out the first non-constant column
|
||||
* or calculate number of set bytes in the filter.
|
||||
@ -154,14 +152,20 @@ void FilterTransform::transform(Chunk & chunk)
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<IFilterDescription> filter_description;
|
||||
if (filter_column->isSparse())
|
||||
filter_description = std::make_unique<SparseFilterDescription>(*filter_column);
|
||||
else
|
||||
filter_description = std::make_unique<FilterDescription>(*filter_column);
|
||||
|
||||
size_t num_filtered_rows = 0;
|
||||
if (first_non_constant_column != num_columns)
|
||||
{
|
||||
columns[first_non_constant_column] = columns[first_non_constant_column]->filter(*filter_and_holder.data, -1);
|
||||
columns[first_non_constant_column] = filter_description->filter(*columns[first_non_constant_column], -1);
|
||||
num_filtered_rows = columns[first_non_constant_column]->size();
|
||||
}
|
||||
else
|
||||
num_filtered_rows = countBytesInFilter(*filter_and_holder.data);
|
||||
num_filtered_rows = filter_description->countBytesInFilter();
|
||||
|
||||
/// If the current block is completely filtered out, let's move on to the next one.
|
||||
if (num_filtered_rows == 0)
|
||||
@ -207,7 +211,7 @@ void FilterTransform::transform(Chunk & chunk)
|
||||
if (isColumnConst(*current_column))
|
||||
current_column = current_column->cut(0, num_filtered_rows);
|
||||
else
|
||||
current_column = current_column->filter(*filter_and_holder.data, num_filtered_rows);
|
||||
current_column = filter_description->filter(*current_column, num_filtered_rows);
|
||||
}
|
||||
|
||||
chunk.setColumns(std::move(columns), num_filtered_rows);
|
||||
|
@ -25,7 +25,7 @@
|
||||
<create_query>
|
||||
CREATE TABLE test_sparse_{ratio} (id UInt64, u8 UInt8, u64 UInt64, str String)
|
||||
ENGINE = MergeTree ORDER BY id
|
||||
SETTINGS ratio_of_defaults_for_sparse_serialization = 0.9
|
||||
SETTINGS ratio_of_defaults_for_sparse_serialization = 0.8
|
||||
</create_query>
|
||||
|
||||
<create_query>SYSTEM STOP MERGES test_{serialization}_{ratio}</create_query>
|
||||
@ -54,5 +54,8 @@
|
||||
<query>SELECT sum(u64) FROM test_{serialization}_{ratio} GROUP BY id % 11 FORMAT Null</query>
|
||||
<query>SELECT uniq(str) FROM test_{serialization}_{ratio} GROUP BY id % 11 FORMAT Null</query>
|
||||
|
||||
<!-- <drop_query>DROP TABLE IF EXISTS test_{serialization}_{ratio}</drop_query> -->
|
||||
<query>SELECT count() FROM test_{serialization}_{ratio} WHERE u64 > 0</query>
|
||||
<query>SELECT count() FROM test_{serialization}_{ratio} WHERE notEmpty(str)</query>
|
||||
|
||||
<drop_query>DROP TABLE IF EXISTS test_{serialization}_{ratio}</drop_query>
|
||||
</test>
|
||||
|
@ -0,0 +1,13 @@
|
||||
id Default
|
||||
s Sparse
|
||||
u Sparse
|
||||
5000
|
||||
2000
|
||||
id Default
|
||||
id Default
|
||||
s Default
|
||||
s Sparse
|
||||
u Default
|
||||
u Sparse
|
||||
105000
|
||||
102000
|
33
tests/queries/0_stateless/01780_column_sparse_filter.sql
Normal file
33
tests/queries/0_stateless/01780_column_sparse_filter.sql
Normal file
@ -0,0 +1,33 @@
|
||||
DROP TABLE IF EXISTS t_sparse;
|
||||
|
||||
CREATE TABLE t_sparse (id UInt64, u UInt64, s String)
|
||||
ENGINE = MergeTree ORDER BY id
|
||||
SETTINGS ratio_of_defaults_for_sparse_serialization = 0.9;
|
||||
|
||||
INSERT INTO t_sparse SELECT
|
||||
number,
|
||||
if (number % 20 = 0, number, 0),
|
||||
if (number % 50 = 0, toString(number), '')
|
||||
FROM numbers(1, 100000);
|
||||
|
||||
SELECT column, serialization_kind FROM system.parts_columns
|
||||
WHERE table = 't_sparse' AND database = currentDatabase()
|
||||
ORDER BY column, serialization_kind;
|
||||
|
||||
SELECT count() FROM t_sparse WHERE u > 0;
|
||||
SELECT count() FROM t_sparse WHERE notEmpty(s);
|
||||
|
||||
SYSTEM STOP MERGES t_sparse;
|
||||
|
||||
INSERT INTO t_sparse SELECT
|
||||
number, number, toString(number)
|
||||
FROM numbers (1, 100000);
|
||||
|
||||
SELECT column, serialization_kind FROM system.parts_columns
|
||||
WHERE table = 't_sparse' AND database = currentDatabase()
|
||||
ORDER BY column, serialization_kind;
|
||||
|
||||
SELECT count() FROM t_sparse WHERE u > 0;
|
||||
SELECT count() FROM t_sparse WHERE notEmpty(s);
|
||||
|
||||
DROP TABLE t_sparse;
|
Loading…
Reference in New Issue
Block a user