fix filling of missed subcolumns

This commit is contained in:
Anton Popov 2024-07-08 17:47:56 +00:00
parent 5943d60f26
commit a2b17b01f9
6 changed files with 181 additions and 44 deletions

View File

@ -18,6 +18,7 @@
#include <Storages/ColumnsDescription.h>
#include <DataTypes/NestedUtils.h>
#include <Columns/ColumnArray.h>
#include <Columns/ColumnConst.h>
#include <DataTypes/DataTypeArray.h>
#include <Storages/StorageInMemoryMetadata.h>
@ -35,8 +36,13 @@ namespace
/// Add all required expressions for missing columns calculation
void addDefaultRequiredExpressionsRecursively(
const Block & block, const String & required_column_name, DataTypePtr required_column_type,
const ColumnsDescription & columns, ASTPtr default_expr_list_accum, NameSet & added_columns, bool null_as_default)
const Block & block,
const String & required_column_name,
DataTypePtr required_column_type,
const ColumnsDescription & columns,
ASTPtr default_expr_list_accum,
NameSet & added_columns,
bool null_as_default)
{
checkStackSize();
@ -273,6 +279,20 @@ static std::unordered_map<String, ColumnPtr> collectOffsetsColumns(
return offsets_columns;
}
static ColumnPtr createColumnWithDefaultValue(const IDataType & data_type, const String & subcolumn_name, size_t num_rows)
{
auto column = data_type.createColumnConstWithDefaultValue(num_rows);
if (subcolumn_name.empty())
return column->convertToFullColumnIfConst();
/// Firstly get subcolumn from const column and then replicate.
column = assert_cast<const ColumnConst &>(*column).getDataColumnPtr();
column = data_type.getSubcolumn(subcolumn_name, column);
return ColumnConst::create(std::move(column), num_rows)->convertToFullColumnIfConst();
}
void fillMissingColumns(
Columns & res_columns,
size_t num_rows,
@ -298,21 +318,19 @@ void fillMissingColumns(
auto requested_column = requested_columns.begin();
for (size_t i = 0; i < num_columns; ++i, ++requested_column)
{
const auto & [name, type] = *requested_column;
if (res_columns[i] && partially_read_columns.contains(name))
if (res_columns[i] && partially_read_columns.contains(requested_column->name))
res_columns[i] = nullptr;
if (res_columns[i])
continue;
if (metadata_snapshot && metadata_snapshot->getColumns().hasDefault(name))
if (metadata_snapshot && metadata_snapshot->getColumns().hasDefault(requested_column->getNameInStorage()))
continue;
std::vector<ColumnPtr> current_offsets;
size_t num_dimensions = 0;
const auto * array_type = typeid_cast<const DataTypeArray *>(type.get());
const auto * array_type = typeid_cast<const DataTypeArray *>(requested_column->type.get());
if (array_type && !offsets_columns.empty())
{
num_dimensions = getNumberOfDimensions(*array_type);
@ -348,10 +366,10 @@ void fillMissingColumns(
if (!current_offsets.empty())
{
size_t num_empty_dimensions = num_dimensions - current_offsets.size();
auto scalar_type = createArrayOfType(getBaseTypeOfArray(type), num_empty_dimensions);
auto scalar_type = createArrayOfType(getBaseTypeOfArray(requested_column->getTypeInStorage()), num_empty_dimensions);
size_t data_size = assert_cast<const ColumnUInt64 &>(*current_offsets.back()).getData().back();
res_columns[i] = scalar_type->createColumnConstWithDefaultValue(data_size)->convertToFullColumnIfConst();
res_columns[i] = createColumnWithDefaultValue(*scalar_type, requested_column->getSubcolumnName(), data_size);
for (auto it = current_offsets.rbegin(); it != current_offsets.rend(); ++it)
res_columns[i] = ColumnArray::create(res_columns[i], *it);
@ -360,7 +378,7 @@ void fillMissingColumns(
{
/// We must turn a constant column into a full column because the interpreter could infer
/// that it is constant everywhere but in some blocks (from other parts) it can be a full column.
res_columns[i] = type->createColumnConstWithDefaultValue(num_rows)->convertToFullColumnIfConst();
res_columns[i] = createColumnWithDefaultValue(*requested_column->getTypeInStorage(), requested_column->getSubcolumnName(), num_rows);
}
}
}

View File

@ -3,6 +3,7 @@
#include <Storages/MergeTree/MergeTreeVirtualColumns.h>
#include <Storages/MergeTree/LoadedMergeTreeDataPartInfoForReader.h>
#include <DataTypes/NestedUtils.h>
#include <DataTypes/DataTypeNested.h>
#include <Common/escapeForFileName.h>
#include <Compression/CachedCompressedReadBuffer.h>
#include <Columns/ColumnArray.h>
@ -144,19 +145,26 @@ void IMergeTreeReader::evaluateMissingDefaults(Block additional_columns, Columns
throw Exception(ErrorCodes::LOGICAL_ERROR, "invalid number of columns passed to MergeTreeReader::fillMissingColumns. "
"Expected {}, got {}", num_columns, res_columns.size());
/// Convert columns list to block.
/// TODO: rewrite with columns interface. It will be possible after changes in ExpressionActions.
auto name_and_type = requested_columns.begin();
for (size_t pos = 0; pos < num_columns; ++pos, ++name_and_type)
{
if (res_columns[pos] == nullptr)
continue;
NameSet full_requested_columns_set;
NamesAndTypesList full_requested_columns;
additional_columns.insert({res_columns[pos], name_and_type->type, name_and_type->name});
/// Convert columns list to block. And convert subcolumns to full columns.
/// TODO: rewrite with columns interface. It will be possible after changes in ExpressionActions.
auto it = requested_columns.begin();
for (size_t pos = 0; pos < num_columns; ++pos, ++it)
{
auto name_in_storage = it->getNameInStorage();
if (full_requested_columns_set.emplace(name_in_storage).second)
full_requested_columns.emplace_back(name_in_storage, it->getTypeInStorage());
if (res_columns[pos])
additional_columns.insert({res_columns[pos], it->type, it->name});
}
auto dag = DB::evaluateMissingDefaults(
additional_columns, requested_columns,
additional_columns, full_requested_columns,
storage_snapshot->metadata->getColumns(),
data_part_info_for_read->getContext());
@ -170,9 +178,18 @@ void IMergeTreeReader::evaluateMissingDefaults(Block additional_columns, Columns
}
/// Move columns from block.
name_and_type = requested_columns.begin();
for (size_t pos = 0; pos < num_columns; ++pos, ++name_and_type)
res_columns[pos] = std::move(additional_columns.getByName(name_and_type->name).column);
it = requested_columns.begin();
for (size_t pos = 0; pos < num_columns; ++pos, ++it)
{
auto name_in_storage = it->getNameInStorage();
res_columns[pos] = additional_columns.getByName(name_in_storage).column;
if (it->isSubcolumn())
{
const auto & type_in_storage = it->getTypeInStorage();
res_columns[pos] = type_in_storage->getSubcolumn(it->getSubcolumnName(), res_columns[pos]);
}
}
}
catch (Exception & e)
{
@ -192,7 +209,12 @@ bool IMergeTreeReader::isSubcolumnOffsetsOfNested(const String & name_in_storage
if (!data_part_info_for_read->isWidePart() || subcolumn_name != "size0")
return false;
return Nested::isSubcolumnOfNested(name_in_storage, part_columns);
auto split = Nested::splitName(name_in_storage);
if (split.second.empty())
return false;
auto nested_column = part_columns.tryGetColumn(GetColumnsOptions::All, split.first);
return nested_column && isNested(nested_column->type);
}
String IMergeTreeReader::getColumnNameInPart(const NameAndTypePair & required_column) const

View File

@ -60,39 +60,25 @@ void MergeTreeReaderCompact::fillColumnPositions()
for (size_t i = 0; i < columns_num; ++i)
{
const auto & column_to_read = columns_to_read[i];
auto & column_to_read = columns_to_read[i];
auto position = data_part_info_for_read->getColumnPosition(column_to_read.getNameInStorage());
bool is_array = isArray(column_to_read.type);
if (column_to_read.isSubcolumn())
{
auto storage_column_from_part = getColumnInPart(
{column_to_read.getNameInStorage(), column_to_read.getTypeInStorage()});
NameAndTypePair column_in_storage{column_to_read.getNameInStorage(), column_to_read.getTypeInStorage()};
auto storage_column_from_part = getColumnInPart(column_in_storage);
auto subcolumn_name = column_to_read.getSubcolumnName();
if (!storage_column_from_part.type->hasSubcolumn(subcolumn_name))
position.reset();
}
column_positions[i] = std::move(position);
/// If array of Nested column is missing in part,
/// we have to read its offsets if they exist.
if (!position && is_array)
{
auto column_to_read_with_subcolumns = getColumnConvertedToSubcolumnOfNested(column_to_read);
auto name_level_for_offsets = findColumnForOffsets(column_to_read_with_subcolumns);
if (name_level_for_offsets.has_value())
{
column_positions[i] = data_part_info_for_read->getColumnPosition(name_level_for_offsets->first);
columns_for_offsets[i] = name_level_for_offsets;
partially_read_columns.insert(column_to_read.name);
}
}
else
{
column_positions[i] = std::move(position);
}
if (!column_positions[i])
findPositionForMissedNested(i);
}
}
@ -125,6 +111,38 @@ NameAndTypePair MergeTreeReaderCompact::getColumnConvertedToSubcolumnOfNested(co
Nested::concatenateName(name_in_storage, subcolumn_name));
}
void MergeTreeReaderCompact::findPositionForMissedNested(size_t pos)
{
auto & column = columns_to_read[pos];
bool is_array = isArray(column.type);
bool is_offsets_subcolumn = isArray(column.getTypeInStorage()) && column.getSubcolumnName() == "size0";
if (!is_array && !is_offsets_subcolumn)
return;
NameAndTypePair column_in_storage{column.getNameInStorage(), column.getTypeInStorage()};
auto column_to_read_with_subcolumns = getColumnConvertedToSubcolumnOfNested(column_in_storage);
auto name_level_for_offsets = findColumnForOffsets(column_to_read_with_subcolumns);
if (!name_level_for_offsets)
return;
column_positions[pos] = data_part_info_for_read->getColumnPosition(name_level_for_offsets->first);
if (is_offsets_subcolumn)
{
/// Read offsets from antoher array from the same Nested column.
column = {name_level_for_offsets->first, column.getSubcolumnName(), column.getTypeInStorage(), column.type};
}
else
{
columns_for_offsets[pos] = std::move(name_level_for_offsets);
partially_read_columns.insert(column.name);
}
}
void MergeTreeReaderCompact::readData(
const NameAndTypePair & name_and_type,
ColumnPtr & column,

View File

@ -36,6 +36,7 @@ public:
protected:
void fillColumnPositions();
NameAndTypePair getColumnConvertedToSubcolumnOfNested(const NameAndTypePair & column);
void findPositionForMissedNested(size_t pos);
using InputStreamGetter = ISerialization::InputStreamGetter;

View File

@ -0,0 +1,31 @@
0
2
4
6
8
0
2
4
6
8
1 ['aaa',NULL] [NULL,NULL]
2 ['ccc'] [NULL]
3 [NULL] [NULL]
4 [NULL,'bbb'] ['ddd',NULL]
5 [NULL] [NULL]
1 2 2
2 1 1
3 1 1
4 2 2
5 1 1
1 [0,1] [1,1]
2 [0] [1]
3 [1] [1]
4 [1,0] [0,1]
5 [1] [1]
1 ('foo','bar') [1,NULL,3]
2 ('aaa','bbb') [1,NULL,3]
3 ('ccc','ddd') [4,5,6]
1 foo bar 3 [0,1,0]
2 foo bar 3 [0,1,0]
3 ccc ddd 3 [0,0,0]

View File

@ -0,0 +1,47 @@
DROP TABLE IF EXISTS t_missed_subcolumns;
-- .null subcolumn
CREATE TABLE t_missed_subcolumns (x UInt32) ENGINE = MergeTree ORDER BY tuple();
INSERT INTO t_missed_subcolumns SELECT * FROM numbers(10);
ALTER TABLE t_missed_subcolumns ADD COLUMN `y` Nullable(UInt32);
INSERT INTO t_missed_subcolumns SELECT number, if(number % 2, NULL, number) FROM numbers(10);
SELECT x FROM t_missed_subcolumns WHERE y IS NOT NULL SETTINGS optimize_functions_to_subcolumns = 1;
SELECT x FROM t_missed_subcolumns WHERE y IS NOT NULL SETTINGS optimize_functions_to_subcolumns = 0;
DROP TABLE IF EXISTS t_missed_subcolumns;
-- .null and .size0 subcolumn in array
CREATE TABLE t_missed_subcolumns (id UInt64, `n.a` Array(Nullable(String))) ENGINE = MergeTree ORDER BY id;
INSERT INTO t_missed_subcolumns VALUES (1, ['aaa', NULL]) (2, ['ccc']) (3, [NULL]);
ALTER TABLE t_missed_subcolumns ADD COLUMN `n.b` Array(Nullable(String));
INSERT INTO t_missed_subcolumns VALUES (4, [NULL, 'bbb'], ['ddd', NULL]), (5, [NULL], [NULL]);
SELECT id, n.a, n.b FROM t_missed_subcolumns ORDER BY id;
SELECT id, n.a.size0, n.b.size0 FROM t_missed_subcolumns ORDER BY id;
SELECT id, n.a.null, n.b.null FROM t_missed_subcolumns ORDER BY id;
DROP TABLE IF EXISTS t_missed_subcolumns;
-- subcolumns and custom defaults
CREATE TABLE t_missed_subcolumns (id UInt64) ENGINE = MergeTree ORDER BY id;
SYSTEM STOP MERGES t_missed_subcolumns;
INSERT INTO t_missed_subcolumns VALUES (1);
ALTER TABLE t_missed_subcolumns ADD COLUMN t Tuple(a String, b String) DEFAULT ('foo', 'bar');
INSERT INTO t_missed_subcolumns VALUES (2, ('aaa', 'bbb'));
ALTER TABLE t_missed_subcolumns ADD COLUMN arr Array(Nullable(UInt64)) DEFAULT [1, NULL, 3];
INSERT INTO t_missed_subcolumns VALUES (3, ('ccc', 'ddd'), [4, 5, 6]);
SELECT id, t, arr FROM t_missed_subcolumns ORDER BY id;
SELECT id, t.a, t.b, arr.size0, arr.null FROM t_missed_subcolumns ORDER BY id;
DROP TABLE t_missed_subcolumns;