Merge pull request #37253 from Avogar/fix-defaults

Fix inserting defaults for missing values in columnar formats
This commit is contained in:
Kruglov Pavel 2022-05-23 12:10:14 +02:00 committed by GitHub
commit 754e675ec3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 61 additions and 20 deletions

View File

@ -196,7 +196,7 @@ static auto getNameRange(const ColumnsDescription::ColumnsContainer & columns, c
return std::make_pair(begin, end);
}
void ColumnsDescription::add(ColumnDescription column, const String & after_column, bool first)
void ColumnsDescription::add(ColumnDescription column, const String & after_column, bool first, bool add_subcolumns)
{
if (has(column.name))
throw Exception("Cannot add column " + column.name + ": column with this name already exists",
@ -222,7 +222,8 @@ void ColumnsDescription::add(ColumnDescription column, const String & after_colu
insert_it = range.second;
}
addSubcolumns(column.name, column.type);
if (add_subcolumns)
addSubcolumns(column.name, column.type);
columns.get<0>().insert(insert_it, std::move(column));
}
@ -572,6 +573,27 @@ std::optional<NameAndTypePair> ColumnsDescription::tryGetColumnOrSubcolumn(GetCo
return tryGetColumn(GetColumnsOptions(kind).withSubcolumns(), column_name);
}
std::optional<const ColumnDescription> ColumnsDescription::tryGetColumnDescription(const GetColumnsOptions & options, const String & column_name) const
{
auto it = columns.get<1>().find(column_name);
if (it != columns.get<1>().end() && (defaultKindToGetKind(it->default_desc.kind) & options.kind))
return *it;
if (options.with_subcolumns)
{
auto jt = subcolumns.get<0>().find(column_name);
if (jt != subcolumns.get<0>().end())
return ColumnDescription{jt->name, jt->type};
}
return {};
}
std::optional<const ColumnDescription> ColumnsDescription::tryGetColumnOrSubcolumnDescription(GetColumnsOptions::Kind kind, const String & column_name) const
{
return tryGetColumnDescription(GetColumnsOptions(kind).withSubcolumns(), column_name);
}
NameAndTypePair ColumnsDescription::getColumnOrSubcolumn(GetColumnsOptions::Kind kind, const String & column_name) const
{
auto column = tryGetColumnOrSubcolumn(kind, column_name);

View File

@ -100,7 +100,7 @@ public:
explicit ColumnsDescription(NamesAndTypesList ordinary, NamesAndAliases aliases);
/// `after_column` can be a Nested column name;
void add(ColumnDescription column, const String & after_column = String(), bool first = false);
void add(ColumnDescription column, const String & after_column = String(), bool first = false, bool add_subcolumns = true);
/// `column_name` can be a Nested column name;
void remove(const String & column_name);
@ -180,6 +180,9 @@ public:
std::optional<NameAndTypePair> tryGetColumnOrSubcolumn(GetColumnsOptions::Kind kind, const String & column_name) const;
std::optional<NameAndTypePair> tryGetColumn(const GetColumnsOptions & options, const String & column_name) const;
std::optional<const ColumnDescription> tryGetColumnOrSubcolumnDescription(GetColumnsOptions::Kind kind, const String & column_name) const;
std::optional<const ColumnDescription> tryGetColumnDescription(const GetColumnsOptions & options, const String & column_name) const;
ColumnDefaults getDefaults() const; /// TODO: remove
bool hasDefault(const String & column_name) const;
bool hasDefaults() const;

View File

@ -539,8 +539,7 @@ Pipe StorageHDFS::read(
if (fetch_columns.empty())
fetch_columns.push_back(ExpressionActions::getSmallestColumn(storage_snapshot->metadata->getColumns().getAllPhysical()));
columns_description = ColumnsDescription{
storage_snapshot->getSampleBlockForColumns(fetch_columns).getNamesAndTypesList()};
columns_description = storage_snapshot->getDescriptionForColumns(fetch_columns);
block_for_format = storage_snapshot->getSampleBlockForColumns(columns_description.getNamesOfPhysical());
}
else

View File

@ -691,8 +691,7 @@ Pipe StorageFile::read(
const auto get_columns_for_format = [&]() -> ColumnsDescription
{
if (isColumnOriented())
return ColumnsDescription{
storage_snapshot->getSampleBlockForColumns(column_names).getNamesAndTypesList()};
return storage_snapshot->getDescriptionForColumns(column_names);
else
return storage_snapshot->metadata->getColumns();
};

View File

@ -719,8 +719,7 @@ Pipe StorageS3::read(
if (fetch_columns.empty())
fetch_columns.push_back(ExpressionActions::getSmallestColumn(storage_snapshot->metadata->getColumns().getAllPhysical()));
columns_description = ColumnsDescription{
storage_snapshot->getSampleBlockForColumns(fetch_columns).getNamesAndTypesList()};
columns_description = storage_snapshot->getDescriptionForColumns(fetch_columns);
block_for_format = storage_snapshot->getSampleBlockForColumns(columns_description.getNamesOfPhysical());
}
else

View File

@ -92,32 +92,40 @@ NameAndTypePair StorageSnapshot::getColumn(const GetColumnsOptions & options, co
Block StorageSnapshot::getSampleBlockForColumns(const Names & column_names) const
{
Block res;
auto columns_description = getDescriptionForColumns(column_names);
for (const auto & column : columns_description)
res.insert({column.type->createColumn(), column.type, column.name});
return res;
}
ColumnsDescription StorageSnapshot::getDescriptionForColumns(const Names & column_names) const
{
ColumnsDescription res;
const auto & columns = getMetadataForQuery()->getColumns();
for (const auto & name : column_names)
{
auto column = columns.tryGetColumnOrSubcolumn(GetColumnsOptions::All, name);
auto object_column = object_columns.tryGetColumnOrSubcolumn(GetColumnsOptions::All, name);
auto column = columns.tryGetColumnOrSubcolumnDescription(GetColumnsOptions::All, name);
auto object_column = object_columns.tryGetColumnOrSubcolumnDescription(GetColumnsOptions::All, name);
if (column && !object_column)
{
res.insert({column->type->createColumn(), column->type, column->name});
res.add(*column, "", false, false);
}
else if (object_column)
{
res.insert({object_column->type->createColumn(), object_column->type, object_column->name});
res.add(*object_column, "", false, false);
}
else if (auto it = virtual_columns.find(name); it != virtual_columns.end())
{
/// Virtual columns must be appended after ordinary, because user can
/// override them.
const auto & type = it->second;
res.insert({type->createColumn(), type, name});
res.add({name, type});
}
else
{
throw Exception(ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK,
"Column {} not found in table {}", backQuote(name), storage.getStorageID().getNameForLogs());
"Column {} not found in table {}", backQuote(name), storage.getStorageID().getNameForLogs());
}
}

View File

@ -68,6 +68,8 @@ struct StorageSnapshot
/// Block with ordinary + materialized + aliases + virtuals + subcolumns.
Block getSampleBlockForColumns(const Names & column_names) const;
ColumnsDescription getDescriptionForColumns(const Names & column_names) const;
/// Verify that all the requested names are in the table and are set correctly:
/// list of names is not empty and the names do not repeat.
void check(const Names & column_names) const;

View File

@ -602,8 +602,7 @@ Pipe IStorageURLBase::read(
Block block_for_format;
if (isColumnOriented())
{
columns_description = ColumnsDescription{
storage_snapshot->getSampleBlockForColumns(column_names).getNamesAndTypesList()};
columns_description = storage_snapshot->getDescriptionForColumns(column_names);
block_for_format = storage_snapshot->getSampleBlockForColumns(columns_description.getNamesOfPhysical());
}
else
@ -690,8 +689,7 @@ Pipe StorageURLWithFailover::read(
Block block_for_format;
if (isColumnOriented())
{
columns_description = ColumnsDescription{
storage_snapshot->getSampleBlockForColumns(column_names).getNamesAndTypesList()};
columns_description = storage_snapshot->getDescriptionForColumns(column_names);
block_for_format = storage_snapshot->getSampleBlockForColumns(columns_description.getNamesOfPhysical());
}
else

View File

@ -0,0 +1,3 @@
1 42 43
1 42 43
1 42 43

View File

@ -0,0 +1,8 @@
-- Tags: no-fasttest, no-parallel
insert into function file(data_02302.parquet) select 1 as x settings engine_file_truncate_on_insert=1;
select * from file(data_02302.parquet, auto, 'x UInt8, y default 42, z default x + y') settings input_format_parquet_allow_missing_columns=1;
insert into function file(data_02302.orc) select 1 as x settings engine_file_truncate_on_insert=1;
select * from file(data_02302.orc, auto, 'x UInt8, y default 42, z default x + y') settings input_format_orc_allow_missing_columns=1;
insert into function file(data_02302.arrow) select 1 as x settings engine_file_truncate_on_insert=1;
select * from file(data_02302.arrow, auto, 'x UInt8, y default 42, z default x + y') settings input_format_arrow_allow_missing_columns=1;