fix filling of empty Nested

This commit is contained in:
Anton Popov 2024-07-19 12:36:57 +00:00
parent 2d30524d72
commit 57c1d7a101
7 changed files with 95 additions and 13 deletions

View File

@ -90,7 +90,9 @@ void IDataType::forEachSubcolumn(
{
auto name = ISerialization::getSubcolumnNameForStream(subpath, prefix_len);
auto subdata = ISerialization::createFromPath(subpath, prefix_len);
callback(subpath, name, subdata);
auto path_copy = subpath;
path_copy.resize(prefix_len);
callback(path_copy, name, subdata);
}
subpath[i].visited = true;
}

View File

@ -8,6 +8,7 @@
#include <DataTypes/DataTypeNothing.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeMap.h>
#include <DataTypes/DataTypeTuple.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypesNumber.h>
#include <DataTypes/DataTypeNested.h>
@ -66,6 +67,36 @@ DataTypePtr getBaseTypeOfArray(const DataTypePtr & type)
return last_array ? last_array->getNestedType() : type;
}
DataTypePtr getBaseTypeOfArray(DataTypePtr type, const Names & tuple_elements)
{
auto it = tuple_elements.begin();
while (true)
{
if (const auto * type_array = typeid_cast<const DataTypeArray *>(type.get()))
{
type = type_array->getNestedType();
}
else if (const auto * type_tuple = typeid_cast<const DataTypeTuple *>(type.get()))
{
if (it == tuple_elements.end())
break;
auto pos = type_tuple->tryGetPositionByName(*it);
if (!pos)
break;
++it;
type = type_tuple->getElement(*pos);
}
else
{
break;
}
}
return type;
}
ColumnPtr getBaseColumnOfArray(const ColumnPtr & column)
{
/// Get raw pointers to avoid extra copying of column pointers.

View File

@ -27,6 +27,9 @@ size_t getNumberOfDimensions(const IColumn & column);
/// Returns type of scalars of Array of arbitrary dimensions.
DataTypePtr getBaseTypeOfArray(const DataTypePtr & type);
/// The same as above but takes into account Tuples of Nested.
DataTypePtr getBaseTypeOfArray(DataTypePtr type, const Names & tuple_elements);
/// Returns Array type with requested scalar type and number of dimensions.
DataTypePtr createArrayOfType(DataTypePtr type, size_t num_dimensions);

View File

@ -195,7 +195,7 @@ public:
/// Types of substreams that can have arbitrary name.
static const std::set<Type> named_types;
Type type;
Type type = Type::Regular;
/// The name of a variant element type.
String variant_element_name;
@ -212,6 +212,7 @@ public:
/// Flag, that may help to traverse substream paths.
mutable bool visited = false;
Substream() = default;
Substream(Type type_) : type(type_) {} /// NOLINT
String toString() const;
};

View File

@ -283,6 +283,9 @@ static ColumnPtr createColumnWithDefaultValue(const IDataType & data_type, const
{
auto column = data_type.createColumnConstWithDefaultValue(num_rows);
/// We must turn a constant column into a full column because the interpreter could infer
/// that it is constant everywhere but in some blocks (from other parts) it can be a full column.
if (subcolumn_name.empty())
return column->convertToFullColumnIfConst();
@ -293,6 +296,35 @@ static ColumnPtr createColumnWithDefaultValue(const IDataType & data_type, const
return ColumnConst::create(std::move(column), num_rows)->convertToFullColumnIfConst();
}
static bool hasDefault(const StorageMetadataPtr & metadata_snapshot, const NameAndTypePair & column)
{
if (!metadata_snapshot)
return false;
const auto & columns = metadata_snapshot->getColumns();
if (columns.has(column.name))
return columns.hasDefault(column.name);
auto name_in_storage = column.getNameInStorage();
return columns.hasDefault(name_in_storage);
}
static String removeTupleElementsFromSubcolumn(String subcolumn_name, const Names & tuple_elements)
{
subcolumn_name += ".";
for (const auto & elem : tuple_elements)
{
auto pos = subcolumn_name.find(elem + ".");
if (pos != std::string::npos)
subcolumn_name.erase(pos, elem.size());
}
if (subcolumn_name.ends_with("."))
subcolumn_name.pop_back();
return subcolumn_name;
}
void fillMissingColumns(
Columns & res_columns,
size_t num_rows,
@ -321,10 +353,8 @@ void fillMissingColumns(
if (res_columns[i] && partially_read_columns.contains(requested_column->name))
res_columns[i] = nullptr;
if (res_columns[i])
continue;
if (metadata_snapshot && metadata_snapshot->getColumns().hasDefault(requested_column->getNameInStorage()))
/// Nothing to fill or default should be filled in evaluateMissingDefaults
if (res_columns[i] || hasDefault(metadata_snapshot, *requested_column))
continue;
std::vector<ColumnPtr> current_offsets;
@ -365,19 +395,30 @@ void fillMissingColumns(
if (!current_offsets.empty())
{
Names tuple_elements;
auto serialization = IDataType::getSerialization(*requested_column);
IDataType::forEachSubcolumn([&](const auto & path, const auto &, const auto &)
{
if (path.back().type == ISerialization::Substream::TupleElement)
tuple_elements.push_back(path.back().name_of_substream);
}, ISerialization::SubstreamData(serialization));
size_t num_empty_dimensions = num_dimensions - current_offsets.size();
auto scalar_type = createArrayOfType(getBaseTypeOfArray(requested_column->getTypeInStorage()), num_empty_dimensions);
auto base_type = getBaseTypeOfArray(requested_column->getTypeInStorage(), tuple_elements);
auto scalar_type = createArrayOfType(base_type, num_empty_dimensions);
size_t data_size = assert_cast<const ColumnUInt64 &>(*current_offsets.back()).getData().back();
res_columns[i] = createColumnWithDefaultValue(*scalar_type, requested_column->getSubcolumnName(), data_size);
auto subcolumn_name = removeTupleElementsFromSubcolumn(requested_column->getSubcolumnName(), tuple_elements);
res_columns[i] = createColumnWithDefaultValue(*scalar_type, subcolumn_name, data_size);
for (auto it = current_offsets.rbegin(); it != current_offsets.rend(); ++it)
res_columns[i] = ColumnArray::create(res_columns[i], *it);
}
else
{
/// We must turn a constant column into a full column because the interpreter could infer
/// that it is constant everywhere but in some blocks (from other parts) it can be a full column.
res_columns[i] = createColumnWithDefaultValue(*requested_column->getTypeInStorage(), requested_column->getSubcolumnName(), num_rows);
}
}

View File

@ -44,6 +44,7 @@ IMergeTreeReader::IMergeTreeReader(
, alter_conversions(data_part_info_for_read->getAlterConversions())
/// For wide parts convert plain arrays of Nested to subcolumns
/// to allow to use shared offset column from cache.
, original_requested_columns(columns_)
, requested_columns(data_part_info_for_read->isWidePart()
? Nested::convertToSubcolumns(columns_)
: columns_)
@ -139,7 +140,7 @@ void IMergeTreeReader::evaluateMissingDefaults(Block additional_columns, Columns
{
try
{
size_t num_columns = requested_columns.size();
size_t num_columns = original_requested_columns.size();
if (res_columns.size() != num_columns)
throw Exception(ErrorCodes::LOGICAL_ERROR, "invalid number of columns passed to MergeTreeReader::fillMissingColumns. "
@ -151,7 +152,7 @@ void IMergeTreeReader::evaluateMissingDefaults(Block additional_columns, Columns
/// Convert columns list to block. And convert subcolumns to full columns.
/// TODO: rewrite with columns interface. It will be possible after changes in ExpressionActions.
auto it = requested_columns.begin();
auto it = original_requested_columns.begin();
for (size_t pos = 0; pos < num_columns; ++pos, ++it)
{
auto name_in_storage = it->getNameInStorage();
@ -178,7 +179,7 @@ void IMergeTreeReader::evaluateMissingDefaults(Block additional_columns, Columns
}
/// Move columns from block.
it = requested_columns.begin();
it = original_requested_columns.begin();
for (size_t pos = 0; pos < num_columns; ++pos, ++it)
{
auto name_in_storage = it->getNameInStorage();

View File

@ -112,6 +112,9 @@ protected:
private:
/// Columns that are requested to read.
NamesAndTypesList original_requested_columns;
/// The same as above but with converted Arrays to subcolumns of Nested.
NamesAndTypesList requested_columns;
/// Actual columns description in part.