fix filling of missed Nested columns with multiple levels

This commit is contained in:
Anton Popov 2022-05-11 19:33:45 +00:00
parent 3215c98319
commit 0dbbca5b3e
8 changed files with 108 additions and 38 deletions

View File

@ -132,29 +132,29 @@ namespace
offset_values.resize(i);
}
}
ColumnPtr arraySizesToOffsets(const IColumn & column)
{
const auto & column_sizes = assert_cast<const ColumnArray::ColumnOffsets &>(column);
MutableColumnPtr column_offsets = column_sizes.cloneEmpty();
if (column_sizes.empty())
return column_offsets;
const auto & sizes_data = column_sizes.getData();
auto & offsets_data = assert_cast<ColumnArray::ColumnOffsets &>(*column_offsets).getData();
offsets_data.resize(sizes_data.size());
IColumn::Offset prev_offset = 0;
for (size_t i = 0, size = sizes_data.size(); i < size; ++i)
{
prev_offset += sizes_data[i];
offsets_data[i] = prev_offset;
}
ColumnPtr arraySizesToOffsets(const IColumn & column)
{
const auto & column_sizes = assert_cast<const ColumnArray::ColumnOffsets &>(column);
MutableColumnPtr column_offsets = column_sizes.cloneEmpty();
if (column_sizes.empty())
return column_offsets;
const auto & sizes_data = column_sizes.getData();
auto & offsets_data = assert_cast<ColumnArray::ColumnOffsets &>(*column_offsets).getData();
offsets_data.resize(sizes_data.size());
IColumn::Offset prev_offset = 0;
for (size_t i = 0, size = sizes_data.size(); i < size; ++i)
{
prev_offset += sizes_data[i];
offsets_data[i] = prev_offset;
}
return column_offsets;
}
ColumnPtr arrayOffsetsToSizes(const IColumn & column)

View File

@ -80,5 +80,6 @@ private:
};
ColumnPtr arrayOffsetsToSizes(const IColumn & column);
ColumnPtr arraySizesToOffsets(const IColumn & column);
}

View File

@ -12,6 +12,7 @@
#include <Parsers/ASTFunction.h>
#include <utility>
#include <DataTypes/DataTypesNumber.h>
#include <DataTypes/ObjectUtils.h>
#include <Interpreters/RequiredSourceColumnsVisitor.h>
#include <Common/checkStackSize.h>
#include <Storages/ColumnsDescription.h>
@ -198,6 +199,9 @@ static bool arrayHasNoElementsRead(const IColumn & column)
if (!size)
return false;
if (const auto * nested_array = typeid_cast<const ColumnArray *>(&column_array->getData()))
return arrayHasNoElementsRead(*nested_array);
size_t data_size = column_array->getData().size();
if (data_size)
return false;
@ -210,6 +214,7 @@ void fillMissingColumns(
Columns & res_columns,
size_t num_rows,
const NamesAndTypesList & requested_columns,
const NamesAndTypesList & available_columns,
StorageMetadataPtr metadata_snapshot)
{
size_t num_columns = requested_columns.size();
@ -224,26 +229,35 @@ void fillMissingColumns(
/// First, collect offset columns for all arrays in the block.
std::unordered_map<String, ColumnPtr> offset_columns;
auto requested_column = requested_columns.begin();
for (size_t i = 0; i < num_columns; ++i, ++requested_column)
auto available_column = available_columns.begin();
for (size_t i = 0; i < num_columns; ++i, ++available_column)
{
if (res_columns[i] == nullptr)
continue;
if (const auto * array = typeid_cast<const ColumnArray *>(res_columns[i].get()))
auto serialization = IDataType::getSerialization(*available_column);
auto name_in_storage = Nested::extractTableName(available_column->name);
ISerialization::SubstreamPath path;
serialization->enumerateStreams(path, [&](const auto & subpath)
{
String offsets_name = Nested::extractTableName(requested_column->name);
auto & offsets_column = offset_columns[offsets_name];
if (subpath.empty() || subpath.back().type != ISerialization::Substream::ArraySizes)
return;
auto subname = ISerialization::getSubcolumnNameForStream(subpath);
auto & offsets_column = offset_columns[Nested::concatenateName(name_in_storage, subname)];
/// If for some reason multiple offsets columns are present for the same nested data structure,
/// choose the one that is not empty.
/// TODO: more optimal
if (!offsets_column || offsets_column->empty())
offsets_column = array->getOffsetsPtr();
}
offsets_column = arraySizesToOffsets(*subpath.back().data.column);
}, {serialization, available_column->type, res_columns[i], nullptr});
}
/// insert default values only for columns without default expressions
requested_column = requested_columns.begin();
auto requested_column = requested_columns.begin();
for (size_t i = 0; i < num_columns; ++i, ++requested_column)
{
const auto & [name, type] = *requested_column;
@ -256,19 +270,44 @@ void fillMissingColumns(
if (metadata_snapshot && metadata_snapshot->getColumns().hasDefault(name))
continue;
String offsets_name = Nested::extractTableName(name);
auto offset_it = offset_columns.find(offsets_name);
std::vector<ColumnPtr> current_offsets;
bool has_all_offsets = true;
const auto * array_type = typeid_cast<const DataTypeArray *>(type.get());
if (offset_it != offset_columns.end() && array_type)
if (array_type)
{
const auto & nested_type = array_type->getNestedType();
ColumnPtr offsets_column = offset_it->second;
size_t nested_rows = typeid_cast<const ColumnUInt64 &>(*offsets_column).getData().back();
auto serialization = IDataType::getSerialization(*requested_column);
auto name_in_storage = Nested::extractTableName(requested_column->name);
ColumnPtr nested_column =
nested_type->createColumnConstWithDefaultValue(nested_rows)->convertToFullColumnIfConst();
ISerialization::SubstreamPath path;
serialization->enumerateStreams(path, [&](const auto & subpath)
{
if (!has_all_offsets)
return;
res_columns[i] = ColumnArray::create(nested_column, offsets_column);
if (subpath.empty() || subpath.back().type != ISerialization::Substream::ArraySizes)
return;
auto subname = ISerialization::getSubcolumnNameForStream(subpath);
auto it = offset_columns.find(Nested::concatenateName(name_in_storage, subname));
if (it != offset_columns.end())
current_offsets.emplace_back(it->second);
else
has_all_offsets = false;
}, {serialization, type, nullptr, nullptr});
}
if (array_type && has_all_offsets)
{
assert(!current_offsets.empty());
auto scalar_type = getBaseTypeOfArray(type);
size_t data_size = assert_cast<const ColumnUInt64 &>(*current_offsets.back()).getData().back();
res_columns[i] = scalar_type->createColumnConstWithDefaultValue(data_size)->convertToFullColumnIfConst();
for (auto it = current_offsets.rbegin(); it != current_offsets.rend(); ++it)
res_columns[i] = ColumnArray::create(res_columns[i], *it);
}
else
{

View File

@ -43,6 +43,7 @@ void fillMissingColumns(
Columns & res_columns,
size_t num_rows,
const NamesAndTypesList & requested_columns,
const NamesAndTypesList & available_columns,
StorageMetadataPtr metadata_snapshot);
}

View File

@ -66,7 +66,11 @@ void IMergeTreeReader::fillMissingColumns(Columns & res_columns, bool & should_e
{
try
{
DB::fillMissingColumns(res_columns, num_rows, columns, metadata_snapshot);
NamesAndTypesList available_columns;
for (const auto & column : columns)
available_columns.push_back(getColumnFromPart(column));
DB::fillMissingColumns(res_columns, num_rows, columns, available_columns, metadata_snapshot);
should_evaluate_missing_defaults = std::any_of(
res_columns.begin(), res_columns.end(), [](const auto & column) { return column == nullptr; });
}

View File

@ -91,7 +91,7 @@ protected:
++name_and_type;
}
fillMissingColumns(columns, src.rows(), column_names_and_types, /*metadata_snapshot=*/ nullptr);
fillMissingColumns(columns, src.rows(), column_names_and_types, column_names_and_types, /*metadata_snapshot=*/ nullptr);
assert(std::all_of(columns.begin(), columns.end(), [](const auto & column) { return column != nullptr; }));
return Chunk(std::move(columns), src.rows());

View File

@ -0,0 +1,7 @@
Tuple(arr Nested(k1 Nested(k2 String, k3 String, k4 Int8), k5 Tuple(k6 String)), id Int8)
{"obj":{"arr":[{"k1":[{"k2":"aaa","k3":"bbb","k4":0},{"k2":"ccc","k3":"","k4":0}],"k5":{"k6":""}}],"id":1}}
{"obj":{"arr":[{"k1":[{"k2":"","k3":"ddd","k4":10},{"k2":"","k3":"","k4":20}],"k5":{"k6":"foo"}}],"id":2}}
[['bbb','']] [['aaa','ccc']]
[['ddd','']] [['','']]
[[0,0]]
[[10,20]]

View File

@ -0,0 +1,18 @@
-- Tags: no-fasttest
DROP TABLE IF EXISTS t_json_17;
SET allow_experimental_object_type = 1;
SET output_format_json_named_tuples_as_objects = 1;
CREATE TABLE t_json_17(obj JSON)
ENGINE = MergeTree ORDER BY tuple();
INSERT INTO t_json_17 FORMAT JSONAsObject {"id": 1, "arr": [{"k1": [{"k2": "aaa", "k3": "bbb"}, {"k2": "ccc"}]}]}
INSERT INTO t_json_17 FORMAT JSONAsObject {"id": 2, "arr": [{"k1": [{"k3": "ddd", "k4": 10}, {"k4": 20}], "k5": {"k6": "foo"}}]}
SELECT toTypeName(obj) FROM t_json_17 LIMIT 1;
SELECT obj FROM t_json_17 ORDER BY obj.id FORMAT JSONEachRow;
SELECT obj.arr.k1.k3, obj.arr.k1.k2 FROM t_json_17 ORDER BY obj.id;
SELECT obj.arr.k1.k4 FROM t_json_17 ORDER BY obj.id;
DROP TABLE IF EXISTS t_json_17;