From 0a7895ebb974de4989b46d696b530650350c39c4 Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Thu, 17 Feb 2022 22:00:25 +0300 Subject: [PATCH] add comments and small refactoring --- src/Columns/ColumnObject.cpp | 107 ++++++------- src/Columns/ColumnObject.h | 59 ++++++- src/Common/config.h.in | 1 - src/Core/Field.h | 42 ++--- src/DataTypes/DataTypeObject.cpp | 2 +- src/DataTypes/IDataType.h | 5 +- src/DataTypes/ObjectUtils.cpp | 145 +++++++++--------- src/DataTypes/ObjectUtils.h | 52 ++++++- src/DataTypes/Serializations/ISerialization.h | 1 + .../Serializations/SerializationObject.cpp | 14 +- src/DataTypes/Serializations/SubcolumnsTree.h | 69 ++++----- src/DataTypes/getLeastSupertype.h | 1 - src/Storages/IStorage.cpp | 1 - 13 files changed, 298 insertions(+), 201 deletions(-) diff --git a/src/Columns/ColumnObject.cpp b/src/Columns/ColumnObject.cpp index f653c322abd..faa307e6dfb 100644 --- a/src/Columns/ColumnObject.cpp +++ b/src/Columns/ColumnObject.cpp @@ -1,13 +1,10 @@ #include #include -#include -#include +#include #include #include #include #include -#include -#include #include #include #include @@ -15,8 +12,6 @@ #include #include -#include - namespace DB { @@ -32,6 +27,7 @@ namespace ErrorCodes namespace { +/// Recreates scolumn with default scalar values and keeps sizes of arrays. ColumnPtr recreateColumnWithDefaultValues( const ColumnPtr & column, const DataTypePtr & scalar_type, size_t num_dimensions) { @@ -47,43 +43,44 @@ ColumnPtr recreateColumnWithDefaultValues( return createArrayOfType(scalar_type, num_dimensions)->createColumn()->cloneResized(column->size()); } +/// Replaces NULL fields to given field or empty array. class FieldVisitorReplaceNull : public StaticVisitor { public: - [[maybe_unused]] explicit FieldVisitorReplaceNull( + explicit FieldVisitorReplaceNull( const Field & replacement_, size_t num_dimensions_) : replacement(replacement_) , num_dimensions(num_dimensions_) { } - template - Field operator()(const T & x) const + Field operator()(const Null &) const { - if constexpr (std::is_same_v) - { - return num_dimensions - ? createEmptyArrayField(num_dimensions) - : replacement; - } - else if constexpr (std::is_same_v) - { - assert(num_dimensions > 0); - const size_t size = x.size(); - Array res(size); - for (size_t i = 0; i < size; ++i) - res[i] = applyVisitor(FieldVisitorReplaceNull(replacement, num_dimensions - 1), x[i]); - return res; - } - else - return x; + return num_dimensions + ? createEmptyArrayField(num_dimensions) + : replacement; } + Field operator()(const Array & x) const + { + assert(num_dimensions > 0); + const size_t size = x.size(); + Array res(size); + for (size_t i = 0; i < size; ++i) + res[i] = applyVisitor(FieldVisitorReplaceNull(replacement, num_dimensions - 1), x[i]); + return res; + } + + template + Field operator()(const T & x) const { return x; } + private: const Field & replacement; size_t num_dimensions; }; +/// Calculates number of dimensions in array field. +/// Returns 0 for scalar fields. class FieldVisitorToNumberOfDimensions : public StaticVisitor { public: @@ -114,6 +111,9 @@ public: size_t operator()(const T &) const { return 0; } }; +/// Visitor that allows to get type of scalar field +/// or least common type of scalars in array. +/// More optimized version of FieldToDataType. class FieldVisitorToScalarType : public StaticVisitor<> { public: @@ -160,8 +160,7 @@ public: template void operator()(const T &) { - auto field_type = Field::TypeToEnum>::value; - field_types.insert(field_type); + field_types.insert(Field::TypeToEnum>::value); type_indexes.insert(TypeToTypeIndex>); } @@ -280,18 +279,10 @@ void ColumnObject::Subcolumn::insert(Field field, FieldInfo info) if (is_nullable) base_type = makeNullable(base_type); - DataTypePtr value_type; if (!is_nullable && info.have_nulls) - { - auto default_value = base_type->getDefault(); - value_type = createArrayOfType(base_type, value_dim); - field = applyVisitor(FieldVisitorReplaceNull(default_value, value_dim), std::move(field)); - } - else - { - value_type = createArrayOfType(base_type, value_dim); - } + field = applyVisitor(FieldVisitorReplaceNull(base_type->getDefault(), value_dim), std::move(field)); + auto value_type = createArrayOfType(base_type, value_dim); bool type_changed = false; if (data.empty()) @@ -311,12 +302,9 @@ void ColumnObject::Subcolumn::insert(Field field, FieldInfo info) } if (type_changed || info.need_convert) - { - auto converted_field = convertFieldToTypeOrThrow(std::move(field), *value_type); - data.back()->insert(std::move(converted_field)); - } - else - data.back()->insert(std::move(field)); + field = convertFieldToTypeOrThrow(std::move(field), *value_type); + + data.back()->insert(std::move(field)); } void ColumnObject::Subcolumn::insertRangeFrom(const Subcolumn & src, size_t start, size_t length) @@ -372,6 +360,10 @@ void ColumnObject::Subcolumn::finalize() auto offsets = ColumnUInt64::create(); auto & offsets_data = offsets->getData(); + /// We need to convert only non-default values and then recreate column + /// with default value of new type, because default values (which represents misses in data) + /// may be inconsistent between types (e.g "0" in UInt64 and empty string in String). + part->getIndicesOfNonDefaultRows(offsets_data, 0, part_size); if (offsets->size() == part_size) @@ -448,16 +440,16 @@ Field ColumnObject::Subcolumn::getLastField() const ColumnObject::Subcolumn ColumnObject::Subcolumn::recreateWithDefaultValues(const FieldInfo & field_info) const { + auto scalar_type = field_info.scalar_type; + if (is_nullable) + scalar_type = makeNullable(scalar_type); + Subcolumn new_subcolumn; - new_subcolumn.least_common_type = createArrayOfType(field_info.scalar_type, field_info.num_dimensions); + new_subcolumn.least_common_type = createArrayOfType(scalar_type, field_info.num_dimensions); new_subcolumn.is_nullable = is_nullable; new_subcolumn.num_of_defaults_in_prefix = num_of_defaults_in_prefix; new_subcolumn.data.reserve(data.size()); - auto scalar_type = field_info.scalar_type; - if (new_subcolumn.is_nullable) - scalar_type = makeNullable(scalar_type); - for (const auto & part : data) new_subcolumn.data.push_back(recreateColumnWithDefaultValues( part, scalar_type, field_info.num_dimensions)); @@ -524,6 +516,7 @@ size_t ColumnObject::size() const MutableColumnPtr ColumnObject::cloneResized(size_t new_size) const { + /// cloneResized with new_size == 0 is used for cloneEmpty(). if (new_size != 0) throw Exception(ErrorCodes::NOT_IMPLEMENTED, "ColumnObject doesn't support resize to non-zero length"); @@ -663,7 +656,7 @@ const ColumnObject::Subcolumn & ColumnObject::getSubcolumn(const PathInData & ke ColumnObject::Subcolumn & ColumnObject::getSubcolumn(const PathInData & key) { if (const auto * node = subcolumns.findLeaf(key)) - return const_cast(node)->data; + return const_cast(node)->data; throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in ColumnObject", key.getPath()); } @@ -702,23 +695,29 @@ void ColumnObject::addNestedSubcolumn(const PathInData & key, const FieldInfo & "Cannot add Nested subcolumn, because path doesn't contain Nested"); bool inserted = false; + /// We find node that represents the same Nested type as @key. const auto * nested_node = subcolumns.findBestMatch(key); if (nested_node) { + /// Find any leaf of Nested subcolumn. const auto * leaf = subcolumns.findLeaf(nested_node, [&](const auto &) { return true; }); assert(leaf); + /// Recreate subcolumn with default values and the same sizes of arrays. auto new_subcolumn = leaf->data.recreateWithDefaultValues(field_info); + + /// It's possible that we have already inserted value from current row + /// to this subcolumn. So, adjust size to expected. if (new_subcolumn.size() > new_size) new_subcolumn.popBack(new_subcolumn.size() - new_size); - else if (new_subcolumn.size() < new_size) - new_subcolumn.insertManyDefaults(new_size - new_subcolumn.size()); + assert(new_subcolumn.size() == new_size); inserted = subcolumns.add(key, new_subcolumn); } else { + /// If node was not found just add subcolumn with empty arrays. inserted = subcolumns.add(key, Subcolumn(new_size, is_nullable)); } @@ -751,6 +750,8 @@ void ColumnObject::finalize() for (auto && entry : subcolumns) { const auto & least_common_type = entry->data.getLeastCommonType(); + + /// Do not add subcolumns, which consists only from NULLs. if (isNothing(getBaseTypeOfArray(least_common_type))) continue; @@ -758,6 +759,8 @@ void ColumnObject::finalize() new_subcolumns.add(entry->path, std::move(entry->data)); } + /// If all subcolumns were skipped add a dummy subcolumn, + /// because Tuple type must have at least one element. if (new_subcolumns.empty()) new_subcolumns.add(PathInData{COLUMN_NAME_DUMMY}, Subcolumn{ColumnUInt8::create(old_size), is_nullable}); diff --git a/src/Columns/ColumnObject.h b/src/Columns/ColumnObject.h index 361767d6e20..261df2ef7b4 100644 --- a/src/Columns/ColumnObject.h +++ b/src/Columns/ColumnObject.h @@ -18,19 +18,43 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } +/// Info that represents a scalar or array field in a decomposed view. +/// It allows to recreate field with different number +/// of dimensions or nullability. struct FieldInfo { + /// The common type of of all scalars in field. DataTypePtr scalar_type; + + /// Do we have NULL scalar in field. bool have_nulls; + + /// If true then we have scalars with different types in array and + /// we need to convert scalars to the common type. bool need_convert; + + /// Number of dimension in array. 0 if field is scalar. size_t num_dimensions; }; FieldInfo getFieldInfo(const Field & field); +/** A column that represents object with dynamic set of subcolumns. + * Subcolumns are identified by paths in document and are stored in + * a trie-like structure. ColumnObject is not suitable for writing into tables + * and it should be converted to Tuple with fixed set of subcolumns before that. + */ class ColumnObject final : public COWHelper { public: + /** Class that represents one subcolumn. + * It stores values in several parts of column + * and keeps current common type of all parts. + * We add a new column part with a new type, when we insert a field, + * which can't be converted to the current common type. + * After insertion of all values subcolumn should be finalized + * for writing and other operations. + */ class Subcolumn { public: @@ -44,8 +68,12 @@ public: bool isFinalized() const { return data.size() == 1 && num_of_defaults_in_prefix == 0; } const DataTypePtr & getLeastCommonType() const { return least_common_type; } + + /// Checks the consistency of column's parts stored in @data. void checkTypes() const; + /// Inserts a field, which scalars can be arbitrary, but number of + /// dimensions should be consistent with current common type. void insert(Field field); void insert(Field field, FieldInfo info); @@ -54,11 +82,19 @@ public: void insertRangeFrom(const Subcolumn & src, size_t start, size_t length); void popBack(size_t n); + /// Converts all column's parts to the common type and + /// creates a single column that stores all values. void finalize(); + /// Returns last inserted field. Field getLastField() const; + + /// Recreates subcolumn with default scalar values and keeps sizes of arrays. + /// Used to create columns of type Nested with consistent array sizes. Subcolumn recreateWithDefaultValues(const FieldInfo & field_info) const; + /// Returns single column if subcolumn in finalizes. + /// Otherwise -- undefined behaviour. IColumn & getFinalizedColumn(); const IColumn & getFinalizedColumn() const; const ColumnPtr & getFinalizedColumnPtr() const; @@ -66,15 +102,28 @@ public: friend class ColumnObject; private: + /// Current least common type of all values inserted to this subcolumn. DataTypePtr least_common_type; + + /// If true then common type type of subcolumn is Nullable + /// and default values are NULLs. bool is_nullable = false; + + /// Parts of column. Parts should be in increasing order in terms of subtypes/supertypes. + /// That means that the least common type for i-th prefix is the type of i-th part + /// and it's the supertype for all type of column from 0 to i-1. std::vector data; + + /// Until we insert any non-default field we don't know further + /// least common type and we count number of defaults in prefix, + /// which will be converted to the default type of final common type. size_t num_of_defaults_in_prefix = 0; }; using SubcolumnsTree = SubcolumnsTree; private: + /// If true then all subcolumns are nullable. const bool is_nullable; SubcolumnsTree subcolumns; @@ -86,6 +135,7 @@ public: explicit ColumnObject(bool is_nullable_); ColumnObject(SubcolumnsTree && subcolumns_, bool is_nullable_); + /// Checks that all subcolumns have consistent sizes. void checkConsistency() const; bool hasSubcolumn(const PathInData & key) const; @@ -95,16 +145,23 @@ public: void incrementNumRows() { ++num_rows; } + /// Adds a subcolumn from existing IColumn. void addSubcolumn(const PathInData & key, MutableColumnPtr && subcolumn); + + /// Adds a subcolumn of specific size with default values. void addSubcolumn(const PathInData & key, size_t new_size); + + /// Adds a subcolumn of type Nested of specific size with default values. + /// It cares about consistency of sizes of Nested arrays. void addNestedSubcolumn(const PathInData & key, const FieldInfo & field_info, size_t new_size); const SubcolumnsTree & getSubcolumns() const { return subcolumns; } SubcolumnsTree & getSubcolumns() { return subcolumns; } PathsInData getKeys() const; - bool isFinalized() const; + /// Finalizes all subcolumns. void finalize(); + bool isFinalized() const; /// Part of interface diff --git a/src/Common/config.h.in b/src/Common/config.h.in index 945f85970fe..d8d308c59bd 100644 --- a/src/Common/config.h.in +++ b/src/Common/config.h.in @@ -13,7 +13,6 @@ #cmakedefine01 USE_CASSANDRA #cmakedefine01 USE_SENTRY #cmakedefine01 USE_GRPC -#cmakedefine01 USE_STATS #cmakedefine01 USE_SIMDJSON #cmakedefine01 USE_RAPIDJSON diff --git a/src/Core/Field.h b/src/Core/Field.h index a9c067fd487..8b3bd425139 100644 --- a/src/Core/Field.h +++ b/src/Core/Field.h @@ -759,27 +759,27 @@ private: using Row = std::vector; -template <> struct Field::TypeToEnum { static const Types::Which value = Types::Null; }; -template <> struct Field::TypeToEnum { static const Types::Which value = Types::UInt64; }; -template <> struct Field::TypeToEnum { static const Types::Which value = Types::UInt128; }; -template <> struct Field::TypeToEnum { static const Types::Which value = Types::UInt256; }; -template <> struct Field::TypeToEnum { static const Types::Which value = Types::Int64; }; -template <> struct Field::TypeToEnum { static const Types::Which value = Types::Int128; }; -template <> struct Field::TypeToEnum { static const Types::Which value = Types::Int256; }; -template <> struct Field::TypeToEnum { static const Types::Which value = Types::UUID; }; -template <> struct Field::TypeToEnum { static const Types::Which value = Types::Float64; }; -template <> struct Field::TypeToEnum { static const Types::Which value = Types::String; }; -template <> struct Field::TypeToEnum { static const Types::Which value = Types::Array; }; -template <> struct Field::TypeToEnum { static const Types::Which value = Types::Tuple; }; -template <> struct Field::TypeToEnum { static const Types::Which value = Types::Map; }; -template <> struct Field::TypeToEnum { static const Types::Which value = Types::Object; }; -template <> struct Field::TypeToEnum>{ static const Types::Which value = Types::Decimal32; }; -template <> struct Field::TypeToEnum>{ static const Types::Which value = Types::Decimal64; }; -template <> struct Field::TypeToEnum>{ static const Types::Which value = Types::Decimal128; }; -template <> struct Field::TypeToEnum>{ static const Types::Which value = Types::Decimal256; }; -template <> struct Field::TypeToEnum>{ static const Types::Which value = Types::Decimal64; }; -template <> struct Field::TypeToEnum{ static const Types::Which value = Types::AggregateFunctionState; }; -template <> struct Field::TypeToEnum{ static const Types::Which value = Types::Bool; }; +template <> struct Field::TypeToEnum { static constexpr Types::Which value = Types::Null; }; +template <> struct Field::TypeToEnum { static constexpr Types::Which value = Types::UInt64; }; +template <> struct Field::TypeToEnum { static constexpr Types::Which value = Types::UInt128; }; +template <> struct Field::TypeToEnum { static constexpr Types::Which value = Types::UInt256; }; +template <> struct Field::TypeToEnum { static constexpr Types::Which value = Types::Int64; }; +template <> struct Field::TypeToEnum { static constexpr Types::Which value = Types::Int128; }; +template <> struct Field::TypeToEnum { static constexpr Types::Which value = Types::Int256; }; +template <> struct Field::TypeToEnum { static constexpr Types::Which value = Types::UUID; }; +template <> struct Field::TypeToEnum { static constexpr Types::Which value = Types::Float64; }; +template <> struct Field::TypeToEnum { static constexpr Types::Which value = Types::String; }; +template <> struct Field::TypeToEnum { static constexpr Types::Which value = Types::Array; }; +template <> struct Field::TypeToEnum { static constexpr Types::Which value = Types::Tuple; }; +template <> struct Field::TypeToEnum { static constexpr Types::Which value = Types::Map; }; +template <> struct Field::TypeToEnum { static constexpr Types::Which value = Types::Object; }; +template <> struct Field::TypeToEnum>{ static constexpr Types::Which value = Types::Decimal32; }; +template <> struct Field::TypeToEnum>{ static constexpr Types::Which value = Types::Decimal64; }; +template <> struct Field::TypeToEnum>{ static constexpr Types::Which value = Types::Decimal128; }; +template <> struct Field::TypeToEnum>{ static constexpr Types::Which value = Types::Decimal256; }; +template <> struct Field::TypeToEnum>{ static constexpr Types::Which value = Types::Decimal64; }; +template <> struct Field::TypeToEnum{ static constexpr Types::Which value = Types::AggregateFunctionState; }; +template <> struct Field::TypeToEnum{ static constexpr Types::Which value = Types::Bool; }; template <> struct Field::EnumToType { using Type = Null; }; template <> struct Field::EnumToType { using Type = UInt64; }; diff --git a/src/DataTypes/DataTypeObject.cpp b/src/DataTypes/DataTypeObject.cpp index b706e61b3c1..e05add0b91b 100644 --- a/src/DataTypes/DataTypeObject.cpp +++ b/src/DataTypes/DataTypeObject.cpp @@ -26,7 +26,7 @@ DataTypeObject::DataTypeObject(const String & schema_format_, bool is_nullable_) bool DataTypeObject::equals(const IDataType & rhs) const { if (const auto * object = typeid_cast(&rhs)) - return schema_format == object->schema_format; + return schema_format == object->schema_format && is_nullable == object->is_nullable; return false; } diff --git a/src/DataTypes/IDataType.h b/src/DataTypes/IDataType.h index d8e51eb22d0..ae7d4e99abb 100644 --- a/src/DataTypes/IDataType.h +++ b/src/DataTypes/IDataType.h @@ -408,7 +408,10 @@ inline bool isNothing(const DataTypePtr & data_type) { return WhichDataType(data inline bool isUUID(const DataTypePtr & data_type) { return WhichDataType(data_type).isUUID(); } template -inline bool isObject(const T & data_type) {return WhichDataType(data_type).isObject(); } +inline bool isObject(const T & data_type) +{ + return WhichDataType(data_type).isObject(); +} template inline bool isUInt8(const T & data_type) diff --git a/src/DataTypes/ObjectUtils.cpp b/src/DataTypes/ObjectUtils.cpp index 889091f9e98..0f09729d009 100644 --- a/src/DataTypes/ObjectUtils.cpp +++ b/src/DataTypes/ObjectUtils.cpp @@ -6,25 +6,18 @@ #include #include #include -#include #include #include #include #include #include #include -#include -#include #include #include #include #include #include -#include -#include -#include - namespace DB { @@ -52,8 +45,9 @@ size_t getNumberOfDimensions(const IColumn & column) DataTypePtr getBaseTypeOfArray(const DataTypePtr & type) { + /// Get raw pointers to avoid extra copying of type pointers. const DataTypeArray * last_array = nullptr; - const IDataType * current_type = type.get(); + const auto * current_type = type.get(); while (const auto * type_array = typeid_cast(current_type)) { current_type = type_array->getNestedType().get(); @@ -65,8 +59,9 @@ DataTypePtr getBaseTypeOfArray(const DataTypePtr & type) ColumnPtr getBaseColumnOfArray(const ColumnPtr & column) { + /// Get raw pointers to avoid extra copying of column pointers. const ColumnArray * last_array = nullptr; - const IColumn * current_column = column.get(); + const auto * current_column = column.get(); while (const auto * column_array = checkAndGetColumn(current_column)) { current_column = &column_array->getData(); @@ -92,6 +87,9 @@ ColumnPtr createArrayOfColumn(ColumnPtr column, size_t num_dimensions) Array createEmptyArrayField(size_t num_dimensions) { + if (num_dimensions == 0) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot create array field with 0 dimensions"); + Array array; Array * current_array = &array; for (size_t i = 1; i < num_dimensions; ++i) @@ -138,53 +136,53 @@ void convertObjectsToTuples(NamesAndTypesList & columns_list, Block & block, con for (auto & name_type : columns_list) { - if (isObject(name_type.type)) + if (!isObject(name_type.type)) + continue; + + auto & column = block.getByName(name_type.name); + if (!isObject(column.type)) + throw Exception(ErrorCodes::TYPE_MISMATCH, + "Type for column '{}' mismatch in columns list and in block. In list: {}, in block: {}", + name_type.name, name_type.type->getName(), column.type->getName()); + + const auto & column_object = assert_cast(*column.column); + const auto & subcolumns = column_object.getSubcolumns(); + + if (!column_object.isFinalized()) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Cannot convert to tuple column '{}' from type {}. Column should be finalized first", + name_type.name, name_type.type->getName()); + + PathsInData tuple_paths; + DataTypes tuple_types; + Columns tuple_columns; + + for (const auto & entry : subcolumns) { - auto & column = block.getByName(name_type.name); - - if (!isObject(column.type)) - throw Exception(ErrorCodes::TYPE_MISMATCH, - "Type for column '{}' mismatch in columns list and in block. In list: {}, in block: {}", - name_type.name, name_type.type->getName(), column.type->getName()); - - const auto & column_object = assert_cast(*column.column); - const auto & subcolumns_map = column_object.getSubcolumns(); - - if (!column_object.isFinalized()) - throw Exception(ErrorCodes::LOGICAL_ERROR, - "Cannot convert to tuple column '{}' from type {}. Column should be finalized first", - name_type.name, name_type.type->getName()); - - PathsInData tuple_paths; - DataTypes tuple_types; - Columns tuple_columns; - - for (const auto & entry : subcolumns_map) - { - tuple_paths.emplace_back(entry->path); - tuple_types.emplace_back(entry->data.getLeastCommonType()); - tuple_columns.emplace_back(entry->data.getFinalizedColumnPtr()); - } - - auto it = storage_columns_map.find(name_type.name); - if (it == storage_columns_map.end()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Column '{}' not found in storage", name_type.name); - - std::tie(column.column, column.type) = unflattenTuple(tuple_paths, tuple_types, tuple_columns); - name_type.type = column.type; - - getLeastCommonTypeForObject({column.type, it->second}, true); + tuple_paths.emplace_back(entry->path); + tuple_types.emplace_back(entry->data.getLeastCommonType()); + tuple_columns.emplace_back(entry->data.getFinalizedColumnPtr()); } + + auto it = storage_columns_map.find(name_type.name); + if (it == storage_columns_map.end()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Column '{}' not found in storage", name_type.name); + + std::tie(column.column, column.type) = unflattenTuple(tuple_paths, tuple_types, tuple_columns); + name_type.type = column.type; + + /// Check that constructed Tuple type and type in storage are compatible. + getLeastCommonTypeForObject({column.type, it->second}, true); } } -static bool isPrefix(const PathInData::Parts & prefix, const PathInData::Parts & strings) +static bool isPrefix(const PathInData::Parts & prefix, const PathInData::Parts & parts) { - if (prefix.size() > strings.size()) + if (prefix.size() > parts.size()) return false; for (size_t i = 0; i < prefix.size(); ++i) - if (prefix[i].key != strings[i].key) + if (prefix[i].key != parts[i].key) return false; return true; } @@ -192,19 +190,15 @@ static bool isPrefix(const PathInData::Parts & prefix, const PathInData::Parts & void checkObjectHasNoAmbiguosPaths(const PathsInData & paths) { size_t size = paths.size(); - std::vector names_parts(size); - - for (size_t i = 0; i < size; ++i) - names_parts[i] = paths[i].getParts(); - for (size_t i = 0; i < size; ++i) { for (size_t j = 0; j < i; ++j) { - if (isPrefix(names_parts[i], names_parts[j]) || isPrefix(names_parts[j], names_parts[i])) + if (isPrefix(paths[i].getParts(), paths[j].getParts()) + || isPrefix(paths[j].getParts(), paths[i].getParts())) throw Exception(ErrorCodes::DUPLICATE_COLUMN, "Data in Object has ambiguous paths: '{}' and '{}'", - paths[i].getPath(), paths[i].getPath()); + paths[i].getPath(), paths[j].getPath()); } } } @@ -227,8 +221,11 @@ DataTypePtr getLeastCommonTypeForObject(const DataTypes & types, bool check_ambi if (all_equal) return types[0]; + /// Types of subcolumns by path from all tuples. std::unordered_map subcolumns_types; + /// First we flatten tuples, then get common type for paths + /// and finally unflatten paths and create new tuple type. for (const auto & type : types) { const auto * type_tuple = typeid_cast(type.get()); @@ -246,6 +243,7 @@ DataTypePtr getLeastCommonTypeForObject(const DataTypes & types, bool check_ambi PathsInData tuple_paths; DataTypes tuple_types; + /// Get the least common type for all paths. for (const auto & [key, subtypes] : subcolumns_types) { assert(!subtypes.empty()); @@ -312,7 +310,7 @@ void updateObjectColumns(ColumnsDescription & object_columns, const NamesAndType { for (const auto & new_column : new_columns) { - auto object_column = object_columns.tryGetPhysical(new_column.name); + auto object_column = object_columns.tryGetColumn(GetColumnsOptions::All, new_column.name); if (object_column && !object_column->type->equals(*new_column.type)) { object_columns.modify(new_column.name, [&](auto & column) @@ -326,10 +324,14 @@ void updateObjectColumns(ColumnsDescription & object_columns, const NamesAndType namespace { -void flattenTupleImpl(PathInDataBuilder & builder, DataTypePtr type, size_t array_level, PathsInData & new_paths, DataTypes & new_types) +void flattenTupleImpl( + PathInDataBuilder & builder, + DataTypePtr type, + size_t array_level, + PathsInData & new_paths, + DataTypes & new_types) { bool is_nested = isNested(type); - if (is_nested) type = assert_cast(*type).getNestedType(); @@ -356,13 +358,14 @@ void flattenTupleImpl(PathInDataBuilder & builder, DataTypePtr type, size_t arra } } +/// @offsets_columns are used as stack of array offsets and allows to recreate Array columns. void flattenTupleImpl(const ColumnPtr & column, Columns & new_columns, Columns & offsets_columns) { if (const auto * column_tuple = checkAndGetColumn(column.get())) { const auto & subcolumns = column_tuple->getColumns(); for (const auto & subcolumn : subcolumns) - flattenTupleImpl(subcolumn, new_columns,offsets_columns); + flattenTupleImpl(subcolumn, new_columns, offsets_columns); } else if (const auto * column_array = checkAndGetColumn(column.get())) { @@ -375,8 +378,8 @@ void flattenTupleImpl(const ColumnPtr & column, Columns & new_columns, Columns & if (!offsets_columns.empty()) { auto new_column = ColumnArray::create(column, offsets_columns.back()); - for (ssize_t i = static_cast(offsets_columns.size()) - 2; i >= 0; --i) - new_column = ColumnArray::create(new_column, offsets_columns[i]); + for (auto it = offsets_columns.rbegin() + 1; it != offsets_columns.rend(); ++it) + new_column = ColumnArray::create(new_column, *it); new_columns.push_back(std::move(new_column)); } @@ -422,9 +425,8 @@ struct ColumnWithTypeAndDimensions size_t array_dimensions; }; -using SubcolumnsTreeWithTypes = SubcolumnsTree; +using SubcolumnsTreeWithTypes = SubcolumnsTree; using Node = SubcolumnsTreeWithTypes::Node; -using Leaf = SubcolumnsTreeWithTypes::Leaf; std::pair createTypeFromNode(const Node * node) { @@ -438,6 +440,7 @@ std::pair createTypeFromNode(const Node * node) tuple_elements.emplace_back(name, std::move(column), std::move(type)); } + /// Sort to always create the same type for the same set of subcolumns. std::sort(tuple_elements.begin(), tuple_elements.end(), [](const auto & lhs, const auto & rhs) { return std::get<0>(lhs) < std::get<0>(rhs); }); @@ -450,8 +453,7 @@ std::pair createTypeFromNode(const Node * node) if (node->kind == Node::SCALAR) { - const auto * leaf = typeid_cast(node); - return {leaf->data.column, leaf->data.type}; + return {node->data.column, node->data.type}; } else if (node->kind == Node::NESTED) { @@ -474,9 +476,9 @@ std::pair createTypeFromNode(const Node * node) auto result_column = ColumnArray::create(ColumnTuple::create(tuple_columns), offsets_columns.back()); auto result_type = createNested(tuple_types, tuple_names); - for (ssize_t i = static_cast(offsets_columns.size()) - 2; i >= 0; --i) + for (auto it = offsets_columns.rbegin() + 1; it != offsets_columns.rend(); ++it) { - result_column = ColumnArray::create(result_column, offsets_columns[i]); + result_column = ColumnArray::create(result_column, *it); result_type = std::make_shared(result_type); } @@ -533,6 +535,9 @@ std::pair unflattenTuple( assert(paths.size() == tuple_types.size()); assert(paths.size() == tuple_columns.size()); + /// We add all paths to the subcolumn tree and then create a type from it. + /// The tree stores column, type and number of array dimensions + /// for each intermediate node. SubcolumnsTreeWithTypes tree; for (size_t i = 0; i < paths.size(); ++i) @@ -562,10 +567,9 @@ std::pair unflattenTuple( ColumnWithTypeAndDimensions current_column; if (kind == Node::NESTED) { - size_t dimensions_to_reduce = array_level - nested_level; assert(parts[pos].is_nested); - ++dimensions_to_reduce; + size_t dimensions_to_reduce = array_level - nested_level + 1; --nested_level; current_column = ColumnWithTypeAndDimensions{column, type, dimensions_to_reduce}; @@ -579,15 +583,16 @@ std::pair unflattenTuple( array_level -= dimensions_to_reduce; } else + { current_column = ColumnWithTypeAndDimensions{column, type, 0}; + } ++pos; - if (exists) return nullptr; return kind == Node::SCALAR - ? std::make_shared(paths[i], current_column) + ? std::make_shared(kind, current_column, paths[i]) : std::make_shared(kind, current_column); }); } diff --git a/src/DataTypes/ObjectUtils.h b/src/DataTypes/ObjectUtils.h index 15c8d8ed794..f0c9a73bf80 100644 --- a/src/DataTypes/ObjectUtils.h +++ b/src/DataTypes/ObjectUtils.h @@ -12,31 +12,64 @@ namespace DB { +/// Returns number of dimensions in Array type. 0 if type is not array. size_t getNumberOfDimensions(const IDataType & type); -size_t getNumberOfDimensions(const IColumn & column); -DataTypePtr getBaseTypeOfArray(const DataTypePtr & type); -DataTypePtr createArrayOfType(DataTypePtr type, size_t num_dimensions); -Array createEmptyArrayField(size_t num_dimensions); +/// Returns number of dimensions in Array column. 0 if column is not array. +size_t getNumberOfDimensions(const IColumn & column); + +/// Returns type of scalars of Array of arbitrary dimensions. +DataTypePtr getBaseTypeOfArray(const DataTypePtr & type); + +/// Returns Array type with requested scalar type and number of dimensions. +DataTypePtr createArrayOfType(DataTypePtr type, size_t num_dimensions); + +/// Returns column of scalars of Array of arbitrary dimensions. ColumnPtr getBaseColumnOfArray(const ColumnPtr & column); + +/// Returns empty Array column with requested scalar column and number of dimensions. ColumnPtr createArrayOfColumn(const ColumnPtr & column, size_t num_dimensions); +/// Returns Array with requested number of dimensions and no scalars. +Array createEmptyArrayField(size_t num_dimensions); + +/// Tries to get data type by column. Only limited subset of types is supported DataTypePtr getDataTypeByColumn(const IColumn & column); + +/// Converts Object types and columns to Tuples in @columns_list and @block +/// and checks that types are consistent with types in @extended_storage_columns. void convertObjectsToTuples(NamesAndTypesList & columns_list, Block & block, const NamesAndTypesList & extended_storage_columns); + +/// Checks that each path is not the prefix of any other path. void checkObjectHasNoAmbiguosPaths(const PathsInData & paths); + +/// Receives several Tuple types and deduces the least common type among them. DataTypePtr getLeastCommonTypeForObject(const DataTypes & types, bool check_ambiguos_paths = false); + +/// Converts types of object columns to tuples in @columns_list +/// according to @object_columns and adds all tuple's subcolumns if needed. void extendObjectColumns(NamesAndTypesList & columns_list, const ColumnsDescription & object_columns, bool with_subcolumns); NameSet getNamesOfObjectColumns(const NamesAndTypesList & columns_list); bool hasObjectColumns(const ColumnsDescription & columns); +void finalizeObjectColumns(MutableColumns & columns); +/// Updates types of objects in @object_columns inplace +/// according to types in new_columns. void updateObjectColumns(ColumnsDescription & object_columns, const NamesAndTypesList & new_columns); using DataTypeTuplePtr = std::shared_ptr; +/// Flattens nested Tuple to plain Tuple. I.e extracts all paths and types from tuple. +/// E.g. Tuple(t Tuple(c1 UInt32, c2 String), c3 UInt64) -> Tuple(t.c1 UInt32, t.c2 String, c3 UInt32) std::pair flattenTuple(const DataTypePtr & type); + +/// Flattens nested Tuple column to plain Tuple column. ColumnPtr flattenTuple(const ColumnPtr & column); +/// The reverse operation to 'flattenTuple'. +/// Creates nested Tuple from all paths and types. +/// E.g. Tuple(t.c1 UInt32, t.c2 String, c3 UInt32) -> Tuple(t Tuple(c1 UInt32, c2 String), c3 UInt64) DataTypePtr unflattenTuple( const PathsInData & paths, const DataTypes & tuple_types); @@ -46,13 +79,20 @@ std::pair unflattenTuple( const DataTypes & tuple_types, const Columns & tuple_columns); +/// For all columns which exist in @expected_columns and +/// don't exist in @available_columns adds to WITH clause +/// an alias with column name to literal of default value of column type. void replaceMissedSubcolumnsByConstants( const ColumnsDescription & expected_columns, const ColumnsDescription & available_columns, ASTPtr query); -void finalizeObjectColumns(MutableColumns & columns); - +/// Receives range of objects, which contains collections +/// of columns-like objects (e.g. ColumnsDescription or NamesAndTypesList) +/// and deduces the common types of object columns for all entries. +/// @entry_columns_getter should extract reference to collection of +/// columns-like objects from entry to which Iterator points. +/// columns-like object should have fields "name" and "type". template ColumnsDescription getObjectColumns( Iterator begin, Iterator end, diff --git a/src/DataTypes/Serializations/ISerialization.h b/src/DataTypes/Serializations/ISerialization.h index eb8aca0d37f..1d55155c8be 100644 --- a/src/DataTypes/Serializations/ISerialization.h +++ b/src/DataTypes/Serializations/ISerialization.h @@ -136,6 +136,7 @@ public: /// Index of tuple element, starting at 1 or name. String tuple_element_name; + /// Name of subcolumn of object column. String object_key_name; /// Do we need to escape a dot in filenames for tuple elements. diff --git a/src/DataTypes/Serializations/SerializationObject.cpp b/src/DataTypes/Serializations/SerializationObject.cpp index e9eb962dfc5..b306a3f6cbd 100644 --- a/src/DataTypes/Serializations/SerializationObject.cpp +++ b/src/DataTypes/Serializations/SerializationObject.cpp @@ -61,19 +61,21 @@ private: size_t num_dimensions_to_keep; }; +using Node = typename ColumnObject::SubcolumnsTree::Node; + bool tryInsertDefaultFromNested( - ColumnObject::SubcolumnsTree::LeafPtr entry, const ColumnObject::SubcolumnsTree & subcolumns) + std::shared_ptr entry, const ColumnObject::SubcolumnsTree & subcolumns) { if (!entry->path.hasNested()) return false; - const ColumnObject::SubcolumnsTree::Node * node = subcolumns.findLeaf(entry->path); - const ColumnObject::SubcolumnsTree::Leaf * leaf = nullptr; + const Node * current_node = subcolumns.findLeaf(entry->path); + const Node * leaf = nullptr; size_t num_skipped_nested = 0; - while (node) + while (current_node) { - const auto * node_nested = subcolumns.findParent(node, + const auto * node_nested = subcolumns.findParent(current_node, [](const auto & candidate) { return candidate.isNested(); }); if (!node_nested) @@ -88,7 +90,7 @@ bool tryInsertDefaultFromNested( if (leaf) break; - node = node_nested->parent; + current_node = node_nested->parent; ++num_skipped_nested; } diff --git a/src/DataTypes/Serializations/SubcolumnsTree.h b/src/DataTypes/Serializations/SubcolumnsTree.h index 94266db39f7..8722f14b4e9 100644 --- a/src/DataTypes/Serializations/SubcolumnsTree.h +++ b/src/DataTypes/Serializations/SubcolumnsTree.h @@ -8,9 +8,7 @@ namespace DB { -struct EmptyNodeData {}; - -template +template class SubcolumnsTree { public: @@ -25,40 +23,31 @@ public: explicit Node(Kind kind_) : kind(kind_) {} Node(Kind kind_, const NodeData & data_) : kind(kind_), data(data_) {} + Node(Kind kind_, const NodeData & data_, const PathInData & path_) + : kind(kind_), data(data_), path(path_) {} Kind kind = TUPLE; const Node * parent = nullptr; std::map, std::less<>> children; + NodeData data; + PathInData path; bool isNested() const { return kind == NESTED; } + bool isScalar() const { return kind == SCALAR; } void addChild(const String & key, std::shared_ptr next_node) { next_node->parent = this; children[key] = std::move(next_node); } - - virtual ~Node() = default; - }; - - struct Leaf : public Node - { - Leaf(const PathInData & path_, const LeafData & data_) - : Node(Node::SCALAR), path(path_), data(data_) - { - } - - PathInData path; - LeafData data; }; using NodeKind = typename Node::Kind; using NodePtr = std::shared_ptr; - using LeafPtr = std::shared_ptr; - bool add(const PathInData & path, const LeafData & leaf_data) + bool add(const PathInData & path, const NodeData & leaf_data) { return add(path, [&](NodeKind kind, bool exists) -> NodePtr { @@ -66,7 +55,7 @@ public: return nullptr; if (kind == Node::SCALAR) - return std::make_shared(path, leaf_data); + return std::make_shared(kind, leaf_data, path); return std::make_shared(kind); }); @@ -94,9 +83,8 @@ public: { current_node = it->second.get(); node_creator(current_node->kind, true); - bool current_node_is_nested = current_node->kind == Node::NESTED; - if (current_node_is_nested != parts[i].is_nested) + if (current_node->isNested() != parts[i].is_nested) return false; } else @@ -114,10 +102,7 @@ public: auto next_node = node_creator(Node::SCALAR, false); current_node->addChild(String(parts.back().key), next_node); - - auto leaf = std::dynamic_pointer_cast(next_node); - assert(leaf); - leaves.push_back(std::move(leaf)); + leaves.push_back(std::move(next_node)); return true; } @@ -132,22 +117,28 @@ public: return findImpl(path, true); } - const Leaf * findLeaf(const PathInData & path) const + const Node * findLeaf(const PathInData & path) const { - return typeid_cast(findExact(path)); + const auto * candidate = findExact(path); + if (!candidate || !candidate->isScalar()) + return nullptr; + return candidate; } - using LeafPredicate = std::function; + using NodePredicate = std::function; - const Leaf * findLeaf(const LeafPredicate & predicate) + const Node * findLeaf(const NodePredicate & predicate) { return findLeaf(root.get(), predicate); } - static const Leaf * findLeaf(const Node * node, const LeafPredicate & predicate) + static const Node * findLeaf(const Node * node, const NodePredicate & predicate) { - if (const auto * leaf = typeid_cast(node)) - return predicate(*leaf) ? leaf : nullptr; + if (!node) + return nullptr; + + if (node->isScalar()) + return predicate(*node) ? node : nullptr; for (const auto & [_, child] : node->children) if (const auto * leaf = findLeaf(child.get(), predicate)) @@ -156,8 +147,6 @@ public: return nullptr; } - using NodePredicate = std::function; - static const Node * findParent(const Node * node, const NodePredicate & predicate) { while (node && !predicate(*node)) @@ -168,12 +157,13 @@ public: bool empty() const { return root == nullptr; } size_t size() const { return leaves.size(); } - using Leaves = std::vector; - const Leaves & getLeaves() const { return leaves; } + using Nodes = std::vector; + + const Nodes & getLeaves() const { return leaves; } const Node * getRoot() const { return root.get(); } - using iterator = typename Leaves::iterator; - using const_iterator = typename Leaves::const_iterator; + using iterator = typename Nodes::iterator; + using const_iterator = typename Nodes::const_iterator; iterator begin() { return leaves.begin(); } iterator end() { return leaves.end(); } @@ -200,11 +190,10 @@ private: } return current_node; - } NodePtr root; - Leaves leaves; + Nodes leaves; }; } diff --git a/src/DataTypes/getLeastSupertype.h b/src/DataTypes/getLeastSupertype.h index 366fa885a66..5444bb34d06 100644 --- a/src/DataTypes/getLeastSupertype.h +++ b/src/DataTypes/getLeastSupertype.h @@ -14,7 +14,6 @@ namespace DB * Examples: there is no least common supertype for Array(UInt8), Int8. */ DataTypePtr getLeastSupertype(const DataTypes & types, bool allow_conversion_to_string = false); -DataTypePtr getLeastSupertype(const DataTypePtr & lhs, const DataTypePtr & rhs, bool allow_conversion_to_string = false); using TypeIndexSet = std::unordered_set; DataTypePtr getLeastSupertype(const TypeIndexSet & types, bool allow_conversion_to_string = false); diff --git a/src/Storages/IStorage.cpp b/src/Storages/IStorage.cpp index 66f9513c7f7..89403a773b3 100644 --- a/src/Storages/IStorage.cpp +++ b/src/Storages/IStorage.cpp @@ -13,7 +13,6 @@ #include #include #include -#include namespace DB