add comments and small refactoring

This commit is contained in:
Anton Popov 2022-02-17 22:00:25 +03:00
parent edd686e1d9
commit 0a7895ebb9
13 changed files with 298 additions and 201 deletions

View File

@ -1,13 +1,10 @@
#include <Core/Field.h>
#include <Columns/ColumnObject.h>
#include <Columns/ColumnSparse.h>
#include <Columns/ColumnVector.h>
#include <Columns/ColumnsNumber.h>
#include <Columns/ColumnArray.h>
#include <DataTypes/ObjectUtils.h>
#include <DataTypes/getLeastSupertype.h>
#include <DataTypes/DataTypeNothing.h>
#include <DataTypes/DataTypesNumber.h>
#include <DataTypes/FieldToDataType.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypeFactory.h>
#include <DataTypes/NestedUtils.h>
@ -15,8 +12,6 @@
#include <Interpreters/convertFieldToType.h>
#include <Common/HashTable/HashSet.h>
#include <Common/FieldVisitorToString.h>
namespace DB
{
@ -32,6 +27,7 @@ namespace ErrorCodes
namespace
{
/// Recreates scolumn with default scalar values and keeps sizes of arrays.
ColumnPtr recreateColumnWithDefaultValues(
const ColumnPtr & column, const DataTypePtr & scalar_type, size_t num_dimensions)
{
@ -47,43 +43,44 @@ ColumnPtr recreateColumnWithDefaultValues(
return createArrayOfType(scalar_type, num_dimensions)->createColumn()->cloneResized(column->size());
}
/// Replaces NULL fields to given field or empty array.
class FieldVisitorReplaceNull : public StaticVisitor<Field>
{
public:
[[maybe_unused]] explicit FieldVisitorReplaceNull(
explicit FieldVisitorReplaceNull(
const Field & replacement_, size_t num_dimensions_)
: replacement(replacement_)
, num_dimensions(num_dimensions_)
{
}
template <typename T>
Field operator()(const T & x) const
Field operator()(const Null &) const
{
if constexpr (std::is_same_v<T, Null>)
{
return num_dimensions
? createEmptyArrayField(num_dimensions)
: replacement;
}
else if constexpr (std::is_same_v<T, Array>)
{
assert(num_dimensions > 0);
const size_t size = x.size();
Array res(size);
for (size_t i = 0; i < size; ++i)
res[i] = applyVisitor(FieldVisitorReplaceNull(replacement, num_dimensions - 1), x[i]);
return res;
}
else
return x;
return num_dimensions
? createEmptyArrayField(num_dimensions)
: replacement;
}
Field operator()(const Array & x) const
{
assert(num_dimensions > 0);
const size_t size = x.size();
Array res(size);
for (size_t i = 0; i < size; ++i)
res[i] = applyVisitor(FieldVisitorReplaceNull(replacement, num_dimensions - 1), x[i]);
return res;
}
template <typename T>
Field operator()(const T & x) const { return x; }
private:
const Field & replacement;
size_t num_dimensions;
};
/// Calculates number of dimensions in array field.
/// Returns 0 for scalar fields.
class FieldVisitorToNumberOfDimensions : public StaticVisitor<size_t>
{
public:
@ -114,6 +111,9 @@ public:
size_t operator()(const T &) const { return 0; }
};
/// Visitor that allows to get type of scalar field
/// or least common type of scalars in array.
/// More optimized version of FieldToDataType.
class FieldVisitorToScalarType : public StaticVisitor<>
{
public:
@ -160,8 +160,7 @@ public:
template <typename T>
void operator()(const T &)
{
auto field_type = Field::TypeToEnum<NearestFieldType<T>>::value;
field_types.insert(field_type);
field_types.insert(Field::TypeToEnum<NearestFieldType<T>>::value);
type_indexes.insert(TypeToTypeIndex<NearestFieldType<T>>);
}
@ -280,18 +279,10 @@ void ColumnObject::Subcolumn::insert(Field field, FieldInfo info)
if (is_nullable)
base_type = makeNullable(base_type);
DataTypePtr value_type;
if (!is_nullable && info.have_nulls)
{
auto default_value = base_type->getDefault();
value_type = createArrayOfType(base_type, value_dim);
field = applyVisitor(FieldVisitorReplaceNull(default_value, value_dim), std::move(field));
}
else
{
value_type = createArrayOfType(base_type, value_dim);
}
field = applyVisitor(FieldVisitorReplaceNull(base_type->getDefault(), value_dim), std::move(field));
auto value_type = createArrayOfType(base_type, value_dim);
bool type_changed = false;
if (data.empty())
@ -311,12 +302,9 @@ void ColumnObject::Subcolumn::insert(Field field, FieldInfo info)
}
if (type_changed || info.need_convert)
{
auto converted_field = convertFieldToTypeOrThrow(std::move(field), *value_type);
data.back()->insert(std::move(converted_field));
}
else
data.back()->insert(std::move(field));
field = convertFieldToTypeOrThrow(std::move(field), *value_type);
data.back()->insert(std::move(field));
}
void ColumnObject::Subcolumn::insertRangeFrom(const Subcolumn & src, size_t start, size_t length)
@ -372,6 +360,10 @@ void ColumnObject::Subcolumn::finalize()
auto offsets = ColumnUInt64::create();
auto & offsets_data = offsets->getData();
/// We need to convert only non-default values and then recreate column
/// with default value of new type, because default values (which represents misses in data)
/// may be inconsistent between types (e.g "0" in UInt64 and empty string in String).
part->getIndicesOfNonDefaultRows(offsets_data, 0, part_size);
if (offsets->size() == part_size)
@ -448,16 +440,16 @@ Field ColumnObject::Subcolumn::getLastField() const
ColumnObject::Subcolumn ColumnObject::Subcolumn::recreateWithDefaultValues(const FieldInfo & field_info) const
{
auto scalar_type = field_info.scalar_type;
if (is_nullable)
scalar_type = makeNullable(scalar_type);
Subcolumn new_subcolumn;
new_subcolumn.least_common_type = createArrayOfType(field_info.scalar_type, field_info.num_dimensions);
new_subcolumn.least_common_type = createArrayOfType(scalar_type, field_info.num_dimensions);
new_subcolumn.is_nullable = is_nullable;
new_subcolumn.num_of_defaults_in_prefix = num_of_defaults_in_prefix;
new_subcolumn.data.reserve(data.size());
auto scalar_type = field_info.scalar_type;
if (new_subcolumn.is_nullable)
scalar_type = makeNullable(scalar_type);
for (const auto & part : data)
new_subcolumn.data.push_back(recreateColumnWithDefaultValues(
part, scalar_type, field_info.num_dimensions));
@ -524,6 +516,7 @@ size_t ColumnObject::size() const
MutableColumnPtr ColumnObject::cloneResized(size_t new_size) const
{
/// cloneResized with new_size == 0 is used for cloneEmpty().
if (new_size != 0)
throw Exception(ErrorCodes::NOT_IMPLEMENTED,
"ColumnObject doesn't support resize to non-zero length");
@ -663,7 +656,7 @@ const ColumnObject::Subcolumn & ColumnObject::getSubcolumn(const PathInData & ke
ColumnObject::Subcolumn & ColumnObject::getSubcolumn(const PathInData & key)
{
if (const auto * node = subcolumns.findLeaf(key))
return const_cast<SubcolumnsTree::Leaf *>(node)->data;
return const_cast<SubcolumnsTree::Node *>(node)->data;
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in ColumnObject", key.getPath());
}
@ -702,23 +695,29 @@ void ColumnObject::addNestedSubcolumn(const PathInData & key, const FieldInfo &
"Cannot add Nested subcolumn, because path doesn't contain Nested");
bool inserted = false;
/// We find node that represents the same Nested type as @key.
const auto * nested_node = subcolumns.findBestMatch(key);
if (nested_node)
{
/// Find any leaf of Nested subcolumn.
const auto * leaf = subcolumns.findLeaf(nested_node, [&](const auto &) { return true; });
assert(leaf);
/// Recreate subcolumn with default values and the same sizes of arrays.
auto new_subcolumn = leaf->data.recreateWithDefaultValues(field_info);
/// It's possible that we have already inserted value from current row
/// to this subcolumn. So, adjust size to expected.
if (new_subcolumn.size() > new_size)
new_subcolumn.popBack(new_subcolumn.size() - new_size);
else if (new_subcolumn.size() < new_size)
new_subcolumn.insertManyDefaults(new_size - new_subcolumn.size());
assert(new_subcolumn.size() == new_size);
inserted = subcolumns.add(key, new_subcolumn);
}
else
{
/// If node was not found just add subcolumn with empty arrays.
inserted = subcolumns.add(key, Subcolumn(new_size, is_nullable));
}
@ -751,6 +750,8 @@ void ColumnObject::finalize()
for (auto && entry : subcolumns)
{
const auto & least_common_type = entry->data.getLeastCommonType();
/// Do not add subcolumns, which consists only from NULLs.
if (isNothing(getBaseTypeOfArray(least_common_type)))
continue;
@ -758,6 +759,8 @@ void ColumnObject::finalize()
new_subcolumns.add(entry->path, std::move(entry->data));
}
/// If all subcolumns were skipped add a dummy subcolumn,
/// because Tuple type must have at least one element.
if (new_subcolumns.empty())
new_subcolumns.add(PathInData{COLUMN_NAME_DUMMY}, Subcolumn{ColumnUInt8::create(old_size), is_nullable});

View File

@ -18,19 +18,43 @@ namespace ErrorCodes
extern const int LOGICAL_ERROR;
}
/// Info that represents a scalar or array field in a decomposed view.
/// It allows to recreate field with different number
/// of dimensions or nullability.
struct FieldInfo
{
/// The common type of of all scalars in field.
DataTypePtr scalar_type;
/// Do we have NULL scalar in field.
bool have_nulls;
/// If true then we have scalars with different types in array and
/// we need to convert scalars to the common type.
bool need_convert;
/// Number of dimension in array. 0 if field is scalar.
size_t num_dimensions;
};
FieldInfo getFieldInfo(const Field & field);
/** A column that represents object with dynamic set of subcolumns.
* Subcolumns are identified by paths in document and are stored in
* a trie-like structure. ColumnObject is not suitable for writing into tables
* and it should be converted to Tuple with fixed set of subcolumns before that.
*/
class ColumnObject final : public COWHelper<IColumn, ColumnObject>
{
public:
/** Class that represents one subcolumn.
* It stores values in several parts of column
* and keeps current common type of all parts.
* We add a new column part with a new type, when we insert a field,
* which can't be converted to the current common type.
* After insertion of all values subcolumn should be finalized
* for writing and other operations.
*/
class Subcolumn
{
public:
@ -44,8 +68,12 @@ public:
bool isFinalized() const { return data.size() == 1 && num_of_defaults_in_prefix == 0; }
const DataTypePtr & getLeastCommonType() const { return least_common_type; }
/// Checks the consistency of column's parts stored in @data.
void checkTypes() const;
/// Inserts a field, which scalars can be arbitrary, but number of
/// dimensions should be consistent with current common type.
void insert(Field field);
void insert(Field field, FieldInfo info);
@ -54,11 +82,19 @@ public:
void insertRangeFrom(const Subcolumn & src, size_t start, size_t length);
void popBack(size_t n);
/// Converts all column's parts to the common type and
/// creates a single column that stores all values.
void finalize();
/// Returns last inserted field.
Field getLastField() const;
/// Recreates subcolumn with default scalar values and keeps sizes of arrays.
/// Used to create columns of type Nested with consistent array sizes.
Subcolumn recreateWithDefaultValues(const FieldInfo & field_info) const;
/// Returns single column if subcolumn in finalizes.
/// Otherwise -- undefined behaviour.
IColumn & getFinalizedColumn();
const IColumn & getFinalizedColumn() const;
const ColumnPtr & getFinalizedColumnPtr() const;
@ -66,15 +102,28 @@ public:
friend class ColumnObject;
private:
/// Current least common type of all values inserted to this subcolumn.
DataTypePtr least_common_type;
/// If true then common type type of subcolumn is Nullable
/// and default values are NULLs.
bool is_nullable = false;
/// Parts of column. Parts should be in increasing order in terms of subtypes/supertypes.
/// That means that the least common type for i-th prefix is the type of i-th part
/// and it's the supertype for all type of column from 0 to i-1.
std::vector<WrappedPtr> data;
/// Until we insert any non-default field we don't know further
/// least common type and we count number of defaults in prefix,
/// which will be converted to the default type of final common type.
size_t num_of_defaults_in_prefix = 0;
};
using SubcolumnsTree = SubcolumnsTree<Subcolumn>;
private:
/// If true then all subcolumns are nullable.
const bool is_nullable;
SubcolumnsTree subcolumns;
@ -86,6 +135,7 @@ public:
explicit ColumnObject(bool is_nullable_);
ColumnObject(SubcolumnsTree && subcolumns_, bool is_nullable_);
/// Checks that all subcolumns have consistent sizes.
void checkConsistency() const;
bool hasSubcolumn(const PathInData & key) const;
@ -95,16 +145,23 @@ public:
void incrementNumRows() { ++num_rows; }
/// Adds a subcolumn from existing IColumn.
void addSubcolumn(const PathInData & key, MutableColumnPtr && subcolumn);
/// Adds a subcolumn of specific size with default values.
void addSubcolumn(const PathInData & key, size_t new_size);
/// Adds a subcolumn of type Nested of specific size with default values.
/// It cares about consistency of sizes of Nested arrays.
void addNestedSubcolumn(const PathInData & key, const FieldInfo & field_info, size_t new_size);
const SubcolumnsTree & getSubcolumns() const { return subcolumns; }
SubcolumnsTree & getSubcolumns() { return subcolumns; }
PathsInData getKeys() const;
bool isFinalized() const;
/// Finalizes all subcolumns.
void finalize();
bool isFinalized() const;
/// Part of interface

View File

@ -13,7 +13,6 @@
#cmakedefine01 USE_CASSANDRA
#cmakedefine01 USE_SENTRY
#cmakedefine01 USE_GRPC
#cmakedefine01 USE_STATS
#cmakedefine01 USE_SIMDJSON
#cmakedefine01 USE_RAPIDJSON

View File

@ -759,27 +759,27 @@ private:
using Row = std::vector<Field>;
template <> struct Field::TypeToEnum<Null> { static const Types::Which value = Types::Null; };
template <> struct Field::TypeToEnum<UInt64> { static const Types::Which value = Types::UInt64; };
template <> struct Field::TypeToEnum<UInt128> { static const Types::Which value = Types::UInt128; };
template <> struct Field::TypeToEnum<UInt256> { static const Types::Which value = Types::UInt256; };
template <> struct Field::TypeToEnum<Int64> { static const Types::Which value = Types::Int64; };
template <> struct Field::TypeToEnum<Int128> { static const Types::Which value = Types::Int128; };
template <> struct Field::TypeToEnum<Int256> { static const Types::Which value = Types::Int256; };
template <> struct Field::TypeToEnum<UUID> { static const Types::Which value = Types::UUID; };
template <> struct Field::TypeToEnum<Float64> { static const Types::Which value = Types::Float64; };
template <> struct Field::TypeToEnum<String> { static const Types::Which value = Types::String; };
template <> struct Field::TypeToEnum<Array> { static const Types::Which value = Types::Array; };
template <> struct Field::TypeToEnum<Tuple> { static const Types::Which value = Types::Tuple; };
template <> struct Field::TypeToEnum<Map> { static const Types::Which value = Types::Map; };
template <> struct Field::TypeToEnum<Object> { static const Types::Which value = Types::Object; };
template <> struct Field::TypeToEnum<DecimalField<Decimal32>>{ static const Types::Which value = Types::Decimal32; };
template <> struct Field::TypeToEnum<DecimalField<Decimal64>>{ static const Types::Which value = Types::Decimal64; };
template <> struct Field::TypeToEnum<DecimalField<Decimal128>>{ static const Types::Which value = Types::Decimal128; };
template <> struct Field::TypeToEnum<DecimalField<Decimal256>>{ static const Types::Which value = Types::Decimal256; };
template <> struct Field::TypeToEnum<DecimalField<DateTime64>>{ static const Types::Which value = Types::Decimal64; };
template <> struct Field::TypeToEnum<AggregateFunctionStateData>{ static const Types::Which value = Types::AggregateFunctionState; };
template <> struct Field::TypeToEnum<bool>{ static const Types::Which value = Types::Bool; };
template <> struct Field::TypeToEnum<Null> { static constexpr Types::Which value = Types::Null; };
template <> struct Field::TypeToEnum<UInt64> { static constexpr Types::Which value = Types::UInt64; };
template <> struct Field::TypeToEnum<UInt128> { static constexpr Types::Which value = Types::UInt128; };
template <> struct Field::TypeToEnum<UInt256> { static constexpr Types::Which value = Types::UInt256; };
template <> struct Field::TypeToEnum<Int64> { static constexpr Types::Which value = Types::Int64; };
template <> struct Field::TypeToEnum<Int128> { static constexpr Types::Which value = Types::Int128; };
template <> struct Field::TypeToEnum<Int256> { static constexpr Types::Which value = Types::Int256; };
template <> struct Field::TypeToEnum<UUID> { static constexpr Types::Which value = Types::UUID; };
template <> struct Field::TypeToEnum<Float64> { static constexpr Types::Which value = Types::Float64; };
template <> struct Field::TypeToEnum<String> { static constexpr Types::Which value = Types::String; };
template <> struct Field::TypeToEnum<Array> { static constexpr Types::Which value = Types::Array; };
template <> struct Field::TypeToEnum<Tuple> { static constexpr Types::Which value = Types::Tuple; };
template <> struct Field::TypeToEnum<Map> { static constexpr Types::Which value = Types::Map; };
template <> struct Field::TypeToEnum<Object> { static constexpr Types::Which value = Types::Object; };
template <> struct Field::TypeToEnum<DecimalField<Decimal32>>{ static constexpr Types::Which value = Types::Decimal32; };
template <> struct Field::TypeToEnum<DecimalField<Decimal64>>{ static constexpr Types::Which value = Types::Decimal64; };
template <> struct Field::TypeToEnum<DecimalField<Decimal128>>{ static constexpr Types::Which value = Types::Decimal128; };
template <> struct Field::TypeToEnum<DecimalField<Decimal256>>{ static constexpr Types::Which value = Types::Decimal256; };
template <> struct Field::TypeToEnum<DecimalField<DateTime64>>{ static constexpr Types::Which value = Types::Decimal64; };
template <> struct Field::TypeToEnum<AggregateFunctionStateData>{ static constexpr Types::Which value = Types::AggregateFunctionState; };
template <> struct Field::TypeToEnum<bool>{ static constexpr Types::Which value = Types::Bool; };
template <> struct Field::EnumToType<Field::Types::Null> { using Type = Null; };
template <> struct Field::EnumToType<Field::Types::UInt64> { using Type = UInt64; };

View File

@ -26,7 +26,7 @@ DataTypeObject::DataTypeObject(const String & schema_format_, bool is_nullable_)
bool DataTypeObject::equals(const IDataType & rhs) const
{
if (const auto * object = typeid_cast<const DataTypeObject *>(&rhs))
return schema_format == object->schema_format;
return schema_format == object->schema_format && is_nullable == object->is_nullable;
return false;
}

View File

@ -408,7 +408,10 @@ inline bool isNothing(const DataTypePtr & data_type) { return WhichDataType(data
inline bool isUUID(const DataTypePtr & data_type) { return WhichDataType(data_type).isUUID(); }
template <typename T>
inline bool isObject(const T & data_type) {return WhichDataType(data_type).isObject(); }
inline bool isObject(const T & data_type)
{
return WhichDataType(data_type).isObject();
}
template <typename T>
inline bool isUInt8(const T & data_type)

View File

@ -6,25 +6,18 @@
#include <DataTypes/DataTypesNumber.h>
#include <DataTypes/DataTypeNested.h>
#include <DataTypes/DataTypeFactory.h>
#include <DataTypes/FieldToDataType.h>
#include <DataTypes/getLeastSupertype.h>
#include <DataTypes/NestedUtils.h>
#include <Columns/ColumnObject.h>
#include <Columns/ColumnTuple.h>
#include <Columns/ColumnArray.h>
#include <Columns/ColumnNullable.h>
#include <Common/FieldVisitors.h>
#include <Common/assert_cast.h>
#include <Parsers/ASTSelectQuery.h>
#include <Parsers/ASTExpressionList.h>
#include <Parsers/ASTLiteral.h>
#include <Parsers/ASTFunction.h>
#include <IO/Operators.h>
#include <boost/algorithm/string/split.hpp>
#include <boost/algorithm/string.hpp>
#include <base/EnumReflection.h>
namespace DB
{
@ -52,8 +45,9 @@ size_t getNumberOfDimensions(const IColumn & column)
DataTypePtr getBaseTypeOfArray(const DataTypePtr & type)
{
/// Get raw pointers to avoid extra copying of type pointers.
const DataTypeArray * last_array = nullptr;
const IDataType * current_type = type.get();
const auto * current_type = type.get();
while (const auto * type_array = typeid_cast<const DataTypeArray *>(current_type))
{
current_type = type_array->getNestedType().get();
@ -65,8 +59,9 @@ DataTypePtr getBaseTypeOfArray(const DataTypePtr & type)
ColumnPtr getBaseColumnOfArray(const ColumnPtr & column)
{
/// Get raw pointers to avoid extra copying of column pointers.
const ColumnArray * last_array = nullptr;
const IColumn * current_column = column.get();
const auto * current_column = column.get();
while (const auto * column_array = checkAndGetColumn<ColumnArray>(current_column))
{
current_column = &column_array->getData();
@ -92,6 +87,9 @@ ColumnPtr createArrayOfColumn(ColumnPtr column, size_t num_dimensions)
Array createEmptyArrayField(size_t num_dimensions)
{
if (num_dimensions == 0)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot create array field with 0 dimensions");
Array array;
Array * current_array = &array;
for (size_t i = 1; i < num_dimensions; ++i)
@ -138,53 +136,53 @@ void convertObjectsToTuples(NamesAndTypesList & columns_list, Block & block, con
for (auto & name_type : columns_list)
{
if (isObject(name_type.type))
if (!isObject(name_type.type))
continue;
auto & column = block.getByName(name_type.name);
if (!isObject(column.type))
throw Exception(ErrorCodes::TYPE_MISMATCH,
"Type for column '{}' mismatch in columns list and in block. In list: {}, in block: {}",
name_type.name, name_type.type->getName(), column.type->getName());
const auto & column_object = assert_cast<const ColumnObject &>(*column.column);
const auto & subcolumns = column_object.getSubcolumns();
if (!column_object.isFinalized())
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Cannot convert to tuple column '{}' from type {}. Column should be finalized first",
name_type.name, name_type.type->getName());
PathsInData tuple_paths;
DataTypes tuple_types;
Columns tuple_columns;
for (const auto & entry : subcolumns)
{
auto & column = block.getByName(name_type.name);
if (!isObject(column.type))
throw Exception(ErrorCodes::TYPE_MISMATCH,
"Type for column '{}' mismatch in columns list and in block. In list: {}, in block: {}",
name_type.name, name_type.type->getName(), column.type->getName());
const auto & column_object = assert_cast<const ColumnObject &>(*column.column);
const auto & subcolumns_map = column_object.getSubcolumns();
if (!column_object.isFinalized())
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Cannot convert to tuple column '{}' from type {}. Column should be finalized first",
name_type.name, name_type.type->getName());
PathsInData tuple_paths;
DataTypes tuple_types;
Columns tuple_columns;
for (const auto & entry : subcolumns_map)
{
tuple_paths.emplace_back(entry->path);
tuple_types.emplace_back(entry->data.getLeastCommonType());
tuple_columns.emplace_back(entry->data.getFinalizedColumnPtr());
}
auto it = storage_columns_map.find(name_type.name);
if (it == storage_columns_map.end())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Column '{}' not found in storage", name_type.name);
std::tie(column.column, column.type) = unflattenTuple(tuple_paths, tuple_types, tuple_columns);
name_type.type = column.type;
getLeastCommonTypeForObject({column.type, it->second}, true);
tuple_paths.emplace_back(entry->path);
tuple_types.emplace_back(entry->data.getLeastCommonType());
tuple_columns.emplace_back(entry->data.getFinalizedColumnPtr());
}
auto it = storage_columns_map.find(name_type.name);
if (it == storage_columns_map.end())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Column '{}' not found in storage", name_type.name);
std::tie(column.column, column.type) = unflattenTuple(tuple_paths, tuple_types, tuple_columns);
name_type.type = column.type;
/// Check that constructed Tuple type and type in storage are compatible.
getLeastCommonTypeForObject({column.type, it->second}, true);
}
}
static bool isPrefix(const PathInData::Parts & prefix, const PathInData::Parts & strings)
static bool isPrefix(const PathInData::Parts & prefix, const PathInData::Parts & parts)
{
if (prefix.size() > strings.size())
if (prefix.size() > parts.size())
return false;
for (size_t i = 0; i < prefix.size(); ++i)
if (prefix[i].key != strings[i].key)
if (prefix[i].key != parts[i].key)
return false;
return true;
}
@ -192,19 +190,15 @@ static bool isPrefix(const PathInData::Parts & prefix, const PathInData::Parts &
void checkObjectHasNoAmbiguosPaths(const PathsInData & paths)
{
size_t size = paths.size();
std::vector<PathInData::Parts> names_parts(size);
for (size_t i = 0; i < size; ++i)
names_parts[i] = paths[i].getParts();
for (size_t i = 0; i < size; ++i)
{
for (size_t j = 0; j < i; ++j)
{
if (isPrefix(names_parts[i], names_parts[j]) || isPrefix(names_parts[j], names_parts[i]))
if (isPrefix(paths[i].getParts(), paths[j].getParts())
|| isPrefix(paths[j].getParts(), paths[i].getParts()))
throw Exception(ErrorCodes::DUPLICATE_COLUMN,
"Data in Object has ambiguous paths: '{}' and '{}'",
paths[i].getPath(), paths[i].getPath());
paths[i].getPath(), paths[j].getPath());
}
}
}
@ -227,8 +221,11 @@ DataTypePtr getLeastCommonTypeForObject(const DataTypes & types, bool check_ambi
if (all_equal)
return types[0];
/// Types of subcolumns by path from all tuples.
std::unordered_map<PathInData, DataTypes, PathInData::Hash> subcolumns_types;
/// First we flatten tuples, then get common type for paths
/// and finally unflatten paths and create new tuple type.
for (const auto & type : types)
{
const auto * type_tuple = typeid_cast<const DataTypeTuple *>(type.get());
@ -246,6 +243,7 @@ DataTypePtr getLeastCommonTypeForObject(const DataTypes & types, bool check_ambi
PathsInData tuple_paths;
DataTypes tuple_types;
/// Get the least common type for all paths.
for (const auto & [key, subtypes] : subcolumns_types)
{
assert(!subtypes.empty());
@ -312,7 +310,7 @@ void updateObjectColumns(ColumnsDescription & object_columns, const NamesAndType
{
for (const auto & new_column : new_columns)
{
auto object_column = object_columns.tryGetPhysical(new_column.name);
auto object_column = object_columns.tryGetColumn(GetColumnsOptions::All, new_column.name);
if (object_column && !object_column->type->equals(*new_column.type))
{
object_columns.modify(new_column.name, [&](auto & column)
@ -326,10 +324,14 @@ void updateObjectColumns(ColumnsDescription & object_columns, const NamesAndType
namespace
{
void flattenTupleImpl(PathInDataBuilder & builder, DataTypePtr type, size_t array_level, PathsInData & new_paths, DataTypes & new_types)
void flattenTupleImpl(
PathInDataBuilder & builder,
DataTypePtr type,
size_t array_level,
PathsInData & new_paths,
DataTypes & new_types)
{
bool is_nested = isNested(type);
if (is_nested)
type = assert_cast<const DataTypeArray &>(*type).getNestedType();
@ -356,13 +358,14 @@ void flattenTupleImpl(PathInDataBuilder & builder, DataTypePtr type, size_t arra
}
}
/// @offsets_columns are used as stack of array offsets and allows to recreate Array columns.
void flattenTupleImpl(const ColumnPtr & column, Columns & new_columns, Columns & offsets_columns)
{
if (const auto * column_tuple = checkAndGetColumn<ColumnTuple>(column.get()))
{
const auto & subcolumns = column_tuple->getColumns();
for (const auto & subcolumn : subcolumns)
flattenTupleImpl(subcolumn, new_columns,offsets_columns);
flattenTupleImpl(subcolumn, new_columns, offsets_columns);
}
else if (const auto * column_array = checkAndGetColumn<ColumnArray>(column.get()))
{
@ -375,8 +378,8 @@ void flattenTupleImpl(const ColumnPtr & column, Columns & new_columns, Columns &
if (!offsets_columns.empty())
{
auto new_column = ColumnArray::create(column, offsets_columns.back());
for (ssize_t i = static_cast<ssize_t>(offsets_columns.size()) - 2; i >= 0; --i)
new_column = ColumnArray::create(new_column, offsets_columns[i]);
for (auto it = offsets_columns.rbegin() + 1; it != offsets_columns.rend(); ++it)
new_column = ColumnArray::create(new_column, *it);
new_columns.push_back(std::move(new_column));
}
@ -422,9 +425,8 @@ struct ColumnWithTypeAndDimensions
size_t array_dimensions;
};
using SubcolumnsTreeWithTypes = SubcolumnsTree<ColumnWithTypeAndDimensions, ColumnWithTypeAndDimensions>;
using SubcolumnsTreeWithTypes = SubcolumnsTree<ColumnWithTypeAndDimensions>;
using Node = SubcolumnsTreeWithTypes::Node;
using Leaf = SubcolumnsTreeWithTypes::Leaf;
std::pair<ColumnPtr, DataTypePtr> createTypeFromNode(const Node * node)
{
@ -438,6 +440,7 @@ std::pair<ColumnPtr, DataTypePtr> createTypeFromNode(const Node * node)
tuple_elements.emplace_back(name, std::move(column), std::move(type));
}
/// Sort to always create the same type for the same set of subcolumns.
std::sort(tuple_elements.begin(), tuple_elements.end(),
[](const auto & lhs, const auto & rhs) { return std::get<0>(lhs) < std::get<0>(rhs); });
@ -450,8 +453,7 @@ std::pair<ColumnPtr, DataTypePtr> createTypeFromNode(const Node * node)
if (node->kind == Node::SCALAR)
{
const auto * leaf = typeid_cast<const Leaf *>(node);
return {leaf->data.column, leaf->data.type};
return {node->data.column, node->data.type};
}
else if (node->kind == Node::NESTED)
{
@ -474,9 +476,9 @@ std::pair<ColumnPtr, DataTypePtr> createTypeFromNode(const Node * node)
auto result_column = ColumnArray::create(ColumnTuple::create(tuple_columns), offsets_columns.back());
auto result_type = createNested(tuple_types, tuple_names);
for (ssize_t i = static_cast<ssize_t>(offsets_columns.size()) - 2; i >= 0; --i)
for (auto it = offsets_columns.rbegin() + 1; it != offsets_columns.rend(); ++it)
{
result_column = ColumnArray::create(result_column, offsets_columns[i]);
result_column = ColumnArray::create(result_column, *it);
result_type = std::make_shared<DataTypeArray>(result_type);
}
@ -533,6 +535,9 @@ std::pair<ColumnPtr, DataTypePtr> unflattenTuple(
assert(paths.size() == tuple_types.size());
assert(paths.size() == tuple_columns.size());
/// We add all paths to the subcolumn tree and then create a type from it.
/// The tree stores column, type and number of array dimensions
/// for each intermediate node.
SubcolumnsTreeWithTypes tree;
for (size_t i = 0; i < paths.size(); ++i)
@ -562,10 +567,9 @@ std::pair<ColumnPtr, DataTypePtr> unflattenTuple(
ColumnWithTypeAndDimensions current_column;
if (kind == Node::NESTED)
{
size_t dimensions_to_reduce = array_level - nested_level;
assert(parts[pos].is_nested);
++dimensions_to_reduce;
size_t dimensions_to_reduce = array_level - nested_level + 1;
--nested_level;
current_column = ColumnWithTypeAndDimensions{column, type, dimensions_to_reduce};
@ -579,15 +583,16 @@ std::pair<ColumnPtr, DataTypePtr> unflattenTuple(
array_level -= dimensions_to_reduce;
}
else
{
current_column = ColumnWithTypeAndDimensions{column, type, 0};
}
++pos;
if (exists)
return nullptr;
return kind == Node::SCALAR
? std::make_shared<Leaf>(paths[i], current_column)
? std::make_shared<Node>(kind, current_column, paths[i])
: std::make_shared<Node>(kind, current_column);
});
}

View File

@ -12,31 +12,64 @@
namespace DB
{
/// Returns number of dimensions in Array type. 0 if type is not array.
size_t getNumberOfDimensions(const IDataType & type);
size_t getNumberOfDimensions(const IColumn & column);
DataTypePtr getBaseTypeOfArray(const DataTypePtr & type);
DataTypePtr createArrayOfType(DataTypePtr type, size_t num_dimensions);
Array createEmptyArrayField(size_t num_dimensions);
/// Returns number of dimensions in Array column. 0 if column is not array.
size_t getNumberOfDimensions(const IColumn & column);
/// Returns type of scalars of Array of arbitrary dimensions.
DataTypePtr getBaseTypeOfArray(const DataTypePtr & type);
/// Returns Array type with requested scalar type and number of dimensions.
DataTypePtr createArrayOfType(DataTypePtr type, size_t num_dimensions);
/// Returns column of scalars of Array of arbitrary dimensions.
ColumnPtr getBaseColumnOfArray(const ColumnPtr & column);
/// Returns empty Array column with requested scalar column and number of dimensions.
ColumnPtr createArrayOfColumn(const ColumnPtr & column, size_t num_dimensions);
/// Returns Array with requested number of dimensions and no scalars.
Array createEmptyArrayField(size_t num_dimensions);
/// Tries to get data type by column. Only limited subset of types is supported
DataTypePtr getDataTypeByColumn(const IColumn & column);
/// Converts Object types and columns to Tuples in @columns_list and @block
/// and checks that types are consistent with types in @extended_storage_columns.
void convertObjectsToTuples(NamesAndTypesList & columns_list, Block & block, const NamesAndTypesList & extended_storage_columns);
/// Checks that each path is not the prefix of any other path.
void checkObjectHasNoAmbiguosPaths(const PathsInData & paths);
/// Receives several Tuple types and deduces the least common type among them.
DataTypePtr getLeastCommonTypeForObject(const DataTypes & types, bool check_ambiguos_paths = false);
/// Converts types of object columns to tuples in @columns_list
/// according to @object_columns and adds all tuple's subcolumns if needed.
void extendObjectColumns(NamesAndTypesList & columns_list, const ColumnsDescription & object_columns, bool with_subcolumns);
NameSet getNamesOfObjectColumns(const NamesAndTypesList & columns_list);
bool hasObjectColumns(const ColumnsDescription & columns);
void finalizeObjectColumns(MutableColumns & columns);
/// Updates types of objects in @object_columns inplace
/// according to types in new_columns.
void updateObjectColumns(ColumnsDescription & object_columns, const NamesAndTypesList & new_columns);
using DataTypeTuplePtr = std::shared_ptr<DataTypeTuple>;
/// Flattens nested Tuple to plain Tuple. I.e extracts all paths and types from tuple.
/// E.g. Tuple(t Tuple(c1 UInt32, c2 String), c3 UInt64) -> Tuple(t.c1 UInt32, t.c2 String, c3 UInt32)
std::pair<PathsInData, DataTypes> flattenTuple(const DataTypePtr & type);
/// Flattens nested Tuple column to plain Tuple column.
ColumnPtr flattenTuple(const ColumnPtr & column);
/// The reverse operation to 'flattenTuple'.
/// Creates nested Tuple from all paths and types.
/// E.g. Tuple(t.c1 UInt32, t.c2 String, c3 UInt32) -> Tuple(t Tuple(c1 UInt32, c2 String), c3 UInt64)
DataTypePtr unflattenTuple(
const PathsInData & paths,
const DataTypes & tuple_types);
@ -46,13 +79,20 @@ std::pair<ColumnPtr, DataTypePtr> unflattenTuple(
const DataTypes & tuple_types,
const Columns & tuple_columns);
/// For all columns which exist in @expected_columns and
/// don't exist in @available_columns adds to WITH clause
/// an alias with column name to literal of default value of column type.
void replaceMissedSubcolumnsByConstants(
const ColumnsDescription & expected_columns,
const ColumnsDescription & available_columns,
ASTPtr query);
void finalizeObjectColumns(MutableColumns & columns);
/// Receives range of objects, which contains collections
/// of columns-like objects (e.g. ColumnsDescription or NamesAndTypesList)
/// and deduces the common types of object columns for all entries.
/// @entry_columns_getter should extract reference to collection of
/// columns-like objects from entry to which Iterator points.
/// columns-like object should have fields "name" and "type".
template <typename Iterator, typename EntryColumnsGetter>
ColumnsDescription getObjectColumns(
Iterator begin, Iterator end,

View File

@ -136,6 +136,7 @@ public:
/// Index of tuple element, starting at 1 or name.
String tuple_element_name;
/// Name of subcolumn of object column.
String object_key_name;
/// Do we need to escape a dot in filenames for tuple elements.

View File

@ -61,19 +61,21 @@ private:
size_t num_dimensions_to_keep;
};
using Node = typename ColumnObject::SubcolumnsTree::Node;
bool tryInsertDefaultFromNested(
ColumnObject::SubcolumnsTree::LeafPtr entry, const ColumnObject::SubcolumnsTree & subcolumns)
std::shared_ptr<Node> entry, const ColumnObject::SubcolumnsTree & subcolumns)
{
if (!entry->path.hasNested())
return false;
const ColumnObject::SubcolumnsTree::Node * node = subcolumns.findLeaf(entry->path);
const ColumnObject::SubcolumnsTree::Leaf * leaf = nullptr;
const Node * current_node = subcolumns.findLeaf(entry->path);
const Node * leaf = nullptr;
size_t num_skipped_nested = 0;
while (node)
while (current_node)
{
const auto * node_nested = subcolumns.findParent(node,
const auto * node_nested = subcolumns.findParent(current_node,
[](const auto & candidate) { return candidate.isNested(); });
if (!node_nested)
@ -88,7 +90,7 @@ bool tryInsertDefaultFromNested(
if (leaf)
break;
node = node_nested->parent;
current_node = node_nested->parent;
++num_skipped_nested;
}

View File

@ -8,9 +8,7 @@
namespace DB
{
struct EmptyNodeData {};
template <typename LeafData, typename NodeData = EmptyNodeData>
template <typename NodeData>
class SubcolumnsTree
{
public:
@ -25,40 +23,31 @@ public:
explicit Node(Kind kind_) : kind(kind_) {}
Node(Kind kind_, const NodeData & data_) : kind(kind_), data(data_) {}
Node(Kind kind_, const NodeData & data_, const PathInData & path_)
: kind(kind_), data(data_), path(path_) {}
Kind kind = TUPLE;
const Node * parent = nullptr;
std::map<String, std::shared_ptr<Node>, std::less<>> children;
NodeData data;
PathInData path;
bool isNested() const { return kind == NESTED; }
bool isScalar() const { return kind == SCALAR; }
void addChild(const String & key, std::shared_ptr<Node> next_node)
{
next_node->parent = this;
children[key] = std::move(next_node);
}
virtual ~Node() = default;
};
struct Leaf : public Node
{
Leaf(const PathInData & path_, const LeafData & data_)
: Node(Node::SCALAR), path(path_), data(data_)
{
}
PathInData path;
LeafData data;
};
using NodeKind = typename Node::Kind;
using NodePtr = std::shared_ptr<Node>;
using LeafPtr = std::shared_ptr<Leaf>;
bool add(const PathInData & path, const LeafData & leaf_data)
bool add(const PathInData & path, const NodeData & leaf_data)
{
return add(path, [&](NodeKind kind, bool exists) -> NodePtr
{
@ -66,7 +55,7 @@ public:
return nullptr;
if (kind == Node::SCALAR)
return std::make_shared<Leaf>(path, leaf_data);
return std::make_shared<Node>(kind, leaf_data, path);
return std::make_shared<Node>(kind);
});
@ -94,9 +83,8 @@ public:
{
current_node = it->second.get();
node_creator(current_node->kind, true);
bool current_node_is_nested = current_node->kind == Node::NESTED;
if (current_node_is_nested != parts[i].is_nested)
if (current_node->isNested() != parts[i].is_nested)
return false;
}
else
@ -114,10 +102,7 @@ public:
auto next_node = node_creator(Node::SCALAR, false);
current_node->addChild(String(parts.back().key), next_node);
auto leaf = std::dynamic_pointer_cast<Leaf>(next_node);
assert(leaf);
leaves.push_back(std::move(leaf));
leaves.push_back(std::move(next_node));
return true;
}
@ -132,22 +117,28 @@ public:
return findImpl(path, true);
}
const Leaf * findLeaf(const PathInData & path) const
const Node * findLeaf(const PathInData & path) const
{
return typeid_cast<const Leaf *>(findExact(path));
const auto * candidate = findExact(path);
if (!candidate || !candidate->isScalar())
return nullptr;
return candidate;
}
using LeafPredicate = std::function<bool(const Leaf &)>;
using NodePredicate = std::function<bool(const Node &)>;
const Leaf * findLeaf(const LeafPredicate & predicate)
const Node * findLeaf(const NodePredicate & predicate)
{
return findLeaf(root.get(), predicate);
}
static const Leaf * findLeaf(const Node * node, const LeafPredicate & predicate)
static const Node * findLeaf(const Node * node, const NodePredicate & predicate)
{
if (const auto * leaf = typeid_cast<const Leaf *>(node))
return predicate(*leaf) ? leaf : nullptr;
if (!node)
return nullptr;
if (node->isScalar())
return predicate(*node) ? node : nullptr;
for (const auto & [_, child] : node->children)
if (const auto * leaf = findLeaf(child.get(), predicate))
@ -156,8 +147,6 @@ public:
return nullptr;
}
using NodePredicate = std::function<bool(const Node &)>;
static const Node * findParent(const Node * node, const NodePredicate & predicate)
{
while (node && !predicate(*node))
@ -168,12 +157,13 @@ public:
bool empty() const { return root == nullptr; }
size_t size() const { return leaves.size(); }
using Leaves = std::vector<LeafPtr>;
const Leaves & getLeaves() const { return leaves; }
using Nodes = std::vector<NodePtr>;
const Nodes & getLeaves() const { return leaves; }
const Node * getRoot() const { return root.get(); }
using iterator = typename Leaves::iterator;
using const_iterator = typename Leaves::const_iterator;
using iterator = typename Nodes::iterator;
using const_iterator = typename Nodes::const_iterator;
iterator begin() { return leaves.begin(); }
iterator end() { return leaves.end(); }
@ -200,11 +190,10 @@ private:
}
return current_node;
}
NodePtr root;
Leaves leaves;
Nodes leaves;
};
}

View File

@ -14,7 +14,6 @@ namespace DB
* Examples: there is no least common supertype for Array(UInt8), Int8.
*/
DataTypePtr getLeastSupertype(const DataTypes & types, bool allow_conversion_to_string = false);
DataTypePtr getLeastSupertype(const DataTypePtr & lhs, const DataTypePtr & rhs, bool allow_conversion_to_string = false);
using TypeIndexSet = std::unordered_set<TypeIndex>;
DataTypePtr getLeastSupertype(const TypeIndexSet & types, bool allow_conversion_to_string = false);

View File

@ -13,7 +13,6 @@
#include <Processors/QueryPlan/ReadFromPreparedSource.h>
#include <Processors/QueryPlan/QueryPlan.h>
#include <Storages/AlterCommands.h>
#include <boost/algorithm/string/join.hpp>
namespace DB