mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-12-13 01:41:59 +00:00
992 lines
35 KiB
C++
992 lines
35 KiB
C++
#include <DataTypes/ObjectUtils.h>
|
|
#include <DataTypes/DataTypeObject.h>
|
|
#include <DataTypes/DataTypeNothing.h>
|
|
#include <DataTypes/DataTypeArray.h>
|
|
#include <DataTypes/DataTypeMap.h>
|
|
#include <DataTypes/DataTypeNullable.h>
|
|
#include <DataTypes/DataTypesNumber.h>
|
|
#include <DataTypes/DataTypeNested.h>
|
|
#include <DataTypes/DataTypeFactory.h>
|
|
#include <DataTypes/getLeastSupertype.h>
|
|
#include <DataTypes/NestedUtils.h>
|
|
#include <Storages/StorageSnapshot.h>
|
|
#include <Columns/ColumnObject.h>
|
|
#include <Columns/ColumnTuple.h>
|
|
#include <Columns/ColumnArray.h>
|
|
#include <Columns/ColumnMap.h>
|
|
#include <Columns/ColumnNullable.h>
|
|
#include <Parsers/ASTSelectQuery.h>
|
|
#include <Parsers/ASTExpressionList.h>
|
|
#include <Parsers/ASTLiteral.h>
|
|
#include <Parsers/ASTFunction.h>
|
|
#include <IO/Operators.h>
|
|
|
|
|
|
namespace DB
|
|
{
|
|
|
|
namespace ErrorCodes
|
|
{
|
|
extern const int TYPE_MISMATCH;
|
|
extern const int LOGICAL_ERROR;
|
|
extern const int INCOMPATIBLE_COLUMNS;
|
|
}
|
|
|
|
size_t getNumberOfDimensions(const IDataType & type)
|
|
{
|
|
if (const auto * type_array = typeid_cast<const DataTypeArray *>(&type))
|
|
return type_array->getNumberOfDimensions();
|
|
return 0;
|
|
}
|
|
|
|
size_t getNumberOfDimensions(const IColumn & column)
|
|
{
|
|
if (const auto * column_array = checkAndGetColumn<ColumnArray>(column))
|
|
return column_array->getNumberOfDimensions();
|
|
return 0;
|
|
}
|
|
|
|
DataTypePtr getBaseTypeOfArray(const DataTypePtr & type)
|
|
{
|
|
/// Get raw pointers to avoid extra copying of type pointers.
|
|
const DataTypeArray * last_array = nullptr;
|
|
const auto * current_type = type.get();
|
|
while (const auto * type_array = typeid_cast<const DataTypeArray *>(current_type))
|
|
{
|
|
current_type = type_array->getNestedType().get();
|
|
last_array = type_array;
|
|
}
|
|
|
|
return last_array ? last_array->getNestedType() : type;
|
|
}
|
|
|
|
ColumnPtr getBaseColumnOfArray(const ColumnPtr & column)
|
|
{
|
|
/// Get raw pointers to avoid extra copying of column pointers.
|
|
const ColumnArray * last_array = nullptr;
|
|
const auto * current_column = column.get();
|
|
while (const auto * column_array = checkAndGetColumn<ColumnArray>(current_column))
|
|
{
|
|
current_column = &column_array->getData();
|
|
last_array = column_array;
|
|
}
|
|
|
|
return last_array ? last_array->getDataPtr() : column;
|
|
}
|
|
|
|
DataTypePtr createArrayOfType(DataTypePtr type, size_t num_dimensions)
|
|
{
|
|
for (size_t i = 0; i < num_dimensions; ++i)
|
|
type = std::make_shared<DataTypeArray>(std::move(type));
|
|
return type;
|
|
}
|
|
|
|
ColumnPtr createArrayOfColumn(ColumnPtr column, size_t num_dimensions)
|
|
{
|
|
for (size_t i = 0; i < num_dimensions; ++i)
|
|
column = ColumnArray::create(column);
|
|
return column;
|
|
}
|
|
|
|
Array createEmptyArrayField(size_t num_dimensions)
|
|
{
|
|
if (num_dimensions == 0)
|
|
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot create array field with 0 dimensions");
|
|
|
|
Array array;
|
|
Array * current_array = &array;
|
|
for (size_t i = 1; i < num_dimensions; ++i)
|
|
{
|
|
current_array->push_back(Array());
|
|
current_array = ¤t_array->back().get<Array &>();
|
|
}
|
|
|
|
return array;
|
|
}
|
|
|
|
DataTypePtr getDataTypeByColumn(const IColumn & column)
|
|
{
|
|
auto idx = column.getDataType();
|
|
WhichDataType which(idx);
|
|
if (which.isSimple())
|
|
return DataTypeFactory::instance().get(String(magic_enum::enum_name(idx)));
|
|
|
|
if (which.isNothing())
|
|
return std::make_shared<DataTypeNothing>();
|
|
|
|
if (const auto * column_array = checkAndGetColumn<ColumnArray>(&column))
|
|
return std::make_shared<DataTypeArray>(getDataTypeByColumn(column_array->getData()));
|
|
|
|
if (const auto * column_nullable = checkAndGetColumn<ColumnNullable>(&column))
|
|
return makeNullable(getDataTypeByColumn(column_nullable->getNestedColumn()));
|
|
|
|
/// TODO: add more types.
|
|
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot get data type of column {}", column.getFamilyName());
|
|
}
|
|
|
|
template <size_t I, typename Tuple>
|
|
static auto extractVector(const std::vector<Tuple> & vec)
|
|
{
|
|
static_assert(I < std::tuple_size_v<Tuple>);
|
|
std::vector<std::tuple_element_t<I, Tuple>> res;
|
|
res.reserve(vec.size());
|
|
for (const auto & elem : vec)
|
|
res.emplace_back(std::get<I>(elem));
|
|
return res;
|
|
}
|
|
|
|
static DataTypePtr recreateTupleWithElements(const DataTypeTuple & type_tuple, const DataTypes & elements)
|
|
{
|
|
return type_tuple.haveExplicitNames()
|
|
? std::make_shared<DataTypeTuple>(elements, type_tuple.getElementNames())
|
|
: std::make_shared<DataTypeTuple>(elements);
|
|
}
|
|
|
|
static std::pair<ColumnPtr, DataTypePtr> convertObjectColumnToTuple(
|
|
const ColumnObject & column_object, const DataTypeObject & type_object)
|
|
{
|
|
if (!column_object.isFinalized())
|
|
{
|
|
auto finalized = column_object.cloneFinalized();
|
|
const auto & finalized_object = assert_cast<const ColumnObject &>(*finalized);
|
|
return convertObjectColumnToTuple(finalized_object, type_object);
|
|
}
|
|
|
|
const auto & subcolumns = column_object.getSubcolumns();
|
|
|
|
PathsInData tuple_paths;
|
|
DataTypes tuple_types;
|
|
Columns tuple_columns;
|
|
|
|
for (const auto & entry : subcolumns)
|
|
{
|
|
tuple_paths.emplace_back(entry->path);
|
|
tuple_types.emplace_back(entry->data.getLeastCommonType());
|
|
tuple_columns.emplace_back(entry->data.getFinalizedColumnPtr());
|
|
}
|
|
|
|
return unflattenTuple(tuple_paths, tuple_types, tuple_columns);
|
|
}
|
|
|
|
static std::pair<ColumnPtr, DataTypePtr> recursivlyConvertDynamicColumnToTuple(
|
|
const ColumnPtr & column, const DataTypePtr & type)
|
|
{
|
|
if (!type->hasDynamicSubcolumns())
|
|
return {column, type};
|
|
|
|
if (const auto * type_object = typeid_cast<const DataTypeObject *>(type.get()))
|
|
{
|
|
const auto & column_object = assert_cast<const ColumnObject &>(*column);
|
|
return convertObjectColumnToTuple(column_object, *type_object);
|
|
}
|
|
|
|
if (const auto * type_array = typeid_cast<const DataTypeArray *>(type.get()))
|
|
{
|
|
const auto & column_array = assert_cast<const ColumnArray &>(*column);
|
|
auto [new_column, new_type] = recursivlyConvertDynamicColumnToTuple(
|
|
column_array.getDataPtr(), type_array->getNestedType());
|
|
|
|
return
|
|
{
|
|
ColumnArray::create(new_column, column_array.getOffsetsPtr()),
|
|
std::make_shared<DataTypeArray>(std::move(new_type)),
|
|
};
|
|
}
|
|
|
|
if (const auto * type_map = typeid_cast<const DataTypeMap *>(type.get()))
|
|
{
|
|
const auto & column_map = assert_cast<const ColumnMap &>(*column);
|
|
auto [new_column, new_type] = recursivlyConvertDynamicColumnToTuple(
|
|
column_map.getNestedColumnPtr(), type_map->getNestedType());
|
|
|
|
return
|
|
{
|
|
ColumnMap::create(new_column),
|
|
std::make_shared<DataTypeMap>(std::move(new_type)),
|
|
};
|
|
}
|
|
|
|
if (const auto * type_tuple = typeid_cast<const DataTypeTuple *>(type.get()))
|
|
{
|
|
const auto & tuple_columns = assert_cast<const ColumnTuple &>(*column).getColumns();
|
|
const auto & tuple_types = type_tuple->getElements();
|
|
|
|
assert(tuple_columns.size() == tuple_types.size());
|
|
const size_t tuple_size = tuple_types.size();
|
|
|
|
Columns new_tuple_columns(tuple_size);
|
|
DataTypes new_tuple_types(tuple_size);
|
|
|
|
for (size_t i = 0; i < tuple_size; ++i)
|
|
{
|
|
std::tie(new_tuple_columns[i], new_tuple_types[i])
|
|
= recursivlyConvertDynamicColumnToTuple(tuple_columns[i], tuple_types[i]);
|
|
}
|
|
|
|
return
|
|
{
|
|
ColumnTuple::create(new_tuple_columns),
|
|
recreateTupleWithElements(*type_tuple, new_tuple_types)
|
|
};
|
|
}
|
|
|
|
throw Exception(ErrorCodes::LOGICAL_ERROR, "Type {} unexpectedly has dynamic columns", type->getName());
|
|
}
|
|
|
|
void convertDynamicColumnsToTuples(Block & block, const StorageSnapshotPtr & storage_snapshot)
|
|
{
|
|
for (auto & column : block)
|
|
{
|
|
if (!column.type->hasDynamicSubcolumns())
|
|
continue;
|
|
|
|
std::tie(column.column, column.type)
|
|
= recursivlyConvertDynamicColumnToTuple(column.column, column.type);
|
|
|
|
GetColumnsOptions options(GetColumnsOptions::AllPhysical);
|
|
auto storage_column = storage_snapshot->tryGetColumn(options, column.name);
|
|
if (!storage_column)
|
|
throw Exception(ErrorCodes::LOGICAL_ERROR, "Column '{}' not found in storage", column.name);
|
|
|
|
auto storage_column_concrete = storage_snapshot->getColumn(options.withExtendedObjects(), column.name);
|
|
|
|
/// Check that constructed Tuple type and type in storage are compatible.
|
|
getLeastCommonTypeForDynamicColumns(
|
|
storage_column->type, {column.type, storage_column_concrete.type}, true);
|
|
}
|
|
}
|
|
|
|
static bool isPrefix(const PathInData::Parts & prefix, const PathInData::Parts & parts)
|
|
{
|
|
if (prefix.size() > parts.size())
|
|
return false;
|
|
|
|
for (size_t i = 0; i < prefix.size(); ++i)
|
|
if (prefix[i].key != parts[i].key)
|
|
return false;
|
|
return true;
|
|
}
|
|
|
|
/// Returns true if there exists a prefix with matched names,
|
|
/// but not matched structure (is Nested, number of dimensions).
|
|
static bool hasDifferentStructureInPrefix(const PathInData::Parts & lhs, const PathInData::Parts & rhs)
|
|
{
|
|
for (size_t i = 0; i < std::min(lhs.size(), rhs.size()); ++i)
|
|
{
|
|
if (lhs[i].key != rhs[i].key)
|
|
return false;
|
|
else if (lhs[i] != rhs[i])
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
void checkObjectHasNoAmbiguosPaths(const PathsInData & paths)
|
|
{
|
|
size_t size = paths.size();
|
|
for (size_t i = 0; i < size; ++i)
|
|
{
|
|
for (size_t j = 0; j < i; ++j)
|
|
{
|
|
if (isPrefix(paths[i].getParts(), paths[j].getParts())
|
|
|| isPrefix(paths[j].getParts(), paths[i].getParts()))
|
|
throw Exception(ErrorCodes::INCOMPATIBLE_COLUMNS,
|
|
"Data in Object has ambiguous paths: '{}' and '{}'",
|
|
paths[i].getPath(), paths[j].getPath());
|
|
|
|
if (hasDifferentStructureInPrefix(paths[i].getParts(), paths[j].getParts()))
|
|
throw Exception(ErrorCodes::INCOMPATIBLE_COLUMNS,
|
|
"Data in Object has ambiguous paths: '{}' and '{}'. "
|
|
"Paths have prefixes matched by names, but different in structure",
|
|
paths[i].getPath(), paths[j].getPath());
|
|
}
|
|
}
|
|
}
|
|
|
|
static DataTypePtr getLeastCommonTypeForObject(const DataTypes & types, bool check_ambiguos_paths)
|
|
{
|
|
/// Types of subcolumns by path from all tuples.
|
|
std::unordered_map<PathInData, DataTypes, PathInData::Hash> subcolumns_types;
|
|
|
|
/// First we flatten tuples, then get common type for paths
|
|
/// and finally unflatten paths and create new tuple type.
|
|
for (const auto & type : types)
|
|
{
|
|
const auto * type_tuple = typeid_cast<const DataTypeTuple *>(type.get());
|
|
if (!type_tuple)
|
|
throw Exception(ErrorCodes::LOGICAL_ERROR,
|
|
"Least common type for object can be deduced only from tuples, but {} given", type->getName());
|
|
|
|
auto [tuple_paths, tuple_types] = flattenTuple(type);
|
|
assert(tuple_paths.size() == tuple_types.size());
|
|
|
|
for (size_t i = 0; i < tuple_paths.size(); ++i)
|
|
subcolumns_types[tuple_paths[i]].push_back(tuple_types[i]);
|
|
}
|
|
|
|
PathsInData tuple_paths;
|
|
DataTypes tuple_types;
|
|
|
|
/// Get the least common type for all paths.
|
|
for (const auto & [key, subtypes] : subcolumns_types)
|
|
{
|
|
assert(!subtypes.empty());
|
|
if (key.getPath() == ColumnObject::COLUMN_NAME_DUMMY)
|
|
continue;
|
|
|
|
size_t first_dim = getNumberOfDimensions(*subtypes[0]);
|
|
for (size_t i = 1; i < subtypes.size(); ++i)
|
|
if (first_dim != getNumberOfDimensions(*subtypes[i]))
|
|
throw Exception(ErrorCodes::TYPE_MISMATCH,
|
|
"Uncompatible types of subcolumn '{}': {} and {}",
|
|
key.getPath(), subtypes[0]->getName(), subtypes[i]->getName());
|
|
|
|
tuple_paths.emplace_back(key);
|
|
tuple_types.emplace_back(getLeastSupertypeOrString(subtypes));
|
|
}
|
|
|
|
if (tuple_paths.empty())
|
|
{
|
|
tuple_paths.emplace_back(ColumnObject::COLUMN_NAME_DUMMY);
|
|
tuple_types.emplace_back(std::make_shared<DataTypeUInt8>());
|
|
}
|
|
|
|
if (check_ambiguos_paths)
|
|
checkObjectHasNoAmbiguosPaths(tuple_paths);
|
|
|
|
return unflattenTuple(tuple_paths, tuple_types);
|
|
}
|
|
|
|
static DataTypePtr getLeastCommonTypeForDynamicColumnsImpl(
|
|
const DataTypePtr & type_in_storage, const DataTypes & concrete_types, bool check_ambiguos_paths);
|
|
|
|
template<typename Type>
|
|
static DataTypePtr getLeastCommonTypeForColumnWithNestedType(
|
|
const Type & type, const DataTypes & concrete_types, bool check_ambiguos_paths)
|
|
{
|
|
DataTypes nested_types;
|
|
nested_types.reserve(concrete_types.size());
|
|
|
|
for (const auto & concrete_type : concrete_types)
|
|
{
|
|
const auto * type_with_nested_conctete = typeid_cast<const Type *>(concrete_type.get());
|
|
if (!type_with_nested_conctete)
|
|
throw Exception(ErrorCodes::TYPE_MISMATCH, "Expected {} type, got {}", demangle(typeid(Type).name()), concrete_type->getName());
|
|
|
|
nested_types.push_back(type_with_nested_conctete->getNestedType());
|
|
}
|
|
|
|
return std::make_shared<Type>(
|
|
getLeastCommonTypeForDynamicColumnsImpl(
|
|
type.getNestedType(), nested_types, check_ambiguos_paths));
|
|
}
|
|
|
|
static DataTypePtr getLeastCommonTypeForTuple(
|
|
const DataTypeTuple & type, const DataTypes & concrete_types, bool check_ambiguos_paths)
|
|
{
|
|
const auto & element_types = type.getElements();
|
|
DataTypes new_element_types(element_types.size());
|
|
|
|
for (size_t i = 0; i < element_types.size(); ++i)
|
|
{
|
|
DataTypes concrete_element_types;
|
|
concrete_element_types.reserve(concrete_types.size());
|
|
|
|
for (const auto & type_concrete : concrete_types)
|
|
{
|
|
const auto * type_tuple_conctete = typeid_cast<const DataTypeTuple *>(type_concrete.get());
|
|
if (!type_tuple_conctete)
|
|
throw Exception(ErrorCodes::TYPE_MISMATCH, "Expected Tuple type, got {}", type_concrete->getName());
|
|
|
|
concrete_element_types.push_back(type_tuple_conctete->getElement(i));
|
|
}
|
|
|
|
new_element_types[i] = getLeastCommonTypeForDynamicColumnsImpl(
|
|
element_types[i], concrete_element_types, check_ambiguos_paths);
|
|
}
|
|
|
|
return recreateTupleWithElements(type, new_element_types);
|
|
}
|
|
|
|
static DataTypePtr getLeastCommonTypeForDynamicColumnsImpl(
|
|
const DataTypePtr & type_in_storage, const DataTypes & concrete_types, bool check_ambiguos_paths)
|
|
{
|
|
if (!type_in_storage->hasDynamicSubcolumns())
|
|
return type_in_storage;
|
|
|
|
if (isObject(type_in_storage))
|
|
return getLeastCommonTypeForObject(concrete_types, check_ambiguos_paths);
|
|
|
|
if (const auto * type_array = typeid_cast<const DataTypeArray *>(type_in_storage.get()))
|
|
return getLeastCommonTypeForColumnWithNestedType(*type_array, concrete_types, check_ambiguos_paths);
|
|
|
|
if (const auto * type_map = typeid_cast<const DataTypeMap *>(type_in_storage.get()))
|
|
return getLeastCommonTypeForColumnWithNestedType(*type_map, concrete_types, check_ambiguos_paths);
|
|
|
|
if (const auto * type_tuple = typeid_cast<const DataTypeTuple *>(type_in_storage.get()))
|
|
return getLeastCommonTypeForTuple(*type_tuple, concrete_types, check_ambiguos_paths);
|
|
|
|
throw Exception(ErrorCodes::LOGICAL_ERROR, "Type {} unexpectedly has dynamic columns", type_in_storage->getName());
|
|
}
|
|
|
|
DataTypePtr getLeastCommonTypeForDynamicColumns(
|
|
const DataTypePtr & type_in_storage, const DataTypes & concrete_types, bool check_ambiguos_paths)
|
|
{
|
|
if (concrete_types.empty())
|
|
return nullptr;
|
|
|
|
bool all_equal = true;
|
|
for (size_t i = 1; i < concrete_types.size(); ++i)
|
|
{
|
|
if (!concrete_types[i]->equals(*concrete_types[0]))
|
|
{
|
|
all_equal = false;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (all_equal)
|
|
return concrete_types[0];
|
|
|
|
return getLeastCommonTypeForDynamicColumnsImpl(type_in_storage, concrete_types, check_ambiguos_paths);
|
|
}
|
|
|
|
DataTypePtr createConcreteEmptyDynamicColumn(const DataTypePtr & type_in_storage)
|
|
{
|
|
if (!type_in_storage->hasDynamicSubcolumns())
|
|
return type_in_storage;
|
|
|
|
if (isObject(type_in_storage))
|
|
return std::make_shared<DataTypeTuple>(
|
|
DataTypes{std::make_shared<DataTypeUInt8>()}, Names{ColumnObject::COLUMN_NAME_DUMMY});
|
|
|
|
if (const auto * type_array = typeid_cast<const DataTypeArray *>(type_in_storage.get()))
|
|
return std::make_shared<DataTypeArray>(
|
|
createConcreteEmptyDynamicColumn(type_array->getNestedType()));
|
|
|
|
if (const auto * type_map = typeid_cast<const DataTypeMap *>(type_in_storage.get()))
|
|
return std::make_shared<DataTypeMap>(
|
|
createConcreteEmptyDynamicColumn(type_map->getNestedType()));
|
|
|
|
if (const auto * type_tuple = typeid_cast<const DataTypeTuple *>(type_in_storage.get()))
|
|
{
|
|
const auto & elements = type_tuple->getElements();
|
|
DataTypes new_elements;
|
|
new_elements.reserve(elements.size());
|
|
|
|
for (const auto & element : elements)
|
|
new_elements.push_back(createConcreteEmptyDynamicColumn(element));
|
|
|
|
return recreateTupleWithElements(*type_tuple, new_elements);
|
|
}
|
|
|
|
throw Exception(ErrorCodes::LOGICAL_ERROR, "Type {} unexpectedly has dynamic columns", type_in_storage->getName());
|
|
}
|
|
|
|
bool hasDynamicSubcolumns(const ColumnsDescription & columns)
|
|
{
|
|
return std::any_of(columns.begin(), columns.end(),
|
|
[](const auto & column)
|
|
{
|
|
return column.type->hasDynamicSubcolumns();
|
|
});
|
|
}
|
|
|
|
void extendObjectColumns(NamesAndTypesList & columns_list, const ColumnsDescription & object_columns, bool with_subcolumns)
|
|
{
|
|
NamesAndTypesList subcolumns_list;
|
|
for (auto & column : columns_list)
|
|
{
|
|
auto object_column = object_columns.tryGetColumn(GetColumnsOptions::All, column.name);
|
|
if (object_column)
|
|
{
|
|
column.type = object_column->type;
|
|
|
|
if (with_subcolumns)
|
|
subcolumns_list.splice(subcolumns_list.end(), object_columns.getSubcolumns(column.name));
|
|
}
|
|
}
|
|
|
|
columns_list.splice(columns_list.end(), std::move(subcolumns_list));
|
|
}
|
|
|
|
void updateObjectColumns(
|
|
ColumnsDescription & object_columns,
|
|
const ColumnsDescription & storage_columns,
|
|
const NamesAndTypesList & new_columns)
|
|
{
|
|
for (const auto & new_column : new_columns)
|
|
{
|
|
auto object_column = object_columns.tryGetColumn(GetColumnsOptions::All, new_column.name);
|
|
if (object_column && !object_column->type->equals(*new_column.type))
|
|
{
|
|
auto storage_column = storage_columns.getColumn(GetColumnsOptions::All, new_column.name);
|
|
object_columns.modify(new_column.name, [&](auto & column)
|
|
{
|
|
column.type = getLeastCommonTypeForDynamicColumns(storage_column.type, {object_column->type, new_column.type});
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
namespace
|
|
{
|
|
|
|
void flattenTupleImpl(
|
|
PathInDataBuilder & builder,
|
|
DataTypePtr type,
|
|
std::vector<PathInData::Parts> & new_paths,
|
|
DataTypes & new_types)
|
|
{
|
|
if (const auto * type_tuple = typeid_cast<const DataTypeTuple *>(type.get()))
|
|
{
|
|
const auto & tuple_names = type_tuple->getElementNames();
|
|
const auto & tuple_types = type_tuple->getElements();
|
|
|
|
for (size_t i = 0; i < tuple_names.size(); ++i)
|
|
{
|
|
builder.append(tuple_names[i], false);
|
|
flattenTupleImpl(builder, tuple_types[i], new_paths, new_types);
|
|
builder.popBack();
|
|
}
|
|
}
|
|
else if (const auto * type_array = typeid_cast<const DataTypeArray *>(type.get()))
|
|
{
|
|
PathInDataBuilder element_builder;
|
|
std::vector<PathInData::Parts> element_paths;
|
|
DataTypes element_types;
|
|
|
|
flattenTupleImpl(element_builder, type_array->getNestedType(), element_paths, element_types);
|
|
assert(element_paths.size() == element_types.size());
|
|
|
|
for (size_t i = 0; i < element_paths.size(); ++i)
|
|
{
|
|
builder.append(element_paths[i], true);
|
|
new_paths.emplace_back(builder.getParts());
|
|
new_types.emplace_back(std::make_shared<DataTypeArray>(element_types[i]));
|
|
builder.popBack(element_paths[i].size());
|
|
}
|
|
}
|
|
else
|
|
{
|
|
new_paths.emplace_back(builder.getParts());
|
|
new_types.emplace_back(type);
|
|
}
|
|
}
|
|
|
|
/// @offsets_columns are used as stack of array offsets and allows to recreate Array columns.
|
|
void flattenTupleImpl(const ColumnPtr & column, Columns & new_columns, Columns & offsets_columns)
|
|
{
|
|
if (const auto * column_tuple = checkAndGetColumn<ColumnTuple>(column.get()))
|
|
{
|
|
const auto & subcolumns = column_tuple->getColumns();
|
|
for (const auto & subcolumn : subcolumns)
|
|
flattenTupleImpl(subcolumn, new_columns, offsets_columns);
|
|
}
|
|
else if (const auto * column_array = checkAndGetColumn<ColumnArray>(column.get()))
|
|
{
|
|
offsets_columns.push_back(column_array->getOffsetsPtr());
|
|
flattenTupleImpl(column_array->getDataPtr(), new_columns, offsets_columns);
|
|
offsets_columns.pop_back();
|
|
}
|
|
else
|
|
{
|
|
if (!offsets_columns.empty())
|
|
{
|
|
auto new_column = ColumnArray::create(column, offsets_columns.back());
|
|
for (auto it = offsets_columns.rbegin() + 1; it != offsets_columns.rend(); ++it)
|
|
new_column = ColumnArray::create(new_column, *it);
|
|
|
|
new_columns.push_back(std::move(new_column));
|
|
}
|
|
else
|
|
{
|
|
new_columns.push_back(column);
|
|
}
|
|
}
|
|
}
|
|
|
|
DataTypePtr reduceNumberOfDimensions(DataTypePtr type, size_t dimensions_to_reduce)
|
|
{
|
|
while (dimensions_to_reduce--)
|
|
{
|
|
const auto * type_array = typeid_cast<const DataTypeArray *>(type.get());
|
|
if (!type_array)
|
|
throw Exception(ErrorCodes::LOGICAL_ERROR, "Not enough dimensions to reduce");
|
|
|
|
type = type_array->getNestedType();
|
|
}
|
|
|
|
return type;
|
|
}
|
|
|
|
ColumnPtr reduceNumberOfDimensions(ColumnPtr column, size_t dimensions_to_reduce)
|
|
{
|
|
while (dimensions_to_reduce--)
|
|
{
|
|
const auto * column_array = typeid_cast<const ColumnArray *>(column.get());
|
|
if (!column_array)
|
|
throw Exception(ErrorCodes::LOGICAL_ERROR, "Not enough dimensions to reduce");
|
|
|
|
column = column_array->getDataPtr();
|
|
}
|
|
|
|
return column;
|
|
}
|
|
|
|
/// We save intermediate column, type and number of array
|
|
/// dimensions for each intermediate node in path in subcolumns tree.
|
|
struct ColumnWithTypeAndDimensions
|
|
{
|
|
ColumnPtr column;
|
|
DataTypePtr type;
|
|
size_t array_dimensions;
|
|
};
|
|
|
|
using SubcolumnsTreeWithColumns = SubcolumnsTree<ColumnWithTypeAndDimensions>;
|
|
using Node = SubcolumnsTreeWithColumns::Node;
|
|
|
|
/// Creates data type and column from tree of subcolumns.
|
|
ColumnWithTypeAndDimensions createTypeFromNode(const Node & node)
|
|
{
|
|
auto collect_tuple_elemets = [](const auto & children)
|
|
{
|
|
if (children.empty())
|
|
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot create type from empty Tuple or Nested node");
|
|
|
|
std::vector<std::tuple<String, ColumnWithTypeAndDimensions>> tuple_elements;
|
|
tuple_elements.reserve(children.size());
|
|
for (const auto & [name, child] : children)
|
|
{
|
|
assert(child);
|
|
auto column = createTypeFromNode(*child);
|
|
tuple_elements.emplace_back(name, std::move(column));
|
|
}
|
|
|
|
/// Sort to always create the same type for the same set of subcolumns.
|
|
::sort(tuple_elements.begin(), tuple_elements.end(),
|
|
[](const auto & lhs, const auto & rhs) { return std::get<0>(lhs) < std::get<0>(rhs); });
|
|
|
|
auto tuple_names = extractVector<0>(tuple_elements);
|
|
auto tuple_columns = extractVector<1>(tuple_elements);
|
|
|
|
return std::make_tuple(std::move(tuple_names), std::move(tuple_columns));
|
|
};
|
|
|
|
if (node.kind == Node::SCALAR)
|
|
{
|
|
return node.data;
|
|
}
|
|
else if (node.kind == Node::NESTED)
|
|
{
|
|
auto [tuple_names, tuple_columns] = collect_tuple_elemets(node.children);
|
|
|
|
Columns offsets_columns;
|
|
offsets_columns.reserve(tuple_columns[0].array_dimensions + 1);
|
|
|
|
/// If we have a Nested node and child node with anonymous array levels
|
|
/// we need to push a Nested type through all array levels.
|
|
/// Example: { "k1": [[{"k2": 1, "k3": 2}] } should be parsed as
|
|
/// `k1 Array(Nested(k2 Int, k3 Int))` and k1 is marked as Nested
|
|
/// and `k2` and `k3` has anonymous_array_level = 1 in that case.
|
|
|
|
const auto & current_array = assert_cast<const ColumnArray &>(*node.data.column);
|
|
offsets_columns.push_back(current_array.getOffsetsPtr());
|
|
|
|
auto first_column = tuple_columns[0].column;
|
|
for (size_t i = 0; i < tuple_columns[0].array_dimensions; ++i)
|
|
{
|
|
const auto & column_array = assert_cast<const ColumnArray &>(*first_column);
|
|
offsets_columns.push_back(column_array.getOffsetsPtr());
|
|
first_column = column_array.getDataPtr();
|
|
}
|
|
|
|
size_t num_elements = tuple_columns.size();
|
|
Columns tuple_elements_columns(num_elements);
|
|
DataTypes tuple_elements_types(num_elements);
|
|
|
|
/// Reduce extra array dimensions to get columns and types of Nested elements.
|
|
for (size_t i = 0; i < num_elements; ++i)
|
|
{
|
|
assert(tuple_columns[i].array_dimensions == tuple_columns[0].array_dimensions);
|
|
tuple_elements_columns[i] = reduceNumberOfDimensions(tuple_columns[i].column, tuple_columns[i].array_dimensions);
|
|
tuple_elements_types[i] = reduceNumberOfDimensions(tuple_columns[i].type, tuple_columns[i].array_dimensions);
|
|
}
|
|
|
|
auto result_column = ColumnArray::create(ColumnTuple::create(tuple_elements_columns), offsets_columns.back());
|
|
auto result_type = createNested(tuple_elements_types, tuple_names);
|
|
|
|
/// Recreate result Array type and Array column.
|
|
for (auto it = offsets_columns.rbegin() + 1; it != offsets_columns.rend(); ++it)
|
|
{
|
|
result_column = ColumnArray::create(result_column, *it);
|
|
result_type = std::make_shared<DataTypeArray>(result_type);
|
|
}
|
|
|
|
return {result_column, result_type, tuple_columns[0].array_dimensions};
|
|
}
|
|
else
|
|
{
|
|
auto [tuple_names, tuple_columns] = collect_tuple_elemets(node.children);
|
|
|
|
size_t num_elements = tuple_columns.size();
|
|
Columns tuple_elements_columns(num_elements);
|
|
DataTypes tuple_elements_types(num_elements);
|
|
|
|
for (size_t i = 0; i < tuple_columns.size(); ++i)
|
|
{
|
|
assert(tuple_columns[i].array_dimensions == tuple_columns[0].array_dimensions);
|
|
tuple_elements_columns[i] = tuple_columns[i].column;
|
|
tuple_elements_types[i] = tuple_columns[i].type;
|
|
}
|
|
|
|
auto result_column = ColumnTuple::create(tuple_elements_columns);
|
|
auto result_type = std::make_shared<DataTypeTuple>(tuple_elements_types, tuple_names);
|
|
|
|
return {result_column, result_type, tuple_columns[0].array_dimensions};
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
std::pair<PathsInData, DataTypes> flattenTuple(const DataTypePtr & type)
|
|
{
|
|
std::vector<PathInData::Parts> new_path_parts;
|
|
DataTypes new_types;
|
|
PathInDataBuilder builder;
|
|
|
|
flattenTupleImpl(builder, type, new_path_parts, new_types);
|
|
|
|
PathsInData new_paths(new_path_parts.begin(), new_path_parts.end());
|
|
return {new_paths, new_types};
|
|
}
|
|
|
|
ColumnPtr flattenTuple(const ColumnPtr & column)
|
|
{
|
|
Columns new_columns;
|
|
Columns offsets_columns;
|
|
|
|
flattenTupleImpl(column, new_columns, offsets_columns);
|
|
return ColumnTuple::create(new_columns);
|
|
}
|
|
|
|
DataTypePtr unflattenTuple(const PathsInData & paths, const DataTypes & tuple_types)
|
|
{
|
|
assert(paths.size() == tuple_types.size());
|
|
Columns tuple_columns;
|
|
tuple_columns.reserve(tuple_types.size());
|
|
for (const auto & type : tuple_types)
|
|
tuple_columns.emplace_back(type->createColumn());
|
|
|
|
return unflattenTuple(paths, tuple_types, tuple_columns).second;
|
|
}
|
|
|
|
std::pair<ColumnPtr, DataTypePtr> unflattenObjectToTuple(const ColumnObject & column)
|
|
{
|
|
const auto & subcolumns = column.getSubcolumns();
|
|
|
|
if (subcolumns.empty())
|
|
{
|
|
auto type = std::make_shared<DataTypeTuple>(
|
|
DataTypes{std::make_shared<DataTypeUInt8>()},
|
|
Names{ColumnObject::COLUMN_NAME_DUMMY});
|
|
|
|
return {type->createColumn()->cloneResized(column.size()), type};
|
|
}
|
|
|
|
PathsInData paths;
|
|
DataTypes types;
|
|
Columns columns;
|
|
|
|
paths.reserve(subcolumns.size());
|
|
types.reserve(subcolumns.size());
|
|
columns.reserve(subcolumns.size());
|
|
|
|
for (const auto & entry : subcolumns)
|
|
{
|
|
paths.emplace_back(entry->path);
|
|
types.emplace_back(entry->data.getLeastCommonType());
|
|
columns.emplace_back(entry->data.getFinalizedColumnPtr());
|
|
}
|
|
|
|
return unflattenTuple(paths, types, columns);
|
|
}
|
|
|
|
std::pair<ColumnPtr, DataTypePtr> unflattenTuple(
|
|
const PathsInData & paths,
|
|
const DataTypes & tuple_types,
|
|
const Columns & tuple_columns)
|
|
{
|
|
assert(paths.size() == tuple_types.size());
|
|
assert(paths.size() == tuple_columns.size());
|
|
|
|
if (paths.empty())
|
|
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot unflatten empty Tuple");
|
|
|
|
/// We add all paths to the subcolumn tree and then create a type from it.
|
|
/// The tree stores column, type and number of array dimensions
|
|
/// for each intermediate node.
|
|
SubcolumnsTreeWithColumns tree;
|
|
|
|
for (size_t i = 0; i < paths.size(); ++i)
|
|
{
|
|
auto column = tuple_columns[i];
|
|
auto type = tuple_types[i];
|
|
|
|
const auto & parts = paths[i].getParts();
|
|
size_t num_parts = parts.size();
|
|
|
|
size_t pos = 0;
|
|
tree.add(paths[i], [&](Node::Kind kind, bool exists) -> std::shared_ptr<Node>
|
|
{
|
|
if (pos >= num_parts)
|
|
throw Exception(ErrorCodes::LOGICAL_ERROR,
|
|
"Not enough name parts for path {}. Expected at least {}, got {}",
|
|
paths[i].getPath(), pos + 1, num_parts);
|
|
|
|
size_t array_dimensions = kind == Node::NESTED ? 1 : parts[pos].anonymous_array_level;
|
|
ColumnWithTypeAndDimensions current_column{column, type, array_dimensions};
|
|
|
|
/// Get type and column for next node.
|
|
if (array_dimensions)
|
|
{
|
|
type = reduceNumberOfDimensions(type, array_dimensions);
|
|
column = reduceNumberOfDimensions(column, array_dimensions);
|
|
}
|
|
|
|
++pos;
|
|
if (exists)
|
|
return nullptr;
|
|
|
|
return kind == Node::SCALAR
|
|
? std::make_shared<Node>(kind, current_column, paths[i])
|
|
: std::make_shared<Node>(kind, current_column);
|
|
});
|
|
}
|
|
|
|
auto [column, type, _] = createTypeFromNode(tree.getRoot());
|
|
return std::make_pair(std::move(column), std::move(type));
|
|
}
|
|
|
|
static void addConstantToWithClause(const ASTPtr & query, const String & column_name, const DataTypePtr & data_type)
|
|
{
|
|
auto & select = query->as<ASTSelectQuery &>();
|
|
if (!select.with())
|
|
select.setExpression(ASTSelectQuery::Expression::WITH, std::make_shared<ASTExpressionList>());
|
|
|
|
/// TODO: avoid materialize
|
|
auto node = makeASTFunction("materialize",
|
|
makeASTFunction("CAST",
|
|
std::make_shared<ASTLiteral>(data_type->getDefault()),
|
|
std::make_shared<ASTLiteral>(data_type->getName())));
|
|
|
|
node->alias = column_name;
|
|
node->prefer_alias_to_column_name = true;
|
|
select.with()->children.push_back(std::move(node));
|
|
}
|
|
|
|
/// @expected_columns and @available_columns contain descriptions
|
|
/// of extended Object columns.
|
|
void replaceMissedSubcolumnsByConstants(
|
|
const ColumnsDescription & expected_columns,
|
|
const ColumnsDescription & available_columns,
|
|
ASTPtr query)
|
|
{
|
|
NamesAndTypes missed_names_types;
|
|
|
|
/// Find all subcolumns that are in @expected_columns, but not in @available_columns.
|
|
for (const auto & column : available_columns)
|
|
{
|
|
auto expected_column = expected_columns.getColumn(GetColumnsOptions::All, column.name);
|
|
|
|
/// Extract all paths from both descriptions to easily check existence of subcolumns.
|
|
auto [available_paths, available_types] = flattenTuple(column.type);
|
|
auto [expected_paths, expected_types] = flattenTuple(expected_column.type);
|
|
|
|
auto extract_names_and_types = [&column](const auto & paths, const auto & types)
|
|
{
|
|
NamesAndTypes res;
|
|
res.reserve(paths.size());
|
|
for (size_t i = 0; i < paths.size(); ++i)
|
|
{
|
|
auto full_name = Nested::concatenateName(column.name, paths[i].getPath());
|
|
res.emplace_back(full_name, types[i]);
|
|
}
|
|
|
|
::sort(res.begin(), res.end());
|
|
return res;
|
|
};
|
|
|
|
auto available_names_types = extract_names_and_types(available_paths, available_types);
|
|
auto expected_names_types = extract_names_and_types(expected_paths, expected_types);
|
|
|
|
std::set_difference(
|
|
expected_names_types.begin(), expected_names_types.end(),
|
|
available_names_types.begin(), available_names_types.end(),
|
|
std::back_inserter(missed_names_types),
|
|
[](const auto & lhs, const auto & rhs) { return lhs.name < rhs.name; });
|
|
}
|
|
|
|
if (missed_names_types.empty())
|
|
return;
|
|
|
|
IdentifierNameSet identifiers;
|
|
query->collectIdentifierNames(identifiers);
|
|
|
|
/// Replace missed subcolumns to default literals of theirs type.
|
|
for (const auto & [name, type] : missed_names_types)
|
|
if (identifiers.contains(name))
|
|
addConstantToWithClause(query, name, type);
|
|
}
|
|
|
|
Field FieldVisitorReplaceScalars::operator()(const Array & x) const
|
|
{
|
|
if (num_dimensions_to_keep == 0)
|
|
return replacement;
|
|
|
|
const size_t size = x.size();
|
|
Array res(size);
|
|
for (size_t i = 0; i < size; ++i)
|
|
res[i] = applyVisitor(FieldVisitorReplaceScalars(replacement, num_dimensions_to_keep - 1), x[i]);
|
|
return res;
|
|
}
|
|
|
|
size_t FieldVisitorToNumberOfDimensions::operator()(const Array & x)
|
|
{
|
|
const size_t size = x.size();
|
|
size_t dimensions = 0;
|
|
|
|
for (size_t i = 0; i < size; ++i)
|
|
{
|
|
size_t element_dimensions = applyVisitor(*this, x[i]);
|
|
if (i > 0 && element_dimensions != dimensions)
|
|
need_fold_dimension = true;
|
|
|
|
dimensions = std::max(dimensions, element_dimensions);
|
|
}
|
|
|
|
return 1 + dimensions;
|
|
}
|
|
|
|
Field FieldVisitorFoldDimension::operator()(const Array & x) const
|
|
{
|
|
if (num_dimensions_to_fold == 0)
|
|
return x;
|
|
|
|
const size_t size = x.size();
|
|
Array res(size);
|
|
for (size_t i = 0; i < size; ++i)
|
|
res[i] = applyVisitor(FieldVisitorFoldDimension(num_dimensions_to_fold - 1), x[i]);
|
|
|
|
return res;
|
|
}
|
|
|
|
void setAllObjectsToDummyTupleType(NamesAndTypesList & columns)
|
|
{
|
|
for (auto & column : columns)
|
|
if (column.type->hasDynamicSubcolumns())
|
|
column.type = createConcreteEmptyDynamicColumn(column.type);
|
|
}
|
|
|
|
}
|