From cbe12a532e222052978d59cf0f9ef141233dc4b1 Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Tue, 6 Oct 2020 15:46:17 +0300 Subject: [PATCH] allow to extract subcolumns from column --- src/DataTypes/DataTypeArray.cpp | 22 ++-- src/DataTypes/DataTypeArray.h | 6 +- src/DataTypes/DataTypeLowCardinality.h | 1 + src/DataTypes/DataTypeNullable.cpp | 11 +- src/DataTypes/DataTypeNullable.h | 2 +- src/DataTypes/DataTypeTuple.cpp | 44 +++++++- src/DataTypes/DataTypeTuple.h | 4 +- src/DataTypes/IDataType.cpp | 104 +++++++++++++----- src/DataTypes/IDataType.h | 14 ++- src/Storages/ColumnsDescription.cpp | 11 +- src/Storages/ColumnsDescription.h | 2 +- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 5 + src/Storages/MergeTree/IMergeTreeDataPart.h | 1 + src/Storages/MergeTree/IMergeTreeReader.cpp | 2 +- .../MergeTree/MergeTreeReaderCompact.cpp | 39 +++++-- .../MergeTree/MergeTreeReaderCompact.h | 2 +- .../0_stateless/001475_read_subcolumns_2.sql | 48 ++++++++ .../001475_read_subcolumns_storages.sh | 23 ++++ 18 files changed, 272 insertions(+), 69 deletions(-) create mode 100644 tests/queries/0_stateless/001475_read_subcolumns_2.sql create mode 100755 tests/queries/0_stateless/001475_read_subcolumns_storages.sh diff --git a/src/DataTypes/DataTypeArray.cpp b/src/DataTypes/DataTypeArray.cpp index 530d02ba68f..0c94aa693be 100644 --- a/src/DataTypes/DataTypeArray.cpp +++ b/src/DataTypes/DataTypeArray.cpp @@ -509,17 +509,25 @@ DataTypePtr DataTypeArray::getSubcolumnType(const String & subcolumn_name) const if (checkString("size", buf) && tryReadIntText(dim, buf) && dim < getNumberOfDimensions()) return std::make_shared(); - return nullptr; + return std::make_shared(nested->getSubcolumnType(subcolumn_name)); } -std::vector DataTypeArray::getSubcolumnNames() const +MutableColumnPtr DataTypeArray::getSubcolumn(const String & subcolumn_name, IColumn & column) const { - size_t num_of_dimentions = getNumberOfDimensions(); - std::vector res(num_of_dimentions); - for (size_t i = 0; i < num_of_dimentions; ++i) - res[i] = "size" + std::to_string(i); + return getSubcolumnImpl(subcolumn_name, column, 0); +} - return res; +MutableColumnPtr DataTypeArray::getSubcolumnImpl(const String & subcolumn_name, IColumn & column, size_t level) const +{ + auto & column_array = assert_cast(column); + if (subcolumn_name == "size" + std::to_string(level)) + return column_array.getOffsetsPtr()->assumeMutable(); + + if (const auto * nested_array = typeid_cast(nested.get())) + return nested_array->getSubcolumnImpl(subcolumn_name, column, level + 1); + + auto subcolumn = nested->getSubcolumn(subcolumn_name, column_array.getData()); + return ColumnArray::create(std::move(subcolumn), column_array.getOffsetsPtr()->assumeMutable()); } String DataTypeArray::getEscapedFileName(const NameAndTypePair & column) const diff --git a/src/DataTypes/DataTypeArray.h b/src/DataTypes/DataTypeArray.h index 2923b9ac676..b10c8b32b5e 100644 --- a/src/DataTypes/DataTypeArray.h +++ b/src/DataTypes/DataTypeArray.h @@ -112,13 +112,17 @@ public: } DataTypePtr getSubcolumnType(const String & subcolumn_name) const override; - std::vector getSubcolumnNames() const override; + MutableColumnPtr getSubcolumn(const String & subcolumn_name, IColumn & column) const override; + String getEscapedFileName(const NameAndTypePair & column) const override; const DataTypePtr & getNestedType() const { return nested; } /// 1 for plain array, 2 for array of arrays and so on. size_t getNumberOfDimensions() const; + +private: + MutableColumnPtr getSubcolumnImpl(const String & subcolumn_name, IColumn & column, size_t level) const; }; } diff --git a/src/DataTypes/DataTypeLowCardinality.h b/src/DataTypes/DataTypeLowCardinality.h index f8c314909b8..0c6c1c4b7cb 100644 --- a/src/DataTypes/DataTypeLowCardinality.h +++ b/src/DataTypes/DataTypeLowCardinality.h @@ -23,6 +23,7 @@ public: TypeIndex getTypeId() const override { return TypeIndex::LowCardinality; } void enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const override; + DataTypePtr getSubcolumnType(const String & /* subcolumn_name */) const override { return shared_from_this(); } void serializeBinaryBulkStatePrefix( SerializeBinaryBulkSettings & settings, diff --git a/src/DataTypes/DataTypeNullable.cpp b/src/DataTypes/DataTypeNullable.cpp index 8120fea1106..ffbd918c538 100644 --- a/src/DataTypes/DataTypeNullable.cpp +++ b/src/DataTypes/DataTypeNullable.cpp @@ -529,15 +529,20 @@ bool DataTypeNullable::equals(const IDataType & rhs) const DataTypePtr DataTypeNullable::getSubcolumnType(const String & subcolumn_name) const { + std::cerr << "(DataTypeNullable::getSubcolumnType) subcolumn_name: " << subcolumn_name << "\n"; if (subcolumn_name == "null") return std::make_shared(); - return nullptr; + return nested_data_type->getSubcolumnType(subcolumn_name); } -std::vector DataTypeNullable::getSubcolumnNames() const +MutableColumnPtr DataTypeNullable::getSubcolumn(const String & subcolumn_name, IColumn & column) const { - return {"null"}; + auto & column_nullable = assert_cast(column); + if (subcolumn_name == "null") + return column_nullable.getNullMapColumnPtr()->assumeMutable(); + + return nested_data_type->getSubcolumn(subcolumn_name, column_nullable.getNestedColumn()); } String DataTypeNullable::getEscapedFileName(const NameAndTypePair & column) const diff --git a/src/DataTypes/DataTypeNullable.h b/src/DataTypes/DataTypeNullable.h index c62f9eb64c8..a2d5fcc53bf 100644 --- a/src/DataTypes/DataTypeNullable.h +++ b/src/DataTypes/DataTypeNullable.h @@ -98,7 +98,7 @@ public: bool onlyNull() const override; bool canBeInsideLowCardinality() const override { return nested_data_type->canBeInsideLowCardinality(); } DataTypePtr getSubcolumnType(const String & subcolumn_name) const override; - std::vector getSubcolumnNames() const override; + MutableColumnPtr getSubcolumn(const String & subcolumn_name, IColumn & column) const override; String getEscapedFileName(const NameAndTypePair & column) const override; const DataTypePtr & getNestedType() const { return nested_data_type; } diff --git a/src/DataTypes/DataTypeTuple.cpp b/src/DataTypes/DataTypeTuple.cpp index 4564ea39b54..f7cfdb903ca 100644 --- a/src/DataTypes/DataTypeTuple.cpp +++ b/src/DataTypes/DataTypeTuple.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -31,6 +32,7 @@ namespace ErrorCodes extern const int DUPLICATE_COLUMN; extern const int BAD_ARGUMENTS; extern const int NOT_FOUND_COLUMN_IN_BLOCK; + extern const int ILLEGAL_COLUMN; } @@ -419,6 +421,14 @@ void DataTypeTuple::deserializeBinaryBulkWithMultipleStreams( settings.path.pop_back(); } +String DataTypeTuple::getEscapedFileName(const NameAndTypePair & column) const +{ + if (column.isSubcolumn()) + return escapeForFileName(column.getStorageName()) + "%2E" + column.getSubcolumnName(); + + return escapeForFileName(column.name); +} + void DataTypeTuple::serializeProtobuf(const IColumn & column, size_t row_num, ProtobufWriter & protobuf, size_t & value_index) const { for (; value_index < elems.size(); ++value_index) @@ -531,12 +541,40 @@ size_t DataTypeTuple::getSizeOfValueInMemory() const DataTypePtr DataTypeTuple::getSubcolumnType(const String & subcolumn_name) const { - return elems[getPositionByName(subcolumn_name)]; + for (size_t i = 0; i < names.size(); ++i) + { + if (startsWith(subcolumn_name, names[i])) + { + size_t name_length = names[i].size(); + if (subcolumn_name.size() == name_length) + return elems[i]; + + if (subcolumn_name[name_length] == '.') + return elems[i]->getSubcolumnType(subcolumn_name.substr(name_length + 1)); + } + } + + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in type {}", subcolumn_name, getName()); } -std::vector DataTypeTuple::getSubcolumnNames() const +MutableColumnPtr DataTypeTuple::getSubcolumn(const String & subcolumn_name, IColumn & column) const { - return names; + for (size_t i = 0; i < names.size(); ++i) + { + if (startsWith(subcolumn_name, names[i])) + { + size_t name_length = names[i].size(); + auto & subcolumn = extractElementColumn(column, i); + + if (subcolumn_name.size() == name_length) + return subcolumn.assumeMutable(); + + if (subcolumn_name[name_length] == '.') + return elems[i]->getSubcolumn(subcolumn_name.substr(name_length + 1), subcolumn); + } + } + + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in type {}", subcolumn_name, getName()); } diff --git a/src/DataTypes/DataTypeTuple.h b/src/DataTypes/DataTypeTuple.h index 795eb6cf4a9..6ae3804f2e9 100644 --- a/src/DataTypes/DataTypeTuple.h +++ b/src/DataTypes/DataTypeTuple.h @@ -94,8 +94,10 @@ public: bool haveMaximumSizeOfValue() const override; size_t getMaximumSizeOfValueInMemory() const override; size_t getSizeOfValueInMemory() const override; + DataTypePtr getSubcolumnType(const String & subcolumn_name) const override; - std::vector getSubcolumnNames() const override; + MutableColumnPtr getSubcolumn(const String & subcolumn_name, IColumn & column) const override; + String getEscapedFileName(const NameAndTypePair & column) const override; const DataTypes & getElements() const { return elems; } const Strings & getElementNames() const { return names; } diff --git a/src/DataTypes/IDataType.cpp b/src/DataTypes/IDataType.cpp index 7eb7f3a346e..05936b28f62 100644 --- a/src/DataTypes/IDataType.cpp +++ b/src/DataTypes/IDataType.cpp @@ -18,6 +18,7 @@ namespace ErrorCodes extern const int MULTIPLE_STREAMS_REQUIRED; extern const int LOGICAL_ERROR; extern const int DATA_TYPE_CANNOT_BE_PROMOTED; + extern const int ILLEGAL_COLUMN; } IDataType::IDataType() : custom_name(nullptr), custom_text_serialization(nullptr) @@ -92,19 +93,83 @@ size_t IDataType::getSizeOfValueInMemory() const throw Exception("Value of type " + getName() + " in memory is not of fixed size.", ErrorCodes::LOGICAL_ERROR); } +DataTypePtr IDataType::getSubcolumnType(const String & subcolumn_name) const +{ + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in type {}", subcolumn_name, getName()); +} + +MutableColumnPtr IDataType::getSubcolumn(const String & subcolumn_name, IColumn &) const +{ + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in type {}", subcolumn_name, getName()); +} + +std::vector IDataType::getSubcolumnNames() const +{ + std::vector res; + enumerateStreams([&res](const IDataType::SubstreamPath & substream_path, const IDataType & /* substream_type */) + { + auto subcolumn_name = IDataType::getSubcolumnNameForStream("", substream_path); + if (!subcolumn_name.empty()) + res.push_back(subcolumn_name.substr(1)); // It starts with a dot. + }); + + return res; +} + String IDataType::getEscapedFileName(const NameAndTypePair & column) const { return escapeForFileName(column.name); } +static String getNameForSubstreamPath( + String stream_name, + const IDataType::SubstreamPath & path, + const String & tuple_element_delimeter = ".") +{ + size_t array_level = 0; + for (const auto & elem : path) + { + if (elem.type == IDataType::Substream::NullMap) + stream_name += ".null"; + else if (elem.type == IDataType::Substream::ArraySizes) + stream_name += ".size" + toString(array_level); + else if (elem.type == IDataType::Substream::ArrayElements) + ++array_level; + else if (elem.type == IDataType::Substream::TupleElement) + stream_name += tuple_element_delimeter + escapeForFileName(elem.tuple_element_name); + else if (elem.type == IDataType::Substream::DictionaryKeys) + stream_name += ".dict"; + } + + return stream_name; +} + + +/// FIXME: rewrite it. String IDataType::getFileNameForStream(const NameAndTypePair & column, const IDataType::SubstreamPath & path) { - if (!column.isSubcolumn()) return getFileNameForStream(column.name, path); - auto stream_name = column.getStorageType()->getEscapedFileName(column); - return getFileNameForStreamImpl(std::move(stream_name), path); + String storage_name = column.getStorageName(); + String nested_table_name = Nested::extractTableName(storage_name); + + bool is_sizes_of_nested_type = + (path.size() == 1 && path[0].type == IDataType::Substream::ArraySizes + && nested_table_name != storage_name) || column.getSubcolumnName() == "size0"; + + String stream_name; + if (is_sizes_of_nested_type) + { + if (column.getSubcolumnName() == "size0") + return escapeForFileName(nested_table_name) + ".size0"; + + stream_name = escapeForFileName(Nested::extractTableName(storage_name)); + } + else + stream_name = column.getStorageType()->getEscapedFileName(column); + + return getNameForSubstreamPath(std::move(stream_name), path, "%2E"); } String IDataType::getFileNameForStream(const String & column_name, const IDataType::SubstreamPath & path) @@ -119,36 +184,19 @@ String IDataType::getFileNameForStream(const String & column_name, const IDataTy && nested_table_name != column_name; auto stream_name = escapeForFileName(is_sizes_of_nested_type ? nested_table_name : column_name); - return getFileNameForStreamImpl(std::move(stream_name), path); + + /// For compatibility reasons, we use %2E instead of dot. + /// Because nested data may be represented not by Array of Tuple, + /// but by separate Array columns with names in a form of a.b, + /// and name is encoded as a whole. + return getNameForSubstreamPath(std::move(stream_name), path, "%2E"); } -String IDataType::getFileNameForStreamImpl(String stream_name, const IDataType::SubstreamPath & path) +String IDataType::getSubcolumnNameForStream(String stream_name, const SubstreamPath & path) { - size_t array_level = 0; - for (const Substream & elem : path) - { - if (elem.type == Substream::NullMap) - stream_name += ".null"; - else if (elem.type == Substream::ArraySizes) - stream_name += ".size" + toString(array_level); - else if (elem.type == Substream::ArrayElements) - ++array_level; - else if (elem.type == Substream::TupleElement) - { - /// For compatibility reasons, we use %2E instead of dot. - /// Because nested data may be represented not by Array of Tuple, - /// but by separate Array columns with names in a form of a.b, - /// and name is encoded as a whole. - stream_name += "%2E" + escapeForFileName(elem.tuple_element_name); - } - else if (elem.type == Substream::DictionaryKeys) - stream_name += ".dict"; - } - - return stream_name; + return getNameForSubstreamPath(std::move(stream_name), path); } - bool IDataType::isSpecialCompressionAllowed(const SubstreamPath & path) { for (const Substream & elem : path) diff --git a/src/DataTypes/IDataType.h b/src/DataTypes/IDataType.h index 8c81bf86261..0a88886e348 100644 --- a/src/DataTypes/IDataType.h +++ b/src/DataTypes/IDataType.h @@ -37,7 +37,7 @@ struct NameAndTypePair; * * DataType is totally immutable object. You can always share them. */ -class IDataType : private boost::noncopyable +class IDataType : private boost::noncopyable, public std::enable_shared_from_this { public: IDataType(); @@ -115,6 +115,10 @@ public: void enumerateStreams(const StreamCallback & callback, SubstreamPath && path) const { enumerateStreams(callback, path); } void enumerateStreams(const StreamCallback & callback) const { enumerateStreams(callback, {}); } + virtual DataTypePtr getSubcolumnType(const String & subcolumn_name) const; + virtual MutableColumnPtr getSubcolumn(const String & subcolumn_name, IColumn & column) const; + std::vector getSubcolumnNames() const; + using OutputStreamGetter = std::function; using InputStreamGetter = std::function; @@ -152,6 +156,8 @@ public: bool position_independent_encoding = true; /// If not zero, may be used to avoid reallocations while reading column of String type. double avg_value_size_hint = 0; + + std::vector temporary_column_holders; }; /// Call before serializeBinaryBulkWithMultipleStreams chain to write something before first mark. @@ -230,9 +236,6 @@ public: virtual void serializeProtobuf(const IColumn & column, size_t row_num, ProtobufWriter & protobuf, size_t & value_index) const = 0; virtual void deserializeProtobuf(IColumn & column, ProtobufReader & protobuf, bool allow_add_row, bool & row_added) const = 0; - virtual DataTypePtr getSubcolumnType(const String & /* subcolumn_path */) const { return nullptr; } - virtual std::vector getSubcolumnNames() const { return {}; } - /** Text serialization with escaping but without quoting. */ void serializeAsTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const; @@ -450,6 +453,7 @@ public: static String getFileNameForStream(const NameAndTypePair & column, const SubstreamPath & path); static String getFileNameForStream(const String & column_name, const SubstreamPath & path); + static String getSubcolumnNameForStream(String stream_name, const SubstreamPath & path); /// Substream path supports special compression methods like codec Delta. /// For all other substreams (like ArraySizes, NullMasks, etc.) we use only @@ -464,8 +468,6 @@ private: mutable DataTypeCustomNamePtr custom_name; mutable DataTypeCustomTextSerializationPtr custom_text_serialization; - static String getFileNameForStreamImpl(String stream_name, const SubstreamPath & path); - public: const IDataTypeCustomName * getCustomName() const { return custom_name.get(); } }; diff --git a/src/Storages/ColumnsDescription.cpp b/src/Storages/ColumnsDescription.cpp index a638f15f1ba..bf10dc654dc 100644 --- a/src/Storages/ColumnsDescription.cpp +++ b/src/Storages/ColumnsDescription.cpp @@ -185,7 +185,7 @@ void ColumnsDescription::add(ColumnDescription column, const String & after_colu insert_it = range.second; } - addSubcolumns(NameAndTypePair(column.name, column.type)); + addSubcolumns(column.name, column.type); columns.get<0>().insert(insert_it, std::move(column)); } @@ -519,12 +519,13 @@ ColumnsDescription ColumnsDescription::parse(const String & str) return result; } -void ColumnsDescription::addSubcolumns(NameAndTypePair storage_column) +void ColumnsDescription::addSubcolumns(const String & storage_name, const DataTypePtr & storage_type) { - for (const auto & subcolumn_name : storage_column.type->getSubcolumnNames()) + for (const auto & subcolumn_name : storage_type->getSubcolumnNames()) { - auto subcolumn = NameAndTypePair(storage_column.name, subcolumn_name, - storage_column.type, storage_column.type->getSubcolumnType(subcolumn_name)); + std::cerr << "storage_name: " << storage_name << ", subcolumn_name: " << subcolumn_name << "\n"; + auto subcolumn = NameAndTypePair(storage_name, subcolumn_name, + storage_type, storage_type->getSubcolumnType(subcolumn_name)); if (has(subcolumn.name)) throw Exception(ErrorCodes::ILLEGAL_COLUMN, diff --git a/src/Storages/ColumnsDescription.h b/src/Storages/ColumnsDescription.h index c3988da5ff8..507d234f6bc 100644 --- a/src/Storages/ColumnsDescription.h +++ b/src/Storages/ColumnsDescription.h @@ -147,7 +147,7 @@ private: SubcolumnsContainer subcolumns; void modifyColumnOrder(const String & column_name, const String & after_column, bool first); - void addSubcolumns(NameAndTypePair storage_column); + void addSubcolumns(const String & storage_name, const DataTypePtr & storage_type); }; /// Validate default expressions and corresponding types compatibility, i.e. diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index a60194bb05b..f6db3ac70d3 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -191,6 +191,11 @@ std::optional IMergeTreeDataPart::getColumnPosition(const String & colum return it->second; } +std::optional IMergeTreeDataPart::getColumnPosition(const NameAndTypePair & column) const +{ + return getColumnPosition(column.getStorageName()); +} + DayNum IMergeTreeDataPart::getMinDate() const { if (storage.minmax_idx_date_column_pos != -1 && minmax_idx.initialized) diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index c4936bc263a..51c5999f7c4 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -142,6 +142,7 @@ public: /// take place, you must take original name of column for this part from /// storage and pass it to this method. std::optional getColumnPosition(const String & column_name) const; + std::optional getColumnPosition(const NameAndTypePair & column) const; /// Returns the name of a column with minimum compressed size (as returned by getColumnSize()). /// If no checksums are present returns the name of the first physically existing column. diff --git a/src/Storages/MergeTree/IMergeTreeReader.cpp b/src/Storages/MergeTree/IMergeTreeReader.cpp index 5740eef6810..75b216c989f 100644 --- a/src/Storages/MergeTree/IMergeTreeReader.cpp +++ b/src/Storages/MergeTree/IMergeTreeReader.cpp @@ -265,7 +265,7 @@ IMergeTreeReader::ColumnPosition IMergeTreeReader::findColumnForOffsets(const St { if (typeid_cast(part_column.type.get())) { - auto position = data_part->getColumnPosition(part_column.name); + auto position = data_part->getColumnPosition(part_column); if (position && Nested::extractTableName(part_column.name) == table_name) return position; } diff --git a/src/Storages/MergeTree/MergeTreeReaderCompact.cpp b/src/Storages/MergeTree/MergeTreeReaderCompact.cpp index 87b3f0a4329..41af0040fc5 100644 --- a/src/Storages/MergeTree/MergeTreeReaderCompact.cpp +++ b/src/Storages/MergeTree/MergeTreeReaderCompact.cpp @@ -53,14 +53,14 @@ MergeTreeReaderCompact::MergeTreeReaderCompact( auto name_and_type = columns.begin(); for (size_t i = 0; i < columns_num; ++i, ++name_and_type) { - const auto & [name, type] = getColumnFromPart(*name_and_type); - auto position = data_part->getColumnPosition(name); + auto column_from_part = getColumnFromPart(*name_and_type); + auto position = data_part->getColumnPosition(column_from_part); - if (!position && typeid_cast(type.get())) + if (!position && typeid_cast(column_from_part.type.get())) { /// If array of Nested column is missing in part, /// we have to read its offsets if they exist. - position = findColumnForOffsets(name); + position = findColumnForOffsets(column_from_part.name); read_only_offsets[i] = (position != std::nullopt); } @@ -149,14 +149,14 @@ size_t MergeTreeReaderCompact::readRows(size_t from_mark, bool continue_reading, if (!res_columns[pos]) continue; - auto [name, type] = getColumnFromPart(*name_and_type); + auto column_from_part = getColumnFromPart(*name_and_type); auto & column = mutable_columns[pos]; try { size_t column_size_before_reading = column->size(); - readData(name, *column, *type, from_mark, *column_positions[pos], rows_to_read, read_only_offsets[pos]); + readData(column_from_part, *column, from_mark, *column_positions[pos], rows_to_read, read_only_offsets[pos]); size_t read_rows_in_column = column->size() - column_size_before_reading; @@ -170,7 +170,7 @@ size_t MergeTreeReaderCompact::readRows(size_t from_mark, bool continue_reading, storage.reportBrokenPart(data_part->name); /// Better diagnostics. - e.addMessage("(while reading column " + name + ")"); + e.addMessage("(while reading column " + column_from_part.name + ")"); throw; } catch (...) @@ -199,9 +199,11 @@ size_t MergeTreeReaderCompact::readRows(size_t from_mark, bool continue_reading, } void MergeTreeReaderCompact::readData( - const String & name, IColumn & column, const IDataType & type, + const NameAndTypePair & name_and_type, IColumn & column, size_t from_mark, size_t column_position, size_t rows_to_read, bool only_offsets) { + const auto & [name, type] = name_and_type; + if (!isContinuousReading(from_mark, column_position)) seekToMark(from_mark, column_position); @@ -213,14 +215,29 @@ void MergeTreeReaderCompact::readData( return data_buffer; }; + IDataType::DeserializeBinaryBulkStatePtr state; IDataType::DeserializeBinaryBulkSettings deserialize_settings; deserialize_settings.getter = buffer_getter; deserialize_settings.avg_value_size_hint = avg_value_size_hints[name]; deserialize_settings.position_independent_encoding = true; - IDataType::DeserializeBinaryBulkStatePtr state; - type.deserializeBinaryBulkStatePrefix(deserialize_settings, state); - type.deserializeBinaryBulkWithMultipleStreams(column, rows_to_read, deserialize_settings, state); + if (name_and_type.isSubcolumn()) + { + const auto & storage_type = name_and_type.getStorageType(); + auto temp_column = storage_type->createColumn(); + + storage_type->deserializeBinaryBulkStatePrefix(deserialize_settings, state); + storage_type->deserializeBinaryBulkWithMultipleStreams(*temp_column, rows_to_read, deserialize_settings, state); + + auto subcolumn = storage_type->getSubcolumn(name_and_type.getSubcolumnName(), *temp_column); + column.insertRangeFrom(*subcolumn, 0, subcolumn->size()); + } + else + { + deserialize_settings.position_independent_encoding = true; + type->deserializeBinaryBulkStatePrefix(deserialize_settings, state); + type->deserializeBinaryBulkWithMultipleStreams(column, rows_to_read, deserialize_settings, state); + } /// The buffer is left in inconsistent state after reading single offsets if (only_offsets) diff --git a/src/Storages/MergeTree/MergeTreeReaderCompact.h b/src/Storages/MergeTree/MergeTreeReaderCompact.h index 9ef88716579..27cabaa2a2f 100644 --- a/src/Storages/MergeTree/MergeTreeReaderCompact.h +++ b/src/Storages/MergeTree/MergeTreeReaderCompact.h @@ -56,7 +56,7 @@ private: void seekToMark(size_t row_index, size_t column_index); - void readData(const String & name, IColumn & column, const IDataType & type, + void readData(const NameAndTypePair & name_and_type, IColumn & column, size_t from_mark, size_t column_position, size_t rows_to_read, bool only_offsets = false); /// Returns maximal value of granule size in compressed file from @mark_ranges. diff --git a/tests/queries/0_stateless/001475_read_subcolumns_2.sql b/tests/queries/0_stateless/001475_read_subcolumns_2.sql new file mode 100644 index 00000000000..18868a3863a --- /dev/null +++ b/tests/queries/0_stateless/001475_read_subcolumns_2.sql @@ -0,0 +1,48 @@ +DROP TABLE IF EXISTS subcolumns; + +CREATE TABLE subcolumns +( + t Tuple + ( + a Array(Nullable(UInt32)), + u UInt32, + s Nullable(String) + ), + arr Array(Nullable(String)), + arr2 Array(Array(LowCardinality(Nullable(String)))), + lc LowCardinality(String), + nested Nested(col1 String, col2 Nullable(UInt32)) +) +ENGINE = MergeTree order by tuple() SETTINGS min_bytes_for_wide_part = '10M'; + +INSERT INTO subcolumns VALUES (([1, NULL], 2, 'a'), ['foo', NULL, 'bar'], [['123'], ['456', '789']], 'qqqq', ['zzz', 'xxx'], [42, 43]); +SELECT * FROM subcolumns; +SELECT t.a, t.u, t.s, nested.col1, nested.col2, lc FROM subcolumns; +SELECT t.a.size0, t.a.null, t.u, t.s, t.s.null FROM subcolumns; +-- SELECT arr2, arr2.size0, arr2.size1, arr2.null FROM subcolumns; +-- SELECT nested.col1, nested.col2, nested.col1.size0, nested.col2.size0, nested.col2.null FROM subcolumns; +SELECT sumArray(arr.null), sum(arr.size0) FROM subcolumns; +DROP TABLE IF EXISTS subcolumns; + +CREATE TABLE subcolumns +( + t Tuple + ( + a Array(Nullable(UInt32)), + u UInt32, + s Nullable(String) + ), + arr Array(Nullable(String)), + arr2 Array(Array(LowCardinality(Nullable(String)))), + lc LowCardinality(String), + nested Nested(col1 String, col2 Nullable(UInt32)) +) +ENGINE = MergeTree order by tuple() SETTINGS min_bytes_for_wide_part = 0; + +INSERT INTO subcolumns VALUES (([1, NULL], 2, 'a'), ['foo', NULL, 'bar'], [['123'], ['456', '789']], 'qqqq', ['zzz', 'xxx'], [42, 43]); +SELECT * FROM subcolumns; +SELECT t.a, t.u, t.s, nested.col1, nested.col2, lc FROM subcolumns; +SELECT t.a.size0, t.a.null, t.u, t.s, t.s.null FROM subcolumns; +-- SELECT arr2, arr2.size0, arr2.size1, arr2.null FROM subcolumns; +-- SELECT nested.col1, nested.col2, nested.col1.size0, nested.col2.size0, nested.col2.null FROM subcolumns; +SELECT sumArray(arr.null), sum(arr.size0) FROM subcolumns; diff --git a/tests/queries/0_stateless/001475_read_subcolumns_storages.sh b/tests/queries/0_stateless/001475_read_subcolumns_storages.sh new file mode 100755 index 00000000000..2304726bf6f --- /dev/null +++ b/tests/queries/0_stateless/001475_read_subcolumns_storages.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +. "$CURDIR"/../shell_config.sh + +set -e + +create_query="CREATE TABLE subcolumns(n Nullable(UInt32), a1 Array(UInt32),\ + a2 Array(Array(Array(UInt32))), a3 Array(Nullable(UInt32)), t Tuple(s String, v UInt32))" + +declare -a ENGINES=("Log" "StripeLog" "TinyLog" "Memory" \ + "MergeTree ORDER BY tuple() SETTINGS min_bytes_for_compact_part='10M'" + "MergeTree ORDER BY tuple() SETTINGS min_bytes_for_wide_part='10M'" + "MergeTree ORDER BY tuple() SETTINGS min_bytes_for_wide_part=0") + +for engine in "${ENGINES[@]}"; do + echo $engine + $CLICKHOUSE_CLIENT --query "DROP TABLE IF EXISTS subcolumns" + $CLICKHOUSE_CLIENT --query "$create_query ENGINE = $engine" + $CLICKHOUSE_CLIENT --query "INSERT INTO subcolumns VALUES (100, [1, 2, 3], [[[1, 2], [], [4]], [[5, 6], [7, 8]], [[]]], [1, NULL, 2], ('foo', 200))" + $CLICKHOUSE_CLIENT --query "SELECT * FROM subcolumns" + $CLICKHOUSE_CLIENT --query "SELECT n, n.null, a1, a1.size0, a2, a2.size0, a2.size1, a2.size2, a3, a3.size0, a3.null, t, t.s, t.v FROM subcolumns" +done