From eb7aad00160b1418ed96ecc83770b62ce3bfaaf0 Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 14 Nov 2023 11:35:54 +0100 Subject: [PATCH 001/245] Do not consider parts broken if only projections are broken --- src/Interpreters/MutationsInterpreter.cpp | 14 + src/Interpreters/MutationsInterpreter.h | 1 + src/Storages/MergeTree/IMergeTreeDataPart.cpp | 61 +++- src/Storages/MergeTree/IMergeTreeDataPart.h | 19 +- src/Storages/MergeTree/MergeTreeData.cpp | 26 +- src/Storages/MergeTree/MergeTreeData.h | 11 +- .../MergeTree/MergeTreeDataPartChecksum.h | 2 + src/Storages/MergeTree/MutateTask.cpp | 8 +- .../ReplicatedMergeTreePartCheckThread.cpp | 31 +- .../ReplicatedMergeTreePartCheckThread.h | 4 +- src/Storages/MergeTree/checkDataPart.cpp | 80 ++++- src/Storages/MergeTree/checkDataPart.h | 4 +- src/Storages/StorageMergeTree.cpp | 5 +- src/Storages/StorageReplicatedMergeTree.cpp | 3 +- src/Storages/System/StorageSystemDisks.cpp | 2 +- .../System/StorageSystemPartsBase.cpp | 8 +- src/Storages/System/StorageSystemPartsBase.h | 2 +- .../System/StorageSystemProjectionParts.cpp | 48 ++- .../StorageSystemProjectionPartsColumns.cpp | 21 +- .../02916_broken_projection.reference | 224 ++++++++++++++ .../0_stateless/02916_broken_projection.sh | 283 ++++++++++++++++++ 21 files changed, 795 insertions(+), 62 deletions(-) create mode 100644 tests/queries/0_stateless/02916_broken_projection.reference create mode 100755 tests/queries/0_stateless/02916_broken_projection.sh diff --git a/src/Interpreters/MutationsInterpreter.cpp b/src/Interpreters/MutationsInterpreter.cpp index 0ace0a8b79c..a9a5d4f33d0 100644 --- a/src/Interpreters/MutationsInterpreter.cpp +++ b/src/Interpreters/MutationsInterpreter.cpp @@ -305,6 +305,11 @@ bool MutationsInterpreter::Source::hasProjection(const String & name) const return part && part->hasProjection(name); } +bool MutationsInterpreter::Source::hasBrokenProjection(const String & name) const +{ + return part && part->hasBrokenProjection(name); +} + bool MutationsInterpreter::Source::isCompactPart() const { return part && part->getType() == MergeTreeDataPartType::Compact; @@ -922,6 +927,15 @@ void MutationsInterpreter::prepare(bool dry_run) materialized_indices.insert(index.name); } + /// Always rebuild broken projections. + for (const auto & projection : metadata_snapshot->getProjections()) + { + if (!source.hasBrokenProjection(projection.name)) + continue; + + materialized_projections.insert(projection.name); + } + for (const auto & projection : metadata_snapshot->getProjections()) { if (!source.hasProjection(projection.name)) diff --git a/src/Interpreters/MutationsInterpreter.h b/src/Interpreters/MutationsInterpreter.h index c53b86ddb5e..33b8021a653 100644 --- a/src/Interpreters/MutationsInterpreter.h +++ b/src/Interpreters/MutationsInterpreter.h @@ -122,6 +122,7 @@ public: bool materializeTTLRecalculateOnly() const; bool hasSecondaryIndex(const String & name) const; bool hasProjection(const String & name) const; + bool hasBrokenProjection(const String & name) const; bool isCompactPart() const; void read( diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 9bc72577b25..bc81758675e 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -732,7 +732,23 @@ void IMergeTreeDataPart::loadProjections(bool require_columns_checksums, bool ch else { auto part = getProjectionPartBuilder(projection.name).withPartFormatFromDisk().build(); - part->loadColumnsChecksumsIndexes(require_columns_checksums, check_consistency); + + try + { + part->loadColumnsChecksumsIndexes(require_columns_checksums, check_consistency); + } + catch (...) + { + if (isRetryableException(std::current_exception())) + throw; + + LOG_ERROR(&Poco::Logger::get("IMergeTreeDataPart"), + "Cannot load projection {}, will consider it broken", projection.name); + + addBrokenProjectionPart(projection.name, std::move(part), getCurrentExceptionMessage(false), getCurrentExceptionCode()); + continue; + } + addProjectionPart(projection.name, std::move(part)); } } @@ -1129,7 +1145,8 @@ void IMergeTreeDataPart::loadChecksums(bool require) /// Check the data while we are at it. LOG_WARNING(storage.log, "Checksums for part {} not found. Will calculate them from data on disk.", name); - checksums = checkDataPart(shared_from_this(), false); + bool noop; + checksums = checkDataPart(shared_from_this(), false, noop, /* is_cancelled */{}, /* throw_on_broken_projection */false); writeChecksums(checksums, {}); bytes_on_disk = checksums.getTotalSizeOnDisk(); @@ -2130,6 +2147,46 @@ std::optional IMergeTreeDataPart::getStreamNameForColumn( return getStreamNameOrHash(stream_name, extension, storage_); } +void IMergeTreeDataPart::addBrokenProjectionPart( + const String & projection_name, + std::shared_ptr projection_part, + const String & message, + int code) +{ + projection_part->setBrokenReason(message, code); + bool inserted = broken_projection_parts.emplace(projection_name, projection_part).second; + if (!inserted) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Projection part {} in part {} is already added to a broken projection parts list", projection_name, name); +} + +void IMergeTreeDataPart::markProjectionPartAsBroken(const String & projection_name, const String & message, int code) const +{ + std::lock_guard lock(broken_projections_mutex); + + auto it = projection_parts.find(projection_name); + if (it == projection_parts.end()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "There is no projection part '{}'", projection_name); + + it->second->setBrokenReason(message, code); + + broken_projection_parts.emplace(projection_name, it->second); + projection_parts.erase(it); +} + +void IMergeTreeDataPart::setBrokenReason(const String & message, int code) +{ + std::lock_guard lock(broken_projections_mutex); + is_broken = true; + exception = message; + exception_code = code; +} + +bool IMergeTreeDataPart::hasBrokenProjection(const String & projection_name) const +{ + std::lock_guard lock(broken_projections_mutex); + return broken_projection_parts.contains(projection_name); +} + bool isCompactPart(const MergeTreeDataPartPtr & data_part) { return (data_part && data_part->getType() == MergeTreeDataPartType::Compact); diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index a9659d2f5f4..52a1541e15f 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -255,6 +255,12 @@ public: /// Frozen by ALTER TABLE ... FREEZE ... It is used for information purposes in system.parts table. mutable std::atomic is_frozen {false}; + /// If it is a projection part, it can be broken sometimes. + mutable std::atomic is_broken {false}; + mutable std::string exception; + mutable int exception_code = 0; + mutable std::mutex broken_projections_mutex; + /// Indicates that the part was marked Outdated by PartCheckThread because the part was not committed to ZooKeeper mutable bool is_unexpected_local_part = false; @@ -405,12 +411,20 @@ public: const std::map> & getProjectionParts() const { return projection_parts; } + const std::map> & getBrokenProjectionParts() const { return broken_projection_parts; } + MergeTreeDataPartBuilder getProjectionPartBuilder(const String & projection_name, bool is_temp_projection = false); void addProjectionPart(const String & projection_name, std::shared_ptr && projection_part); + void addBrokenProjectionPart(const String & projection_name, std::shared_ptr projection_part, const String & message, int code); + + void markProjectionPartAsBroken(const String & projection_name, const String & message, int code) const; + bool hasProjection(const String & projection_name) const { return projection_parts.contains(projection_name); } + bool hasBrokenProjection(const String & projection_name) const; + void loadProjections(bool require_columns_checksums, bool check_consistency, bool if_not_loaded = false); /// Return set of metadata file names without checksums. For example, @@ -564,7 +578,8 @@ protected: const IMergeTreeDataPart * parent_part; String parent_part_name; - std::map> projection_parts; + mutable std::map> projection_parts; + mutable std::map> broken_projection_parts; mutable PartMetadataManagerPtr metadata_manager; @@ -678,6 +693,8 @@ private: void incrementStateMetric(MergeTreeDataPartState state) const; void decrementStateMetric(MergeTreeDataPartState state) const; + void setBrokenReason(const String & message, int code); + /// This ugly flag is needed for debug assertions only mutable bool part_is_probably_removed_from_disk = false; }; diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 1c0f9208fef..152c386e188 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -5737,7 +5737,7 @@ MergeTreeData::getDataPartsVectorForInternalUsage(const DataPartStates & afforda } MergeTreeData::ProjectionPartsVector -MergeTreeData::getProjectionPartsVectorForInternalUsage(const DataPartStates & affordable_states, DataPartStateVector * out_states) const +MergeTreeData::getProjectionPartsVectorForInternalUsage(const DataPartStates & affordable_states, bool fill_states) const { auto lock = lockParts(); ProjectionPartsVector res; @@ -5749,14 +5749,20 @@ MergeTreeData::getProjectionPartsVectorForInternalUsage(const DataPartStates & a res.data_parts.push_back(part); for (const auto & [_, projection_part] : part->getProjectionParts()) res.projection_parts.push_back(projection_part); + for (const auto & [_, projection_part] : part->getBrokenProjectionParts()) + res.broken_projection_parts.push_back(projection_part); } } - if (out_states != nullptr) + if (fill_states) { - out_states->resize(res.projection_parts.size()); + res.projection_parts_states.resize(res.projection_parts.size()); for (size_t i = 0; i < res.projection_parts.size(); ++i) - (*out_states)[i] = res.projection_parts[i]->getParentPart()->getState(); + (res.projection_parts_states)[i] = res.projection_parts[i]->getParentPart()->getState(); + + res.broken_projection_parts_states.resize(res.broken_projection_parts.size()); + for (size_t i = 0; i < res.broken_projection_parts.size(); ++i) + (res.broken_projection_parts_states)[i] = res.broken_projection_parts[i]->getParentPart()->getState(); } return res; @@ -5809,7 +5815,7 @@ bool MergeTreeData::supportsLightweightDelete() const return true; } -MergeTreeData::ProjectionPartsVector MergeTreeData::getAllProjectionPartsVector(MergeTreeData::DataPartStateVector * out_states) const +MergeTreeData::ProjectionPartsVector MergeTreeData::getAllProjectionPartsVector(bool fill_states) const { ProjectionPartsVector res; auto lock = lockParts(); @@ -5820,11 +5826,15 @@ MergeTreeData::ProjectionPartsVector MergeTreeData::getAllProjectionPartsVector( res.projection_parts.push_back(projection_part); } - if (out_states != nullptr) + if (fill_states) { - out_states->resize(res.projection_parts.size()); + res.projection_parts_states.resize(res.projection_parts.size()); for (size_t i = 0; i < res.projection_parts.size(); ++i) - (*out_states)[i] = res.projection_parts[i]->getParentPart()->getState(); + (res.projection_parts_states)[i] = res.projection_parts[i]->getParentPart()->getState(); + + res.broken_projection_parts_states.resize(res.broken_projection_parts.size()); + for (size_t i = 0; i < res.broken_projection_parts.size(); ++i) + (res.broken_projection_parts_states)[i] = res.broken_projection_parts[i]->getParentPart()->getState(); } return res; } diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index 54104849fe4..4ef3b75988b 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -468,8 +468,13 @@ public: struct ProjectionPartsVector { - DataPartsVector projection_parts; DataPartsVector data_parts; + + DataPartsVector projection_parts; + DataPartStateVector projection_parts_states; + + DataPartsVector broken_projection_parts; + DataPartStateVector broken_projection_parts_states; }; /// Returns a copy of the list so that the caller shouldn't worry about locks. @@ -484,7 +489,7 @@ public: const DataPartStates & affordable_states, DataPartStateVector * out_states = nullptr) const; /// Same as above but only returns projection parts ProjectionPartsVector getProjectionPartsVectorForInternalUsage( - const DataPartStates & affordable_states, DataPartStateVector * out_states = nullptr) const; + const DataPartStates & affordable_states, bool fill_states = false) const; /// Returns absolutely all parts (and snapshot of their states) @@ -496,7 +501,7 @@ public: size_t getTotalMarksCount() const; /// Same as above but only returns projection parts - ProjectionPartsVector getAllProjectionPartsVector(MergeTreeData::DataPartStateVector * out_states = nullptr) const; + ProjectionPartsVector getAllProjectionPartsVector(bool fill_states = false) const; /// Returns parts in Active state DataParts getDataPartsForInternalUsage() const; diff --git a/src/Storages/MergeTree/MergeTreeDataPartChecksum.h b/src/Storages/MergeTree/MergeTreeDataPartChecksum.h index 8e5e8c8c448..3595ce38db5 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartChecksum.h +++ b/src/Storages/MergeTree/MergeTreeDataPartChecksum.h @@ -54,6 +54,8 @@ struct MergeTreeDataPartChecksums bool has(const String & file_name) const { return files.find(file_name) != files.end(); } + bool remove(const String & file_name) { return files.erase(file_name); } + bool empty() const { return files.empty(); } /// Checks that the set of columns and their checksums are the same. If not, throws an exception. diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index 911b25de2ad..8ef1621b647 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -513,7 +513,9 @@ static std::set getProjectionsToRecalculate( { bool need_recalculate = materialized_projections.contains(projection.name) - || (!is_full_part_storage && source_part->hasProjection(projection.name)); + || (!is_full_part_storage + && (source_part->hasProjection(projection.name) + || source_part->hasBrokenProjection(projection.name))); if (need_recalculate) projections_to_recalc.insert(&projection); @@ -1367,7 +1369,9 @@ private: bool need_recalculate = ctx->materialized_projections.contains(projection.name) - || (!is_full_part_storage && ctx->source_part->hasProjection(projection.name)); + || (!is_full_part_storage + && (ctx->source_part->hasProjection(projection.name) + || ctx->source_part->hasBrokenProjection(projection.name))); if (need_recalculate) { diff --git a/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp b/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp index b1875464725..4468cf8e3bf 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp @@ -274,7 +274,7 @@ std::pair ReplicatedMergeTreePartCheckThread::findLo return std::make_pair(exists_in_zookeeper, part); } -ReplicatedCheckResult ReplicatedMergeTreePartCheckThread::checkPartImpl(const String & part_name) +ReplicatedCheckResult ReplicatedMergeTreePartCheckThread::checkPartImpl(const String & part_name, bool throw_on_broken_projection) { ReplicatedCheckResult result; auto [exists_in_zookeeper, part] = findLocalPart(part_name); @@ -341,6 +341,7 @@ ReplicatedCheckResult ReplicatedMergeTreePartCheckThread::checkPartImpl(const St /// before the ReplicatedMergeTreePartHeader was introduced. String part_path = storage.replica_path + "/parts/" + part_name; String part_znode = zookeeper->get(part_path); + bool is_broken_projection = false; try { @@ -362,8 +363,10 @@ ReplicatedCheckResult ReplicatedMergeTreePartCheckThread::checkPartImpl(const St checkDataPart( part, - true, - [this] { return need_stop.load(); }); + /* require_checksums */true, + is_broken_projection, + [this] { return need_stop.load(); }, + throw_on_broken_projection); if (need_stop) { @@ -384,12 +387,22 @@ ReplicatedCheckResult ReplicatedMergeTreePartCheckThread::checkPartImpl(const St tryLogCurrentException(log, __PRETTY_FUNCTION__); - auto message = PreformattedMessage::create("Part {} looks broken. Removing it and will try to fetch.", part_name); - LOG_ERROR(log, message); + PreformattedMessage message; + if (is_broken_projection) + { + message = PreformattedMessage::create("Part {} has a broken projection. It will be ignored.", part_name); + LOG_DEBUG(log, message); + result.action = ReplicatedCheckResult::DoNothing; + } + else + { + message = PreformattedMessage::create("Part {} looks broken. Removing it and will try to fetch.", part_name); + LOG_ERROR(log, message); + result.action = ReplicatedCheckResult::TryFetchMissing; + } /// Part is broken, let's try to find it and fetch. result.status = {part_name, false, message}; - result.action = ReplicatedCheckResult::TryFetchMissing; return result; } @@ -419,12 +432,12 @@ ReplicatedCheckResult ReplicatedMergeTreePartCheckThread::checkPartImpl(const St } -CheckResult ReplicatedMergeTreePartCheckThread::checkPartAndFix(const String & part_name, std::optional * recheck_after) +CheckResult ReplicatedMergeTreePartCheckThread::checkPartAndFix(const String & part_name, std::optional * recheck_after, bool throw_on_broken_projection) { LOG_INFO(log, "Checking part {}", part_name); ProfileEvents::increment(ProfileEvents::ReplicatedPartChecks); - ReplicatedCheckResult result = checkPartImpl(part_name); + ReplicatedCheckResult result = checkPartImpl(part_name, throw_on_broken_projection); switch (result.action) { case ReplicatedCheckResult::None: UNREACHABLE(); @@ -577,7 +590,7 @@ void ReplicatedMergeTreePartCheckThread::run() } std::optional recheck_after; - checkPartAndFix(selected->name, &recheck_after); + checkPartAndFix(selected->name, &recheck_after, /* throw_on_broken_projection */false); if (need_stop) return; diff --git a/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.h b/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.h index 68dc6ca3d1d..26c4bfe9384 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.h +++ b/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.h @@ -65,9 +65,9 @@ public: size_t size() const; /// Check part by name - CheckResult checkPartAndFix(const String & part_name, std::optional * recheck_after = nullptr); + CheckResult checkPartAndFix(const String & part_name, std::optional * recheck_after = nullptr, bool throw_on_broken_projection = true); - ReplicatedCheckResult checkPartImpl(const String & part_name); + ReplicatedCheckResult checkPartImpl(const String & part_name, bool throw_on_broken_projection); std::unique_lock pausePartsCheck(); diff --git a/src/Storages/MergeTree/checkDataPart.cpp b/src/Storages/MergeTree/checkDataPart.cpp index a75df00e8a7..74af7cbb77c 100644 --- a/src/Storages/MergeTree/checkDataPart.cpp +++ b/src/Storages/MergeTree/checkDataPart.cpp @@ -111,7 +111,9 @@ static IMergeTreeDataPart::Checksums checkDataPart( const NameSet & files_without_checksums, const ReadSettings & read_settings, bool require_checksums, - std::function is_cancelled) + std::function is_cancelled, + bool & is_broken_projection, + bool throw_on_broken_projection) { /** Responsibility: * - read list of columns from columns.txt; @@ -120,6 +122,7 @@ static IMergeTreeDataPart::Checksums checkDataPart( */ CurrentMetrics::Increment metric_increment{CurrentMetrics::ReplicatedChecks}; + Poco::Logger * log = &Poco::Logger::get("checkDataPart"); NamesAndTypesList columns_txt; @@ -269,23 +272,68 @@ static IMergeTreeDataPart::Checksums checkDataPart( } } - for (const auto & [name, projection] : data_part->getProjectionParts()) + auto check_projection = [&](const String & name, std::shared_ptr projection) { - if (is_cancelled()) - return {}; - auto projection_file = name + ".proj"; - auto projection_checksums = checkDataPart( - projection, *data_part_storage.getProjection(projection_file), - projection->getColumns(), projection->getType(), - projection->getFileNamesWithoutChecksums(), - read_settings, require_checksums, is_cancelled); + if (!throw_on_broken_projection && projection->is_broken) + { + projections_on_disk.erase(projection_file); + checksums_txt.remove(projection_file); + return; + } + + IMergeTreeDataPart::Checksums projection_checksums; + try + { + bool noop; + projection_checksums = checkDataPart( + projection, *data_part_storage.getProjection(projection_file), + projection->getColumns(), projection->getType(), + projection->getFileNamesWithoutChecksums(), + read_settings, require_checksums, is_cancelled, noop, /* throw_on_broken_projection */false); + } + catch (...) + { + if (isRetryableException(std::current_exception())) + throw; + + LOG_TEST(log, "Marking projection {} as broken ({})", name, projection_file); + + if (!data_part->hasBrokenProjection(name)) + data_part->markProjectionPartAsBroken(name, getCurrentExceptionMessage(false), getCurrentExceptionCode()); + + is_broken_projection = true; + if (throw_on_broken_projection) + throw; + + projections_on_disk.erase(projection_file); + checksums_txt.remove(projection_file); + return; + } checksums_data.files[projection_file] = IMergeTreeDataPart::Checksums::Checksum( projection_checksums.getTotalSizeOnDisk(), projection_checksums.getTotalChecksumUInt128()); projections_on_disk.erase(projection_file); + }; + + auto broken_projection_parts = data_part->getBrokenProjectionParts(); /// Iterate over copy + for (const auto & [name, projection] : broken_projection_parts) + { + if (is_cancelled()) + return {}; + else + check_projection(name, projection); + } + + auto projection_parts = data_part->getProjectionParts(); /// Iterate over copy + for (const auto & [name, projection] : projection_parts) + { + if (is_cancelled()) + return {}; + else + check_projection(name, projection); } if (require_checksums && !projections_on_disk.empty()) @@ -315,7 +363,9 @@ IMergeTreeDataPart::Checksums checkDataPartInMemory(const DataPartInMemoryPtr & IMergeTreeDataPart::Checksums checkDataPart( MergeTreeData::DataPartPtr data_part, bool require_checksums, - std::function is_cancelled) + bool & is_broken_projection, + std::function is_cancelled, + bool throw_on_broken_projection) { if (auto part_in_memory = asInMemoryPart(data_part)) return checkDataPartInMemory(part_in_memory); @@ -357,7 +407,9 @@ IMergeTreeDataPart::Checksums checkDataPart( data_part->getFileNamesWithoutChecksums(), read_settings, require_checksums, - is_cancelled); + is_cancelled, + is_broken_projection, + throw_on_broken_projection); }; try @@ -371,7 +423,9 @@ IMergeTreeDataPart::Checksums checkDataPart( data_part->getFileNamesWithoutChecksums(), read_settings, require_checksums, - is_cancelled); + is_cancelled, + is_broken_projection, + throw_on_broken_projection); } catch (...) { diff --git a/src/Storages/MergeTree/checkDataPart.h b/src/Storages/MergeTree/checkDataPart.h index d0e48b6f80a..a01978f4efe 100644 --- a/src/Storages/MergeTree/checkDataPart.h +++ b/src/Storages/MergeTree/checkDataPart.h @@ -10,7 +10,9 @@ namespace DB IMergeTreeDataPart::Checksums checkDataPart( MergeTreeData::DataPartPtr data_part, bool require_checksums, - std::function is_cancelled = []{ return false; }); + bool & is_broken_projection, + std::function is_cancelled = []{ return false; }, + bool throw_on_broken_projection = false); bool isNotEnoughMemoryErrorCode(int code); bool isRetryableException(const std::exception_ptr exception_ptr); diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index e9a0dd5fbf3..74277616e95 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -2242,11 +2242,12 @@ std::optional StorageMergeTree::checkDataNext(DataValidationTasksPt { /// If the checksums file is not present, calculate the checksums and write them to disk. static constexpr auto checksums_path = "checksums.txt"; + bool noop; if (part->isStoredOnDisk() && !part->getDataPartStorage().exists(checksums_path)) { try { - auto calculated_checksums = checkDataPart(part, false); + auto calculated_checksums = checkDataPart(part, false, noop, /* is_cancelled */{}, /* throw_on_broken_projection */true); calculated_checksums.checkEqual(part->checksums, true); auto & part_mutable = const_cast(*part); @@ -2267,7 +2268,7 @@ std::optional StorageMergeTree::checkDataNext(DataValidationTasksPt { try { - checkDataPart(part, true); + checkDataPart(part, true, noop, /* is_cancelled */{}, /* throw_on_broken_projection */true); return CheckResult(part->name, true, ""); } catch (...) diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 74821a9186c..1859fa03094 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -8690,12 +8690,11 @@ IStorage::DataValidationTasksPtr StorageReplicatedMergeTree::getCheckTaskList( std::optional StorageReplicatedMergeTree::checkDataNext(DataValidationTasksPtr & check_task_list) { - if (auto part = assert_cast(check_task_list.get())->next()) { try { - return CheckResult(part_check_thread.checkPartAndFix(part->name)); + return part_check_thread.checkPartAndFix(part->name, /* recheck_after */nullptr, /* throw_on_broken_projection */true); } catch (const Exception & ex) { diff --git a/src/Storages/System/StorageSystemDisks.cpp b/src/Storages/System/StorageSystemDisks.cpp index 23a00cc7ae5..250fcdba641 100644 --- a/src/Storages/System/StorageSystemDisks.cpp +++ b/src/Storages/System/StorageSystemDisks.cpp @@ -63,7 +63,7 @@ Pipe StorageSystemDisks::read( for (const auto & [disk_name, disk_ptr] : context->getDisksMap()) { col_name->insert(disk_name); - col_path->insert(disk_ptr->getPath()); + col_path->insert(fs::absolute(disk_ptr->getPath()).string()); col_free->insert(disk_ptr->getAvailableSpace().value_or(std::numeric_limits::max())); col_total->insert(disk_ptr->getTotalSpace().value_or(std::numeric_limits::max())); col_unreserved->insert(disk_ptr->getUnreservedSpace().value_or(std::numeric_limits::max())); diff --git a/src/Storages/System/StorageSystemPartsBase.cpp b/src/Storages/System/StorageSystemPartsBase.cpp index 513af6cfc46..e97c13b1fed 100644 --- a/src/Storages/System/StorageSystemPartsBase.cpp +++ b/src/Storages/System/StorageSystemPartsBase.cpp @@ -64,7 +64,7 @@ StoragesInfo::getParts(MergeTreeData::DataPartStateVector & state, bool has_stat } MergeTreeData::ProjectionPartsVector -StoragesInfo::getProjectionParts(MergeTreeData::DataPartStateVector & state, bool has_state_column) const +StoragesInfo::getProjectionParts(bool fill_states, bool has_state_column) const { if (data->getInMemoryMetadataPtr()->projections.empty()) return {}; @@ -74,12 +74,12 @@ StoragesInfo::getProjectionParts(MergeTreeData::DataPartStateVector & state, boo { /// If has_state_column is requested, return all states. if (!has_state_column) - return data->getProjectionPartsVectorForInternalUsage({State::Active, State::Outdated}, &state); + return data->getProjectionPartsVectorForInternalUsage({State::Active, State::Outdated}, fill_states); - return data->getAllProjectionPartsVector(&state); + return data->getAllProjectionPartsVector(fill_states); } - return data->getProjectionPartsVectorForInternalUsage({State::Active}, &state); + return data->getProjectionPartsVectorForInternalUsage({State::Active}, fill_states); } StoragesInfoStream::StoragesInfoStream(const SelectQueryInfo & query_info, ContextPtr context) diff --git a/src/Storages/System/StorageSystemPartsBase.h b/src/Storages/System/StorageSystemPartsBase.h index c3d2e64b303..e0e81f0d24d 100644 --- a/src/Storages/System/StorageSystemPartsBase.h +++ b/src/Storages/System/StorageSystemPartsBase.h @@ -25,7 +25,7 @@ struct StoragesInfo explicit operator bool() const { return storage != nullptr; } MergeTreeData::DataPartsVector getParts(MergeTreeData::DataPartStateVector & state, bool has_state_column) const; - MergeTreeData::ProjectionPartsVector getProjectionParts(MergeTreeData::DataPartStateVector & state, bool has_state_column) const; + MergeTreeData::ProjectionPartsVector getProjectionParts(bool fill_states, bool has_state_column) const; }; /** A helper class that enumerates the storages that match given query. */ diff --git a/src/Storages/System/StorageSystemProjectionParts.cpp b/src/Storages/System/StorageSystemProjectionParts.cpp index 213865a8d61..44bdb294a2d 100644 --- a/src/Storages/System/StorageSystemProjectionParts.cpp +++ b/src/Storages/System/StorageSystemProjectionParts.cpp @@ -83,7 +83,11 @@ StorageSystemProjectionParts::StorageSystemProjectionParts(const StorageID & tab {"rows_where_ttl_info.expression", std::make_shared(std::make_shared())}, {"rows_where_ttl_info.min", std::make_shared(std::make_shared())}, - {"rows_where_ttl_info.max", std::make_shared(std::make_shared())} + {"rows_where_ttl_info.max", std::make_shared(std::make_shared())}, + + {"is_broken", std::make_shared()}, + {"exception_code", std::make_shared()}, + {"exception", std::make_shared()}, } ) { @@ -93,15 +97,14 @@ void StorageSystemProjectionParts::processNextStorage( ContextPtr, MutableColumns & columns, std::vector & columns_mask, const StoragesInfo & info, bool has_state_column) { using State = MergeTreeDataPartState; - MergeTreeData::DataPartStateVector all_parts_state; - MergeTreeData::ProjectionPartsVector all_parts = info.getProjectionParts(all_parts_state, has_state_column); - for (size_t part_number = 0; part_number < all_parts.projection_parts.size(); ++part_number) + MergeTreeData::ProjectionPartsVector all_parts = info.getProjectionParts(true, has_state_column); + auto fill_part_info = [&](size_t part_number, const MergeTreeData::DataPartsVector & parts, const MergeTreeData::DataPartStateVector & states) { - const auto & part = all_parts.projection_parts[part_number]; + const auto & part = parts[part_number]; const auto * parent_part = part->getParentPart(); chassert(parent_part); - auto part_state = all_parts_state[part_number]; + auto part_state = states[part_number]; ColumnSize columns_size = part->getTotalColumnsSize(); ColumnSize parent_columns_size = parent_part->getTotalColumnsSize(); @@ -278,10 +281,43 @@ void StorageSystemProjectionParts::processNextStorage( add_ttl_info_map(part->ttl_infos.group_by_ttl); add_ttl_info_map(part->ttl_infos.rows_where_ttl); + { + if (columns_mask[src_index++]) + columns[res_index++]->insert(part->is_broken.load(std::memory_order_relaxed)); + + if (part->is_broken) + { + std::lock_guard lock(part->broken_projections_mutex); + if (columns_mask[src_index++]) + columns[res_index++]->insert(part->exception_code); + if (columns_mask[src_index++]) + columns[res_index++]->insert(part->exception); + } + else + { + if (columns_mask[src_index++]) + columns[res_index++]->insertDefault(); + if (columns_mask[src_index++]) + columns[res_index++]->insertDefault(); + } + } + /// _state column should be the latest. /// Do not use part->getState*, it can be changed from different thread if (has_state_column) columns[res_index++]->insert(IMergeTreeDataPart::stateString(part_state)); + }; + + for (size_t part_number = 0; part_number < all_parts.projection_parts.size(); ++part_number) + { + auto part = all_parts.projection_parts[part_number]; + fill_part_info(part_number, all_parts.projection_parts, all_parts.projection_parts_states); + } + + for (size_t part_number = 0; part_number < all_parts.broken_projection_parts.size(); ++part_number) + { + auto part = all_parts.broken_projection_parts[part_number]; + fill_part_info(part_number, all_parts.broken_projection_parts, all_parts.broken_projection_parts_states); } } diff --git a/src/Storages/System/StorageSystemProjectionPartsColumns.cpp b/src/Storages/System/StorageSystemProjectionPartsColumns.cpp index 06becc6d91c..3f4224e46bb 100644 --- a/src/Storages/System/StorageSystemProjectionPartsColumns.cpp +++ b/src/Storages/System/StorageSystemProjectionPartsColumns.cpp @@ -103,15 +103,14 @@ void StorageSystemProjectionPartsColumns::processNextStorage( } /// Go through the list of projection parts. - MergeTreeData::DataPartStateVector all_parts_state; - MergeTreeData::ProjectionPartsVector all_parts = info.getProjectionParts(all_parts_state, has_state_column); - for (size_t part_number = 0; part_number < all_parts.projection_parts.size(); ++part_number) + MergeTreeData::ProjectionPartsVector all_parts = info.getProjectionParts(true, has_state_column); + auto fill_part_info = [&](size_t part_number, const MergeTreeData::DataPartsVector & parts, const MergeTreeData::DataPartStateVector & states) { - const auto & part = all_parts.projection_parts[part_number]; + const auto & part = parts[part_number]; const auto * parent_part = part->getParentPart(); chassert(parent_part); - auto part_state = all_parts_state[part_number]; + auto part_state = states[part_number]; auto columns_size = part->getTotalColumnsSize(); auto parent_columns_size = parent_part->getTotalColumnsSize(); @@ -260,6 +259,18 @@ void StorageSystemProjectionPartsColumns::processNextStorage( if (has_state_column) columns[res_index++]->insert(part->stateString()); } + }; + + for (size_t part_number = 0; part_number < all_parts.projection_parts.size(); ++part_number) + { + auto part = all_parts.projection_parts[part_number]; + fill_part_info(part_number, all_parts.projection_parts, all_parts.projection_parts_states); + } + + for (size_t part_number = 0; part_number < all_parts.broken_projection_parts.size(); ++part_number) + { + auto part = all_parts.broken_projection_parts[part_number]; + fill_part_info(part_number, all_parts.broken_projection_parts, all_parts.broken_projection_parts_states); } } diff --git a/tests/queries/0_stateless/02916_broken_projection.reference b/tests/queries/0_stateless/02916_broken_projection.reference new file mode 100644 index 00000000000..d0b07e081db --- /dev/null +++ b/tests/queries/0_stateless/02916_broken_projection.reference @@ -0,0 +1,224 @@ +insert new part +insert new part +insert new part +insert new part +system.parts +all_0_0_0 1 ['proj','proj_2'] +all_1_1_0 1 ['proj','proj_2'] +all_2_2_0 1 ['proj','proj_2'] +all_3_3_0 1 ['proj','proj_2'] +select from projection 'proj' +16 +12 +used projections +SELECT c FROM test WHERE d == 12 OR d == 16; ['default.test.proj'] +select from projection 'proj_2' +12 +16 +used projections +SELECT d FROM test WHERE c == 12 OR c == 16; ['default.test.proj_2'] +check table +1 +0 +broke metadata of part 'proj' (parent part: all_2_2_0) +system.parts +all_0_0_0 1 ['proj','proj_2'] +all_1_1_0 1 ['proj','proj_2'] +all_2_2_0 1 ['proj','proj_2'] +all_3_3_0 1 ['proj','proj_2'] +select from projection 'proj' +16 +12 +used projections +SELECT c FROM test WHERE d == 12 OR d == 16; ['default.test.proj'] +select from projection 'proj_2' +16 +12 +used projections +SELECT d FROM test WHERE c == 12 OR c == 16; ['default.test.proj_2'] +check table +0 +broken projections info +all_2_2_0 proj FILE_DOESNT_EXIST +check table full +all_0_0_0 1 +all_1_1_0 1 +all_3_3_0 1 +all_2_2_0 0 Part all_2_2_0 has a broken projection. It will be ignored. +0 +broke data of part 'proj_2' (parent part: all_2_2_0) +broken projections info +all_2_2_0 proj FILE_DOESNT_EXIST +system.parts +all_0_0_0 1 ['proj','proj_2'] +all_1_1_0 1 ['proj','proj_2'] +all_2_2_0 1 ['proj_2'] +all_3_3_0 1 ['proj','proj_2'] +select from projection 'proj' +16 +12 +used projections +SELECT c FROM test WHERE d == 12 OR d == 16; ['default.test.proj'] +select from projection 'proj_2' +FILE_DOESNT_EXIST +check table +0 +broken projections info +all_2_2_0 proj FILE_DOESNT_EXIST +all_2_2_0 proj_2 NO_FILE_IN_DATA_PART +system.parts +all_0_0_0 1 ['proj','proj_2'] +all_1_1_0 1 ['proj','proj_2'] +all_2_2_0 1 [] +all_3_3_0 1 ['proj','proj_2'] +select from projection 'proj' +16 +12 +used projections +SELECT c FROM test WHERE d == 12 OR d == 16; ['default.test.proj'] +select from projection 'proj_2' +16 +12 +used projections +SELECT d FROM test WHERE c == 12 OR c == 16; ['default.test.proj_2'] +check table +0 +0 +broke data of part 'proj_2' (parent part: all_3_3_0) +broken projections info +all_2_2_0 proj FILE_DOESNT_EXIST +all_2_2_0 proj_2 NO_FILE_IN_DATA_PART +insert new part +insert new part +optimize +0 +broken projections info +all_2_2_0 proj FILE_DOESNT_EXIST +all_2_2_0 proj_2 NO_FILE_IN_DATA_PART +all_3_3_0 proj_2 NO_FILE_IN_DATA_PART +system.parts +all_0_0_0 1 ['proj','proj_2'] +all_1_1_0 1 ['proj','proj_2'] +all_2_2_0 1 [] +all_3_3_0 0 ['proj'] +all_3_5_1 1 ['proj'] +all_4_4_0 0 ['proj','proj_2'] +all_5_5_0 0 ['proj','proj_2'] +select from projection 'proj' +16 +12 +used projections +SELECT c FROM test WHERE d == 12 OR d == 16; ['default.test.proj'] +select from projection 'proj_2' +16 +12 +used projections +SELECT d FROM test WHERE c == 12 OR c == 16; ['default.test.proj_2'] +check table +0 +0 +broke metadata of part 'proj' (parent part: all_1_1_0) +Detach - Attach +broken projections info +all_1_1_0 proj NO_FILE_IN_DATA_PART +all_2_2_0 proj NO_FILE_IN_DATA_PART +all_2_2_0 proj_2 FILE_DOESNT_EXIST +all_3_3_0 proj_2 FILE_DOESNT_EXIST +0 +broke data of part 'proj_2' (parent part: all_1_1_0) +Detach - Attach +broken projections info +all_1_1_0 proj NO_FILE_IN_DATA_PART +all_1_1_0 proj_2 FILE_DOESNT_EXIST +all_2_2_0 proj NO_FILE_IN_DATA_PART +all_2_2_0 proj_2 FILE_DOESNT_EXIST +all_3_3_0 proj_2 FILE_DOESNT_EXIST +system.parts +all_0_0_0 1 ['proj','proj_2'] +all_1_1_0 1 [] +all_2_2_0 1 [] +all_3_3_0 0 ['proj'] +all_3_5_1 1 ['proj'] +all_4_4_0 0 ['proj','proj_2'] +all_5_5_0 0 ['proj','proj_2'] +select from projection 'proj' +16 +12 +used projections +SELECT c FROM test WHERE d == 12 OR d == 16; ['default.test.proj'] +select from projection 'proj_2' +16 +12 +used projections +SELECT d FROM test WHERE c == 12 OR c == 16; ['default.test.proj_2'] +check table +0 +check table full +all_3_5_1 1 +all_0_0_0 1 +all_1_1_0 0 Part all_1_1_0 has a broken projection. It will be ignored. +all_2_2_0 0 Part all_2_2_0 has a broken projection. It will be ignored. +materialize projection proj +check table full +all_3_5_1_6 1 +all_0_0_0_6 1 +all_2_2_0_6 1 +all_1_1_0_6 1 +system.parts +all_0_0_0 0 ['proj','proj_2'] +all_0_0_0_6 1 ['proj','proj_2'] +all_1_1_0 0 [] +all_1_1_0_6 1 ['proj','proj_2'] +all_2_2_0 0 [] +all_2_2_0_6 1 ['proj','proj_2'] +all_3_3_0 0 ['proj'] +all_3_5_1 0 ['proj'] +all_3_5_1_6 1 ['proj'] +all_4_4_0 0 ['proj','proj_2'] +all_5_5_0 0 ['proj','proj_2'] +select from projection 'proj' +16 +12 +used projections +SELECT c FROM test WHERE d == 12 OR d == 16; ['default.test.proj'] +select from projection 'proj_2' +12 +16 +used projections +SELECT d FROM test WHERE c == 12 OR c == 16; ['default.test.proj_2'] +check table +1 +materialize projection proj_2 +check table full +all_3_5_1_7 1 +all_0_0_0_7 1 +all_2_2_0_7 1 +all_1_1_0_7 1 +system.parts +all_0_0_0 0 ['proj','proj_2'] +all_0_0_0_6 0 ['proj','proj_2'] +all_0_0_0_7 1 ['proj','proj_2'] +all_1_1_0 0 [] +all_1_1_0_6 0 ['proj','proj_2'] +all_1_1_0_7 1 ['proj','proj_2'] +all_2_2_0 0 [] +all_2_2_0_6 0 ['proj','proj_2'] +all_2_2_0_7 1 ['proj','proj_2'] +all_3_3_0 0 ['proj'] +all_3_5_1 0 ['proj'] +all_3_5_1_6 0 ['proj'] +all_3_5_1_7 1 ['proj','proj_2'] +all_4_4_0 0 ['proj','proj_2'] +all_5_5_0 0 ['proj','proj_2'] +select from projection 'proj' +12 +16 +used projections +SELECT c FROM test WHERE d == 12 OR d == 16; ['default.test.proj'] +select from projection 'proj_2' +16 +12 +used projections +SELECT d FROM test WHERE c == 12 OR c == 16; ['default.test.proj_2'] +check table +1 diff --git a/tests/queries/0_stateless/02916_broken_projection.sh b/tests/queries/0_stateless/02916_broken_projection.sh new file mode 100755 index 00000000000..81adfe6e49d --- /dev/null +++ b/tests/queries/0_stateless/02916_broken_projection.sh @@ -0,0 +1,283 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT -nm -q " +DROP TABLE IF EXISTS test SYNC; +CREATE TABLE test +( + a String, + b String, + c Int32, + d Int32, + e Int32, + + PROJECTION proj + ( + SELECT c ORDER BY d + ), + PROJECTION proj_2 + ( + SELECT d ORDER BY c + ) +) +ENGINE = ReplicatedMergeTree('/test2/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/', '1') PRIMARY KEY (a) +SETTINGS min_bytes_for_wide_part = 0, + max_parts_to_merge_at_once=3, + enable_vertical_merge_algorithm=1, + vertical_merge_algorithm_min_rows_to_activate = 1, + vertical_merge_algorithm_min_columns_to_activate = 1, + vertical_merge_algorithm_min_columns_to_activate = 1; +" + +table_uuid=$($CLICKHOUSE_CLIENT -q "SELECT uuid FROM system.tables WHERE table='test' and database=currentDatabase()") + +function random() +{ + cat /dev/urandom | LC_ALL=C tr -dc 'a-zA-Z' | fold -w ${1:-8} | head -n 1 +} + +function insert() +{ + offset=$1 + size=$2 + echo 'insert new part' + $CLICKHOUSE_CLIENT -q "INSERT INTO test SELECT number, number, number, number, number%2 FROM numbers($offset, $size);" +} + +function break_projection() +{ + part_name=$1 + parent_name=$2 + break_type=$3 + + read -r disk_name part_path <<< $($CLICKHOUSE_CLIENT -nm -q " + SELECT disk_name, path + FROM system.projection_parts + WHERE table='test' + AND database=currentDatabase() + AND active=1 + AND part_name='$part_name' + AND parent_name='$parent_name' + LIMIT 1; + ") + + path=$($CLICKHOUSE_CLIENT -q "SELECT path FROM system.disks WHERE name='$disk_name'") + + # make sure path is absolute + $CLICKHOUSE_CLIENT -q "select throwIf(substring('$path', 1, 1) != '/', 'Path is relative: $path')" || exit + + if [ "$break_type" = "data" ] + then + rm "$path/$part_path/d.bin" + rm "$path/$part_path/c.bin" + echo "broke data of part '$part_name' (parent part: $parent_name)" + else + rm "$path/$part_path/columns.txt" + echo "broke metadata of part '$part_name' (parent part: $parent_name)" + fi +} + +function broken_projections_info() +{ + echo 'broken projections info' + $CLICKHOUSE_CLIENT -q " + SELECT parent_name, name, errors.name FROM + ( + SELECT parent_name, name, exception_code + FROM system.projection_parts + WHERE table='test' + AND database=currentDatabase() + AND is_broken = 1 + ) AS parts_info + INNER JOIN system.errors AS errors + ON parts_info.exception_code = errors.code + ORDER BY parent_name, name +" +} + +function check() +{ + expect_broken_part="" + expected_error="" + if [ $# -ne 0 ]; then + expect_broken_part=$1 + expected_error=$2 + fi + + echo 'system.parts' + $CLICKHOUSE_CLIENT -q " + SELECT name, active, projections + FROM system.parts + WHERE table='test' AND database=currentDatabase() + ORDER BY name;" + + echo "select from projection 'proj'" + query_id=$(random 8) + + if [ "$expect_broken_part" = "proj" ] + then + $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT c FROM test WHERE d == 12;" 2>&1 | grep -o $expected_error + else + $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT c FROM test WHERE d == 12 OR d == 16;" + echo 'used projections' + $CLICKHOUSE_CLIENT -nm -q " + SYSTEM FLUSH LOGS; + SELECT query, projections FROM system.query_log WHERE query_id='$query_id' and type='QueryFinish' + " + fi + + echo "select from projection 'proj_2'" + query_id=$(random 8) + + if [ "$expect_broken_part" = "proj_2" ] + then + $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT d FROM test WHERE c == 12;" 2>&1 | grep -o $expected_error + else + $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT d FROM test WHERE c == 12 OR c == 16;" + echo 'used projections' + $CLICKHOUSE_CLIENT -nm -q " + SYSTEM FLUSH LOGS; + SELECT query, projections FROM system.query_log WHERE query_id='$query_id' and type='QueryFinish' + " + fi + + echo 'check table' + $CLICKHOUSE_CLIENT -q "CHECK TABLE test" +} + +function optimize_no_wait() +{ + echo 'optimize' + $CLICKHOUSE_CLIENT -nm -q "OPTIMIZE TABLE test SETTINGS alter_sync=0;" +} + +function reattach() +{ + echo 'Detach - Attach' + $CLICKHOUSE_CLIENT -nm -q " + DETACH TABLE test; + ATTACH TABLE test; + " +} + +function materialize_projection +{ + projection=$1 + echo "materialize projection $projection" + $CLICKHOUSE_CLIENT -q "ALTER TABLE test MATERIALIZE PROJECTION $projection SETTINGS mutations_sync=2" +} + +function check_table_full() +{ + echo 'check table full' + $CLICKHOUSE_CLIENT -q "CHECK TABLE test SETTINGS check_query_single_value_result = 0" +} + + +insert 0 5 + +insert 5 5 + +insert 10 5 + +insert 15 5 + +check + +# Break metadata file of projection 'proj' +break_projection proj all_2_2_0 metadata + +# Do select and after "check table" query. +# Select works because it does not read columns.txt. +check + +# Projection 'proj' from part all_2_2_0 will now appear in broken parts info +# because it was marked broken during "check table" query. +# TODO: try to mark it during select as well +broken_projections_info + +# Check table query will also show a list of parts which have broken projections. +check_table_full + +# Break data file of projection 'proj_2' for part all_2_2_0 +break_projection proj_2 all_2_2_0 data + +# It will not yet appear in broken projections info. +broken_projections_info + +# Select now fails with error "File doesn't exist" +check "proj_2" "FILE_DOESNT_EXIST" + +# Projection 'proj_2' from part all_2_2_0 will now appear in broken parts info. +broken_projections_info + +# Second select works, because projection is now marked as broken. +check + +# Break data file of projection 'proj_2' for part all_3_3_0 +break_projection proj_2 all_3_3_0 data + +# It will not yet appear in broken projections info. +broken_projections_info + +insert 20 5 + +insert 25 5 + +# Part all_3_3_0 has 'proj' and 'proj_2' projections, but 'proj_2' is broken and server does NOT know it yet. +# Parts all_4_4_0 and all_5_5_0 have both non-broken projections. +# So a merge will be create for future part all_3_5_1. +# During merge it will fail to read from 'proj_2' of part all_3_3_0 and proj_2 will be marked broken. +# Merge will be retried and on second attempt it will succeed. +# The result part all_3_5_1 will have only 1 projection - 'proj', because +# it will skip 'proj_2' as it will see that one part does not have it anymore in the set of valid projections. +optimize_no_wait +sleep 2 + +$CLICKHOUSE_CLIENT -nm -q " +SYSTEM FLUSH LOGS; +SELECT count() FROM system.text_log +WHERE level='Error' +AND logger_name='MergeTreeBackgroundExecutor' +AND message like 'Exception while executing background task {$table_uuid:all_3_5_1}%Cannot open file%proj_2.proj/c.bin%' +" + +# Projection 'proj_2' from part all_2_2_0 will now appear in broken parts info. +broken_projections_info + +check + +break_projection proj all_1_1_0 metadata + +reattach + +broken_projections_info + +break_projection proj_2 all_1_1_0 data + +reattach + +broken_projections_info + +check + +check_table_full + +materialize_projection proj + +check_table_full + +check + +materialize_projection proj_2 + +check_table_full + +check + +$CLICKHOUSE_CLIENT -nm -q " +DROP TABLE test; +" From 6c42a3fad6b58efdf91115c3b80f267f1f604c62 Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 16 Nov 2023 16:43:34 +0100 Subject: [PATCH 002/245] Better --- src/Interpreters/MutationsInterpreter.cpp | 18 ++-- .../Optimizations/projectionsCommon.cpp | 2 +- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 45 +++------ src/Storages/MergeTree/IMergeTreeDataPart.h | 11 +-- src/Storages/MergeTree/MergeTask.cpp | 5 +- src/Storages/MergeTree/MergeTreeData.cpp | 57 ++++++----- src/Storages/MergeTree/MergeTreeData.h | 4 +- src/Storages/MergeTree/MutateTask.cpp | 8 +- src/Storages/MergeTree/checkDataPart.cpp | 34 ++----- .../System/StorageSystemPartsBase.cpp | 8 +- src/Storages/System/StorageSystemPartsBase.h | 2 +- .../System/StorageSystemProjectionParts.cpp | 30 +++--- .../StorageSystemProjectionPartsColumns.cpp | 21 +--- .../02916_broken_projection.reference | 95 ++++++++----------- .../0_stateless/02916_broken_projection.sh | 12 +-- 15 files changed, 146 insertions(+), 206 deletions(-) diff --git a/src/Interpreters/MutationsInterpreter.cpp b/src/Interpreters/MutationsInterpreter.cpp index a9a5d4f33d0..237bffe4a67 100644 --- a/src/Interpreters/MutationsInterpreter.cpp +++ b/src/Interpreters/MutationsInterpreter.cpp @@ -753,7 +753,7 @@ void MutationsInterpreter::prepare(bool dry_run) { mutation_kind.set(MutationKind::MUTATE_INDEX_PROJECTION); const auto & projection = projections_desc.get(command.projection_name); - if (!source.hasProjection(projection.name)) + if (!source.hasProjection(projection.name) || source.hasBrokenProjection(projection.name)) { for (const auto & column : projection.required_columns) dependencies.emplace(column, ColumnDependency::PROJECTION); @@ -927,20 +927,18 @@ void MutationsInterpreter::prepare(bool dry_run) materialized_indices.insert(index.name); } - /// Always rebuild broken projections. - for (const auto & projection : metadata_snapshot->getProjections()) - { - if (!source.hasBrokenProjection(projection.name)) - continue; - - materialized_projections.insert(projection.name); - } - for (const auto & projection : metadata_snapshot->getProjections()) { if (!source.hasProjection(projection.name)) continue; + /// Always rebuild broken projections. + if (source.hasBrokenProjection(projection.name)) + { + materialized_projections.insert(projection.name); + continue; + } + if (need_rebuild_projections) { materialized_projections.insert(projection.name); diff --git a/src/Processors/QueryPlan/Optimizations/projectionsCommon.cpp b/src/Processors/QueryPlan/Optimizations/projectionsCommon.cpp index c3b3449857b..9ebd5aaa32f 100644 --- a/src/Processors/QueryPlan/Optimizations/projectionsCommon.cpp +++ b/src/Processors/QueryPlan/Optimizations/projectionsCommon.cpp @@ -224,7 +224,7 @@ bool analyzeProjectionCandidate( { const auto & created_projections = part_with_ranges.data_part->getProjectionParts(); auto it = created_projections.find(candidate.projection->name); - if (it != created_projections.end()) + if (it != created_projections.end() && !it->second->is_broken) { projection_parts.push_back(it->second); } diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index bc81758675e..85ce112d9a1 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -745,8 +745,7 @@ void IMergeTreeDataPart::loadProjections(bool require_columns_checksums, bool ch LOG_ERROR(&Poco::Logger::get("IMergeTreeDataPart"), "Cannot load projection {}, will consider it broken", projection.name); - addBrokenProjectionPart(projection.name, std::move(part), getCurrentExceptionMessage(false), getCurrentExceptionCode()); - continue; + part->setBrokenReason(getCurrentExceptionMessage(false), getCurrentExceptionCode()); } addProjectionPart(projection.name, std::move(part)); @@ -2147,44 +2146,30 @@ std::optional IMergeTreeDataPart::getStreamNameForColumn( return getStreamNameOrHash(stream_name, extension, storage_); } -void IMergeTreeDataPart::addBrokenProjectionPart( - const String & projection_name, - std::shared_ptr projection_part, - const String & message, - int code) -{ - projection_part->setBrokenReason(message, code); - bool inserted = broken_projection_parts.emplace(projection_name, projection_part).second; - if (!inserted) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Projection part {} in part {} is already added to a broken projection parts list", projection_name, name); -} - void IMergeTreeDataPart::markProjectionPartAsBroken(const String & projection_name, const String & message, int code) const { - std::lock_guard lock(broken_projections_mutex); - auto it = projection_parts.find(projection_name); if (it == projection_parts.end()) throw Exception(ErrorCodes::LOGICAL_ERROR, "There is no projection part '{}'", projection_name); - it->second->setBrokenReason(message, code); - - broken_projection_parts.emplace(projection_name, it->second); - projection_parts.erase(it); -} - -void IMergeTreeDataPart::setBrokenReason(const String & message, int code) -{ - std::lock_guard lock(broken_projections_mutex); - is_broken = true; - exception = message; - exception_code = code; } bool IMergeTreeDataPart::hasBrokenProjection(const String & projection_name) const { - std::lock_guard lock(broken_projections_mutex); - return broken_projection_parts.contains(projection_name); + auto it = projection_parts.find(projection_name); + if (it == projection_parts.end()) + return false; + return it->second->is_broken; +} + +void IMergeTreeDataPart::setBrokenReason(const String & message, int code) const +{ + std::lock_guard lock(broken_reason_mutex); + if (is_broken) + return; + is_broken = true; + exception = message; + exception_code = code; } bool isCompactPart(const MergeTreeDataPartPtr & data_part) diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index 52a1541e15f..9af2c16f1e8 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -259,7 +259,7 @@ public: mutable std::atomic is_broken {false}; mutable std::string exception; mutable int exception_code = 0; - mutable std::mutex broken_projections_mutex; + mutable std::mutex broken_reason_mutex; /// Indicates that the part was marked Outdated by PartCheckThread because the part was not committed to ZooKeeper mutable bool is_unexpected_local_part = false; @@ -411,14 +411,10 @@ public: const std::map> & getProjectionParts() const { return projection_parts; } - const std::map> & getBrokenProjectionParts() const { return broken_projection_parts; } - MergeTreeDataPartBuilder getProjectionPartBuilder(const String & projection_name, bool is_temp_projection = false); void addProjectionPart(const String & projection_name, std::shared_ptr && projection_part); - void addBrokenProjectionPart(const String & projection_name, std::shared_ptr projection_part, const String & message, int code); - void markProjectionPartAsBroken(const String & projection_name, const String & message, int code) const; bool hasProjection(const String & projection_name) const { return projection_parts.contains(projection_name); } @@ -427,6 +423,8 @@ public: void loadProjections(bool require_columns_checksums, bool check_consistency, bool if_not_loaded = false); + void setBrokenReason(const String & message, int code) const; + /// Return set of metadata file names without checksums. For example, /// columns.txt or checksums.txt itself. NameSet getFileNamesWithoutChecksums() const; @@ -579,7 +577,6 @@ protected: String parent_part_name; mutable std::map> projection_parts; - mutable std::map> broken_projection_parts; mutable PartMetadataManagerPtr metadata_manager; @@ -693,8 +690,6 @@ private: void incrementStateMetric(MergeTreeDataPartState state) const; void decrementStateMetric(MergeTreeDataPartState state) const; - void setBrokenReason(const String & message, int code); - /// This ugly flag is needed for debug assertions only mutable bool part_is_probably_removed_from_disk = false; }; diff --git a/src/Storages/MergeTree/MergeTask.cpp b/src/Storages/MergeTree/MergeTask.cpp index adb1ca72e46..53ba1a57b27 100644 --- a/src/Storages/MergeTree/MergeTask.cpp +++ b/src/Storages/MergeTree/MergeTask.cpp @@ -696,8 +696,9 @@ bool MergeTask::MergeProjectionsStage::mergeMinMaxIndexAndPrepareProjections() c MergeTreeData::DataPartsVector projection_parts; for (const auto & part : global_ctx->future_part->parts) { - auto it = part->getProjectionParts().find(projection.name); - if (it != part->getProjectionParts().end()) + auto actual_projection_parts = part->getProjectionParts(); + auto it = actual_projection_parts.find(projection.name); + if (it != actual_projection_parts.end() && !it->second->is_broken) projection_parts.push_back(it->second); } if (projection_parts.size() < global_ctx->future_part->parts.size()) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 152c386e188..0725c3cbf32 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -5737,7 +5737,7 @@ MergeTreeData::getDataPartsVectorForInternalUsage(const DataPartStates & afforda } MergeTreeData::ProjectionPartsVector -MergeTreeData::getProjectionPartsVectorForInternalUsage(const DataPartStates & affordable_states, bool fill_states) const +MergeTreeData::getProjectionPartsVectorForInternalUsage(const DataPartStates & affordable_states, DataPartStateVector * out_states) const { auto lock = lockParts(); ProjectionPartsVector res; @@ -5749,20 +5749,14 @@ MergeTreeData::getProjectionPartsVectorForInternalUsage(const DataPartStates & a res.data_parts.push_back(part); for (const auto & [_, projection_part] : part->getProjectionParts()) res.projection_parts.push_back(projection_part); - for (const auto & [_, projection_part] : part->getBrokenProjectionParts()) - res.broken_projection_parts.push_back(projection_part); } } - if (fill_states) + if (out_states != nullptr) { - res.projection_parts_states.resize(res.projection_parts.size()); + out_states->resize(res.projection_parts.size()); for (size_t i = 0; i < res.projection_parts.size(); ++i) - (res.projection_parts_states)[i] = res.projection_parts[i]->getParentPart()->getState(); - - res.broken_projection_parts_states.resize(res.broken_projection_parts.size()); - for (size_t i = 0; i < res.broken_projection_parts.size(); ++i) - (res.broken_projection_parts_states)[i] = res.broken_projection_parts[i]->getParentPart()->getState(); + (*out_states)[i] = res.projection_parts[i]->getParentPart()->getState(); } return res; @@ -5815,7 +5809,7 @@ bool MergeTreeData::supportsLightweightDelete() const return true; } -MergeTreeData::ProjectionPartsVector MergeTreeData::getAllProjectionPartsVector(bool fill_states) const +MergeTreeData::ProjectionPartsVector MergeTreeData::getAllProjectionPartsVector(MergeTreeData::DataPartStateVector * out_states) const { ProjectionPartsVector res; auto lock = lockParts(); @@ -5826,15 +5820,11 @@ MergeTreeData::ProjectionPartsVector MergeTreeData::getAllProjectionPartsVector( res.projection_parts.push_back(projection_part); } - if (fill_states) + if (out_states != nullptr) { - res.projection_parts_states.resize(res.projection_parts.size()); + out_states->resize(res.projection_parts.size()); for (size_t i = 0; i < res.projection_parts.size(); ++i) - (res.projection_parts_states)[i] = res.projection_parts[i]->getParentPart()->getState(); - - res.broken_projection_parts_states.resize(res.broken_projection_parts.size()); - for (size_t i = 0; i < res.broken_projection_parts.size(); ++i) - (res.broken_projection_parts_states)[i] = res.broken_projection_parts[i]->getParentPart()->getState(); + (*out_states)[i] = res.projection_parts[i]->getParentPart()->getState(); } return res; } @@ -7050,8 +7040,7 @@ std::pair MergeTreeData::cloneAn } } - auto projections = src_part->getProjectionParts(); - for (const auto & [name, projection_part] : projections) + for (const auto & [name, projection_part] : src_part->getProjectionParts()) { const auto & projection_storage = projection_part->getDataPartStorage(); for (auto it = projection_storage.iterate(); it->isValid(); it->next()) @@ -7654,21 +7643,39 @@ MovePartsOutcome MergeTreeData::moveParts(const CurrentlyMovingPartsTaggerPtr & bool MergeTreeData::partsContainSameProjections(const DataPartPtr & left, const DataPartPtr & right, String & out_reason) { - if (left->getProjectionParts().size() != right->getProjectionParts().size()) + auto remove_broken_parts = [](auto & parts) + { + std::set broken_projection_parts; + for (const auto & [name, part] : parts) + { + if (part->is_broken) + broken_projection_parts.emplace(name); + } + for (const auto & name : broken_projection_parts) + parts.erase(name); + }; + + auto left_projection_parts = left->getProjectionParts(); + auto right_projection_parts = right->getProjectionParts(); + + remove_broken_parts(left_projection_parts); + remove_broken_parts(right_projection_parts); + + if (left_projection_parts.size() != right_projection_parts.size()) { out_reason = fmt::format( "Parts have different number of projections: {} in part '{}' and {} in part '{}'", - left->getProjectionParts().size(), + left_projection_parts.size(), left->name, - right->getProjectionParts().size(), + right_projection_parts.size(), right->name ); return false; } - for (const auto & [name, _] : left->getProjectionParts()) + for (const auto & [name, _] : left_projection_parts) { - if (!right->hasProjection(name)) + if (!right_projection_parts.contains(name)) { out_reason = fmt::format( "The part '{}' doesn't have projection '{}' while part '{}' does", right->name, name, left->name diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index 4ef3b75988b..18087c6b059 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -489,7 +489,7 @@ public: const DataPartStates & affordable_states, DataPartStateVector * out_states = nullptr) const; /// Same as above but only returns projection parts ProjectionPartsVector getProjectionPartsVectorForInternalUsage( - const DataPartStates & affordable_states, bool fill_states = false) const; + const DataPartStates & affordable_states, MergeTreeData::DataPartStateVector * out_states) const; /// Returns absolutely all parts (and snapshot of their states) @@ -501,7 +501,7 @@ public: size_t getTotalMarksCount() const; /// Same as above but only returns projection parts - ProjectionPartsVector getAllProjectionPartsVector(bool fill_states = false) const; + ProjectionPartsVector getAllProjectionPartsVector(MergeTreeData::DataPartStateVector * out_states = nullptr) const; /// Returns parts in Active state DataParts getDataPartsForInternalUsage() const; diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index 8ef1621b647..6a1ceec1cd3 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -514,8 +514,8 @@ static std::set getProjectionsToRecalculate( bool need_recalculate = materialized_projections.contains(projection.name) || (!is_full_part_storage - && (source_part->hasProjection(projection.name) - || source_part->hasBrokenProjection(projection.name))); + && source_part->hasProjection(projection.name) + && !source_part->hasBrokenProjection(projection.name)); if (need_recalculate) projections_to_recalc.insert(&projection); @@ -1370,8 +1370,8 @@ private: bool need_recalculate = ctx->materialized_projections.contains(projection.name) || (!is_full_part_storage - && (ctx->source_part->hasProjection(projection.name) - || ctx->source_part->hasBrokenProjection(projection.name))); + && ctx->source_part->hasProjection(projection.name) + && !ctx->source_part->hasBrokenProjection(projection.name)); if (need_recalculate) { diff --git a/src/Storages/MergeTree/checkDataPart.cpp b/src/Storages/MergeTree/checkDataPart.cpp index 74af7cbb77c..8feabf344b5 100644 --- a/src/Storages/MergeTree/checkDataPart.cpp +++ b/src/Storages/MergeTree/checkDataPart.cpp @@ -272,14 +272,16 @@ static IMergeTreeDataPart::Checksums checkDataPart( } } - auto check_projection = [&](const String & name, std::shared_ptr projection) + for (const auto & [name, projection] : data_part->getProjectionParts()) { + if (is_cancelled()) + return {}; + auto projection_file = name + ".proj"; if (!throw_on_broken_projection && projection->is_broken) { projections_on_disk.erase(projection_file); checksums_txt.remove(projection_file); - return; } IMergeTreeDataPart::Checksums projection_checksums; @@ -297,10 +299,11 @@ static IMergeTreeDataPart::Checksums checkDataPart( if (isRetryableException(std::current_exception())) throw; - LOG_TEST(log, "Marking projection {} as broken ({})", name, projection_file); - - if (!data_part->hasBrokenProjection(name)) - data_part->markProjectionPartAsBroken(name, getCurrentExceptionMessage(false), getCurrentExceptionCode()); + if (!projection->is_broken) + { + LOG_TEST(log, "Marking projection {} as broken ({})", name, projection_file); + projection->setBrokenReason(getCurrentExceptionMessage(false), getCurrentExceptionCode()); + } is_broken_projection = true; if (throw_on_broken_projection) @@ -308,7 +311,6 @@ static IMergeTreeDataPart::Checksums checkDataPart( projections_on_disk.erase(projection_file); checksums_txt.remove(projection_file); - return; } checksums_data.files[projection_file] = IMergeTreeDataPart::Checksums::Checksum( @@ -316,24 +318,6 @@ static IMergeTreeDataPart::Checksums checkDataPart( projection_checksums.getTotalChecksumUInt128()); projections_on_disk.erase(projection_file); - }; - - auto broken_projection_parts = data_part->getBrokenProjectionParts(); /// Iterate over copy - for (const auto & [name, projection] : broken_projection_parts) - { - if (is_cancelled()) - return {}; - else - check_projection(name, projection); - } - - auto projection_parts = data_part->getProjectionParts(); /// Iterate over copy - for (const auto & [name, projection] : projection_parts) - { - if (is_cancelled()) - return {}; - else - check_projection(name, projection); } if (require_checksums && !projections_on_disk.empty()) diff --git a/src/Storages/System/StorageSystemPartsBase.cpp b/src/Storages/System/StorageSystemPartsBase.cpp index e97c13b1fed..513af6cfc46 100644 --- a/src/Storages/System/StorageSystemPartsBase.cpp +++ b/src/Storages/System/StorageSystemPartsBase.cpp @@ -64,7 +64,7 @@ StoragesInfo::getParts(MergeTreeData::DataPartStateVector & state, bool has_stat } MergeTreeData::ProjectionPartsVector -StoragesInfo::getProjectionParts(bool fill_states, bool has_state_column) const +StoragesInfo::getProjectionParts(MergeTreeData::DataPartStateVector & state, bool has_state_column) const { if (data->getInMemoryMetadataPtr()->projections.empty()) return {}; @@ -74,12 +74,12 @@ StoragesInfo::getProjectionParts(bool fill_states, bool has_state_column) const { /// If has_state_column is requested, return all states. if (!has_state_column) - return data->getProjectionPartsVectorForInternalUsage({State::Active, State::Outdated}, fill_states); + return data->getProjectionPartsVectorForInternalUsage({State::Active, State::Outdated}, &state); - return data->getAllProjectionPartsVector(fill_states); + return data->getAllProjectionPartsVector(&state); } - return data->getProjectionPartsVectorForInternalUsage({State::Active}, fill_states); + return data->getProjectionPartsVectorForInternalUsage({State::Active}, &state); } StoragesInfoStream::StoragesInfoStream(const SelectQueryInfo & query_info, ContextPtr context) diff --git a/src/Storages/System/StorageSystemPartsBase.h b/src/Storages/System/StorageSystemPartsBase.h index e0e81f0d24d..c3d2e64b303 100644 --- a/src/Storages/System/StorageSystemPartsBase.h +++ b/src/Storages/System/StorageSystemPartsBase.h @@ -25,7 +25,7 @@ struct StoragesInfo explicit operator bool() const { return storage != nullptr; } MergeTreeData::DataPartsVector getParts(MergeTreeData::DataPartStateVector & state, bool has_state_column) const; - MergeTreeData::ProjectionPartsVector getProjectionParts(bool fill_states, bool has_state_column) const; + MergeTreeData::ProjectionPartsVector getProjectionParts(MergeTreeData::DataPartStateVector & state, bool has_state_column) const; }; /** A helper class that enumerates the storages that match given query. */ diff --git a/src/Storages/System/StorageSystemProjectionParts.cpp b/src/Storages/System/StorageSystemProjectionParts.cpp index 44bdb294a2d..3dbe6823dac 100644 --- a/src/Storages/System/StorageSystemProjectionParts.cpp +++ b/src/Storages/System/StorageSystemProjectionParts.cpp @@ -97,14 +97,15 @@ void StorageSystemProjectionParts::processNextStorage( ContextPtr, MutableColumns & columns, std::vector & columns_mask, const StoragesInfo & info, bool has_state_column) { using State = MergeTreeDataPartState; - MergeTreeData::ProjectionPartsVector all_parts = info.getProjectionParts(true, has_state_column); - auto fill_part_info = [&](size_t part_number, const MergeTreeData::DataPartsVector & parts, const MergeTreeData::DataPartStateVector & states) + MergeTreeData::DataPartStateVector all_parts_state; + MergeTreeData::ProjectionPartsVector all_parts = info.getProjectionParts(all_parts_state, has_state_column); + for (size_t part_number = 0; part_number < all_parts.projection_parts.size(); ++part_number) { - const auto & part = parts[part_number]; + const auto & part = all_parts.projection_parts[part_number]; const auto * parent_part = part->getParentPart(); chassert(parent_part); - auto part_state = states[part_number]; + auto part_state = all_parts_state[part_number]; ColumnSize columns_size = part->getTotalColumnsSize(); ColumnSize parent_columns_size = parent_part->getTotalColumnsSize(); @@ -275,7 +276,12 @@ void StorageSystemProjectionParts::processNextStorage( add_ttl_info_map(part->ttl_infos.moves_ttl); if (columns_mask[src_index++]) - columns[res_index++]->insert(queryToString(part->default_codec->getCodecDesc())); + { + if (part->default_codec) + columns[res_index++]->insert(queryToString(part->default_codec->getCodecDesc())); + else + columns[res_index++]->insertDefault(); + } add_ttl_info_map(part->ttl_infos.recompression_ttl); add_ttl_info_map(part->ttl_infos.group_by_ttl); @@ -287,7 +293,7 @@ void StorageSystemProjectionParts::processNextStorage( if (part->is_broken) { - std::lock_guard lock(part->broken_projections_mutex); + std::lock_guard lock(part->broken_reason_mutex); if (columns_mask[src_index++]) columns[res_index++]->insert(part->exception_code); if (columns_mask[src_index++]) @@ -306,18 +312,6 @@ void StorageSystemProjectionParts::processNextStorage( /// Do not use part->getState*, it can be changed from different thread if (has_state_column) columns[res_index++]->insert(IMergeTreeDataPart::stateString(part_state)); - }; - - for (size_t part_number = 0; part_number < all_parts.projection_parts.size(); ++part_number) - { - auto part = all_parts.projection_parts[part_number]; - fill_part_info(part_number, all_parts.projection_parts, all_parts.projection_parts_states); - } - - for (size_t part_number = 0; part_number < all_parts.broken_projection_parts.size(); ++part_number) - { - auto part = all_parts.broken_projection_parts[part_number]; - fill_part_info(part_number, all_parts.broken_projection_parts, all_parts.broken_projection_parts_states); } } diff --git a/src/Storages/System/StorageSystemProjectionPartsColumns.cpp b/src/Storages/System/StorageSystemProjectionPartsColumns.cpp index 3f4224e46bb..06becc6d91c 100644 --- a/src/Storages/System/StorageSystemProjectionPartsColumns.cpp +++ b/src/Storages/System/StorageSystemProjectionPartsColumns.cpp @@ -103,14 +103,15 @@ void StorageSystemProjectionPartsColumns::processNextStorage( } /// Go through the list of projection parts. - MergeTreeData::ProjectionPartsVector all_parts = info.getProjectionParts(true, has_state_column); - auto fill_part_info = [&](size_t part_number, const MergeTreeData::DataPartsVector & parts, const MergeTreeData::DataPartStateVector & states) + MergeTreeData::DataPartStateVector all_parts_state; + MergeTreeData::ProjectionPartsVector all_parts = info.getProjectionParts(all_parts_state, has_state_column); + for (size_t part_number = 0; part_number < all_parts.projection_parts.size(); ++part_number) { - const auto & part = parts[part_number]; + const auto & part = all_parts.projection_parts[part_number]; const auto * parent_part = part->getParentPart(); chassert(parent_part); - auto part_state = states[part_number]; + auto part_state = all_parts_state[part_number]; auto columns_size = part->getTotalColumnsSize(); auto parent_columns_size = parent_part->getTotalColumnsSize(); @@ -259,18 +260,6 @@ void StorageSystemProjectionPartsColumns::processNextStorage( if (has_state_column) columns[res_index++]->insert(part->stateString()); } - }; - - for (size_t part_number = 0; part_number < all_parts.projection_parts.size(); ++part_number) - { - auto part = all_parts.projection_parts[part_number]; - fill_part_info(part_number, all_parts.projection_parts, all_parts.projection_parts_states); - } - - for (size_t part_number = 0; part_number < all_parts.broken_projection_parts.size(); ++part_number) - { - auto part = all_parts.broken_projection_parts[part_number]; - fill_part_info(part_number, all_parts.broken_projection_parts, all_parts.broken_projection_parts_states); } } diff --git a/tests/queries/0_stateless/02916_broken_projection.reference b/tests/queries/0_stateless/02916_broken_projection.reference index d0b07e081db..62966036eed 100644 --- a/tests/queries/0_stateless/02916_broken_projection.reference +++ b/tests/queries/0_stateless/02916_broken_projection.reference @@ -8,15 +8,15 @@ all_1_1_0 1 ['proj','proj_2'] all_2_2_0 1 ['proj','proj_2'] all_3_3_0 1 ['proj','proj_2'] select from projection 'proj' -16 12 +16 used projections -SELECT c FROM test WHERE d == 12 OR d == 16; ['default.test.proj'] +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['default.test.proj'] select from projection 'proj_2' 12 16 used projections -SELECT d FROM test WHERE c == 12 OR c == 16; ['default.test.proj_2'] +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['default.test.proj_2'] check table 1 0 @@ -27,23 +27,20 @@ all_1_1_0 1 ['proj','proj_2'] all_2_2_0 1 ['proj','proj_2'] all_3_3_0 1 ['proj','proj_2'] select from projection 'proj' -16 12 +16 used projections -SELECT c FROM test WHERE d == 12 OR d == 16; ['default.test.proj'] +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['default.test.proj'] select from projection 'proj_2' -16 12 +16 used projections -SELECT d FROM test WHERE c == 12 OR c == 16; ['default.test.proj_2'] +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['default.test.proj_2'] check table 0 broken projections info all_2_2_0 proj FILE_DOESNT_EXIST check table full -all_0_0_0 1 -all_1_1_0 1 -all_3_3_0 1 all_2_2_0 0 Part all_2_2_0 has a broken projection. It will be ignored. 0 broke data of part 'proj_2' (parent part: all_2_2_0) @@ -52,13 +49,13 @@ all_2_2_0 proj FILE_DOESNT_EXIST system.parts all_0_0_0 1 ['proj','proj_2'] all_1_1_0 1 ['proj','proj_2'] -all_2_2_0 1 ['proj_2'] +all_2_2_0 1 ['proj','proj_2'] all_3_3_0 1 ['proj','proj_2'] select from projection 'proj' -16 12 +16 used projections -SELECT c FROM test WHERE d == 12 OR d == 16; ['default.test.proj'] +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['default.test.proj'] select from projection 'proj_2' FILE_DOESNT_EXIST check table @@ -69,18 +66,18 @@ all_2_2_0 proj_2 NO_FILE_IN_DATA_PART system.parts all_0_0_0 1 ['proj','proj_2'] all_1_1_0 1 ['proj','proj_2'] -all_2_2_0 1 [] +all_2_2_0 1 ['proj','proj_2'] all_3_3_0 1 ['proj','proj_2'] select from projection 'proj' -16 12 +16 used projections -SELECT c FROM test WHERE d == 12 OR d == 16; ['default.test.proj'] +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['default.test.proj'] select from projection 'proj_2' -16 12 +16 used projections -SELECT d FROM test WHERE c == 12 OR c == 16; ['default.test.proj_2'] +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['default.test.proj_2'] check table 0 0 @@ -99,21 +96,21 @@ all_3_3_0 proj_2 NO_FILE_IN_DATA_PART system.parts all_0_0_0 1 ['proj','proj_2'] all_1_1_0 1 ['proj','proj_2'] -all_2_2_0 1 [] -all_3_3_0 0 ['proj'] +all_2_2_0 1 ['proj','proj_2'] +all_3_3_0 0 ['proj','proj_2'] all_3_5_1 1 ['proj'] all_4_4_0 0 ['proj','proj_2'] all_5_5_0 0 ['proj','proj_2'] select from projection 'proj' -16 12 +16 used projections -SELECT c FROM test WHERE d == 12 OR d == 16; ['default.test.proj'] +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['default.test.proj'] select from projection 'proj_2' -16 12 +16 used projections -SELECT d FROM test WHERE c == 12 OR c == 16; ['default.test.proj_2'] +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['default.test.proj_2'] check table 0 0 @@ -135,76 +132,66 @@ all_2_2_0 proj_2 FILE_DOESNT_EXIST all_3_3_0 proj_2 FILE_DOESNT_EXIST system.parts all_0_0_0 1 ['proj','proj_2'] -all_1_1_0 1 [] -all_2_2_0 1 [] -all_3_3_0 0 ['proj'] +all_1_1_0 1 ['proj','proj_2'] +all_2_2_0 1 ['proj','proj_2'] +all_3_3_0 0 ['proj','proj_2'] all_3_5_1 1 ['proj'] all_4_4_0 0 ['proj','proj_2'] all_5_5_0 0 ['proj','proj_2'] select from projection 'proj' -16 12 +16 used projections -SELECT c FROM test WHERE d == 12 OR d == 16; ['default.test.proj'] +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['default.test.proj'] select from projection 'proj_2' -16 12 +16 used projections -SELECT d FROM test WHERE c == 12 OR c == 16; ['default.test.proj_2'] +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['default.test.proj_2'] check table 0 check table full -all_3_5_1 1 -all_0_0_0 1 -all_1_1_0 0 Part all_1_1_0 has a broken projection. It will be ignored. all_2_2_0 0 Part all_2_2_0 has a broken projection. It will be ignored. +all_1_1_0 0 Part all_1_1_0 has a broken projection. It will be ignored. materialize projection proj check table full -all_3_5_1_6 1 -all_0_0_0_6 1 -all_2_2_0_6 1 -all_1_1_0_6 1 system.parts all_0_0_0 0 ['proj','proj_2'] all_0_0_0_6 1 ['proj','proj_2'] -all_1_1_0 0 [] +all_1_1_0 0 ['proj','proj_2'] all_1_1_0_6 1 ['proj','proj_2'] -all_2_2_0 0 [] +all_2_2_0 0 ['proj','proj_2'] all_2_2_0_6 1 ['proj','proj_2'] -all_3_3_0 0 ['proj'] +all_3_3_0 0 ['proj','proj_2'] all_3_5_1 0 ['proj'] all_3_5_1_6 1 ['proj'] all_4_4_0 0 ['proj','proj_2'] all_5_5_0 0 ['proj','proj_2'] select from projection 'proj' -16 12 +16 used projections -SELECT c FROM test WHERE d == 12 OR d == 16; ['default.test.proj'] +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['default.test.proj'] select from projection 'proj_2' 12 16 used projections -SELECT d FROM test WHERE c == 12 OR c == 16; ['default.test.proj_2'] +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['default.test.proj_2'] check table 1 materialize projection proj_2 check table full -all_3_5_1_7 1 -all_0_0_0_7 1 -all_2_2_0_7 1 -all_1_1_0_7 1 system.parts all_0_0_0 0 ['proj','proj_2'] all_0_0_0_6 0 ['proj','proj_2'] all_0_0_0_7 1 ['proj','proj_2'] -all_1_1_0 0 [] +all_1_1_0 0 ['proj','proj_2'] all_1_1_0_6 0 ['proj','proj_2'] all_1_1_0_7 1 ['proj','proj_2'] -all_2_2_0 0 [] +all_2_2_0 0 ['proj','proj_2'] all_2_2_0_6 0 ['proj','proj_2'] all_2_2_0_7 1 ['proj','proj_2'] -all_3_3_0 0 ['proj'] +all_3_3_0 0 ['proj','proj_2'] all_3_5_1 0 ['proj'] all_3_5_1_6 0 ['proj'] all_3_5_1_7 1 ['proj','proj_2'] @@ -214,11 +201,11 @@ select from projection 'proj' 12 16 used projections -SELECT c FROM test WHERE d == 12 OR d == 16; ['default.test.proj'] +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['default.test.proj'] select from projection 'proj_2' -16 12 +16 used projections -SELECT d FROM test WHERE c == 12 OR c == 16; ['default.test.proj_2'] +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['default.test.proj_2'] check table 1 diff --git a/tests/queries/0_stateless/02916_broken_projection.sh b/tests/queries/0_stateless/02916_broken_projection.sh index 81adfe6e49d..4748506d9cf 100755 --- a/tests/queries/0_stateless/02916_broken_projection.sh +++ b/tests/queries/0_stateless/02916_broken_projection.sh @@ -23,7 +23,7 @@ CREATE TABLE test SELECT d ORDER BY c ) ) -ENGINE = ReplicatedMergeTree('/test2/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/', '1') PRIMARY KEY (a) +ENGINE = ReplicatedMergeTree('/test3/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/', '1') PRIMARY KEY (a) SETTINGS min_bytes_for_wide_part = 0, max_parts_to_merge_at_once=3, enable_vertical_merge_algorithm=1, @@ -119,9 +119,9 @@ function check() if [ "$expect_broken_part" = "proj" ] then - $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT c FROM test WHERE d == 12;" 2>&1 | grep -o $expected_error + $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT c FROM test WHERE d == 12 ORDER BY c;" 2>&1 | grep -o $expected_error else - $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT c FROM test WHERE d == 12 OR d == 16;" + $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c;" echo 'used projections' $CLICKHOUSE_CLIENT -nm -q " SYSTEM FLUSH LOGS; @@ -134,9 +134,9 @@ function check() if [ "$expect_broken_part" = "proj_2" ] then - $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT d FROM test WHERE c == 12;" 2>&1 | grep -o $expected_error + $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT d FROM test WHERE c == 12 ORDER BY d;" 2>&1 | grep -o $expected_error else - $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT d FROM test WHERE c == 12 OR c == 16;" + $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d;" echo 'used projections' $CLICKHOUSE_CLIENT -nm -q " SYSTEM FLUSH LOGS; @@ -173,7 +173,7 @@ function materialize_projection function check_table_full() { echo 'check table full' - $CLICKHOUSE_CLIENT -q "CHECK TABLE test SETTINGS check_query_single_value_result = 0" + $CLICKHOUSE_CLIENT -q "CHECK TABLE test SETTINGS check_query_single_value_result = 0" | grep "broken" } From 8ea4e302a50db872a798c6cd39c6f5edb255ec49 Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 16 Nov 2023 19:43:32 +0100 Subject: [PATCH 003/245] Fix style check --- .../0_stateless/02916_broken_projection.sh | 31 ++++++++++++++++--- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/tests/queries/0_stateless/02916_broken_projection.sh b/tests/queries/0_stateless/02916_broken_projection.sh index 4748506d9cf..bf0ec61fd76 100755 --- a/tests/queries/0_stateless/02916_broken_projection.sh +++ b/tests/queries/0_stateless/02916_broken_projection.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash +# shellcheck disable=SC2046 CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh @@ -125,7 +126,7 @@ function check() echo 'used projections' $CLICKHOUSE_CLIENT -nm -q " SYSTEM FLUSH LOGS; - SELECT query, projections FROM system.query_log WHERE query_id='$query_id' and type='QueryFinish' + SELECT query, projections FROM system.query_log WHERE current_database=currentDatabase() AND query_id='$query_id' AND type='QueryFinish' " fi @@ -140,7 +141,7 @@ function check() echo 'used projections' $CLICKHOUSE_CLIENT -nm -q " SYSTEM FLUSH LOGS; - SELECT query, projections FROM system.query_log WHERE query_id='$query_id' and type='QueryFinish' + SELECT query, projections FROM system.query_log WHERE current_database=currentDatabase() AND query_id='$query_id' AND type='QueryFinish' " fi @@ -148,10 +149,20 @@ function check() $CLICKHOUSE_CLIENT -q "CHECK TABLE test" } -function optimize_no_wait() +function optimize() { + final=$1 + no_wait=$2 + echo 'optimize' - $CLICKHOUSE_CLIENT -nm -q "OPTIMIZE TABLE test SETTINGS alter_sync=0;" + query="OPTIMIZE TABLE test" + + if [ $final -eq 1 ]; then + query="$query FINAL" + if [ $no_wait -eq 1 ]; then + query="$query SETTINGS alter_sync=0" + + $CLICKHOUSE_CLIENT -nm -q $query } function reattach() @@ -234,7 +245,7 @@ insert 25 5 # Merge will be retried and on second attempt it will succeed. # The result part all_3_5_1 will have only 1 projection - 'proj', because # it will skip 'proj_2' as it will see that one part does not have it anymore in the set of valid projections. -optimize_no_wait +optimize 0 1 sleep 2 $CLICKHOUSE_CLIENT -nm -q " @@ -276,6 +287,16 @@ materialize_projection proj_2 check_table_full +break_projection proj all_3_5_1_7 data + +insert 30 5 + +optimize 1 0 + +insert 35 5 + +optimize 1 0 + check $CLICKHOUSE_CLIENT -nm -q " From 42b2fe9adcf4596e8e36231068911c5dbdc4948f Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 21 Nov 2023 13:21:35 +0100 Subject: [PATCH 004/245] Fxi --- src/Common/ErrorCodes.cpp | 1 + .../ReplicatedMergeTreePartCheckThread.cpp | 10 +- src/Storages/MergeTree/checkDataPart.cpp | 17 +++- .../02916_broken_projection.reference | 93 ++++++++++--------- .../0_stateless/02916_broken_projection.sh | 39 ++++---- 5 files changed, 94 insertions(+), 66 deletions(-) diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp index 28f8e6c6021..9c3aab5ad01 100644 --- a/src/Common/ErrorCodes.cpp +++ b/src/Common/ErrorCodes.cpp @@ -586,6 +586,7 @@ M(704, CANNOT_USE_QUERY_CACHE_WITH_NONDETERMINISTIC_FUNCTIONS) \ M(705, TABLE_NOT_EMPTY) \ M(706, LIBSSH_ERROR) \ + M(707, BROKEN_PROJECTION) \ M(999, KEEPER_EXCEPTION) \ M(1000, POCO_EXCEPTION) \ M(1001, STD_EXCEPTION) \ diff --git a/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp b/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp index 4468cf8e3bf..ba4d4869025 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp @@ -63,7 +63,7 @@ void ReplicatedMergeTreePartCheckThread::enqueuePart(const String & name, time_t if (parts_set.contains(name)) return; - LOG_TRACE(log, "Enqueueing {} for check after after {}s", name, delay_to_check_seconds); + LOG_TRACE(log, "Enqueueing {} for check after {}s", name, delay_to_check_seconds); parts_queue.emplace_back(name, std::chrono::steady_clock::now() + std::chrono::seconds(delay_to_check_seconds)); parts_set.insert(name); task->schedule(); @@ -385,17 +385,19 @@ ReplicatedCheckResult ReplicatedMergeTreePartCheckThread::checkPartImpl(const St if (isRetryableException(std::current_exception())) throw; - tryLogCurrentException(log, __PRETTY_FUNCTION__); - PreformattedMessage message; if (is_broken_projection) { - message = PreformattedMessage::create("Part {} has a broken projection. It will be ignored.", part_name); + message = PreformattedMessage::create( + "Part {} has a broken projections. It will be ignored. Broken projections info: \n{}", + part_name, getCurrentExceptionMessage(false)); LOG_DEBUG(log, message); result.action = ReplicatedCheckResult::DoNothing; } else { + tryLogCurrentException(log, __PRETTY_FUNCTION__); + message = PreformattedMessage::create("Part {} looks broken. Removing it and will try to fetch.", part_name); LOG_ERROR(log, message); result.action = ReplicatedCheckResult::TryFetchMissing; diff --git a/src/Storages/MergeTree/checkDataPart.cpp b/src/Storages/MergeTree/checkDataPart.cpp index 8feabf344b5..3bb6f763c8b 100644 --- a/src/Storages/MergeTree/checkDataPart.cpp +++ b/src/Storages/MergeTree/checkDataPart.cpp @@ -42,6 +42,7 @@ namespace ErrorCodes extern const int NO_FILE_IN_DATA_PART; extern const int NETWORK_ERROR; extern const int SOCKET_TIMEOUT; + extern const int BROKEN_PROJECTION; } @@ -272,6 +273,7 @@ static IMergeTreeDataPart::Checksums checkDataPart( } } + std::string broken_projections_message; for (const auto & [name, projection] : data_part->getProjectionParts()) { if (is_cancelled()) @@ -307,7 +309,15 @@ static IMergeTreeDataPart::Checksums checkDataPart( is_broken_projection = true; if (throw_on_broken_projection) - throw; + { + if (!broken_projections_message.empty()) + broken_projections_message += "\n"; + + broken_projections_message += fmt::format( + "Part {} has a broken projection {} (error: {})", + data_part->name, name, getCurrentExceptionMessage(false)); + continue; + } projections_on_disk.erase(projection_file); checksums_txt.remove(projection_file); @@ -320,6 +330,11 @@ static IMergeTreeDataPart::Checksums checkDataPart( projections_on_disk.erase(projection_file); } + if (throw_on_broken_projection && !broken_projections_message.empty()) + { + throw Exception(ErrorCodes::BROKEN_PROJECTION, broken_projections_message.data()); + } + if (require_checksums && !projections_on_disk.empty()) { throw Exception(ErrorCodes::UNEXPECTED_FILE_IN_DATA_PART, diff --git a/tests/queries/0_stateless/02916_broken_projection.reference b/tests/queries/0_stateless/02916_broken_projection.reference index 62966036eed..aee18a21fb8 100644 --- a/tests/queries/0_stateless/02916_broken_projection.reference +++ b/tests/queries/0_stateless/02916_broken_projection.reference @@ -7,42 +7,40 @@ all_0_0_0 1 ['proj','proj_2'] all_1_1_0 1 ['proj','proj_2'] all_2_2_0 1 ['proj','proj_2'] all_3_3_0 1 ['proj','proj_2'] -select from projection 'proj' +select from projection 'proj', expect error: 12 16 used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['default.test.proj'] -select from projection 'proj_2' +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['test.test.proj'] +select from projection 'proj_2', expect error: 12 16 used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['default.test.proj_2'] +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['test.test.proj_2'] check table 1 -0 broke metadata of part 'proj' (parent part: all_2_2_0) system.parts all_0_0_0 1 ['proj','proj_2'] all_1_1_0 1 ['proj','proj_2'] all_2_2_0 1 ['proj','proj_2'] all_3_3_0 1 ['proj','proj_2'] -select from projection 'proj' +select from projection 'proj', expect error: 12 16 used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['default.test.proj'] -select from projection 'proj_2' +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['test.test.proj'] +select from projection 'proj_2', expect error: 12 16 used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['default.test.proj_2'] +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['test.test.proj_2'] check table 0 broken projections info all_2_2_0 proj FILE_DOESNT_EXIST check table full -all_2_2_0 0 Part all_2_2_0 has a broken projection. It will be ignored. -0 +all_2_2_0 0 Part all_2_2_0 has a broken projections. It will be ignored. Broken projections info: \nCode: 707. DB::Exception: Part all_2_2_0 has a broken projection proj (error: Code: 107. DB::ErrnoException: Cannot open file /var/lib/clickhouse/store/521/521986ec-2fef-42c8-a402-83f937689286/all_2_2_0/proj.proj/columns.txt, errno: 2, strerror: No such file or directory. (FILE_DOESNT_EXIST) (version 23.11.1.1)). (BROKEN_PROJECTION) (version 23.11.1.1) broke data of part 'proj_2' (parent part: all_2_2_0) broken projections info all_2_2_0 proj FILE_DOESNT_EXIST @@ -51,13 +49,12 @@ all_0_0_0 1 ['proj','proj_2'] all_1_1_0 1 ['proj','proj_2'] all_2_2_0 1 ['proj','proj_2'] all_3_3_0 1 ['proj','proj_2'] -select from projection 'proj' +select from projection 'proj', expect error: proj_2 12 16 used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['default.test.proj'] -select from projection 'proj_2' -FILE_DOESNT_EXIST +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['test.test.proj'] +select from projection 'proj_2', expect error: proj_2 check table 0 broken projections info @@ -68,19 +65,18 @@ all_0_0_0 1 ['proj','proj_2'] all_1_1_0 1 ['proj','proj_2'] all_2_2_0 1 ['proj','proj_2'] all_3_3_0 1 ['proj','proj_2'] -select from projection 'proj' +select from projection 'proj', expect error: 12 16 used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['default.test.proj'] -select from projection 'proj_2' +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['test.test.proj'] +select from projection 'proj_2', expect error: 12 16 used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['default.test.proj_2'] +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['test.test.proj_2'] check table 0 -0 broke data of part 'proj_2' (parent part: all_3_3_0) broken projections info all_2_2_0 proj FILE_DOESNT_EXIST @@ -88,6 +84,7 @@ all_2_2_0 proj_2 NO_FILE_IN_DATA_PART insert new part insert new part optimize +OPTIMIZE TABLE test SETTINGS alter_sync=0 0 broken projections info all_2_2_0 proj FILE_DOESNT_EXIST @@ -101,19 +98,18 @@ all_3_3_0 0 ['proj','proj_2'] all_3_5_1 1 ['proj'] all_4_4_0 0 ['proj','proj_2'] all_5_5_0 0 ['proj','proj_2'] -select from projection 'proj' +select from projection 'proj', expect error: 12 16 used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['default.test.proj'] -select from projection 'proj_2' +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['test.test.proj'] +select from projection 'proj_2', expect error: 12 16 used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['default.test.proj_2'] +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['test.test.proj_2'] check table 0 -0 broke metadata of part 'proj' (parent part: all_1_1_0) Detach - Attach broken projections info @@ -121,7 +117,6 @@ all_1_1_0 proj NO_FILE_IN_DATA_PART all_2_2_0 proj NO_FILE_IN_DATA_PART all_2_2_0 proj_2 FILE_DOESNT_EXIST all_3_3_0 proj_2 FILE_DOESNT_EXIST -0 broke data of part 'proj_2' (parent part: all_1_1_0) Detach - Attach broken projections info @@ -138,21 +133,21 @@ all_3_3_0 0 ['proj','proj_2'] all_3_5_1 1 ['proj'] all_4_4_0 0 ['proj','proj_2'] all_5_5_0 0 ['proj','proj_2'] -select from projection 'proj' +select from projection 'proj', expect error: 12 16 used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['default.test.proj'] -select from projection 'proj_2' +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['test.test.proj'] +select from projection 'proj_2', expect error: 12 16 used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['default.test.proj_2'] +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['test.test.proj_2'] check table 0 check table full -all_2_2_0 0 Part all_2_2_0 has a broken projection. It will be ignored. -all_1_1_0 0 Part all_1_1_0 has a broken projection. It will be ignored. +all_2_2_0 0 Part all_2_2_0 has a broken projections. It will be ignored. Broken projections info: \nCode: 707. DB::Exception: Part all_2_2_0 has a broken projection proj (error: Code: 107. DB::ErrnoException: Cannot open file /var/lib/clickhouse/store/521/521986ec-2fef-42c8-a402-83f937689286/all_2_2_0/proj.proj/columns.txt, errno: 2, strerror: No such file or directory. (FILE_DOESNT_EXIST) (version 23.11.1.1))\nPart all_2_2_0 has a broken projection proj_2 (error: Code: 226. DB::Exception: There is no file for column \'c\' in data part \'proj_2\'. (NO_FILE_IN_DATA_PART) (version 23.11.1.1)). (BROKEN_PROJECTION) (version 23.11.1.1) +all_1_1_0 0 Part all_1_1_0 has a broken projections. It will be ignored. Broken projections info: \nCode: 707. DB::Exception: Part all_1_1_0 has a broken projection proj (error: Code: 107. DB::ErrnoException: Cannot open file /var/lib/clickhouse/store/521/521986ec-2fef-42c8-a402-83f937689286/all_1_1_0/proj.proj/columns.txt, errno: 2, strerror: No such file or directory. (FILE_DOESNT_EXIST) (version 23.11.1.1))\nPart all_1_1_0 has a broken projection proj_2 (error: Code: 226. DB::Exception: There is no file for column \'c\' in data part \'proj_2\'. (NO_FILE_IN_DATA_PART) (version 23.11.1.1)). (BROKEN_PROJECTION) (version 23.11.1.1) materialize projection proj check table full system.parts @@ -167,45 +162,55 @@ all_3_5_1 0 ['proj'] all_3_5_1_6 1 ['proj'] all_4_4_0 0 ['proj','proj_2'] all_5_5_0 0 ['proj','proj_2'] -select from projection 'proj' +select from projection 'proj', expect error: 12 16 used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['default.test.proj'] -select from projection 'proj_2' +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['test.test.proj'] +select from projection 'proj_2', expect error: 12 16 used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['default.test.proj_2'] +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['test.test.proj_2'] check table 1 materialize projection proj_2 check table full +broke data of part 'proj' (parent part: all_3_5_1_7) +insert new part +optimize +OPTIMIZE TABLE test FINAL +insert new part +optimize +OPTIMIZE TABLE test FINAL system.parts all_0_0_0 0 ['proj','proj_2'] all_0_0_0_6 0 ['proj','proj_2'] -all_0_0_0_7 1 ['proj','proj_2'] +all_0_0_0_7 0 ['proj','proj_2'] +all_0_8_2_7 1 ['proj_2'] all_1_1_0 0 ['proj','proj_2'] all_1_1_0_6 0 ['proj','proj_2'] -all_1_1_0_7 1 ['proj','proj_2'] +all_1_1_0_7 0 ['proj','proj_2'] all_2_2_0 0 ['proj','proj_2'] all_2_2_0_6 0 ['proj','proj_2'] -all_2_2_0_7 1 ['proj','proj_2'] +all_2_2_0_7 0 ['proj','proj_2'] all_3_3_0 0 ['proj','proj_2'] all_3_5_1 0 ['proj'] all_3_5_1_6 0 ['proj'] -all_3_5_1_7 1 ['proj','proj_2'] +all_3_5_1_7 0 ['proj','proj_2'] all_4_4_0 0 ['proj','proj_2'] all_5_5_0 0 ['proj','proj_2'] -select from projection 'proj' +all_8_8_0 0 ['proj','proj_2'] +all_9_9_0 1 ['proj','proj_2'] +select from projection 'proj', expect error: 12 16 used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['default.test.proj'] -select from projection 'proj_2' +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['test.test.proj_2'] +select from projection 'proj_2', expect error: 12 16 used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['default.test.proj_2'] +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['test.test.proj_2'] check table 1 diff --git a/tests/queries/0_stateless/02916_broken_projection.sh b/tests/queries/0_stateless/02916_broken_projection.sh index bf0ec61fd76..bf382624787 100755 --- a/tests/queries/0_stateless/02916_broken_projection.sh +++ b/tests/queries/0_stateless/02916_broken_projection.sh @@ -24,7 +24,7 @@ CREATE TABLE test SELECT d ORDER BY c ) ) -ENGINE = ReplicatedMergeTree('/test3/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/', '1') PRIMARY KEY (a) +ENGINE = ReplicatedMergeTree('/test4/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/', '1') PRIMARY KEY (a) SETTINGS min_bytes_for_wide_part = 0, max_parts_to_merge_at_once=3, enable_vertical_merge_algorithm=1, @@ -65,18 +65,13 @@ function break_projection() LIMIT 1; ") - path=$($CLICKHOUSE_CLIENT -q "SELECT path FROM system.disks WHERE name='$disk_name'") - - # make sure path is absolute - $CLICKHOUSE_CLIENT -q "select throwIf(substring('$path', 1, 1) != '/', 'Path is relative: $path')" || exit - if [ "$break_type" = "data" ] then - rm "$path/$part_path/d.bin" - rm "$path/$part_path/c.bin" + rm "$part_path/d.bin" + rm "$part_path/c.bin" echo "broke data of part '$part_name' (parent part: $parent_name)" else - rm "$path/$part_path/columns.txt" + rm "$part_path/columns.txt" echo "broke metadata of part '$part_name' (parent part: $parent_name)" fi } @@ -115,12 +110,12 @@ function check() WHERE table='test' AND database=currentDatabase() ORDER BY name;" - echo "select from projection 'proj'" + echo "select from projection 'proj', expect error: $expect_broken_part" query_id=$(random 8) if [ "$expect_broken_part" = "proj" ] then - $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT c FROM test WHERE d == 12 ORDER BY c;" 2>&1 | grep -o $expected_error + $CLICKHOUSE_CLIENT --optimize_use_projections 1 --send_logs_level 'fatal' --query_id $query_id -q "SELECT c FROM test WHERE d == 12 ORDER BY c;" 2>&1 | grep -o $expected_error else $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c;" echo 'used projections' @@ -130,12 +125,12 @@ function check() " fi - echo "select from projection 'proj_2'" + echo "select from projection 'proj_2', expect error: $expect_broken_part" query_id=$(random 8) if [ "$expect_broken_part" = "proj_2" ] then - $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT d FROM test WHERE c == 12 ORDER BY d;" 2>&1 | grep -o $expected_error + $CLICKHOUSE_CLIENT --optimize_use_projections 1 --send_logs_level 'fatal' --query_id $query_id -q "SELECT d FROM test WHERE c == 12 ORDER BY d;" 2>&1 | grep -o $expected_error else $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d;" echo 'used projections' @@ -146,7 +141,9 @@ function check() fi echo 'check table' - $CLICKHOUSE_CLIENT -q "CHECK TABLE test" + $CLICKHOUSE_CLIENT -nm -q " + SET send_logs_level='fatal'; + CHECK TABLE test;" } function optimize() @@ -159,16 +156,21 @@ function optimize() if [ $final -eq 1 ]; then query="$query FINAL" + fi if [ $no_wait -eq 1 ]; then query="$query SETTINGS alter_sync=0" + fi - $CLICKHOUSE_CLIENT -nm -q $query + echo $query + + $CLICKHOUSE_CLIENT -q "$query" } function reattach() { echo 'Detach - Attach' $CLICKHOUSE_CLIENT -nm -q " + SET send_logs_level='fatal'; DETACH TABLE test; ATTACH TABLE test; " @@ -184,7 +186,10 @@ function materialize_projection function check_table_full() { echo 'check table full' - $CLICKHOUSE_CLIENT -q "CHECK TABLE test SETTINGS check_query_single_value_result = 0" | grep "broken" + $CLICKHOUSE_CLIENT -nm -q " + SET send_logs_level='fatal'; + CHECK TABLE test SETTINGS check_query_single_value_result = 0; +" | grep "broken" } @@ -300,5 +305,5 @@ optimize 1 0 check $CLICKHOUSE_CLIENT -nm -q " -DROP TABLE test; +DROP TABLE test SYNC; " From bcc87c01771414806fca705b5c9b5e0e925dea5f Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 21 Nov 2023 17:17:36 +0100 Subject: [PATCH 005/245] Better test --- .../ReplicatedMergeTreePartCheckThread.cpp | 3 +- src/Storages/System/StorageSystemDisks.cpp | 2 +- .../02916_broken_projection.reference | 43 +++++++++---------- .../0_stateless/02916_broken_projection.sh | 27 ++++++++---- 4 files changed, 42 insertions(+), 33 deletions(-) diff --git a/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp b/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp index ba4d4869025..d058113e134 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp @@ -388,8 +388,9 @@ ReplicatedCheckResult ReplicatedMergeTreePartCheckThread::checkPartImpl(const St PreformattedMessage message; if (is_broken_projection) { + WriteBufferFromOwnString wb; message = PreformattedMessage::create( - "Part {} has a broken projections. It will be ignored. Broken projections info: \n{}", + "Part {} has a broken projections. It will be ignored. Broken projections info: {}", part_name, getCurrentExceptionMessage(false)); LOG_DEBUG(log, message); result.action = ReplicatedCheckResult::DoNothing; diff --git a/src/Storages/System/StorageSystemDisks.cpp b/src/Storages/System/StorageSystemDisks.cpp index 250fcdba641..23a00cc7ae5 100644 --- a/src/Storages/System/StorageSystemDisks.cpp +++ b/src/Storages/System/StorageSystemDisks.cpp @@ -63,7 +63,7 @@ Pipe StorageSystemDisks::read( for (const auto & [disk_name, disk_ptr] : context->getDisksMap()) { col_name->insert(disk_name); - col_path->insert(fs::absolute(disk_ptr->getPath()).string()); + col_path->insert(disk_ptr->getPath()); col_free->insert(disk_ptr->getAvailableSpace().value_or(std::numeric_limits::max())); col_total->insert(disk_ptr->getTotalSpace().value_or(std::numeric_limits::max())); col_unreserved->insert(disk_ptr->getUnreservedSpace().value_or(std::numeric_limits::max())); diff --git a/tests/queries/0_stateless/02916_broken_projection.reference b/tests/queries/0_stateless/02916_broken_projection.reference index aee18a21fb8..1b84ca96840 100644 --- a/tests/queries/0_stateless/02916_broken_projection.reference +++ b/tests/queries/0_stateless/02916_broken_projection.reference @@ -11,12 +11,12 @@ select from projection 'proj', expect error: 12 16 used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['test.test.proj'] +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj select from projection 'proj_2', expect error: 12 16 used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['test.test.proj_2'] +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table 1 broke metadata of part 'proj' (parent part: all_2_2_0) @@ -29,18 +29,18 @@ select from projection 'proj', expect error: 12 16 used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['test.test.proj'] +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj select from projection 'proj_2', expect error: 12 16 used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['test.test.proj_2'] +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table 0 broken projections info all_2_2_0 proj FILE_DOESNT_EXIST -check table full -all_2_2_0 0 Part all_2_2_0 has a broken projections. It will be ignored. Broken projections info: \nCode: 707. DB::Exception: Part all_2_2_0 has a broken projection proj (error: Code: 107. DB::ErrnoException: Cannot open file /var/lib/clickhouse/store/521/521986ec-2fef-42c8-a402-83f937689286/all_2_2_0/proj.proj/columns.txt, errno: 2, strerror: No such file or directory. (FILE_DOESNT_EXIST) (version 23.11.1.1)). (BROKEN_PROJECTION) (version 23.11.1.1) +check table full (all_2_2_0) +all_2_2_0 broke data of part 'proj_2' (parent part: all_2_2_0) broken projections info all_2_2_0 proj FILE_DOESNT_EXIST @@ -53,7 +53,7 @@ select from projection 'proj', expect error: proj_2 12 16 used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['test.test.proj'] +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj select from projection 'proj_2', expect error: proj_2 check table 0 @@ -69,12 +69,12 @@ select from projection 'proj', expect error: 12 16 used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['test.test.proj'] +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj select from projection 'proj_2', expect error: 12 16 used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['test.test.proj_2'] +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table 0 broke data of part 'proj_2' (parent part: all_3_3_0) @@ -102,12 +102,12 @@ select from projection 'proj', expect error: 12 16 used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['test.test.proj'] +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj select from projection 'proj_2', expect error: 12 16 used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['test.test.proj_2'] +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table 0 broke metadata of part 'proj' (parent part: all_1_1_0) @@ -137,19 +137,18 @@ select from projection 'proj', expect error: 12 16 used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['test.test.proj'] +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj select from projection 'proj_2', expect error: 12 16 used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['test.test.proj_2'] +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table 0 -check table full -all_2_2_0 0 Part all_2_2_0 has a broken projections. It will be ignored. Broken projections info: \nCode: 707. DB::Exception: Part all_2_2_0 has a broken projection proj (error: Code: 107. DB::ErrnoException: Cannot open file /var/lib/clickhouse/store/521/521986ec-2fef-42c8-a402-83f937689286/all_2_2_0/proj.proj/columns.txt, errno: 2, strerror: No such file or directory. (FILE_DOESNT_EXIST) (version 23.11.1.1))\nPart all_2_2_0 has a broken projection proj_2 (error: Code: 226. DB::Exception: There is no file for column \'c\' in data part \'proj_2\'. (NO_FILE_IN_DATA_PART) (version 23.11.1.1)). (BROKEN_PROJECTION) (version 23.11.1.1) -all_1_1_0 0 Part all_1_1_0 has a broken projections. It will be ignored. Broken projections info: \nCode: 707. DB::Exception: Part all_1_1_0 has a broken projection proj (error: Code: 107. DB::ErrnoException: Cannot open file /var/lib/clickhouse/store/521/521986ec-2fef-42c8-a402-83f937689286/all_1_1_0/proj.proj/columns.txt, errno: 2, strerror: No such file or directory. (FILE_DOESNT_EXIST) (version 23.11.1.1))\nPart all_1_1_0 has a broken projection proj_2 (error: Code: 226. DB::Exception: There is no file for column \'c\' in data part \'proj_2\'. (NO_FILE_IN_DATA_PART) (version 23.11.1.1)). (BROKEN_PROJECTION) (version 23.11.1.1) +check table full (all_1_1_0) +all_1_1_0 materialize projection proj -check table full +check table full () system.parts all_0_0_0 0 ['proj','proj_2'] all_0_0_0_6 1 ['proj','proj_2'] @@ -166,16 +165,16 @@ select from projection 'proj', expect error: 12 16 used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['test.test.proj'] +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj select from projection 'proj_2', expect error: 12 16 used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['test.test.proj_2'] +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table 1 materialize projection proj_2 -check table full +check table full () broke data of part 'proj' (parent part: all_3_5_1_7) insert new part optimize @@ -206,11 +205,11 @@ select from projection 'proj', expect error: 12 16 used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; ['test.test.proj_2'] +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj_2 select from projection 'proj_2', expect error: 12 16 used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; ['test.test.proj_2'] +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table 1 diff --git a/tests/queries/0_stateless/02916_broken_projection.sh b/tests/queries/0_stateless/02916_broken_projection.sh index bf382624787..a522de42c89 100755 --- a/tests/queries/0_stateless/02916_broken_projection.sh +++ b/tests/queries/0_stateless/02916_broken_projection.sh @@ -121,7 +121,7 @@ function check() echo 'used projections' $CLICKHOUSE_CLIENT -nm -q " SYSTEM FLUSH LOGS; - SELECT query, projections FROM system.query_log WHERE current_database=currentDatabase() AND query_id='$query_id' AND type='QueryFinish' + SELECT query, splitByChar('.', arrayJoin(projections))[-1] FROM system.query_log WHERE current_database=currentDatabase() AND query_id='$query_id' AND type='QueryFinish' " fi @@ -136,7 +136,7 @@ function check() echo 'used projections' $CLICKHOUSE_CLIENT -nm -q " SYSTEM FLUSH LOGS; - SELECT query, projections FROM system.query_log WHERE current_database=currentDatabase() AND query_id='$query_id' AND type='QueryFinish' + SELECT query, splitByChar('.', arrayJoin(projections))[-1] FROM system.query_log WHERE current_database=currentDatabase() AND query_id='$query_id' AND type='QueryFinish' " fi @@ -185,11 +185,20 @@ function materialize_projection function check_table_full() { - echo 'check table full' - $CLICKHOUSE_CLIENT -nm -q " - SET send_logs_level='fatal'; - CHECK TABLE test SETTINGS check_query_single_value_result = 0; -" | grep "broken" + echo "check table full ($1)" + expect_broken_part=$1 + if [ "$expect_broken_part" = "" ] + then + $CLICKHOUSE_CLIENT -nm -q " + SET send_logs_level='fatal'; + CHECK TABLE test SETTINGS check_query_single_value_result = 0; + " | grep "broken" + else + $CLICKHOUSE_CLIENT -nm -q " + SET send_logs_level='fatal'; + CHECK TABLE test SETTINGS check_query_single_value_result = 0; + " | grep "broken" | grep -o $expect_broken_part | head -n 1 + fi } @@ -216,7 +225,7 @@ check broken_projections_info # Check table query will also show a list of parts which have broken projections. -check_table_full +check_table_full "all_2_2_0" # Break data file of projection 'proj_2' for part all_2_2_0 break_projection proj_2 all_2_2_0 data @@ -280,7 +289,7 @@ broken_projections_info check -check_table_full +check_table_full all_1_1_0 materialize_projection proj From e8d99cb29654645c5a89d6cb15856b48a55d7bdf Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Wed, 22 Nov 2023 12:34:31 +0100 Subject: [PATCH 006/245] Fix style check --- tests/queries/0_stateless/02916_broken_projection.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/02916_broken_projection.sh b/tests/queries/0_stateless/02916_broken_projection.sh index a522de42c89..6ed92e2e06e 100755 --- a/tests/queries/0_stateless/02916_broken_projection.sh +++ b/tests/queries/0_stateless/02916_broken_projection.sh @@ -54,8 +54,8 @@ function break_projection() parent_name=$2 break_type=$3 - read -r disk_name part_path <<< $($CLICKHOUSE_CLIENT -nm -q " - SELECT disk_name, path + read -r part_path <<< $($CLICKHOUSE_CLIENT -nm -q " + SELECT path FROM system.projection_parts WHERE table='test' AND database=currentDatabase() From a57e612cf2ef657801cdeefb8410caf5cab804a2 Mon Sep 17 00:00:00 2001 From: kssenii Date: Fri, 24 Nov 2023 16:08:49 +0100 Subject: [PATCH 007/245] Fxi tests --- src/Storages/StorageMergeTree.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index 74277616e95..84b48bb650b 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -2247,7 +2247,7 @@ std::optional StorageMergeTree::checkDataNext(DataValidationTasksPt { try { - auto calculated_checksums = checkDataPart(part, false, noop, /* is_cancelled */{}, /* throw_on_broken_projection */true); + auto calculated_checksums = checkDataPart(part, false, noop, /* is_cancelled */[]{ return false; }, /* throw_on_broken_projection */true); calculated_checksums.checkEqual(part->checksums, true); auto & part_mutable = const_cast(*part); @@ -2268,7 +2268,7 @@ std::optional StorageMergeTree::checkDataNext(DataValidationTasksPt { try { - checkDataPart(part, true, noop, /* is_cancelled */{}, /* throw_on_broken_projection */true); + checkDataPart(part, true, noop, /* is_cancelled */[]{ return false; }, /* throw_on_broken_projection */true); return CheckResult(part->name, true, ""); } catch (...) From 8ebbc8d85dc3f1e37d109ddb1ad1a05a55283a79 Mon Sep 17 00:00:00 2001 From: kssenii Date: Fri, 24 Nov 2023 18:37:40 +0100 Subject: [PATCH 008/245] Update 02117_show_create_table_system.reference --- .../0_stateless/02117_show_create_table_system.reference | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/queries/0_stateless/02117_show_create_table_system.reference b/tests/queries/0_stateless/02117_show_create_table_system.reference index 9ed905a0df8..e122de8ef6c 100644 --- a/tests/queries/0_stateless/02117_show_create_table_system.reference +++ b/tests/queries/0_stateless/02117_show_create_table_system.reference @@ -685,6 +685,9 @@ CREATE TABLE system.projection_parts `rows_where_ttl_info.expression` Array(String), `rows_where_ttl_info.min` Array(DateTime), `rows_where_ttl_info.max` Array(DateTime), + `is_broken` UInt8, + `exception_code` Int32, + `exception` String, `bytes` UInt64 ALIAS bytes_on_disk, `marks_size` UInt64 ALIAS marks_bytes, `part_name` String ALIAS name From b4dab194954845b76d1ce9a6bf8b18dded059d74 Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 27 Nov 2023 12:42:09 +0100 Subject: [PATCH 009/245] Fix test --- .../0_stateless/02916_broken_projection.reference | 1 + tests/queries/0_stateless/02916_broken_projection.sh | 12 +++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/tests/queries/0_stateless/02916_broken_projection.reference b/tests/queries/0_stateless/02916_broken_projection.reference index 1b84ca96840..1f072e207a7 100644 --- a/tests/queries/0_stateless/02916_broken_projection.reference +++ b/tests/queries/0_stateless/02916_broken_projection.reference @@ -55,6 +55,7 @@ select from projection 'proj', expect error: proj_2 used projections SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj select from projection 'proj_2', expect error: proj_2 +FILE_DOESNT_EXIST check table 0 broken projections info diff --git a/tests/queries/0_stateless/02916_broken_projection.sh b/tests/queries/0_stateless/02916_broken_projection.sh index 6ed92e2e06e..80805330577 100755 --- a/tests/queries/0_stateless/02916_broken_projection.sh +++ b/tests/queries/0_stateless/02916_broken_projection.sh @@ -115,7 +115,10 @@ function check() if [ "$expect_broken_part" = "proj" ] then - $CLICKHOUSE_CLIENT --optimize_use_projections 1 --send_logs_level 'fatal' --query_id $query_id -q "SELECT c FROM test WHERE d == 12 ORDER BY c;" 2>&1 | grep -o $expected_error + $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -nm -q " +SET send_logs_level='fatal'; +SELECT c FROM test WHERE d == 12 ORDER BY c; +" 2>&1 | grep -oF "$expected_error" else $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c;" echo 'used projections' @@ -130,7 +133,10 @@ function check() if [ "$expect_broken_part" = "proj_2" ] then - $CLICKHOUSE_CLIENT --optimize_use_projections 1 --send_logs_level 'fatal' --query_id $query_id -q "SELECT d FROM test WHERE c == 12 ORDER BY d;" 2>&1 | grep -o $expected_error + $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -nm -q " +SET send_logs_level='fatal'; +SELECT d FROM test WHERE c == 12 ORDER BY d; +" 2>&1 | grep -oF "$expected_error" else $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d;" echo 'used projections' @@ -234,7 +240,7 @@ break_projection proj_2 all_2_2_0 data broken_projections_info # Select now fails with error "File doesn't exist" -check "proj_2" "FILE_DOESNT_EXIST" +check "proj_2" FILE_DOESNT_EXIST # Projection 'proj_2' from part all_2_2_0 will now appear in broken parts info. broken_projections_info From a6972e7c90fd8ff775855cac13f47f9cd46b2da1 Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 28 Nov 2023 10:22:10 +0100 Subject: [PATCH 010/245] Fxi --- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 85ce112d9a1..be665a64f1c 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -1145,7 +1145,7 @@ void IMergeTreeDataPart::loadChecksums(bool require) LOG_WARNING(storage.log, "Checksums for part {} not found. Will calculate them from data on disk.", name); bool noop; - checksums = checkDataPart(shared_from_this(), false, noop, /* is_cancelled */{}, /* throw_on_broken_projection */false); + checksums = checkDataPart(shared_from_this(), false, noop, /* is_cancelled */[]{ return false; }, /* throw_on_broken_projection */false); writeChecksums(checksums, {}); bytes_on_disk = checksums.getTotalSizeOnDisk(); From 0e11eeaea546dd41231a4f180b877ada1291a23d Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 30 Nov 2023 13:52:08 +0100 Subject: [PATCH 011/245] Allow to backup and restore parts with broken projections --- src/Backups/BackupSettings.cpp | 2 + src/Backups/BackupSettings.h | 6 + .../MergeTree/DataPartStorageOnDiskBase.cpp | 35 +- .../MergeTree/DataPartStorageOnDiskBase.h | 4 +- src/Storages/MergeTree/IDataPartStorage.h | 4 +- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 15 +- src/Storages/MergeTree/IMergeTreeDataPart.h | 3 +- src/Storages/MergeTree/MergeTreeData.cpp | 9 +- .../02916_broken_projection.reference | 226 +++++++++- .../0_stateless/02916_broken_projection.sh | 426 +++++++++++++----- 10 files changed, 588 insertions(+), 142 deletions(-) diff --git a/src/Backups/BackupSettings.cpp b/src/Backups/BackupSettings.cpp index 68d825e9468..51d713f03e1 100644 --- a/src/Backups/BackupSettings.cpp +++ b/src/Backups/BackupSettings.cpp @@ -32,6 +32,8 @@ namespace ErrorCodes M(UInt64, shard_num) \ M(UInt64, replica_num) \ M(Bool, check_parts) \ + M(Bool, check_projection_parts) \ + M(Bool, allow_backup_broken_projections) \ M(Bool, internal) \ M(String, host_id) \ M(OptionalUUID, backup_uuid) diff --git a/src/Backups/BackupSettings.h b/src/Backups/BackupSettings.h index f26b992b348..ec430905f51 100644 --- a/src/Backups/BackupSettings.h +++ b/src/Backups/BackupSettings.h @@ -62,6 +62,12 @@ struct BackupSettings /// Check checksums of the data parts before writing them to a backup. bool check_parts = true; + /// Check checksums of the projection data parts before writing them to a backup. + bool check_projection_parts = true; + + /// Allow to create backup with broken projections. + bool allow_backup_broken_projections = false; + /// Internal, should not be specified by user. /// Whether this backup is a part of a distributed backup created by BACKUP ON CLUSTER. bool internal = false; diff --git a/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp b/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp index 7fc8187aee5..6e5cbdde355 100644 --- a/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp +++ b/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp @@ -334,7 +334,9 @@ void DataPartStorageOnDiskBase::backup( const ReadSettings & read_settings, bool make_temporary_hard_links, BackupEntries & backup_entries, - TemporaryFilesOnDisks * temp_dirs) const + TemporaryFilesOnDisks * temp_dirs, + bool is_projection_part, + bool allow_backup_broken_projection) const { fs::path part_path_on_disk = fs::path{root_path} / part_dir; fs::path part_path_in_backup = fs::path{path_in_backup} / part_dir; @@ -376,7 +378,7 @@ void DataPartStorageOnDiskBase::backup( bool copy_encrypted = !backup_settings.decrypt_files_from_encrypted_disks; - for (const auto & filepath : files_to_backup) + auto backup_file = [&](const String & filepath) { auto filepath_on_disk = part_path_on_disk / filepath; auto filepath_in_backup = part_path_in_backup / filepath; @@ -384,8 +386,10 @@ void DataPartStorageOnDiskBase::backup( if (files_without_checksums.contains(filepath)) { backup_entries.emplace_back(filepath_in_backup, std::make_unique(disk, filepath_on_disk, read_settings, copy_encrypted)); - continue; + return; } + else if (is_projection_part && allow_backup_broken_projection && !disk->exists(filepath_on_disk)) + return; if (make_temporary_hard_links) { @@ -410,6 +414,31 @@ void DataPartStorageOnDiskBase::backup( backup_entry = wrapBackupEntryWith(std::move(backup_entry), temp_dir_owner); backup_entries.emplace_back(filepath_in_backup, std::move(backup_entry)); + }; + + auto * log = &Poco::Logger::get("DataPartStorageOnDiskBase::backup"); + + for (const auto & filepath : files_to_backup) + { + if (is_projection_part && allow_backup_broken_projection) + { + try + { + backup_file(filepath); + } + catch (Exception & e) + { + if (e.code() != ErrorCodes::FILE_DOESNT_EXIST) + throw; + + LOG_ERROR(log, "Cannot backup file {} of projection part {}. Will try to ignore it", filepath, part_dir); + continue; + } + } + else + { + backup_file(filepath); + } } } diff --git a/src/Storages/MergeTree/DataPartStorageOnDiskBase.h b/src/Storages/MergeTree/DataPartStorageOnDiskBase.h index 1826e84c28d..6176a13c27b 100644 --- a/src/Storages/MergeTree/DataPartStorageOnDiskBase.h +++ b/src/Storages/MergeTree/DataPartStorageOnDiskBase.h @@ -58,7 +58,9 @@ public: const ReadSettings & read_settings, bool make_temporary_hard_links, BackupEntries & backup_entries, - TemporaryFilesOnDisks * temp_dirs) const override; + TemporaryFilesOnDisks * temp_dirs, + bool is_projection_part, + bool allow_backup_broken_projection) const override; MutableDataPartStoragePtr freeze( const std::string & to, diff --git a/src/Storages/MergeTree/IDataPartStorage.h b/src/Storages/MergeTree/IDataPartStorage.h index 072cb29626e..b3a6ab203d5 100644 --- a/src/Storages/MergeTree/IDataPartStorage.h +++ b/src/Storages/MergeTree/IDataPartStorage.h @@ -222,7 +222,9 @@ public: const ReadSettings & read_settings, bool make_temporary_hard_links, BackupEntries & backup_entries, - TemporaryFilesOnDisks * temp_dirs) const = 0; + TemporaryFilesOnDisks * temp_dirs, + bool is_projection_part, + bool allow_backup_broken_projection) const = 0; /// Creates hardlinks into 'to/dir_path' for every file in data part. /// Callback is called after hardlinks are created, but before 'delete-on-destroy.txt' marker is removed. diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index be665a64f1c..940b3991067 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -647,13 +647,14 @@ void IMergeTreeDataPart::loadColumnsChecksumsIndexes(bool require_columns_checks loadIndex(); /// Must be called after loadIndexGranularity as it uses the value of `index_granularity` loadRowsCount(); /// Must be called after loadIndexGranularity() as it uses the value of `index_granularity`. loadPartitionAndMinMaxIndex(); + bool has_broken_projections = false; if (!parent_part) { loadTTLInfos(); - loadProjections(require_columns_checksums, check_consistency, false /* if_not_loaded */); + has_broken_projections = !loadProjections(require_columns_checksums, check_consistency, false /* if_not_loaded */); } - if (check_consistency) + if (check_consistency && !has_broken_projections) checkConsistency(require_columns_checksums); loadDefaultCompressionCodec(); @@ -715,9 +716,10 @@ void IMergeTreeDataPart::addProjectionPart( projection_parts[projection_name] = std::move(projection_part); } -void IMergeTreeDataPart::loadProjections(bool require_columns_checksums, bool check_consistency, bool if_not_loaded) +bool IMergeTreeDataPart::loadProjections(bool require_columns_checksums, bool check_consistency, bool if_not_loaded) { auto metadata_snapshot = storage.getInMemoryMetadataPtr(); + bool has_broken_projection = false; for (const auto & projection : metadata_snapshot->projections) { auto path = projection.name + ".proj"; @@ -742,16 +744,19 @@ void IMergeTreeDataPart::loadProjections(bool require_columns_checksums, bool ch if (isRetryableException(std::current_exception())) throw; + auto message = getCurrentExceptionMessage(true); LOG_ERROR(&Poco::Logger::get("IMergeTreeDataPart"), - "Cannot load projection {}, will consider it broken", projection.name); + "Cannot load projection {}, will consider it broken. Reason: {}", projection.name, message); - part->setBrokenReason(getCurrentExceptionMessage(false), getCurrentExceptionCode()); + has_broken_projection = true; + part->setBrokenReason(message, getCurrentExceptionCode()); } addProjectionPart(projection.name, std::move(part)); } } } + return has_broken_projection; } void IMergeTreeDataPart::loadIndexGranularity() diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index 9af2c16f1e8..6e276284f4c 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -421,7 +421,8 @@ public: bool hasBrokenProjection(const String & projection_name) const; - void loadProjections(bool require_columns_checksums, bool check_consistency, bool if_not_loaded = false); + /// Return true, if all projections were loaded successfully and none was marked as broken. + bool loadProjections(bool require_columns_checksums, bool check_consistency, bool if_not_loaded = false); void setBrokenReason(const String & message, int code) const; diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 8683e4293e9..c95aee88aee 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -5113,7 +5113,7 @@ MergeTreeData::PartsBackupEntries MergeTreeData::backupParts( if (hold_table_lock && !table_lock) table_lock = lockForShare(local_context->getCurrentQueryId(), local_context->getSettingsRef().lock_acquire_timeout); - if (backup_settings.check_parts) + if (backup_settings.check_projection_parts) part->checkConsistencyWithProjections(/* require_part_metadata= */ true); BackupEntries backup_entries_from_part; @@ -5125,7 +5125,8 @@ MergeTreeData::PartsBackupEntries MergeTreeData::backupParts( read_settings, make_temporary_hard_links, backup_entries_from_part, - &temp_dirs); + &temp_dirs, + false, false); auto projection_parts = part->getProjectionParts(); for (const auto & [projection_name, projection_part] : projection_parts) @@ -5138,7 +5139,9 @@ MergeTreeData::PartsBackupEntries MergeTreeData::backupParts( read_settings, make_temporary_hard_links, backup_entries_from_part, - &temp_dirs); + &temp_dirs, + projection_part->is_broken, + backup_settings.allow_backup_broken_projections); } if (hold_storage_and_part_ptrs) diff --git a/tests/queries/0_stateless/02916_broken_projection.reference b/tests/queries/0_stateless/02916_broken_projection.reference index 1f072e207a7..4c4901ae99f 100644 --- a/tests/queries/0_stateless/02916_broken_projection.reference +++ b/tests/queries/0_stateless/02916_broken_projection.reference @@ -19,6 +19,7 @@ used projections SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table 1 +0 broke metadata of part 'proj' (parent part: all_2_2_0) system.parts all_0_0_0 1 ['proj','proj_2'] @@ -39,8 +40,9 @@ check table 0 broken projections info all_2_2_0 proj FILE_DOESNT_EXIST -check table full (all_2_2_0) +check table full (test - all_2_2_0) all_2_2_0 +0 broke data of part 'proj_2' (parent part: all_2_2_0) broken projections info all_2_2_0 proj FILE_DOESNT_EXIST @@ -78,6 +80,7 @@ used projections SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table 0 +0 broke data of part 'proj_2' (parent part: all_3_3_0) broken projections info all_2_2_0 proj FILE_DOESNT_EXIST @@ -111,6 +114,7 @@ used projections SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table 0 +0 broke metadata of part 'proj' (parent part: all_1_1_0) Detach - Attach broken projections info @@ -118,6 +122,7 @@ all_1_1_0 proj NO_FILE_IN_DATA_PART all_2_2_0 proj NO_FILE_IN_DATA_PART all_2_2_0 proj_2 FILE_DOESNT_EXIST all_3_3_0 proj_2 FILE_DOESNT_EXIST +0 broke data of part 'proj_2' (parent part: all_1_1_0) Detach - Attach broken projections info @@ -146,10 +151,10 @@ used projections SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table 0 -check table full (all_1_1_0) +check table full (test - all_1_1_0) all_1_1_0 materialize projection proj -check table full () +check table full (test - ) system.parts all_0_0_0 0 ['proj','proj_2'] all_0_0_0_6 1 ['proj','proj_2'] @@ -175,7 +180,8 @@ SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table 1 materialize projection proj_2 -check table full () +check table full (test - ) +0 broke data of part 'proj' (parent part: all_3_5_1_7) insert new part optimize @@ -214,3 +220,215 @@ used projections SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table 1 +insert new part +insert new part +insert new part +insert new part +system.parts +all_0_0_0 1 ['proj','proj_2'] +all_1_1_0 1 ['proj','proj_2'] +select from projection 'proj', expect error: +used projections +SELECT c FROM test2 WHERE d == 12 OR d == 16 ORDER BY c; proj +select from projection 'proj_2', expect error: +used projections +SELECT d FROM test2 WHERE c == 12 OR c == 16 ORDER BY d; proj_2 +check table +1 +system.parts +all_1_1_0 1 ['proj','proj_2'] +select from projection 'proj', expect error: +used projections +SELECT c FROM test2_replica WHERE d == 12 OR d == 16 ORDER BY c; proj +select from projection 'proj_2', expect error: +used projections +SELECT d FROM test2_replica WHERE c == 12 OR c == 16 ORDER BY d; proj_2 +check table +1 +0 +broke data of part 'proj' (parent part: all_0_0_0) +check table full (test2 - all_0_0_0) +all_0_0_0 +system.parts +all_0_0_0 1 ['proj','proj_2'] +all_1_1_0 1 ['proj','proj_2'] +select from projection 'proj', expect error: +used projections +SELECT c FROM test2 WHERE d == 12 OR d == 16 ORDER BY c; proj +select from projection 'proj_2', expect error: +used projections +SELECT d FROM test2 WHERE c == 12 OR c == 16 ORDER BY d; proj_2 +check table +0 +broke data of part 'all_0_0_0' +check table full (test2 - all_0_0_0) +all_0_0_0 +system.parts +all_0_0_0 0 ['proj','proj_2'] +all_1_1_0 1 ['proj','proj_2'] +select from projection 'proj', expect error: +used projections +SELECT c FROM test2 WHERE d == 12 OR d == 16 ORDER BY c; proj +select from projection 'proj_2', expect error: +used projections +SELECT d FROM test2 WHERE c == 12 OR c == 16 ORDER BY d; proj_2 +check table +1 +system.parts +all_0_0_0 1 ['proj','proj_2'] +all_1_1_0 1 ['proj','proj_2'] +select from projection 'proj', expect error: +used projections +SELECT c FROM test2 WHERE d == 12 OR d == 16 ORDER BY c; proj +select from projection 'proj_2', expect error: +used projections +SELECT d FROM test2 WHERE c == 12 OR c == 16 ORDER BY d; proj_2 +check table +1 +insert new part +insert new part +insert new part +insert new part +system.parts +all_0_0_0 1 ['proj','proj_2'] +all_1_1_0 1 ['proj','proj_2'] +all_2_2_0 1 ['proj','proj_2'] +all_3_3_0 1 ['proj','proj_2'] +select from projection 'proj', expect error: +12 +16 +used projections +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj +select from projection 'proj_2', expect error: +12 +16 +used projections +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 +check table +1 +0 +broke data of part 'proj' (parent part: all_2_2_0) +system.parts +all_0_0_0 1 ['proj','proj_2'] +all_1_1_0 1 ['proj','proj_2'] +all_2_2_0 1 ['proj','proj_2'] +all_3_3_0 1 ['proj','proj_2'] +select from projection 'proj', expect error: proj +FILE_DOESNT_EXIST +select from projection 'proj_2', expect error: proj +12 +16 +used projections +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 +check table +0 +broken projections info +all_2_2_0 proj NO_FILE_IN_DATA_PART +BACKUP_CREATED +RESTORED +system.parts +all_0_0_0 1 ['proj','proj_2'] +all_1_1_0 1 ['proj','proj_2'] +all_2_2_0 1 ['proj','proj_2'] +all_3_3_0 1 ['proj','proj_2'] +select from projection 'proj', expect error: proj +FILE_DOESNT_EXIST +select from projection 'proj_2', expect error: proj +12 +16 +used projections +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 +check table +0 +broken projections info +all_2_2_0 proj NO_FILE_IN_DATA_PART +0 +broke all data of part 'proj' (parent part: all_2_2_0) +system.parts +all_0_0_0 1 ['proj','proj_2'] +all_1_1_0 1 ['proj','proj_2'] +all_2_2_0 1 ['proj','proj_2'] +all_3_3_0 1 ['proj','proj_2'] +select from projection 'proj', expect error: proj +select from projection 'proj_2', expect error: proj +12 +16 +used projections +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 +check table +0 +broken projections info +all_2_2_0 proj NO_FILE_IN_DATA_PART +FILE_DOESNT_EXIST +materialize projection proj +system.parts +all_0_0_0 0 ['proj','proj_2'] +all_0_0_0_4 1 ['proj','proj_2'] +all_1_1_0 0 ['proj','proj_2'] +all_1_1_0_4 1 ['proj','proj_2'] +all_2_2_0 0 ['proj','proj_2'] +all_2_2_0_4 1 ['proj','proj_2'] +all_3_3_0 0 ['proj','proj_2'] +all_3_3_0_4 1 ['proj','proj_2'] +select from projection 'proj', expect error: proj +select from projection 'proj_2', expect error: proj +12 +16 +used projections +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 +check table +1 +broken projections info +all_2_2_0 proj NO_FILE_IN_DATA_PART +BACKUP_CREATED +RESTORED +system.parts +all_0_0_0 1 ['proj','proj_2'] +all_1_1_0 1 ['proj','proj_2'] +all_2_2_0 1 ['proj','proj_2'] +all_3_3_0 1 ['proj','proj_2'] +select from projection 'proj', expect error: proj +select from projection 'proj_2', expect error: proj +12 +16 +used projections +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 +check table +1 +0 +broke all data of part 'proj' (parent part: all_2_2_0) +system.parts +all_0_0_0 1 ['proj','proj_2'] +all_1_1_0 1 ['proj','proj_2'] +all_2_2_0 1 ['proj','proj_2'] +all_3_3_0 1 ['proj','proj_2'] +select from projection 'proj', expect error: proj +select from projection 'proj_2', expect error: proj +12 +16 +used projections +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 +check table +0 +broken projections info +all_2_2_0 proj FILE_DOESNT_EXIST +BACKUP_CREATED +RESTORED +system.parts +all_0_0_0 1 ['proj','proj_2'] +all_1_1_0 1 ['proj','proj_2'] +all_2_2_0 1 ['proj_2'] +all_3_3_0 1 ['proj','proj_2'] +select from projection 'proj', expect error: +12 +16 +used projections +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj +select from projection 'proj_2', expect error: +12 +16 +used projections +SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 +check table +0 +broken projections info diff --git a/tests/queries/0_stateless/02916_broken_projection.sh b/tests/queries/0_stateless/02916_broken_projection.sh index 80805330577..1555139e16f 100755 --- a/tests/queries/0_stateless/02916_broken_projection.sh +++ b/tests/queries/0_stateless/02916_broken_projection.sh @@ -5,35 +5,40 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CURDIR"/../shell_config.sh -$CLICKHOUSE_CLIENT -nm -q " -DROP TABLE IF EXISTS test SYNC; -CREATE TABLE test -( - a String, - b String, - c Int32, - d Int32, - e Int32, +function create_table() +{ + test_id=$1 + name=$2 + replica=$3 + $CLICKHOUSE_CLIENT -nm -q " + DROP TABLE IF EXISTS $name SYNC; + CREATE TABLE $name + ( + a String, + b String, + c Int64, + d Int64, + e Int64, - PROJECTION proj - ( - SELECT c ORDER BY d - ), - PROJECTION proj_2 - ( - SELECT d ORDER BY c + PROJECTION proj + ( + SELECT c ORDER BY d + ), + PROJECTION proj_2 + ( + SELECT d ORDER BY c + ) ) -) -ENGINE = ReplicatedMergeTree('/test4/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/', '1') PRIMARY KEY (a) -SETTINGS min_bytes_for_wide_part = 0, - max_parts_to_merge_at_once=3, - enable_vertical_merge_algorithm=1, - vertical_merge_algorithm_min_rows_to_activate = 1, - vertical_merge_algorithm_min_columns_to_activate = 1, - vertical_merge_algorithm_min_columns_to_activate = 1; -" - -table_uuid=$($CLICKHOUSE_CLIENT -q "SELECT uuid FROM system.tables WHERE table='test' and database=currentDatabase()") + ENGINE = ReplicatedMergeTree('/test_broken_projection_24_$test_id/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/', '$replica') ORDER BY a + SETTINGS min_bytes_for_wide_part = 0, + max_parts_to_merge_at_once=3, + enable_vertical_merge_algorithm=1, + vertical_merge_algorithm_min_rows_to_activate = 1, + vertical_merge_algorithm_min_columns_to_activate = 1, + vertical_merge_algorithm_min_columns_to_activate = 1, + compress_primary_key=0; + " +} function random() { @@ -42,49 +47,88 @@ function random() function insert() { - offset=$1 - size=$2 + table=$1 + offset=$2 + size=$3 echo 'insert new part' - $CLICKHOUSE_CLIENT -q "INSERT INTO test SELECT number, number, number, number, number%2 FROM numbers($offset, $size);" + $CLICKHOUSE_CLIENT -q "INSERT INTO $table SELECT number, number, number, number, number%2 FROM numbers($offset, $size);" } function break_projection() { - part_name=$1 - parent_name=$2 - break_type=$3 + table=$1 + part_name=$2 + parent_name=$3 + break_type=$4 read -r part_path <<< $($CLICKHOUSE_CLIENT -nm -q " SELECT path FROM system.projection_parts - WHERE table='test' + WHERE table='$table' AND database=currentDatabase() AND active=1 AND part_name='$part_name' AND parent_name='$parent_name' + ORDER BY modification_time DESC LIMIT 1; ") + $CLICKHOUSE_CLIENT -q "select throwIf(substring('$part_path', 1, 1) != '/', 'Path is relative: $part_path')" || exit + if [ "$break_type" = "data" ] then rm "$part_path/d.bin" rm "$part_path/c.bin" echo "broke data of part '$part_name' (parent part: $parent_name)" - else + fi + if [ "$break_type" = "metadata" ] + then rm "$part_path/columns.txt" echo "broke metadata of part '$part_name' (parent part: $parent_name)" fi + if [ "$break_type" = "part" ] + then + rm -r "$part_path" + echo "broke all data of part '$part_name' (parent part: $parent_name)" + fi +} + +function break_part() +{ + table=$1 + part_name=$2 + + read -r part_path <<< $($CLICKHOUSE_CLIENT -nm -q " + SELECT path + FROM system.parts + WHERE table='$table' + AND database=currentDatabase() + AND active=1 + AND part_name='$part_name' + ORDER BY modification_time DESC + LIMIT 1; + ") + + if [ "$part_path" = "" ] + then + echo "Part path is empty" + exit + fi + + rm $part_path/columns.txt + echo "broke data of part '$part_name'" } function broken_projections_info() { + table=$1 echo 'broken projections info' $CLICKHOUSE_CLIENT -q " SELECT parent_name, name, errors.name FROM ( SELECT parent_name, name, exception_code FROM system.projection_parts - WHERE table='test' + WHERE table='$table' AND database=currentDatabase() AND is_broken = 1 ) AS parts_info @@ -96,18 +140,19 @@ function broken_projections_info() function check() { + table=$1 expect_broken_part="" expected_error="" - if [ $# -ne 0 ]; then - expect_broken_part=$1 - expected_error=$2 + if [ $# -gt 1 ]; then + expect_broken_part=$2 + expected_error=$3 fi echo 'system.parts' $CLICKHOUSE_CLIENT -q " SELECT name, active, projections FROM system.parts - WHERE table='test' AND database=currentDatabase() + WHERE table='$table' AND database=currentDatabase() ORDER BY name;" echo "select from projection 'proj', expect error: $expect_broken_part" @@ -117,10 +162,10 @@ function check() then $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -nm -q " SET send_logs_level='fatal'; -SELECT c FROM test WHERE d == 12 ORDER BY c; +SELECT c FROM $table WHERE d == 12 ORDER BY c; " 2>&1 | grep -oF "$expected_error" else - $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c;" + $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT c FROM $table WHERE d == 12 OR d == 16 ORDER BY c;" echo 'used projections' $CLICKHOUSE_CLIENT -nm -q " SYSTEM FLUSH LOGS; @@ -135,10 +180,10 @@ SELECT c FROM test WHERE d == 12 ORDER BY c; then $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -nm -q " SET send_logs_level='fatal'; -SELECT d FROM test WHERE c == 12 ORDER BY d; +SELECT d FROM $table WHERE c == 12 ORDER BY d; " 2>&1 | grep -oF "$expected_error" else - $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d;" + $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT d FROM $table WHERE c == 12 OR c == 16 ORDER BY d;" echo 'used projections' $CLICKHOUSE_CLIENT -nm -q " SYSTEM FLUSH LOGS; @@ -149,7 +194,7 @@ SELECT d FROM test WHERE c == 12 ORDER BY d; echo 'check table' $CLICKHOUSE_CLIENT -nm -q " SET send_logs_level='fatal'; - CHECK TABLE test;" + CHECK TABLE $table;" } function optimize() @@ -184,141 +229,274 @@ function reattach() function materialize_projection { - projection=$1 + table=$1 + projection=$2 echo "materialize projection $projection" - $CLICKHOUSE_CLIENT -q "ALTER TABLE test MATERIALIZE PROJECTION $projection SETTINGS mutations_sync=2" + $CLICKHOUSE_CLIENT -q "ALTER TABLE $table MATERIALIZE PROJECTION $projection SETTINGS mutations_sync=2" } function check_table_full() { - echo "check table full ($1)" - expect_broken_part=$1 + table=$1 + expect_broken_part=$2 + echo "check table full ($1 - $2)" if [ "$expect_broken_part" = "" ] then $CLICKHOUSE_CLIENT -nm -q " SET send_logs_level='fatal'; - CHECK TABLE test SETTINGS check_query_single_value_result = 0; + CHECK TABLE $table SETTINGS check_query_single_value_result = 0; " | grep "broken" else $CLICKHOUSE_CLIENT -nm -q " SET send_logs_level='fatal'; - CHECK TABLE test SETTINGS check_query_single_value_result = 0; + CHECK TABLE $table SETTINGS check_query_single_value_result = 0; " | grep "broken" | grep -o $expect_broken_part | head -n 1 fi } +function test1() +{ + create_table test1 test 1 -insert 0 5 + table_uuid=$($CLICKHOUSE_CLIENT -q "SELECT uuid FROM system.tables WHERE table='test' and database=currentDatabase()") -insert 5 5 + insert test 0 5 -insert 10 5 + insert test 5 5 -insert 15 5 + insert test 10 5 -check + insert test 15 5 -# Break metadata file of projection 'proj' -break_projection proj all_2_2_0 metadata + check test -# Do select and after "check table" query. -# Select works because it does not read columns.txt. -check + # Break metadata file of projection 'proj' + break_projection test proj all_2_2_0 metadata -# Projection 'proj' from part all_2_2_0 will now appear in broken parts info -# because it was marked broken during "check table" query. -# TODO: try to mark it during select as well -broken_projections_info + # Do select and after "check table" query. + # Select works because it does not read columns.txt. + check test -# Check table query will also show a list of parts which have broken projections. -check_table_full "all_2_2_0" + # Projection 'proj' from part all_2_2_0 will now appear in broken parts info + # because it was marked broken during "check table" query. + # TODO: try to mark it during select as well + broken_projections_info test -# Break data file of projection 'proj_2' for part all_2_2_0 -break_projection proj_2 all_2_2_0 data + # Check table query will also show a list of parts which have broken projections. + check_table_full test "all_2_2_0" -# It will not yet appear in broken projections info. -broken_projections_info + # Break data file of projection 'proj_2' for part all_2_2_0 + break_projection test proj_2 all_2_2_0 data -# Select now fails with error "File doesn't exist" -check "proj_2" FILE_DOESNT_EXIST + # It will not yet appear in broken projections info. + broken_projections_info test -# Projection 'proj_2' from part all_2_2_0 will now appear in broken parts info. -broken_projections_info + # Select now fails with error "File doesn't exist" + check test "proj_2" FILE_DOESNT_EXIST -# Second select works, because projection is now marked as broken. -check + # Projection 'proj_2' from part all_2_2_0 will now appear in broken parts info. + broken_projections_info test -# Break data file of projection 'proj_2' for part all_3_3_0 -break_projection proj_2 all_3_3_0 data + # Second select works, because projection is now marked as broken. + check test -# It will not yet appear in broken projections info. -broken_projections_info + # Break data file of projection 'proj_2' for part all_3_3_0 + break_projection test proj_2 all_3_3_0 data -insert 20 5 + # It will not yet appear in broken projections info. + broken_projections_info test -insert 25 5 + insert test 20 5 -# Part all_3_3_0 has 'proj' and 'proj_2' projections, but 'proj_2' is broken and server does NOT know it yet. -# Parts all_4_4_0 and all_5_5_0 have both non-broken projections. -# So a merge will be create for future part all_3_5_1. -# During merge it will fail to read from 'proj_2' of part all_3_3_0 and proj_2 will be marked broken. -# Merge will be retried and on second attempt it will succeed. -# The result part all_3_5_1 will have only 1 projection - 'proj', because -# it will skip 'proj_2' as it will see that one part does not have it anymore in the set of valid projections. -optimize 0 1 -sleep 2 + insert test 25 5 -$CLICKHOUSE_CLIENT -nm -q " -SYSTEM FLUSH LOGS; -SELECT count() FROM system.text_log -WHERE level='Error' -AND logger_name='MergeTreeBackgroundExecutor' -AND message like 'Exception while executing background task {$table_uuid:all_3_5_1}%Cannot open file%proj_2.proj/c.bin%' -" + # Part all_3_3_0 has 'proj' and 'proj_2' projections, but 'proj_2' is broken and server does NOT know it yet. + # Parts all_4_4_0 and all_5_5_0 have both non-broken projections. + # So a merge will be create for future part all_3_5_1. + # During merge it will fail to read from 'proj_2' of part all_3_3_0 and proj_2 will be marked broken. + # Merge will be retried and on second attempt it will succeed. + # The result part all_3_5_1 will have only 1 projection - 'proj', because + # it will skip 'proj_2' as it will see that one part does not have it anymore in the set of valid projections. + optimize 0 1 + sleep 2 -# Projection 'proj_2' from part all_2_2_0 will now appear in broken parts info. -broken_projections_info + $CLICKHOUSE_CLIENT -nm -q " + SYSTEM FLUSH LOGS; + SELECT count() FROM system.text_log + WHERE level='Error' + AND logger_name='MergeTreeBackgroundExecutor' + AND message like 'Exception while executing background task {$table_uuid:all_3_5_1}%Cannot open file%proj_2.proj/c.bin%' + " -check + # Projection 'proj_2' from part all_2_2_0 will now appear in broken parts info. + broken_projections_info test -break_projection proj all_1_1_0 metadata + check test -reattach + break_projection test proj all_1_1_0 metadata -broken_projections_info + reattach -break_projection proj_2 all_1_1_0 data + broken_projections_info test -reattach + break_projection test proj_2 all_1_1_0 data -broken_projections_info + reattach -check + broken_projections_info test -check_table_full all_1_1_0 + check test -materialize_projection proj + check_table_full test all_1_1_0 -check_table_full + materialize_projection test proj -check + check_table_full test -materialize_projection proj_2 + check test -check_table_full + materialize_projection test proj_2 -break_projection proj all_3_5_1_7 data + check_table_full test -insert 30 5 + break_projection test proj all_3_5_1_7 data -optimize 1 0 + insert test 30 5 -insert 35 5 + optimize 1 0 -optimize 1 0 + insert test 35 5 -check + optimize 1 0 -$CLICKHOUSE_CLIENT -nm -q " -DROP TABLE test SYNC; -" + check test +} + +function test2() +{ + create_table test2 test2 1 + + insert test2 0 5 + + insert test2 5 5 + + insert test 10 5 + + insert test 15 5 + + check test2 + + create_table test2 test2_replica 2 + + check test2_replica + + break_projection test2 proj all_0_0_0 data + + check_table_full test2 all_0_0_0 + + check test2 + + break_part test2 all_0_0_0 + + check_table_full test2 all_0_0_0 + + check test2 + + $CLICKHOUSE_CLIENT -q "SYSTEM SYNC REPLICA test2;" + + check test2 +} + +CLICKHOUSE_TEST_UNIQUE_NAME="gghhhhhhhhhhhhhhhhhhh" +function test3() +{ + create_table test3 test 1 + + insert test 0 5 + + insert test 5 5 + + insert test 10 5 + + insert test 15 5 + + check test + + break_projection test proj all_2_2_0 data + + check test proj FILE_DOESNT_EXIST + + broken_projections_info test + + ${CLICKHOUSE_CLIENT} -nm --query " + backup table ${CLICKHOUSE_DATABASE}.test to Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}') settings check_projection_parts=false; + " | grep -o "BACKUP_CREATED" + + ${CLICKHOUSE_CLIENT} -nm --stacktrace --query " + drop table test sync; + restore table ${CLICKHOUSE_DATABASE}.test from Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}'); + " | grep -o "RESTORED" + + check test proj FILE_DOESNT_EXIST + + broken_projections_info test + + break_projection test proj all_2_2_0 part + + check test proj + + broken_projections_info test + + ${CLICKHOUSE_CLIENT} -nm --query " + set send_logs_level='fatal'; + backup table ${CLICKHOUSE_DATABASE}.test to Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}_2') + " 2>&1 | grep -o "FILE_DOESNT_EXIST" + + materialize_projection test proj + + check test proj + + broken_projections_info test + + ${CLICKHOUSE_CLIENT} -nm --query " + backup table ${CLICKHOUSE_DATABASE}.test to Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}_3') + " | grep -o "BACKUP_CREATED" + + ${CLICKHOUSE_CLIENT} -nm --stacktrace --query " + drop table test sync; + restore table ${CLICKHOUSE_DATABASE}.test from Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}_3'); + " | grep -o "RESTORED" + + check test proj + + break_projection test proj all_2_2_0 part + + check test proj FILE_DOESNT_EXIST + + broken_projections_info test + + ${CLICKHOUSE_CLIENT} -nm --query " + backup table ${CLICKHOUSE_DATABASE}.test to Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}_2') + settings check_projection_parts=false, allow_backup_broken_projections=true; + " | grep -o "BACKUP_CREATED" + + ${CLICKHOUSE_CLIENT} -nm --stacktrace --query " + drop table test sync; + restore table ${CLICKHOUSE_DATABASE}.test from Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}_2'); + " | grep -o "RESTORED" + + check test + + broken_projections_info test +} + +test1 +test2 +test3 + + +#$CLICKHOUSE_CLIENT -nm -q " +#DROP TABLE test SYNC; +#DROP TABLE test2 SYNC; +#DROP TABLE test2_replica SYNC; +#" From 6632589d72ed270626e012c86a78a8f0c8411fb3 Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 30 Nov 2023 13:54:22 +0100 Subject: [PATCH 012/245] Review fix --- src/Storages/MergeTree/MergeTreeData.cpp | 6 +++--- tests/queries/0_stateless/02916_broken_projection.sh | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index c95aee88aee..1ba4153bc3e 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -7669,7 +7669,7 @@ MovePartsOutcome MergeTreeData::moveParts(const CurrentlyMovingPartsTaggerPtr & bool MergeTreeData::partsContainSameProjections(const DataPartPtr & left, const DataPartPtr & right, String & out_reason) { - auto remove_broken_parts = [](auto & parts) + auto remove_broken_parts_from_consideration = [](auto & parts) { std::set broken_projection_parts; for (const auto & [name, part] : parts) @@ -7684,8 +7684,8 @@ bool MergeTreeData::partsContainSameProjections(const DataPartPtr & left, const auto left_projection_parts = left->getProjectionParts(); auto right_projection_parts = right->getProjectionParts(); - remove_broken_parts(left_projection_parts); - remove_broken_parts(right_projection_parts); + remove_broken_parts_from_consideration(left_projection_parts); + remove_broken_parts_from_consideration(right_projection_parts); if (left_projection_parts.size() != right_projection_parts.size()) { diff --git a/tests/queries/0_stateless/02916_broken_projection.sh b/tests/queries/0_stateless/02916_broken_projection.sh index 1555139e16f..60b21216d1a 100755 --- a/tests/queries/0_stateless/02916_broken_projection.sh +++ b/tests/queries/0_stateless/02916_broken_projection.sh @@ -495,8 +495,8 @@ test2 test3 -#$CLICKHOUSE_CLIENT -nm -q " -#DROP TABLE test SYNC; -#DROP TABLE test2 SYNC; -#DROP TABLE test2_replica SYNC; -#" +$CLICKHOUSE_CLIENT -nm -q " +DROP TABLE IF EXISTS test SYNC; +DROP TABLE IF EXISTS test2 SYNC; +DROP TABLE IF EXISTS test2_replica SYNC; +" From caf4dc7e14e594da3c254822b345b79c57e76d19 Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Fri, 1 Dec 2023 12:21:47 +0100 Subject: [PATCH 013/245] Fix style check --- src/Common/ErrorCodes.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp index b0ed754536d..57aa82f3639 100644 --- a/src/Common/ErrorCodes.cpp +++ b/src/Common/ErrorCodes.cpp @@ -587,7 +587,7 @@ M(705, TABLE_NOT_EMPTY) \ M(706, LIBSSH_ERROR) \ M(707, GCP_ERROR) \ - M(708, ILLEGAL_STATISTIC) \ + M(708, ILLEGAL_STATISTIC) \ M(709, BROKEN_PROJECTION) \ \ M(999, KEEPER_EXCEPTION) \ From f609c44eb83fc769ba9e8fc5875bbc10e3e17b9b Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Fri, 1 Dec 2023 13:38:28 +0100 Subject: [PATCH 014/245] Update 02916_broken_projection.sh --- tests/queries/0_stateless/02916_broken_projection.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/queries/0_stateless/02916_broken_projection.sh b/tests/queries/0_stateless/02916_broken_projection.sh index 60b21216d1a..bd141d1a122 100755 --- a/tests/queries/0_stateless/02916_broken_projection.sh +++ b/tests/queries/0_stateless/02916_broken_projection.sh @@ -459,11 +459,13 @@ function test3() broken_projections_info test ${CLICKHOUSE_CLIENT} -nm --query " + set send_logs_level='fatal'; backup table ${CLICKHOUSE_DATABASE}.test to Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}_3') " | grep -o "BACKUP_CREATED" ${CLICKHOUSE_CLIENT} -nm --stacktrace --query " drop table test sync; + set send_logs_level='fatal'; restore table ${CLICKHOUSE_DATABASE}.test from Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}_3'); " | grep -o "RESTORED" @@ -476,12 +478,14 @@ function test3() broken_projections_info test ${CLICKHOUSE_CLIENT} -nm --query " + set send_logs_level='fatal'; backup table ${CLICKHOUSE_DATABASE}.test to Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}_2') settings check_projection_parts=false, allow_backup_broken_projections=true; " | grep -o "BACKUP_CREATED" ${CLICKHOUSE_CLIENT} -nm --stacktrace --query " drop table test sync; + set send_logs_level='fatal'; restore table ${CLICKHOUSE_DATABASE}.test from Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}_2'); " | grep -o "RESTORED" From 2b903003b4795eb3768fec3f84ec8321fa5485f6 Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 4 Dec 2023 13:21:18 +0100 Subject: [PATCH 015/245] Update reference --- .../0_stateless/02916_broken_projection.reference | 8 +++----- tests/queries/0_stateless/02916_broken_projection.sh | 9 ++++----- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/tests/queries/0_stateless/02916_broken_projection.reference b/tests/queries/0_stateless/02916_broken_projection.reference index 4c4901ae99f..acd1b87eb30 100644 --- a/tests/queries/0_stateless/02916_broken_projection.reference +++ b/tests/queries/0_stateless/02916_broken_projection.reference @@ -332,16 +332,14 @@ all_1_1_0 1 ['proj','proj_2'] all_2_2_0 1 ['proj','proj_2'] all_3_3_0 1 ['proj','proj_2'] select from projection 'proj', expect error: proj -FILE_DOESNT_EXIST select from projection 'proj_2', expect error: proj 12 16 used projections SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table -0 +1 broken projections info -all_2_2_0 proj NO_FILE_IN_DATA_PART 0 broke all data of part 'proj' (parent part: all_2_2_0) system.parts @@ -358,7 +356,7 @@ SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table 0 broken projections info -all_2_2_0 proj NO_FILE_IN_DATA_PART +all_2_2_0 proj FILE_DOESNT_EXIST FILE_DOESNT_EXIST materialize projection proj system.parts @@ -379,7 +377,7 @@ SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table 1 broken projections info -all_2_2_0 proj NO_FILE_IN_DATA_PART +all_2_2_0 proj FILE_DOESNT_EXIST BACKUP_CREATED RESTORED system.parts diff --git a/tests/queries/0_stateless/02916_broken_projection.sh b/tests/queries/0_stateless/02916_broken_projection.sh index bd141d1a122..7315cf5ce61 100755 --- a/tests/queries/0_stateless/02916_broken_projection.sh +++ b/tests/queries/0_stateless/02916_broken_projection.sh @@ -29,7 +29,7 @@ function create_table() SELECT d ORDER BY c ) ) - ENGINE = ReplicatedMergeTree('/test_broken_projection_24_$test_id/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/', '$replica') ORDER BY a + ENGINE = ReplicatedMergeTree('/test_broken_projection_$test_id/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/', '$replica') ORDER BY a SETTINGS min_bytes_for_wide_part = 0, max_parts_to_merge_at_once=3, enable_vertical_merge_algorithm=1, @@ -407,7 +407,6 @@ function test2() check test2 } -CLICKHOUSE_TEST_UNIQUE_NAME="gghhhhhhhhhhhhhhhhhhh" function test3() { create_table test3 test 1 @@ -437,7 +436,7 @@ function test3() restore table ${CLICKHOUSE_DATABASE}.test from Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}'); " | grep -o "RESTORED" - check test proj FILE_DOESNT_EXIST + check test proj broken_projections_info test @@ -479,14 +478,14 @@ function test3() ${CLICKHOUSE_CLIENT} -nm --query " set send_logs_level='fatal'; - backup table ${CLICKHOUSE_DATABASE}.test to Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}_2') + backup table ${CLICKHOUSE_DATABASE}.test to Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}_4') settings check_projection_parts=false, allow_backup_broken_projections=true; " | grep -o "BACKUP_CREATED" ${CLICKHOUSE_CLIENT} -nm --stacktrace --query " drop table test sync; set send_logs_level='fatal'; - restore table ${CLICKHOUSE_DATABASE}.test from Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}_2'); + restore table ${CLICKHOUSE_DATABASE}.test from Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}_4'); " | grep -o "RESTORED" check test From b77a6073aea98c7c5f5fcc28492a34e801d11b6b Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 4 Dec 2023 16:57:09 +0100 Subject: [PATCH 016/245] Fix test --- .../02916_broken_projection.reference | 90 +++++++++++-------- .../0_stateless/02916_broken_projection.sh | 36 ++++---- 2 files changed, 70 insertions(+), 56 deletions(-) diff --git a/tests/queries/0_stateless/02916_broken_projection.reference b/tests/queries/0_stateless/02916_broken_projection.reference index acd1b87eb30..b7764a6434e 100644 --- a/tests/queries/0_stateless/02916_broken_projection.reference +++ b/tests/queries/0_stateless/02916_broken_projection.reference @@ -7,12 +7,12 @@ all_0_0_0 1 ['proj','proj_2'] all_1_1_0 1 ['proj','proj_2'] all_2_2_0 1 ['proj','proj_2'] all_3_3_0 1 ['proj','proj_2'] -select from projection 'proj', expect error: +select from projection 'proj' 12 16 used projections SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2', expect error: +select from projection 'proj_2' 12 16 used projections @@ -26,12 +26,12 @@ all_0_0_0 1 ['proj','proj_2'] all_1_1_0 1 ['proj','proj_2'] all_2_2_0 1 ['proj','proj_2'] all_3_3_0 1 ['proj','proj_2'] -select from projection 'proj', expect error: +select from projection 'proj' 12 16 used projections SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2', expect error: +select from projection 'proj_2' 12 16 used projections @@ -51,7 +51,7 @@ all_0_0_0 1 ['proj','proj_2'] all_1_1_0 1 ['proj','proj_2'] all_2_2_0 1 ['proj','proj_2'] all_3_3_0 1 ['proj','proj_2'] -select from projection 'proj', expect error: proj_2 +select from projection 'proj' 12 16 used projections @@ -68,12 +68,12 @@ all_0_0_0 1 ['proj','proj_2'] all_1_1_0 1 ['proj','proj_2'] all_2_2_0 1 ['proj','proj_2'] all_3_3_0 1 ['proj','proj_2'] -select from projection 'proj', expect error: +select from projection 'proj' 12 16 used projections SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2', expect error: +select from projection 'proj_2' 12 16 used projections @@ -102,12 +102,12 @@ all_3_3_0 0 ['proj','proj_2'] all_3_5_1 1 ['proj'] all_4_4_0 0 ['proj','proj_2'] all_5_5_0 0 ['proj','proj_2'] -select from projection 'proj', expect error: +select from projection 'proj' 12 16 used projections SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2', expect error: +select from projection 'proj_2' 12 16 used projections @@ -139,12 +139,12 @@ all_3_3_0 0 ['proj','proj_2'] all_3_5_1 1 ['proj'] all_4_4_0 0 ['proj','proj_2'] all_5_5_0 0 ['proj','proj_2'] -select from projection 'proj', expect error: +select from projection 'proj' 12 16 used projections SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2', expect error: +select from projection 'proj_2' 12 16 used projections @@ -167,12 +167,12 @@ all_3_5_1 0 ['proj'] all_3_5_1_6 1 ['proj'] all_4_4_0 0 ['proj','proj_2'] all_5_5_0 0 ['proj','proj_2'] -select from projection 'proj', expect error: +select from projection 'proj' 12 16 used projections SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2', expect error: +select from projection 'proj_2' 12 16 used projections @@ -208,12 +208,12 @@ all_4_4_0 0 ['proj','proj_2'] all_5_5_0 0 ['proj','proj_2'] all_8_8_0 0 ['proj','proj_2'] all_9_9_0 1 ['proj','proj_2'] -select from projection 'proj', expect error: +select from projection 'proj' 12 16 used projections SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj_2 -select from projection 'proj_2', expect error: +select from projection 'proj_2' 12 16 used projections @@ -227,20 +227,19 @@ insert new part system.parts all_0_0_0 1 ['proj','proj_2'] all_1_1_0 1 ['proj','proj_2'] -select from projection 'proj', expect error: +select from projection 'proj' used projections SELECT c FROM test2 WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2', expect error: +select from projection 'proj_2' used projections SELECT d FROM test2 WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table 1 system.parts -all_1_1_0 1 ['proj','proj_2'] -select from projection 'proj', expect error: +select from projection 'proj' used projections SELECT c FROM test2_replica WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2', expect error: +select from projection 'proj_2' used projections SELECT d FROM test2_replica WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table @@ -252,10 +251,10 @@ all_0_0_0 system.parts all_0_0_0 1 ['proj','proj_2'] all_1_1_0 1 ['proj','proj_2'] -select from projection 'proj', expect error: +select from projection 'proj' used projections SELECT c FROM test2 WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2', expect error: +select from projection 'proj_2' used projections SELECT d FROM test2 WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table @@ -266,10 +265,10 @@ all_0_0_0 system.parts all_0_0_0 0 ['proj','proj_2'] all_1_1_0 1 ['proj','proj_2'] -select from projection 'proj', expect error: +select from projection 'proj' used projections SELECT c FROM test2 WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2', expect error: +select from projection 'proj_2' used projections SELECT d FROM test2 WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table @@ -277,10 +276,10 @@ check table system.parts all_0_0_0 1 ['proj','proj_2'] all_1_1_0 1 ['proj','proj_2'] -select from projection 'proj', expect error: +select from projection 'proj' used projections SELECT c FROM test2 WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2', expect error: +select from projection 'proj_2' used projections SELECT d FROM test2 WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table @@ -294,12 +293,12 @@ all_0_0_0 1 ['proj','proj_2'] all_1_1_0 1 ['proj','proj_2'] all_2_2_0 1 ['proj','proj_2'] all_3_3_0 1 ['proj','proj_2'] -select from projection 'proj', expect error: +select from projection 'proj' 12 16 used projections SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2', expect error: +select from projection 'proj_2' 12 16 used projections @@ -315,7 +314,7 @@ all_2_2_0 1 ['proj','proj_2'] all_3_3_0 1 ['proj','proj_2'] select from projection 'proj', expect error: proj FILE_DOESNT_EXIST -select from projection 'proj_2', expect error: proj +select from projection 'proj_2' 12 16 used projections @@ -331,8 +330,12 @@ all_0_0_0 1 ['proj','proj_2'] all_1_1_0 1 ['proj','proj_2'] all_2_2_0 1 ['proj','proj_2'] all_3_3_0 1 ['proj','proj_2'] -select from projection 'proj', expect error: proj -select from projection 'proj_2', expect error: proj +select from projection 'proj' +12 +16 +used projections +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj +select from projection 'proj_2' 12 16 used projections @@ -348,7 +351,8 @@ all_1_1_0 1 ['proj','proj_2'] all_2_2_0 1 ['proj','proj_2'] all_3_3_0 1 ['proj','proj_2'] select from projection 'proj', expect error: proj -select from projection 'proj_2', expect error: proj +STD_EXCEPTION +select from projection 'proj_2' 12 16 used projections @@ -368,8 +372,12 @@ all_2_2_0 0 ['proj','proj_2'] all_2_2_0_4 1 ['proj','proj_2'] all_3_3_0 0 ['proj','proj_2'] all_3_3_0_4 1 ['proj','proj_2'] -select from projection 'proj', expect error: proj -select from projection 'proj_2', expect error: proj +select from projection 'proj' +12 +16 +used projections +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj +select from projection 'proj_2' 12 16 used projections @@ -385,8 +393,12 @@ all_0_0_0 1 ['proj','proj_2'] all_1_1_0 1 ['proj','proj_2'] all_2_2_0 1 ['proj','proj_2'] all_3_3_0 1 ['proj','proj_2'] -select from projection 'proj', expect error: proj -select from projection 'proj_2', expect error: proj +select from projection 'proj' +12 +16 +used projections +SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj +select from projection 'proj_2' 12 16 used projections @@ -401,7 +413,7 @@ all_1_1_0 1 ['proj','proj_2'] all_2_2_0 1 ['proj','proj_2'] all_3_3_0 1 ['proj','proj_2'] select from projection 'proj', expect error: proj -select from projection 'proj_2', expect error: proj +select from projection 'proj_2' 12 16 used projections @@ -417,12 +429,12 @@ all_0_0_0 1 ['proj','proj_2'] all_1_1_0 1 ['proj','proj_2'] all_2_2_0 1 ['proj_2'] all_3_3_0 1 ['proj','proj_2'] -select from projection 'proj', expect error: +select from projection 'proj' 12 16 used projections SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2', expect error: +select from projection 'proj_2' 12 16 used projections diff --git a/tests/queries/0_stateless/02916_broken_projection.sh b/tests/queries/0_stateless/02916_broken_projection.sh index 7315cf5ce61..eeea512f14a 100755 --- a/tests/queries/0_stateless/02916_broken_projection.sh +++ b/tests/queries/0_stateless/02916_broken_projection.sh @@ -29,7 +29,7 @@ function create_table() SELECT d ORDER BY c ) ) - ENGINE = ReplicatedMergeTree('/test_broken_projection_$test_id/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/', '$replica') ORDER BY a + ENGINE = ReplicatedMergeTree('/test_broken_projection_32_$test_id/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/', '$replica') ORDER BY a SETTINGS min_bytes_for_wide_part = 0, max_parts_to_merge_at_once=3, enable_vertical_merge_algorithm=1, @@ -155,39 +155,41 @@ function check() WHERE table='$table' AND database=currentDatabase() ORDER BY name;" - echo "select from projection 'proj', expect error: $expect_broken_part" query_id=$(random 8) if [ "$expect_broken_part" = "proj" ] then + echo "select from projection 'proj', expect error: $expect_broken_part" $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -nm -q " -SET send_logs_level='fatal'; -SELECT c FROM $table WHERE d == 12 ORDER BY c; -" 2>&1 | grep -oF "$expected_error" + SET send_logs_level='fatal'; + SELECT c FROM $table WHERE d == 12 ORDER BY c; + " 2>&1 | grep -oF "$expected_error" else + echo "select from projection 'proj'" $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT c FROM $table WHERE d == 12 OR d == 16 ORDER BY c;" echo 'used projections' $CLICKHOUSE_CLIENT -nm -q " - SYSTEM FLUSH LOGS; - SELECT query, splitByChar('.', arrayJoin(projections))[-1] FROM system.query_log WHERE current_database=currentDatabase() AND query_id='$query_id' AND type='QueryFinish' + SYSTEM FLUSH LOGS; + SELECT query, splitByChar('.', arrayJoin(projections))[-1] FROM system.query_log WHERE current_database=currentDatabase() AND query_id='$query_id' AND type='QueryFinish' " fi - echo "select from projection 'proj_2', expect error: $expect_broken_part" query_id=$(random 8) if [ "$expect_broken_part" = "proj_2" ] then + echo "select from projection 'proj_2', expect error: $expect_broken_part" $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -nm -q " -SET send_logs_level='fatal'; -SELECT d FROM $table WHERE c == 12 ORDER BY d; -" 2>&1 | grep -oF "$expected_error" + SET send_logs_level='fatal'; + SELECT d FROM $table WHERE c == 12 ORDER BY d; + " 2>&1 | grep -oF "$expected_error" else + echo "select from projection 'proj_2'" $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT d FROM $table WHERE c == 12 OR c == 16 ORDER BY d;" echo 'used projections' $CLICKHOUSE_CLIENT -nm -q " - SYSTEM FLUSH LOGS; - SELECT query, splitByChar('.', arrayJoin(projections))[-1] FROM system.query_log WHERE current_database=currentDatabase() AND query_id='$query_id' AND type='QueryFinish' + SYSTEM FLUSH LOGS; + SELECT query, splitByChar('.', arrayJoin(projections))[-1] FROM system.query_log WHERE current_database=currentDatabase() AND query_id='$query_id' AND type='QueryFinish' " fi @@ -436,13 +438,13 @@ function test3() restore table ${CLICKHOUSE_DATABASE}.test from Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}'); " | grep -o "RESTORED" - check test proj + check test broken_projections_info test break_projection test proj all_2_2_0 part - check test proj + check test proj STD_EXCEPTION broken_projections_info test @@ -453,7 +455,7 @@ function test3() materialize_projection test proj - check test proj + check test broken_projections_info test @@ -468,7 +470,7 @@ function test3() restore table ${CLICKHOUSE_DATABASE}.test from Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}_3'); " | grep -o "RESTORED" - check test proj + check test break_projection test proj all_2_2_0 part From 4de048904a3cbb6ff30e20b5a8defd1564f2e722 Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Mon, 4 Dec 2023 19:14:06 +0100 Subject: [PATCH 017/245] Update 02916_broken_projection.sh --- tests/queries/0_stateless/02916_broken_projection.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/queries/0_stateless/02916_broken_projection.sh b/tests/queries/0_stateless/02916_broken_projection.sh index eeea512f14a..261342da103 100755 --- a/tests/queries/0_stateless/02916_broken_projection.sh +++ b/tests/queries/0_stateless/02916_broken_projection.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash +# Tags: no-random-merge-tree-settings # shellcheck disable=SC2046 CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) From d3b80ac60cdb1fa17fb8907a7a6f11afde759bab Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Mon, 4 Dec 2023 19:14:55 +0100 Subject: [PATCH 018/245] Update 02916_broken_projection.sh --- tests/queries/0_stateless/02916_broken_projection.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02916_broken_projection.sh b/tests/queries/0_stateless/02916_broken_projection.sh index 261342da103..0910ba177fb 100755 --- a/tests/queries/0_stateless/02916_broken_projection.sh +++ b/tests/queries/0_stateless/02916_broken_projection.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Tags: no-random-merge-tree-settings +# Tags: long, no-random-merge-tree-settings # shellcheck disable=SC2046 CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) From 23bde28ac4fc18e296daf6b04283ab50ee58d025 Mon Sep 17 00:00:00 2001 From: kssenii Date: Fri, 8 Dec 2023 19:11:47 +0100 Subject: [PATCH 019/245] Fxi --- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 6 ++---- src/Storages/MergeTree/IMergeTreeDataPart.h | 2 +- src/Storages/MergeTree/MutateTask.cpp | 6 ++++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 52310802c9d..5418bcd83f3 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -677,7 +677,7 @@ void IMergeTreeDataPart::loadColumnsChecksumsIndexes(bool require_columns_checks if (!parent_part) { loadTTLInfos(); - has_broken_projections = !loadProjections(require_columns_checksums, check_consistency, false /* if_not_loaded */); + loadProjections(require_columns_checksums, check_consistency, has_broken_projections, false /* if_not_loaded */); } if (check_consistency && !has_broken_projections) @@ -742,10 +742,9 @@ void IMergeTreeDataPart::addProjectionPart( projection_parts[projection_name] = std::move(projection_part); } -bool IMergeTreeDataPart::loadProjections(bool require_columns_checksums, bool check_consistency, bool if_not_loaded) +void IMergeTreeDataPart::loadProjections(bool require_columns_checksums, bool check_consistency, bool & has_broken_projection, bool if_not_loaded) { auto metadata_snapshot = storage.getInMemoryMetadataPtr(); - bool has_broken_projection = false; for (const auto & projection : metadata_snapshot->projections) { auto path = projection.name + ".proj"; @@ -782,7 +781,6 @@ bool IMergeTreeDataPart::loadProjections(bool require_columns_checksums, bool ch } } } - return has_broken_projection; } void IMergeTreeDataPart::loadIndexGranularity() diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index 050bd76121c..9812529086b 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -434,7 +434,7 @@ public: bool hasBrokenProjection(const String & projection_name) const; /// Return true, if all projections were loaded successfully and none was marked as broken. - bool loadProjections(bool require_columns_checksums, bool check_consistency, bool if_not_loaded = false); + void loadProjections(bool require_columns_checksums, bool check_consistency, bool & has_broken_projection, bool if_not_loaded = false); void setBrokenReason(const String & message, int code) const; diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index 93b241deae7..2b0cf60a7f1 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -870,7 +870,8 @@ void finalizeMutatedPart( new_data_part->modification_time = time(nullptr); /// Load rest projections which are hardlinked - new_data_part->loadProjections(false, false, true /* if_not_loaded */); + bool noop; + new_data_part->loadProjections(false, false, noop, true /* if_not_loaded */); /// All information about sizes is stored in checksums. /// It doesn't make sense to touch filesystem for sizes. @@ -1570,8 +1571,9 @@ private: void finalize() { + bool noop; ctx->new_data_part->minmax_idx = std::move(ctx->minmax_idx); - ctx->new_data_part->loadProjections(false, false, true /* if_not_loaded */); + ctx->new_data_part->loadProjections(false, false, noop, true /* if_not_loaded */); ctx->mutating_executor.reset(); ctx->mutating_pipeline.reset(); From c8c4db5984bf9101478e0d1f33c3432c257ea7a0 Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 11 Dec 2023 13:24:31 +0100 Subject: [PATCH 020/245] Fxi test --- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 7 +++++++ .../queries/0_stateless/02916_broken_projection.reference | 7 ++++--- tests/queries/0_stateless/02916_broken_projection.sh | 4 +++- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 5418bcd83f3..7af49edf788 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -780,6 +780,13 @@ void IMergeTreeDataPart::loadProjections(bool require_columns_checksums, bool ch addProjectionPart(projection.name, std::move(part)); } } + else if (checksums.has(path)) + { + auto part = getProjectionPartBuilder(projection.name).withPartFormatFromDisk().build(); + part->setBrokenReason("Projection directory " + path + " does not exist while loading projections", ErrorCodes::NO_FILE_IN_DATA_PART); + addProjectionPart(projection.name, std::move(part)); + has_broken_projection = true; + } } } diff --git a/tests/queries/0_stateless/02916_broken_projection.reference b/tests/queries/0_stateless/02916_broken_projection.reference index b7764a6434e..358304de74a 100644 --- a/tests/queries/0_stateless/02916_broken_projection.reference +++ b/tests/queries/0_stateless/02916_broken_projection.reference @@ -406,7 +406,7 @@ SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table 1 0 -broke all data of part 'proj' (parent part: all_2_2_0) +broke all data of part 'proj' (parent part: all_1_1_0) system.parts all_0_0_0 1 ['proj','proj_2'] all_1_1_0 1 ['proj','proj_2'] @@ -421,13 +421,13 @@ SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table 0 broken projections info -all_2_2_0 proj FILE_DOESNT_EXIST +all_1_1_0 proj FILE_DOESNT_EXIST BACKUP_CREATED RESTORED system.parts all_0_0_0 1 ['proj','proj_2'] all_1_1_0 1 ['proj','proj_2'] -all_2_2_0 1 ['proj_2'] +all_2_2_0 1 ['proj','proj_2'] all_3_3_0 1 ['proj','proj_2'] select from projection 'proj' 12 @@ -442,3 +442,4 @@ SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table 0 broken projections info +all_1_1_0 proj NO_FILE_IN_DATA_PART diff --git a/tests/queries/0_stateless/02916_broken_projection.sh b/tests/queries/0_stateless/02916_broken_projection.sh index 0910ba177fb..eb68f8621a2 100755 --- a/tests/queries/0_stateless/02916_broken_projection.sh +++ b/tests/queries/0_stateless/02916_broken_projection.sh @@ -410,6 +410,8 @@ function test2() check test2 } +CLICKHOUSE_DATABASE="default" +CLICKHOUSE_TEST_UNIQUE_NAME="test123456" function test3() { create_table test3 test 1 @@ -473,7 +475,7 @@ function test3() check test - break_projection test proj all_2_2_0 part + break_projection test proj all_1_1_0 part check test proj FILE_DOESNT_EXIST From cd41802d7e5b056e0114c8ad7523f00828ad5940 Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Mon, 11 Dec 2023 17:37:44 +0100 Subject: [PATCH 021/245] Update 02916_broken_projection.sh --- tests/queries/0_stateless/02916_broken_projection.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/queries/0_stateless/02916_broken_projection.sh b/tests/queries/0_stateless/02916_broken_projection.sh index eb68f8621a2..a52570f3d52 100755 --- a/tests/queries/0_stateless/02916_broken_projection.sh +++ b/tests/queries/0_stateless/02916_broken_projection.sh @@ -410,8 +410,6 @@ function test2() check test2 } -CLICKHOUSE_DATABASE="default" -CLICKHOUSE_TEST_UNIQUE_NAME="test123456" function test3() { create_table test3 test 1 From 457032d2998a085fb9c10c0b9d536e79dbcc5dab Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 11 Dec 2023 20:40:25 +0100 Subject: [PATCH 022/245] Disable fault injection because it breaks .reference --- tests/queries/0_stateless/02916_broken_projection.sh | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/02916_broken_projection.sh b/tests/queries/0_stateless/02916_broken_projection.sh index a52570f3d52..2049610e45b 100755 --- a/tests/queries/0_stateless/02916_broken_projection.sh +++ b/tests/queries/0_stateless/02916_broken_projection.sh @@ -52,7 +52,7 @@ function insert() offset=$2 size=$3 echo 'insert new part' - $CLICKHOUSE_CLIENT -q "INSERT INTO $table SELECT number, number, number, number, number%2 FROM numbers($offset, $size);" + $CLICKHOUSE_CLIENT -q "INSERT INTO $table SELECT number, number, number, number, number%2 FROM numbers($offset, $size) SETTINGS insert_keeper_fault_injection_probability;" } function break_projection() @@ -431,11 +431,12 @@ function test3() broken_projections_info test ${CLICKHOUSE_CLIENT} -nm --query " - backup table ${CLICKHOUSE_DATABASE}.test to Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}') settings check_projection_parts=false; + backup table ${CLICKHOUSE_DATABASE}.test to Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}') settings check_projection_parts=false, backup_restore_keeper_fault_injection_probability=0.0; " | grep -o "BACKUP_CREATED" ${CLICKHOUSE_CLIENT} -nm --stacktrace --query " drop table test sync; + set backup_restore_keeper_fault_injection_probability=0.0; restore table ${CLICKHOUSE_DATABASE}.test from Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}'); " | grep -o "RESTORED" @@ -451,6 +452,7 @@ function test3() ${CLICKHOUSE_CLIENT} -nm --query " set send_logs_level='fatal'; + set backup_restore_keeper_fault_injection_probability=0.0; backup table ${CLICKHOUSE_DATABASE}.test to Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}_2') " 2>&1 | grep -o "FILE_DOESNT_EXIST" @@ -462,12 +464,14 @@ function test3() ${CLICKHOUSE_CLIENT} -nm --query " set send_logs_level='fatal'; + set backup_restore_keeper_fault_injection_probability=0.0; backup table ${CLICKHOUSE_DATABASE}.test to Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}_3') " | grep -o "BACKUP_CREATED" ${CLICKHOUSE_CLIENT} -nm --stacktrace --query " drop table test sync; set send_logs_level='fatal'; + set backup_restore_keeper_fault_injection_probability=0.0; restore table ${CLICKHOUSE_DATABASE}.test from Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}_3'); " | grep -o "RESTORED" @@ -481,6 +485,7 @@ function test3() ${CLICKHOUSE_CLIENT} -nm --query " set send_logs_level='fatal'; + set backup_restore_keeper_fault_injection_probability=0.0; backup table ${CLICKHOUSE_DATABASE}.test to Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}_4') settings check_projection_parts=false, allow_backup_broken_projections=true; " | grep -o "BACKUP_CREATED" @@ -488,6 +493,7 @@ function test3() ${CLICKHOUSE_CLIENT} -nm --stacktrace --query " drop table test sync; set send_logs_level='fatal'; + set backup_restore_keeper_fault_injection_probability=0.0; restore table ${CLICKHOUSE_DATABASE}.test from Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}_4'); " | grep -o "RESTORED" From 8ef2638cfce90031213bbbd595a50d584406a916 Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Tue, 12 Dec 2023 14:22:14 +0100 Subject: [PATCH 023/245] Update 02916_broken_projection.sh --- tests/queries/0_stateless/02916_broken_projection.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02916_broken_projection.sh b/tests/queries/0_stateless/02916_broken_projection.sh index 2049610e45b..0418759eb26 100755 --- a/tests/queries/0_stateless/02916_broken_projection.sh +++ b/tests/queries/0_stateless/02916_broken_projection.sh @@ -52,7 +52,7 @@ function insert() offset=$2 size=$3 echo 'insert new part' - $CLICKHOUSE_CLIENT -q "INSERT INTO $table SELECT number, number, number, number, number%2 FROM numbers($offset, $size) SETTINGS insert_keeper_fault_injection_probability;" + $CLICKHOUSE_CLIENT -q "INSERT INTO $table SELECT number, number, number, number, number%2 FROM numbers($offset, $size) SETTINGS insert_keeper_fault_injection_probability=0.0;" } function break_projection() From d81edb4adf65c8c3724ec27fc83b65d5d1b3ebad Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Wed, 13 Dec 2023 12:29:28 +0100 Subject: [PATCH 024/245] Update 02916_broken_projection.sh --- tests/queries/0_stateless/02916_broken_projection.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02916_broken_projection.sh b/tests/queries/0_stateless/02916_broken_projection.sh index 0418759eb26..07495c45214 100755 --- a/tests/queries/0_stateless/02916_broken_projection.sh +++ b/tests/queries/0_stateless/02916_broken_projection.sh @@ -431,7 +431,8 @@ function test3() broken_projections_info test ${CLICKHOUSE_CLIENT} -nm --query " - backup table ${CLICKHOUSE_DATABASE}.test to Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}') settings check_projection_parts=false, backup_restore_keeper_fault_injection_probability=0.0; + set backup_restore_keeper_fault_injection_probability=0.0; + backup table ${CLICKHOUSE_DATABASE}.test to Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}') settings check_projection_parts=false; " | grep -o "BACKUP_CREATED" ${CLICKHOUSE_CLIENT} -nm --stacktrace --query " From 79432255df02f696962858347c2207dbdbf2b69f Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Mon, 18 Dec 2023 12:45:24 +0100 Subject: [PATCH 025/245] Update 02916_broken_projection.sh --- tests/queries/0_stateless/02916_broken_projection.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02916_broken_projection.sh b/tests/queries/0_stateless/02916_broken_projection.sh index 07495c45214..55e613b8f3a 100755 --- a/tests/queries/0_stateless/02916_broken_projection.sh +++ b/tests/queries/0_stateless/02916_broken_projection.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Tags: long, no-random-merge-tree-settings +# Tags: long, no-random-merge-tree-settings, no-random-settings # shellcheck disable=SC2046 CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) From 319ae440b6ba09b1dc21b355fab22a99d073592c Mon Sep 17 00:00:00 2001 From: avogar Date: Tue, 19 Dec 2023 16:43:30 +0000 Subject: [PATCH 026/245] Implement Variant data type --- docs/en/operations/settings/settings.md | 52 + docs/en/sql-reference/data-types/variant.md | 217 ++ .../functions/other-functions.md | 36 + src/Columns/ColumnNullable.cpp | 22 +- src/Columns/ColumnNullable.h | 1 + src/Columns/ColumnVariant.cpp | 1360 +++++++++ src/Columns/ColumnVariant.h | 306 ++ src/Columns/IColumn.cpp | 6 + src/Columns/IColumn.h | 14 + src/Columns/MaskOperations.cpp | 6 +- src/Columns/MaskOperations.h | 2 +- src/Columns/tests/gtest_column_variant.cpp | 688 +++++ src/Core/Settings.h | 2 + src/Core/TypeId.h | 1 + src/DataTypes/DataTypeFactory.cpp | 1 + src/DataTypes/DataTypeFactory.h | 1 + src/DataTypes/DataTypeNullable.cpp | 28 + src/DataTypes/DataTypeNullable.h | 3 + src/DataTypes/DataTypeTuple.cpp | 9 +- src/DataTypes/DataTypeVariant.cpp | 197 ++ src/DataTypes/DataTypeVariant.h | 64 + src/DataTypes/EnumValues.cpp | 21 + src/DataTypes/EnumValues.h | 8 +- src/DataTypes/IDataType.cpp | 23 +- src/DataTypes/IDataType.h | 3 + .../Serializations/ISerialization.cpp | 61 +- src/DataTypes/Serializations/ISerialization.h | 13 + .../Serializations/SerializationArray.cpp | 138 +- .../Serializations/SerializationArray.h | 3 + .../Serializations/SerializationBool.cpp | 179 +- .../Serializations/SerializationBool.h | 8 +- .../SerializationCustomSimpleText.cpp | 56 + .../SerializationCustomSimpleText.h | 6 + .../Serializations/SerializationDate.cpp | 46 + .../Serializations/SerializationDate.h | 5 + .../Serializations/SerializationDate32.cpp | 45 + .../Serializations/SerializationDate32.h | 5 + .../Serializations/SerializationDateTime.cpp | 157 +- .../Serializations/SerializationDateTime.h | 5 + .../SerializationDateTime64.cpp | 112 + .../Serializations/SerializationDateTime64.h | 6 + .../Serializations/SerializationDecimal.cpp | 46 +- .../Serializations/SerializationDecimal.h | 6 +- .../Serializations/SerializationEnum.cpp | 97 + .../Serializations/SerializationEnum.h | 13 + .../SerializationFixedString.cpp | 56 + .../Serializations/SerializationFixedString.h | 6 + .../SerializationIPv4andIPv6.cpp | 188 ++ .../Serializations/SerializationIPv4andIPv6.h | 129 +- .../SerializationLowCardinality.cpp | 47 +- .../SerializationLowCardinality.h | 12 + .../Serializations/SerializationMap.cpp | 108 +- .../Serializations/SerializationMap.h | 7 +- .../Serializations/SerializationNamed.cpp | 1 + .../Serializations/SerializationNothing.h | 1 + .../Serializations/SerializationNullable.cpp | 532 +++- .../Serializations/SerializationNullable.h | 53 +- .../Serializations/SerializationNumber.cpp | 80 +- .../Serializations/SerializationNumber.h | 3 + .../Serializations/SerializationString.cpp | 101 +- .../Serializations/SerializationString.h | 5 + .../Serializations/SerializationTuple.cpp | 318 ++- .../Serializations/SerializationTuple.h | 12 + .../Serializations/SerializationUUID.cpp | 41 +- .../Serializations/SerializationUUID.h | 6 +- .../Serializations/SerializationVariant.cpp | 828 ++++++ .../Serializations/SerializationVariant.h | 116 + .../SerializationVariantElement.cpp | 241 ++ .../SerializationVariantElement.h | 87 + .../Serializations/SerializationWrapper.cpp | 25 + .../Serializations/SerializationWrapper.h | 5 + .../Serializations/SimpleTextSerialization.h | 38 + src/DataTypes/Utils.cpp | 1 + src/Databases/DatabaseReplicated.cpp | 1 + src/Formats/EscapingRuleUtils.cpp | 10 +- src/Formats/JSONUtils.cpp | 4 +- src/Formats/SchemaInferenceUtils.cpp | 2 +- src/Functions/FunctionsConversion.h | 262 +- src/Functions/if.cpp | 50 +- src/Functions/isNotNull.cpp | 13 + src/Functions/isNull.cpp | 13 + src/Functions/multiIf.cpp | 10 + src/Functions/variantElement.cpp | 238 ++ src/IO/ReadHelpers.cpp | 298 +- src/IO/ReadHelpers.h | 198 +- src/IO/readDecimalText.h | 20 + src/Interpreters/InterpreterCreateQuery.cpp | 14 + src/Interpreters/InterpreterInsertQuery.cpp | 2 +- src/Interpreters/inplaceBlockConversions.cpp | 35 +- .../parseColumnsListForTableFunction.cpp | 11 + .../parseColumnsListForTableFunction.h | 2 + src/Parsers/ExpressionElementParsers.cpp | 2 +- .../Formats/Impl/CSVRowInputFormat.cpp | 2 +- .../Formats/Impl/MySQLDumpRowInputFormat.cpp | 2 +- .../Formats/Impl/TSKVRowInputFormat.cpp | 2 +- .../Impl/TabSeparatedRowInputFormat.cpp | 4 +- .../Formats/Impl/ValuesBlockInputFormat.cpp | 2 +- ...940_variant_text_deserialization.reference | 516 ++++ .../02940_variant_text_deserialization.sql | 266 ++ .../02941_variant_type_1.reference | 2472 +++++++++++++++++ .../0_stateless/02941_variant_type_1.sh | 124 + .../02941_variant_type_2.reference | 51 + .../0_stateless/02941_variant_type_2.sh | 71 + .../02941_variant_type_3.reference | 51 + .../0_stateless/02941_variant_type_3.sh | 71 + .../02941_variant_type_4.reference | 56 + .../0_stateless/02941_variant_type_4.sh | 66 + .../0_stateless/02942_variant_cast.reference | 25 + .../0_stateless/02942_variant_cast.sql | 23 + .../02943_variant_element.reference | 44 + .../0_stateless/02943_variant_element.sql | 16 + ...44_variant_as_if_multi_if_result.reference | 96 + .../02944_variant_as_if_multi_if_result.sql | 64 + 113 files changed, 11750 insertions(+), 584 deletions(-) create mode 100644 docs/en/sql-reference/data-types/variant.md create mode 100644 src/Columns/ColumnVariant.cpp create mode 100644 src/Columns/ColumnVariant.h create mode 100644 src/Columns/tests/gtest_column_variant.cpp create mode 100644 src/DataTypes/DataTypeVariant.cpp create mode 100644 src/DataTypes/DataTypeVariant.h create mode 100644 src/DataTypes/Serializations/SerializationIPv4andIPv6.cpp create mode 100644 src/DataTypes/Serializations/SerializationVariant.cpp create mode 100644 src/DataTypes/Serializations/SerializationVariant.h create mode 100644 src/DataTypes/Serializations/SerializationVariantElement.cpp create mode 100644 src/DataTypes/Serializations/SerializationVariantElement.h create mode 100644 src/Functions/variantElement.cpp create mode 100644 tests/queries/0_stateless/02940_variant_text_deserialization.reference create mode 100644 tests/queries/0_stateless/02940_variant_text_deserialization.sql create mode 100644 tests/queries/0_stateless/02941_variant_type_1.reference create mode 100755 tests/queries/0_stateless/02941_variant_type_1.sh create mode 100644 tests/queries/0_stateless/02941_variant_type_2.reference create mode 100755 tests/queries/0_stateless/02941_variant_type_2.sh create mode 100644 tests/queries/0_stateless/02941_variant_type_3.reference create mode 100755 tests/queries/0_stateless/02941_variant_type_3.sh create mode 100644 tests/queries/0_stateless/02941_variant_type_4.reference create mode 100755 tests/queries/0_stateless/02941_variant_type_4.sh create mode 100644 tests/queries/0_stateless/02942_variant_cast.reference create mode 100644 tests/queries/0_stateless/02942_variant_cast.sql create mode 100644 tests/queries/0_stateless/02943_variant_element.reference create mode 100644 tests/queries/0_stateless/02943_variant_element.sql create mode 100644 tests/queries/0_stateless/02944_variant_as_if_multi_if_result.reference create mode 100644 tests/queries/0_stateless/02944_variant_as_if_multi_if_result.sql diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index dc46a3f0dcd..dbf5bc341cc 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -5134,3 +5134,55 @@ When set to `true` than for all s3 requests first two attempts are made with low When set to `false` than all attempts are made with identical timeouts. Default value: `true`. + +## allow_experimental_variant_type {#allow_experimental_variant_type} + +Allows creation of experimental [Variant](../../sql-reference/data-types/variant.md). + +Default value: `false`. + +## use_variant_when_no_common_type_in_if {#use_variant_when_no_common_type_in_if} + +Allows to use `Variant` type as a result type for [if](../../sql-reference/functions/conditional-functions.md/#if)/[multiIf](../../sql-reference/functions/conditional-functions.md/#multiif) functions when there is no common type for argument types. + +Example: + +```sql +SET use_variant_when_no_common_type_in_if = 1; +SELECT toTypeName(if(number % 2, number, range(number))) as variant_type FROM numbers(1); +SELECT if(number % 2, number, range(number)) as variant FROM numbers(5); +``` + +```text +┌─variant_type───────────────────┐ +│ Variant(Array(UInt64), UInt64) │ +└────────────────────────────────┘ +┌─variant───┐ +│ [] │ +│ 1 │ +│ [0,1] │ +│ 3 │ +│ [0,1,2,3] │ +└───────────┘ +``` + +```sql +SET use_variant_when_no_common_type_in_if = 1; +SELECT toTypeName(multiIf((number % 4) = 0, 42, (number % 4) = 1, [1, 2, 3], (number % 4) = 2, 'Hello, World!', NULL)) AS variant_type FROM numbers(1); +SELECT multiIf((number % 4) = 0, 42, (number % 4) = 1, [1, 2, 3], (number % 4) = 2, 'Hello, World!', NULL) AS variant FROM numbers(4); +``` + +```text +─variant_type─────────────────────────┐ +│ Variant(Array(UInt8), String, UInt8) │ +└──────────────────────────────────────┘ + +┌─variant───────┐ +│ 42 │ +│ [1,2,3] │ +│ Hello, World! │ +│ ᴺᵁᴸᴸ │ +└───────────────┘ +``` + +Default value: `false`. diff --git a/docs/en/sql-reference/data-types/variant.md b/docs/en/sql-reference/data-types/variant.md new file mode 100644 index 00000000000..34966d79079 --- /dev/null +++ b/docs/en/sql-reference/data-types/variant.md @@ -0,0 +1,217 @@ +--- +slug: /en/sql-reference/data-types/json +sidebar_position: 55 +sidebar_label: Variant +--- + +# Variant(T1, T2, T3, ...) + +This type represents a union of other data types. Type `Variant(T1, T2, ..., TN)` means that each row of this type +has a value of either type `T1` or `T2` or ... or `TN` or none of them (`NULL` value). + +The order of nested types doesn't matter: Variant(T1, T2) = Variant(T2, T1). +Nested types can be arbitrary types except Nullable(...), LowCardinality(Nullable(...)) and Variant(...) types. + +:::note +The Variant data type is an experimental feature. To use it, set `allow_experimental_variant_type = 1`. +::: + +## Creating Variant + +Using `Variant` type in table column definition: + +```sql +CREATE TABLE test (v Variant(UInt64, String, Array(UInt64))) ENGINE = Memory; +INSERT INTO test VALUES (NULL), (42), ('Hello, World!'), ([1, 2, 3]); +SELECT v FROM test; +``` + +```text +┌─v─────────────┐ +│ ᴺᵁᴸᴸ │ +│ 42 │ +│ Hello, World! │ +│ [1,2,3] │ +└───────────────┘ +``` + +Using CAST from ordinary columns: + +```sql +SELECT toTypeName(variant) as type_name, 'Hello, World!'::Variant(UInt64, String, Array(UInt64)) as variant; +``` + +```text +┌─type_name──────────────────────────────┬─variant───────┐ +│ Variant(Array(UInt64), String, UInt64) │ Hello, World! │ +└────────────────────────────────────────┴───────────────┘ +``` + +Using functions `if/multiIf` when arguments doesn't have common type (setting `use_variant_when_no_common_type_in_if` should be enabled for it): + +```sql +SET use_variant_when_no_common_type_in_if = 1; +SELECT if(number % 2, number, range(number)) as variant FROM numbers(5); +``` + +```text +┌─variant───┐ +│ [] │ +│ 1 │ +│ [0,1] │ +│ 3 │ +│ [0,1,2,3] │ +└───────────┘ +``` + +```sql +SET use_variant_when_no_common_type_in_if = 1; +SELECT multiIf((number % 4) = 0, 42, (number % 4) = 1, [1, 2, 3], (number % 4) = 2, 'Hello, World!', NULL) AS variant FROM numbers(4); +``` + +```text +┌─variant───────┐ +│ 42 │ +│ [1,2,3] │ +│ Hello, World! │ +│ ᴺᵁᴸᴸ │ +└───────────────┘ +``` + +## Reading Variant nested types as subcolumns + +Variant type supports reading a single nested type from a Variant column using the type name as a subcolumn. +So, if you have column `variant Variant(T1, T2, T3)` you can read a subcolumn of type `T2` using syntax `variant.T2`, +this subcolumn will have type `Nullable(T2)` if `T2` can be inside `Nullable` and `T2` otherwise. This subcolumn will +be the same size as original `Variant` column and will contain `NULL` values (or empty values if `T2` cannot be inside `Nullable`) +in all rows in which original `Variant` column doesn't have type `T2`. + +Variant subcolumns can be also read using function `variantElement(variant_column, type_name)`. + +Examples: + +```sql +CREATE TABLE test (v Variant(UInt64, String, Array(UInt64))) ENGINE = Memory; +INSERT INTO test VALUES (NULL), (42), ('Hello, World!'), ([1, 2, 3]); +SELECT v, v.String, v.UInt64, v.`Array(UInt64)` FROM test; +``` + +```text +┌─v─────────────┬─v.String──────┬─v.UInt64─┬─v.Array(UInt64)─┐ +│ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [] │ +│ 42 │ ᴺᵁᴸᴸ │ 42 │ [] │ +│ Hello, World! │ Hello, World! │ ᴺᵁᴸᴸ │ [] │ +│ [1,2,3] │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [1,2,3] │ +└───────────────┴───────────────┴──────────┴─────────────────┘ +``` + +```sql +SELECT toTypeName(v.String), toTypeName(v.UInt64), toTypeName(v.`Array(UInt64)`) FROM test LIMIT 1; +``` + +```text +┌─toTypeName(v.String)─┬─toTypeName(v.UInt64)─┬─toTypeName(v.Array(UInt64))─┐ +│ Nullable(String) │ Nullable(UInt64) │ Array(UInt64) │ +└──────────────────────┴──────────────────────┴─────────────────────────────┘ +``` + +```sql +SELECT v, variantElement(v, 'String'), variantElement(v, 'UInt64'), variantElement(v, 'Array(UInt64)') FROM test; +``` + +```text +┌─v─────────────┬─variantElement(v, 'String')─┬─variantElement(v, 'UInt64')─┬─variantElement(v, 'Array(UInt64)')─┐ +│ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [] │ +│ 42 │ ᴺᵁᴸᴸ │ 42 │ [] │ +│ Hello, World! │ Hello, World! │ ᴺᵁᴸᴸ │ [] │ +│ [1,2,3] │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [1,2,3] │ +└───────────────┴─────────────────────────────┴─────────────────────────────┴────────────────────────────────────┘ +``` + +## Conversion between Variant column and other columns + +There are 3 possible conversions that can be performed with Variant column. + +### Converting an ordinary column to a Variant column + +It is possible to convert ordinary column with type `T` to a `Variant` column containing this type: + +```sql +SELECT toTypeName(variant) as type_name, 'Hello, World!'::Variant(UInt64, String, Array(UInt64)) as variant; +``` + +```text +┌─type_name──────────────────────────────┬─variant───────┐ +│ Variant(Array(UInt64), String, UInt64) │ Hello, World! │ +└────────────────────────────────────────┴───────────────┘ +``` + +### Converting a Variant column to an ordinary column + +It is possible to convert a `Variant` column to an ordinary column. In this case all nested variants will be converted to a destination type: + +```sql +CREATE TABLE test (v Variant(UInt64, String)) ENGINE = Memory; +INSERT INTO test VALUES (NULL), (42), ('42.42'); +SELECT v::Nullable(Float64) FROM test; +``` + +```text +┌─CAST(v, 'Nullable(Float64)')─┐ +│ ᴺᵁᴸᴸ │ +│ 42 │ +│ 42.42 │ +└──────────────────────────────┘ +``` + +### Converting a Variant to another Variant + +It is possible to convert a `Variant` column to another `Variant` column, but only if the destination `Variant` column contains all nested types from the original `Variant`: + +```sql +CREATE TABLE test (v Variant(UInt64, String)) ENGINE = Memory; +INSERT INTO test VALUES (NULL), (42), ('String'); +SELECT v::Variant(UInt64, String, Array(UInt64)) FROM test; +``` + +```text +┌─CAST(v, 'Variant(UInt64, String, Array(UInt64))')─┐ +│ ᴺᵁᴸᴸ │ +│ 42 │ +│ String │ +└───────────────────────────────────────────────────┘ +``` + + +## Reading Variant type from the data + +All text formats (TSV, CSV, CustomSeparated, Values, JSONEachRow, etc) supports reading `Variant` type. During data parsing ClickHouse tries to insert value into most appropriate variant type. + +Example: + +```sql +SELECT + v, + variantElement(v, 'String') AS str, + variantElement(v, 'UInt64') AS num, + variantElement(v, 'Float64') AS float, + variantElement(v, 'DateTime') AS date, + variantElement(v, 'Array(UInt64)') AS arr +FROM format(JSONEachRow, 'v Variant(String, UInt64, Float64, DateTime, Array(UInt64))', $$ +{"v" : "Hello, World!"}, +{"v" : 42}, +{"v" : 42.42}, +{"v" : "2020-01-01 00:00:00"}, +{"v" : [1, 2, 3]} +$$) +``` + +```text +┌─v───────────────────┬─str───────────┬──num─┬─float─┬────────────────date─┬─arr─────┐ +│ Hello, World! │ Hello, World! │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [] │ +│ 42 │ ᴺᵁᴸᴸ │ 42 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [] │ +│ 42.42 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 42.42 │ ᴺᵁᴸᴸ │ [] │ +│ 2020-01-01 00:00:00 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 2020-01-01 00:00:00 │ [] │ +│ [1,2,3] │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [1,2,3] │ +└─────────────────────┴───────────────┴──────┴───────┴─────────────────────┴─────────┘ +``` diff --git a/docs/en/sql-reference/functions/other-functions.md b/docs/en/sql-reference/functions/other-functions.md index 5b9d01985dd..47b5ac7b724 100644 --- a/docs/en/sql-reference/functions/other-functions.md +++ b/docs/en/sql-reference/functions/other-functions.md @@ -2831,3 +2831,39 @@ Result: │ SELECT a, b FROM tab WHERE (a > 3) AND (b < 3) │ └─────────────────────────────────────────────────────────────────────────┘ ``` + +## variantElement + +Extracts a column with specified type from a `Variant` column. + +**Syntax** + +``` sql +tupleElement(variant, type_name, [, default_value]) +``` + +- `variant` — Variant column. [Variant](../../sql-reference/data-types/variant.md). +- `type_name` — The name of the variant type to extract. [String](../../sql-reference/data-types/string.md). +- `default_value` - The default value that will be used if variant doesn't have variant with specified type. Can be any type. Optional. + +**Returned value** + +- Subcolumn of a `Variant` column with specified type. + +**Example** + +```sql +CREATE TABLE test (v Variant(UInt64, String, Array(UInt64))) ENGINE = Memory; +INSERT INTO test VALUES (NULL), (42), ('Hello, World!'), ([1, 2, 3]); +SELECT v, variantElement(v, 'String'), variantElement(v, 'UInt64'), variantElement(v, 'Array(UInt64)') FROM test; +``` + +```text +┌─v─────────────┬─variantElement(v, 'String')─┬─variantElement(v, 'UInt64')─┬─variantElement(v, 'Array(UInt64)')─┐ +│ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [] │ +│ 42 │ ᴺᵁᴸᴸ │ 42 │ [] │ +│ Hello, World! │ Hello, World! │ ᴺᵁᴸᴸ │ [] │ +│ [1,2,3] │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [1,2,3] │ +└───────────────┴─────────────────────────────┴─────────────────────────────┴────────────────────────────────────┘ +``` + diff --git a/src/Columns/ColumnNullable.cpp b/src/Columns/ColumnNullable.cpp index 4ee6bb3d586..d2a579d6800 100644 --- a/src/Columns/ColumnNullable.cpp +++ b/src/Columns/ColumnNullable.cpp @@ -890,10 +890,7 @@ ColumnPtr makeNullable(const ColumnPtr & column) ColumnPtr makeNullableOrLowCardinalityNullable(const ColumnPtr & column) { - if (isColumnNullable(*column)) - return column; - - if (isColumnLowCardinalityNullable(*column)) + if (isColumnNullableOrLowCardinalityNullable(*column)) return column; if (isColumnConst(*column)) @@ -919,4 +916,21 @@ ColumnPtr makeNullableSafe(const ColumnPtr & column) return column; } +ColumnPtr makeNullableOrLowCardinalityNullableSafe(const ColumnPtr & column) +{ + if (isColumnNullableOrLowCardinalityNullable(*column)) + return column; + + if (isColumnConst(*column)) + return ColumnConst::create(makeNullableOrLowCardinalityNullableSafe(assert_cast(*column).getDataColumnPtr()), column->size()); + + if (column->lowCardinality()) + return assert_cast(*column).cloneNullable(); + + if (column->canBeInsideNullable()) + return makeNullableSafe(column); + + return column; +} + } diff --git a/src/Columns/ColumnNullable.h b/src/Columns/ColumnNullable.h index b57fdf3064d..60c7750f8fc 100644 --- a/src/Columns/ColumnNullable.h +++ b/src/Columns/ColumnNullable.h @@ -231,5 +231,6 @@ private: ColumnPtr makeNullable(const ColumnPtr & column); ColumnPtr makeNullableSafe(const ColumnPtr & column); ColumnPtr makeNullableOrLowCardinalityNullable(const ColumnPtr & column); +ColumnPtr makeNullableOrLowCardinalityNullableSafe(const ColumnPtr & column); } diff --git a/src/Columns/ColumnVariant.cpp b/src/Columns/ColumnVariant.cpp new file mode 100644 index 00000000000..67754e77992 --- /dev/null +++ b/src/Columns/ColumnVariant.cpp @@ -0,0 +1,1360 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; + extern const int ILLEGAL_COLUMN; + extern const int NOT_IMPLEMENTED; + extern const int LOGICAL_ERROR; + extern const int PARAMETER_OUT_OF_BOUND; + extern const int SIZES_OF_NESTED_COLUMNS_ARE_INCONSISTENT; + extern const int SIZES_OF_COLUMNS_DOESNT_MATCH; +} + +std::string ColumnVariant::getName() const +{ + WriteBufferFromOwnString res; + res << "Variant("; + bool is_first = true; + for (const auto & local_variant : global_to_local_discriminators) + { + if (!is_first) + res << ", "; + is_first = false; + res << variants[local_variant]->getName(); + } + res << ")"; + return res.str(); +} + + +void ColumnVariant::initIdentityGlobalToLocalDiscriminatorsMapping() +{ + local_to_global_discriminators.reserve(variants.size()); + global_to_local_discriminators.reserve(variants.size()); + for (size_t i = 0; i != variants.size(); ++i) + { + local_to_global_discriminators.push_back(i); + global_to_local_discriminators.push_back(i); + } +} + +ColumnVariant::ColumnVariant(MutableColumns && variants_) : ColumnVariant(std::move(variants_), {}) +{ +} + +ColumnVariant::ColumnVariant(MutableColumns && variants_, const std::vector & local_to_global_discriminators_) +{ + /// Empty local_to_global_discriminators mapping means that variants are already in the global order. + if (!local_to_global_discriminators_.empty() && local_to_global_discriminators_.size() != variants_.size()) + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "The number of values in local_to_global_discriminators mapping ({}) doesn't match the number of variants ({})", + local_to_global_discriminators_.size(), + variants_.size()); + + /// As variants are empty, column with local discriminators will be also empty and we can reorder variants according to global discriminators. + variants.resize(variants_.size()); + for (size_t i = 0; i != variants_.size(); ++i) + { + if (isColumnConst(*variants_[i])) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "ColumnVariant cannot have ColumnConst as its element"); + + if (!variants_[i]->empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Not empty column passed to ColumnVariant, but no local_discriminators passed"); + + if (!local_to_global_discriminators_.empty() && local_to_global_discriminators_[i] > variants_.size()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Invalid global discriminator {}. The number of variants: {}", UInt64(local_to_global_discriminators_[i]), variants_.size()); + + if (local_to_global_discriminators_.empty()) + variants[i] = std::move(variants_[i]); + else + variants[local_to_global_discriminators_[i]] = std::move(variants_[i]); + } + + local_discriminators = ColumnDiscriminators::create(); + offsets = ColumnOffsets::create(); + + /// Now global and local discriminators are the same. + initIdentityGlobalToLocalDiscriminatorsMapping(); +} + +ColumnVariant::ColumnVariant(MutableColumnPtr local_discriminators_, MutableColumns && variants_) : ColumnVariant(std::move(local_discriminators_), nullptr, std::move(variants_), {}) +{ +} + +ColumnVariant::ColumnVariant(MutableColumnPtr local_discriminators_, MutableColumns && variants_, const std::vector & global_discriminators) : ColumnVariant(std::move(local_discriminators_), nullptr, std::move(variants_), global_discriminators) +{ +} + +ColumnVariant::ColumnVariant(DB::MutableColumnPtr local_discriminators_, DB::MutableColumnPtr offsets_, DB::MutableColumns && variants_) : ColumnVariant(std::move(local_discriminators_), std::move(offsets_), std::move(variants_), {}) +{ +} + +ColumnVariant::ColumnVariant(DB::MutableColumnPtr local_discriminators_, DB::MutableColumnPtr offsets_, DB::MutableColumns && variants_, const std::vector & local_to_global_discriminators_) +{ + if (variants_.size() > MAX_NESTED_COLUMNS) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Variant type with more than {} nested types is not allowed", ColumnVariant::MAX_NESTED_COLUMNS); + + local_discriminators = std::move(local_discriminators_); + const ColumnDiscriminators * discriminators_concrete = typeid_cast(local_discriminators.get()); + if (!discriminators_concrete) + throw Exception(ErrorCodes::LOGICAL_ERROR, "discriminator column must be a ColumnUInt8"); + + variants.reserve(variants_.size()); + size_t total_size = 0; + for (auto & variant : variants_) + { + if (isColumnConst(*variant)) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "ColumnVariant cannot have ColumnConst as its element"); + + total_size += variant->size(); + variants.push_back(std::move(variant)); + } + + /// We can have more discriminators than values in columns + /// (because of NULL discriminators), but not less. + if (total_size > local_discriminators->size()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Nested columns sizes are inconsistent with local_discriminators column size. Total column sizes: {}, local_discriminators size: {}", total_size, local_discriminators->size()); + + if (offsets_) + { + if (!typeid_cast(offsets_.get())) + throw Exception(ErrorCodes::LOGICAL_ERROR, "offsets column must be a ColumnUInt64"); + + offsets = std::move(offsets_); + } + else + { + /// If no offsets column was provided, construct offsets based on discriminators. + offsets = ColumnOffsets::create(); + Offsets & offsets_data = typeid_cast(offsets.get())->getData(); + offsets_data.reserve(discriminators_concrete->size()); + /// If we have only NULLs, offsets column will not contain any real offsets. + if (hasOnlyNulls()) + { + offsets_data.resize(discriminators_concrete->size()); + } + /// If we have only one non empty variant and no NULLs, + /// offsets column will contain just sequential offsets 0, 1, 2, ... + else if (getLocalDiscriminatorOfOneNoneEmptyVariantNoNulls()) + { + for (size_t i = 0; i != discriminators_concrete->size(); ++i) + offsets_data.push_back(i); + } + /// Otherwise we should iterate through discriminators and + /// remember current offset for each variant column. + else + { + std::vector nested_offsets; + nested_offsets.resize(variants.size()); + for (Discriminator discr : discriminators_concrete->getData()) + { + if (discr == NULL_DISCRIMINATOR) + offsets_data.emplace_back(); + else + offsets_data.push_back(nested_offsets[discr]++); + } + } + } + + /// Empty global_discriminators means that variants are already in global order. + if (local_to_global_discriminators_.empty()) + { + initIdentityGlobalToLocalDiscriminatorsMapping(); + } + else + { + if (local_to_global_discriminators_.size() != variants.size()) + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "The number of values in local_to_global_discriminators mapping ({}) doesn't match the number of variants ({})", + local_to_global_discriminators_.size(), + variants.size()); + + local_to_global_discriminators = local_to_global_discriminators_; + global_to_local_discriminators.resize(local_to_global_discriminators.size()); + /// Create mapping global discriminator -> local discriminator + for (size_t i = 0; i != local_to_global_discriminators.size(); ++i) + { + if (local_to_global_discriminators[i] > variants.size()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Invalid global discriminator {}. The number of variants: {}", UInt64(local_to_global_discriminators[i]), variants_.size()); + + global_to_local_discriminators[local_to_global_discriminators[i]] = i; + } + } +} + +ColumnVariant::Ptr ColumnVariant::create(const Columns & variants, const std::vector & local_to_global_discriminators) +{ + MutableColumns mutable_variants; + mutable_variants.reserve(variants.size()); + for (const auto & variant : variants) + { + if (isColumnConst(*variant)) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "ColumnVariant cannot have ColumnConst as its element"); + mutable_variants.emplace_back(variant->assumeMutable()); + } + + return ColumnVariant::create(std::move(mutable_variants), local_to_global_discriminators); +} + +ColumnVariant::Ptr ColumnVariant::create(const DB::ColumnPtr & local_discriminators, const DB::Columns & variants, const std::vector & local_to_global_discriminators) +{ + MutableColumns mutable_variants; + mutable_variants.reserve(variants.size()); + for (const auto & variant : variants) + { + if (isColumnConst(*variant)) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "ColumnVariant cannot have ColumnConst as its element"); + mutable_variants.emplace_back(variant->assumeMutable()); + } + + return ColumnVariant::create(local_discriminators->assumeMutable(), std::move(mutable_variants), local_to_global_discriminators); +} + +ColumnVariant::Ptr ColumnVariant::create(const DB::ColumnPtr & local_discriminators, const DB::ColumnPtr & offsets, const DB::Columns & variants, const std::vector & local_to_global_discriminators) +{ + MutableColumns mutable_variants; + mutable_variants.reserve(variants.size()); + for (const auto & variant : variants) + { + if (isColumnConst(*variant)) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "ColumnVariant cannot have ColumnConst as its element"); + mutable_variants.emplace_back(variant->assumeMutable()); + } + + return ColumnVariant::create(local_discriminators->assumeMutable(), offsets->assumeMutable(), std::move(mutable_variants), local_to_global_discriminators); +} + +MutableColumnPtr ColumnVariant::cloneEmpty() const +{ + MutableColumns new_variants; + new_variants.reserve(variants.size()); + for (const auto & variant : variants) + new_variants.emplace_back(variant->cloneEmpty()); + + return ColumnVariant::create(std::move(new_variants), local_to_global_discriminators); +} + +MutableColumnPtr ColumnVariant::cloneResized(size_t new_size) const +{ + if (new_size == 0) + return cloneEmpty(); + + const size_t num_variants = variants.size(); + size_t size = local_discriminators->size(); + /// If new size is bigger than the old one, just clone column and append default values. + if (new_size >= size) + { + MutableColumns new_variants; + new_variants.reserve(num_variants); + for (const auto & variant : variants) + new_variants.emplace_back(IColumn::mutate(variant)); + + auto res = ColumnVariant::create(IColumn::mutate(local_discriminators), IColumn::mutate(offsets), std::move(new_variants), local_to_global_discriminators); + res->insertManyDefaults(new_size - size); + return res; + } + + /// If new size is less than current size, we should find the new size for all variants. + + /// Optimization for case when we have only NULLs. In this case we should just resize discriminators and offsets. + if (hasOnlyNulls()) + { + MutableColumns new_variants; + new_variants.reserve(num_variants); + for (const auto & variant : variants) + new_variants.emplace_back(IColumn::mutate(variant)); + + return ColumnVariant::create(local_discriminators->cloneResized(new_size), offsets->cloneResized(new_size), std::move(new_variants), local_to_global_discriminators); + } + + /// Optimization for case when there is only 1 non-empty variant and no NULLs. + /// In this case we can simply call cloneResized on this single variant, discriminators and offsets. + if (auto non_empty_local_discr = getLocalDiscriminatorOfOneNoneEmptyVariantNoNulls()) + { + MutableColumns new_variants; + new_variants.reserve(num_variants); + for (size_t i = 0; i != variants.size(); ++i) + { + if (i == *non_empty_local_discr) + new_variants.emplace_back(variants[i]->cloneResized(new_size)); + else + new_variants.emplace_back(variants[i]->cloneEmpty()); + } + + return ColumnVariant::create(local_discriminators->cloneResized(new_size), offsets->cloneResized(new_size), std::move(new_variants), local_to_global_discriminators); + } + + const auto & local_discriminators_data = getLocalDiscriminators(); + const auto & offsets_data = getOffsets(); + + /// We can find all variants sizes by scanning all new_size local_discriminators and calculating + /// sizes for all new variants. This code is below and commented. + +// std::vector new_nested_sizes(num_variants, 0); +// for (size_t i = 0; i != new_size; ++i) +// { +// Discriminator discr = local_discriminators_data[i]; +// if (discr != NULL_DISCRIMINATOR) +// ++new_nested_sizes[discr]; +// } +// +// MutableColumns new_variants; +// new_variants.reserve(num_variants); +// for (size_t i = 0; i != num_variants; ++i) +// { +// if (new_nested_sizes[i]) +// new_variants.emplace_back(variants[i]->cloneResized(new_nested_sizes[i])); +// else +// new_variants.emplace_back(variants[i]->cloneEmpty()); +// } +// +// return ColumnVariant::create(local_discriminators->cloneResized(new_size), offsets->cloneResized(new_size), std::move(new_variants), local_to_global_discriminators); + + /// But instead we are trying to optimize it using offsets column: + /// For all non-empty variants we are trying to find last occurrence of its discriminator in local_discriminators[:new_size] or + /// first occurrence in local_discriminators[new_size:]. The same row in offsets column will contain the desired size (or size - 1) of variant. + /// All empty variants will remain empty. + /// Not sure how good this optimization is, feel free to remove it and use simpler version above. + + MutableColumns new_variants(num_variants); + std::unordered_set seen_variants; + /// First, check which variants are empty. They will remain empty. + for (Discriminator i = 0; i != num_variants; ++i) + { + if (variants[i]->empty()) + { + seen_variants.insert(i); + new_variants[i] = variants[i]->cloneEmpty(); + } + } + + /// Now, iterate through local discriminators using two pointers. + /// First will go from new_size - 1 to 0, second from new_size to size. + /// Finish when we find all variants or hit lower or upper bound. + ssize_t i = new_size - 1; + size_t j = new_size; + while (i != -1 && j != size) + { + Discriminator i_discr = local_discriminators_data[i]; + if (i_discr != NULL_DISCRIMINATOR) + { + auto [_, inserted] = seen_variants.insert(i_discr); + /// If this is the first occurrence of this discriminator, + /// we can get new size for this variant. + if (inserted) + { + new_variants[i_discr] = variants[i_discr]->cloneResized(offsets_data[i] + 1); + if (seen_variants.size() == num_variants) + break; + } + } + + Discriminator j_discr = local_discriminators_data[j]; + if (j_discr != NULL_DISCRIMINATOR) + { + auto [_, inserted] = seen_variants.insert(j_discr); + /// If this is the first occurrence of this discriminator, + /// we can get new size for this variant. + if (inserted) + { + new_variants[j_discr] = variants[j_discr]->cloneResized(offsets_data[j]); + if (seen_variants.size() == num_variants) + break; + } + } + + --i; + ++j; + } + + /// We can finish in 3 cases: + /// 1) seen_variants.size() == num_variants - we found local_discriminators of all variants, nothing to do. + /// 2) i == -1 - we scanned all values in local_discriminators[:new_size]. Not found variants doesn't have + /// values in local_discriminators[:new_size], so they should be empty in the resized version. + /// 3) j == size - we scanned all values in local_discriminators[new_size:]. Not found variants doesn't have + /// values in local_discriminators[new_size:], so, we should use the full variant in the resized version. + if (seen_variants.size() != num_variants) + { + for (size_t discr = 0; discr != num_variants; ++discr) + { + if (!seen_variants.contains(discr)) + { + if (i == -1) + new_variants[discr] = variants[discr]->cloneEmpty(); + else + new_variants[discr] = IColumn::mutate(variants[discr]); + } + } + } + + return ColumnVariant::create(local_discriminators->cloneResized(new_size), offsets->cloneResized(new_size), std::move(new_variants), local_to_global_discriminators); +} + +Field ColumnVariant::operator[](size_t n) const +{ + Discriminator discr = localDiscriminatorAt(n); + if (discr == NULL_DISCRIMINATOR) + return Null(); + return (*variants[discr])[offsetAt(n)]; +} + +void ColumnVariant::get(size_t n, Field & res) const +{ + Discriminator discr = localDiscriminatorAt(n); + if (discr == NULL_DISCRIMINATOR) + res = Null(); + else + variants[discr]->get(offsetAt(n), res); +} + +bool ColumnVariant::isDefaultAt(size_t n) const +{ + return localDiscriminatorAt(n) == NULL_DISCRIMINATOR; +} + +bool ColumnVariant::isNullAt(size_t n) const +{ + return localDiscriminatorAt(n) == NULL_DISCRIMINATOR; +} + +StringRef ColumnVariant::getDataAt(size_t) const +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getDataAt is not supported for {}", getName()); +} + +void ColumnVariant::insertData(const char *, size_t) +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method insertData is not supported for {}", getName()); +} + +void ColumnVariant::insert(const Field & field) +{ + if (field.isNull()) + insertDefault(); + else + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Cannot insert field {} to column {}", toString(field), getName()); +} + +void ColumnVariant::insertFrom(const IColumn & src_, size_t n) +{ + const ColumnVariant & src = assert_cast(src_); + + const size_t num_variants = variants.size(); + if (src.variants.size() != num_variants) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot insert value of Variant type with different number of types"); + + /// Remember that src column can have different local variants order. + Discriminator global_discr = src.globalDiscriminatorAt(n); + Discriminator local_discr = localDiscriminatorByGlobal(global_discr); + getLocalDiscriminators().push_back(local_discr); + if (local_discr == NULL_DISCRIMINATOR) + { + getOffsets().emplace_back(); + } + else + { + getOffsets().push_back(variants[local_discr]->size()); + variants[local_discr]->insertFrom(src.getVariantByGlobalDiscriminator(global_discr), src.offsetAt(n)); + } +} + +void ColumnVariant::insertRangeFrom(const IColumn & src_, size_t start, size_t length) +{ + const size_t num_variants = variants.size(); + const auto & src = assert_cast(src_); + if (src.variants.size() != num_variants) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot insert value of Variant type with different number of types"); + + if (start + length > src.getLocalDiscriminators().size()) + throw Exception(ErrorCodes::PARAMETER_OUT_OF_BOUND, "Parameter out of bound in ColumnVariant::insertRangeFrom method. " + "[start({}) + length({}) > local_discriminators.size({})]", start, length, src.getLocalDiscriminators().size()); + + /// If src column contains only NULLs, just insert NULLs. + if (src.hasOnlyNulls()) + { + insertManyDefaults(length); + return; + } + + /// Optimization for case when there is only 1 non-empty variant and no NULLs in src column. + /// In this case we can simply call insertRangeFrom on this single variant. + if (auto non_empty_src_local_discr = src.getLocalDiscriminatorOfOneNoneEmptyVariantNoNulls()) + { + auto local_discr = localDiscriminatorByGlobal(src.globalDiscriminatorByLocal(*non_empty_src_local_discr)); + size_t offset = variants[local_discr]->size(); + variants[local_discr]->insertRangeFrom(*src.variants[*non_empty_src_local_discr], start, length); + getLocalDiscriminators().resize_fill(local_discriminators->size() + length, local_discr); + auto & offsets_data = getOffsets(); + offsets_data.reserve(offsets_data.size() + length); + for (size_t i = 0; i != length; ++i) + offsets_data.push_back(offset++); + return; + } + + /// Iterate through src local_discriminators in range [start, start + length], + /// collect ranges we need to insert for all variants and update offsets. + /// nested_ranges[i].first - offset in src.variants[i] + /// nested_ranges[i].second - length in src.variants[i] + std::vector> nested_ranges(num_variants, {0, 0}); + auto & offsets_data = getOffsets(); + offsets_data.reserve(offsets_data.size() + length); + auto & local_discriminators_data = getLocalDiscriminators(); + local_discriminators_data.reserve(local_discriminators_data.size() + length); + const auto & src_offsets_data = src.getOffsets(); + const auto & src_local_discriminators_data = src.getLocalDiscriminators(); + for (size_t i = start; i != start + length; ++i) + { + /// We insert from src.variants[src_local_discr] to variants[local_discr] + Discriminator src_local_discr = src_local_discriminators_data[i]; + Discriminator local_discr = localDiscriminatorByGlobal(src.globalDiscriminatorByLocal(src_local_discr)); + local_discriminators_data.push_back(local_discr); + if (local_discr == NULL_DISCRIMINATOR) + { + offsets_data.emplace_back(); + } + else + { + /// If we see this discriminator for the first time, set its range start. + if (!nested_ranges[src_local_discr].second) + nested_ranges[src_local_discr].first = src_offsets_data[i]; + /// Update offsets column with correct offset. + offsets_data.push_back(variants[local_discr]->size() + nested_ranges[src_local_discr].second); + ++nested_ranges[src_local_discr].second; + } + } + + for (size_t src_local_discr = 0; src_local_discr != nested_ranges.size(); ++src_local_discr) + { + auto [nested_start, nested_length] = nested_ranges[src_local_discr]; + auto local_discr = localDiscriminatorByGlobal(src.globalDiscriminatorByLocal(src_local_discr)); + if (nested_length) + variants[local_discr]->insertRangeFrom(*src.variants[src_local_discr], nested_start, nested_length); + } +} + +void ColumnVariant::insertManyFrom(const DB::IColumn & src_, size_t position, size_t length) +{ + const size_t num_variants = variants.size(); + const auto & src = assert_cast(src_); + if (src.variants.size() != num_variants) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot insert value of Variant type with different number of types"); + + /// Remember that src column can have different local variants order. + Discriminator src_local_discr = src.localDiscriminatorAt(position); + Discriminator local_discr = localDiscriminatorByGlobal(src.globalDiscriminatorByLocal(src_local_discr)); + auto & local_discriminators_data = getLocalDiscriminators(); + local_discriminators_data.resize_fill(local_discriminators_data.size() + length, local_discr); + + auto & offsets_data = getOffsets(); + if (local_discr == NULL_DISCRIMINATOR) + { + offsets_data.resize_fill(offsets_data.size() + length); + } + else + { + size_t prev_offset = variants[local_discr]->size(); + offsets_data.reserve(offsets_data.size() + length); + for (size_t i = 0; i != length; ++i) + offsets_data.push_back(prev_offset + i); + + variants[local_discr]->insertManyFrom(*src.variants[src_local_discr], src.offsetAt(position), length); + } +} + +void ColumnVariant::insertDefault() +{ + getLocalDiscriminators().push_back(NULL_DISCRIMINATOR); + getOffsets().emplace_back(); +} + +void ColumnVariant::insertManyDefaults(size_t length) +{ + size_t size = local_discriminators->size(); + getLocalDiscriminators().resize_fill(size + length, NULL_DISCRIMINATOR); + getOffsets().resize_fill(size + length); +} + +void ColumnVariant::popBack(size_t n) +{ + /// If we have only NULLs, just pop back from local_discriminators and offsets. + if (hasOnlyNulls()) + { + local_discriminators->popBack(n); + offsets->popBack(n); + return; + } + + /// Optimization for case when there is only 1 non-empty variant and no NULLs. + /// In this case we can just popBack n elements from this variant. + if (auto non_empty_local_discr = getLocalDiscriminatorOfOneNoneEmptyVariantNoNulls()) + { + variants[*non_empty_local_discr]->popBack(n); + local_discriminators->popBack(n); + offsets->popBack(n); + return; + } + + /// Calculate how many rows we need to pop from each variant + auto & local_discriminators_data = getLocalDiscriminators(); + size_t size = local_discriminators_data.size(); + const size_t num_variants = variants.size(); + std::vector nested_n(num_variants, 0); + for (size_t i = 0; i != n; ++i) + { + Discriminator discr = local_discriminators_data[size - i - 1]; + if (discr != NULL_DISCRIMINATOR) + ++nested_n[discr]; + } + + for (size_t i = 0; i != num_variants; ++i) + { + if (nested_n[i]) + variants[i]->popBack(nested_n[i]); + } + + local_discriminators->popBack(n); + offsets->popBack(n); +} + +StringRef ColumnVariant::serializeValueIntoArena(size_t n, Arena & arena, char const *& begin, const UInt8 *) const +{ + /// During any serialization/deserialization we should always use global discriminators. + Discriminator global_discr = globalDiscriminatorAt(n); + char * pos = arena.allocContinue(sizeof(global_discr), begin); + memcpy(pos, &global_discr, sizeof(global_discr)); + StringRef res(pos, sizeof(global_discr)); + + if (global_discr == NULL_DISCRIMINATOR) + return res; + + auto value_ref = variants[localDiscriminatorByGlobal(global_discr)]->serializeValueIntoArena(offsetAt(n), arena, begin); + res.data = value_ref.data - res.size; + res.size += value_ref.size; + + return res; +} + +const char * ColumnVariant::deserializeAndInsertFromArena(const char * pos) +{ + /// During any serialization/deserialization we should always use global discriminators. + Discriminator global_discr = unalignedLoad(pos); + pos += sizeof(global_discr); + Discriminator local_discr = localDiscriminatorByGlobal(global_discr); + getLocalDiscriminators().push_back(local_discr); + if (local_discr == NULL_DISCRIMINATOR) + { + getOffsets().emplace_back(); + return pos; + } + + getOffsets().push_back(variants[local_discr]->size()); + return variants[local_discr]->deserializeAndInsertFromArena(pos); +} + +const char * ColumnVariant::skipSerializedInArena(const char * pos) const +{ + Discriminator global_discr = unalignedLoad(pos); + pos += sizeof(global_discr); + if (global_discr == NULL_DISCRIMINATOR) + return pos; + + return variants[localDiscriminatorByGlobal(global_discr)]->skipSerializedInArena(pos); +} + +void ColumnVariant::updateHashWithValue(size_t n, SipHash & hash) const +{ + Discriminator global_discr = globalDiscriminatorAt(n); + hash.update(global_discr); + if (global_discr != NULL_DISCRIMINATOR) + variants[localDiscriminatorByGlobal(global_discr)]->updateHashWithValue(offsetAt(n), hash); +} + +void ColumnVariant::updateWeakHash32(WeakHash32 & hash) const +{ + auto s = size(); + + if (hash.getData().size() != s) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Size of WeakHash32 does not match size of column: " + "column size is {}, hash size is {}", std::to_string(s), std::to_string(hash.getData().size())); + + /// If we have only NULLs, keep hash unchanged. + if (hasOnlyNulls()) + return; + + /// Optimization for case when there is only 1 non-empty variant and no NULLs. + /// In this case we can just calculate weak hash for this variant. + if (auto non_empty_local_discr = getLocalDiscriminatorOfOneNoneEmptyVariantNoNulls()) + { + variants[*non_empty_local_discr]->updateWeakHash32(hash); + return; + } + + /// Calculate weak hash for all variants. + std::vector nested_hashes; + for (const auto & variant : variants) + { + WeakHash32 nested_hash(variant->size()); + variant->updateWeakHash32(nested_hash); + nested_hashes.emplace_back(std::move(nested_hash)); + } + + /// For each row hash is a hash of corresponding row from corresponding variant. + auto & hash_data = hash.getData(); + const auto & local_discriminators_data = getLocalDiscriminators(); + const auto & offsets_data = getOffsets(); + for (size_t i = 0; i != local_discriminators_data.size(); ++i) + { + Discriminator discr = local_discriminators_data[i]; + /// Update hash only for non-NULL values + if (discr != NULL_DISCRIMINATOR) + { + auto nested_hash = nested_hashes[local_discriminators_data[i]].getData()[offsets_data[i]]; + hash_data[i] = static_cast(hashCRC32(nested_hash, hash_data[i])); + } + } +} + +void ColumnVariant::updateHashFast(SipHash & hash) const +{ + local_discriminators->updateHashFast(hash); + for (const auto & variant : variants) + variant->updateHashFast(hash); +} + +ColumnPtr ColumnVariant::filter(const Filter & filt, ssize_t result_size_hint) const +{ + if (size() != filt.size()) + throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH, "Size of filter ({}) doesn't match size of column ({})", filt.size(), size()); + + /// If we have only NULLs, just filter local_discriminators column. + if (hasOnlyNulls()) + { + Columns new_variants(variants.begin(), variants.end()); + auto new_discriminators = local_discriminators->filter(filt, result_size_hint); + /// In case of all NULL values offsets doesn't contain any useful values, just resize it. + ColumnPtr new_offsets = offsets->cloneResized(new_discriminators->size()); + return ColumnVariant::create(new_discriminators, new_offsets, new_variants, local_to_global_discriminators); + } + + /// Optimization for case when there is only 1 non-empty variant and no NULLs. + /// In this case we can just filter this variant and resize discriminators/offsets. + if (auto non_empty_discr = getLocalDiscriminatorOfOneNoneEmptyVariantNoNulls()) + { + Columns new_variants(variants.begin(), variants.end()); + new_variants[*non_empty_discr] = variants[*non_empty_discr]->filter(filt, result_size_hint); + size_t new_size = new_variants[*non_empty_discr]->size(); + ColumnPtr new_discriminators = local_discriminators->cloneResized(new_size); + ColumnPtr new_offsets = offsets->cloneResized(new_size); + return ColumnVariant::create(new_discriminators, new_offsets, new_variants, local_to_global_discriminators); + } + + /// We should create filter for each variant + /// according to local_discriminators and given filter. + const size_t num_variants = variants.size(); + std::vector nested_filters(num_variants); + for (size_t i = 0; i != num_variants; ++i) + nested_filters[i].reserve(variants[i]->size()); + + /// As we will iterate through local_discriminators anyway, we can count + /// result size for each variant. + std::vector variant_result_size_hints(num_variants); + + const auto & local_discriminators_data = getLocalDiscriminators(); + for (size_t i = 0; i != local_discriminators_data.size(); ++i) + { + Discriminator discr = local_discriminators_data[i]; + if (discr != NULL_DISCRIMINATOR) + { + nested_filters[local_discriminators_data[i]].push_back(filt[i]); + variant_result_size_hints[local_discriminators_data[i]] += !!(filt[i]); + } + } + + Columns new_variants; + new_variants.reserve(num_variants); + for (size_t i = 0; i != num_variants; ++i) + { + /// It make sense to call filter() on variant only if the result size is not 0. + if (variant_result_size_hints[i]) + new_variants.emplace_back(variants[i]->filter(nested_filters[i], variant_result_size_hints[i])); + else + new_variants.emplace_back(variants[i]->cloneEmpty()); + } + + /// We cannot use filtered offsets column, as it will be incorrect. + /// It will be reconstructed on ColumnVariant creation according to new local_discriminators. + return ColumnVariant::create(local_discriminators->filter(filt, result_size_hint), new_variants, local_to_global_discriminators); +} + +void ColumnVariant::expand(const Filter & mask, bool inverted) +{ + /// Expand local_discriminators using NULL_DISCRIMINATOR for 0-rows. + expandDataByMask(getLocalDiscriminators(), mask, inverted, NULL_DISCRIMINATOR); + expandDataByMask(getOffsets(), mask, inverted); +} + +ColumnPtr ColumnVariant::permute(const Permutation & perm, size_t limit) const +{ + /// If we have only NULLs, permutation will take no effect, just return resized column. + if (hasOnlyNulls()) + return cloneResized(limit); + + /// Optimization when we have only one non empty variant and no NULLs. + /// In this case local_discriminators column is filled with identical values and offsets column + /// filled with sequential numbers. In this case we can just apply permutation to this + /// single non-empty variant and cut local_discriminators and offsets columns to the result size. + if (auto non_empty_local_discr = getLocalDiscriminatorOfOneNoneEmptyVariantNoNulls()) + { + MutableColumns new_variants; + const size_t num_variants = variants.size(); + new_variants.reserve(num_variants); + for (size_t i = 0; i != num_variants; ++i) + { + if (i == *non_empty_local_discr) + new_variants.emplace_back(variants[*non_empty_local_discr]->permute(perm, limit)->assumeMutable()); + else + new_variants.emplace_back(variants[i]->assumeMutable()); + } + + size_t new_size = new_variants[*non_empty_local_discr]->size(); + return ColumnVariant::create(local_discriminators->cloneResized(new_size), offsets->cloneResized(new_size), std::move(new_variants), local_to_global_discriminators); + } + + return permuteImpl(*this, perm, limit); +} + +ColumnPtr ColumnVariant::index(const IColumn & indexes, size_t limit) const +{ + /// If we have only NULLs, index will take no effect, just return resized column. + if (hasOnlyNulls()) + return cloneResized(limit); + + /// Optimization when we have only one non empty variant and no NULLs. + /// In this case local_discriminators column is filled with identical values and offsets column + /// filled with sequential numbers. So we can just apply indexes to this + /// single non-empty variant and cut local_discriminators and offsets columns to the result size. + if (auto non_empty_local_discr = getLocalDiscriminatorOfOneNoneEmptyVariantNoNulls()) + { + MutableColumns new_variants; + const size_t num_variants = variants.size(); + new_variants.reserve(num_variants); + for (size_t i = 0; i != num_variants; ++i) + { + if (i == *non_empty_local_discr) + new_variants.emplace_back(variants[*non_empty_local_discr]->index(indexes, limit)->assumeMutable()); + else + new_variants.emplace_back(variants[i]->assumeMutable()); + } + + size_t new_size = new_variants[*non_empty_local_discr]->size(); + return ColumnVariant::create(local_discriminators->cloneResized(new_size), offsets->cloneResized(new_size), std::move(new_variants), local_to_global_discriminators); + } + + return selectIndexImpl(*this, indexes, limit); +} + +template +ColumnPtr ColumnVariant::indexImpl(const PaddedPODArray & indexes, size_t limit) const +{ + /// First, apply indexes for local_discriminators and offsets. + ColumnPtr new_local_discriminators = assert_cast(*local_discriminators).indexImpl(indexes, limit); + ColumnPtr new_offsets = assert_cast(*offsets).indexImpl(indexes, limit); + const auto & new_local_discriminators_data = assert_cast(*new_local_discriminators).getData(); + const auto & new_offsets_data = assert_cast(*new_offsets).getData(); + /// Then, create permutation for each variant. + const size_t num_variants = variants.size(); + std::vector nested_perms(num_variants); + /// If there is no limit, we know the size of each permutation + /// in advance and can use reserve. + if (limit == 0) + { + for (size_t i = 0; i != num_variants; ++i) + nested_perms[i].reserve(variants[i]->size()); + } + + for (size_t i = 0; i != new_local_discriminators_data.size(); ++i) + { + Discriminator discr = new_local_discriminators_data[i]; + if (discr != NULL_DISCRIMINATOR) + nested_perms[discr].push_back(new_offsets_data[i]); + } + + Columns new_variants; + new_variants.reserve(num_variants); + for (size_t i = 0; i != num_variants; ++i) + { + size_t nested_limit = nested_perms[i].size() == variants[i]->size() ? 0 : nested_perms[i].size(); + new_variants.emplace_back(variants[i]->permute(nested_perms[i], nested_limit)); + } + + /// We cannot use new_offsets column as an offset column, because it became invalid after variants permutation. + /// New offsets column will be created in constructor. + return ColumnVariant::create(new_local_discriminators, new_variants, local_to_global_discriminators); +} + +ColumnPtr ColumnVariant::replicate(const Offsets & replicate_offsets) const +{ + if (size() != replicate_offsets.size()) + throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH, "Size of offsets {} doesn't match size of column {}", replicate_offsets.size(), size()); + + if (empty()) + return cloneEmpty(); + + /// If we have only NULLs, just resize column to the new size. + if (hasOnlyNulls()) + return cloneResized(replicate_offsets.back()); + + const size_t num_variants = variants.size(); + + /// Optimization when we have only one non empty variant and no NULLs. + /// In this case local_discriminators column is filled with identical values and offsets column + /// filled with sequential numbers. So we can just replicate this one non empty variant, + /// then resize local_discriminators to the result size and fill offsets column. + if (auto non_empty_local_discr = getLocalDiscriminatorOfOneNoneEmptyVariantNoNulls()) + { + MutableColumns new_variants; + new_variants.reserve(num_variants); + for (size_t i = 0; i != num_variants; ++i) + { + if (i == *non_empty_local_discr) + new_variants.emplace_back(variants[*non_empty_local_discr]->replicate(replicate_offsets)->assumeMutable()); + else + new_variants.emplace_back(variants[i]->cloneEmpty()); + } + + size_t new_size = new_variants[*non_empty_local_discr]->size(); + /// Create and fill new local_discriminators column with non_empty_index discriminator. + auto new_local_discriminators = IColumn::mutate(local_discriminators); + assert_cast(*new_local_discriminators).getData().resize_fill(new_size, *non_empty_local_discr); + /// Create and fill new offsets column with sequential indexes. + auto new_offsets = IColumn::mutate(offsets); + auto & new_offsets_data = assert_cast(*new_offsets).getData(); + size_t old_size = offsets->size(); + if (new_size > old_size) + { + new_offsets_data.reserve(new_size); + for (size_t i = old_size; i < new_size; ++i) + new_offsets_data.push_back(new_offsets_data[i - 1] + 1); + } + else + { + new_offsets_data.resize(new_size); + } + + return ColumnVariant::create(std::move(new_local_discriminators), std::move(new_offsets), std::move(new_variants), local_to_global_discriminators); + } + + /// Create replicate offsets for each variant according to + /// local_discriminators column. + std::vector nested_replicated_offsets(num_variants); + for (size_t i = 0; i != num_variants; ++i) + nested_replicated_offsets[i].reserve(variants[i]->size()); + + const auto & local_discriminators_data = getLocalDiscriminators(); + for (size_t i = 0; i != local_discriminators_data.size(); ++i) + { + Discriminator discr = local_discriminators_data[i]; + if (discr != NULL_DISCRIMINATOR) + { + size_t repeat_count = replicate_offsets[i] - replicate_offsets[i - 1]; + nested_replicated_offsets[discr].push_back(nested_replicated_offsets[discr].back() + repeat_count); + } + } + + auto new_local_discriminators = local_discriminators->replicate(replicate_offsets); + Columns new_variants; + new_variants.reserve(num_variants); + for (size_t i = 0; i != num_variants; ++i) + new_variants.emplace_back(variants[i]->replicate(nested_replicated_offsets[i])); + + /// New offsets column will be created in constructor. + return ColumnVariant::create(new_local_discriminators, new_variants, local_to_global_discriminators); +} + +MutableColumns ColumnVariant::scatter(ColumnIndex num_columns, const Selector & selector) const +{ + const size_t num_variants = variants.size(); + + /// If we have only NULLs, we need to scatter only local_discriminators. + if (hasOnlyNulls()) + { + auto scattered_local_discriminators = local_discriminators->scatter(num_columns, selector); + MutableColumns result; + result.reserve(num_columns); + for (size_t i = 0; i != num_columns; ++i) + { + MutableColumns new_variants; + new_variants.reserve(num_variants); + for (const auto & variant : variants) + new_variants.emplace_back(IColumn::mutate(variant)); + + result.emplace_back(ColumnVariant::create(std::move(scattered_local_discriminators[i]), std::move(new_variants), local_to_global_discriminators)); + } + + return result; + } + + /// Optimization when we have only one non empty variant and no NULLs. + /// In this case we can just scatter local_discriminators and this non empty variant. + if (auto non_empty_local_discr = getLocalDiscriminatorOfOneNoneEmptyVariantNoNulls()) + { + auto scattered_local_discriminators = local_discriminators->scatter(num_columns, selector); + auto scattered_non_empty_variant = variants[*non_empty_local_discr]->scatter(num_columns, selector); + MutableColumns result; + result.reserve(num_columns); + for (size_t i = 0; i != num_columns; ++i) + { + MutableColumns scattered_nested_variants(num_variants); + for (size_t j = 0; j != num_variants; ++j) + { + if (j == *non_empty_local_discr) + scattered_nested_variants[j] = std::move(scattered_non_empty_variant[i]); + else + scattered_nested_variants[j] = IColumn::mutate(variants[j]); + } + + result.emplace_back(ColumnVariant::create(std::move(scattered_local_discriminators[i]), std::move(scattered_nested_variants), local_to_global_discriminators)); + } + + return result; + } + + /// Create selector for each variant according to local_discriminators. + std::vector nested_selectors(num_variants); + for (size_t i = 0; i != num_variants; ++i) + nested_selectors[i].reserve(variants[i]->size()); + + const auto & local_discriminators_data = getLocalDiscriminators(); + for (size_t i = 0; i != local_discriminators_data.size(); ++i) + { + Discriminator discr = local_discriminators_data[i]; + if (discr != NULL_DISCRIMINATOR) + nested_selectors[discr].push_back(selector[i]); + } + + auto scattered_local_discriminators = local_discriminators->scatter(num_columns, selector); + std::vector nested_scattered_variants; + nested_scattered_variants.reserve(num_variants); + for (size_t i = 0; i != num_variants; ++i) + nested_scattered_variants.emplace_back(variants[i]->scatter(num_columns, nested_selectors[i])); + + MutableColumns result; + result.reserve(num_columns); + for (size_t i = 0; i != num_columns; ++i) + { + MutableColumns new_variants; + new_variants.reserve(num_variants); + for (size_t j = 0; j != num_variants; ++j) + new_variants.emplace_back(std::move(nested_scattered_variants[j][i])); + result.emplace_back(ColumnVariant::create(std::move(scattered_local_discriminators[i]), std::move(new_variants), local_to_global_discriminators)); + } + + return result; +} + +void ColumnVariant::gather(ColumnGathererStream & gatherer) +{ + gatherer.gather(*this); +} + +bool ColumnVariant::hasEqualValues() const +{ + if (local_discriminators->empty() || hasOnlyNulls()) + return true; + + return local_discriminators->hasEqualValues() && variants[localDiscriminatorAt(0)]->hasEqualValues(); +} + +void ColumnVariant::getPermutation(IColumn::PermutationSortDirection, IColumn::PermutationSortStability, size_t, int, IColumn::Permutation & res) const +{ + size_t s = local_discriminators->size(); + res.resize(s); + for (size_t i = 0; i < s; ++i) + res[i] = i; +} + +void ColumnVariant::updatePermutation(IColumn::PermutationSortDirection, IColumn::PermutationSortStability, size_t, int, IColumn::Permutation &, DB::EqualRanges &) const +{ +} + +void ColumnVariant::reserve(size_t n) +{ + local_discriminators->reserve(n); + offsets->reserve(n); +} + +void ColumnVariant::ensureOwnership() +{ + const size_t num_variants = variants.size(); + for (size_t i = 0; i < num_variants; ++i) + getVariantByLocalDiscriminator(i).ensureOwnership(); +} + +size_t ColumnVariant::byteSize() const +{ + size_t res = local_discriminators->byteSize() + offsets->byteSize(); + for (const auto & variant : variants) + res += variant->byteSize(); + return res; +} + +size_t ColumnVariant::byteSizeAt(size_t n) const +{ + size_t res = sizeof(Offset) + sizeof(Discriminator); + Discriminator discr = localDiscriminatorAt(n); + if (discr == NULL_DISCRIMINATOR) + return res; + + return res + variants[discr]->byteSizeAt(offsetAt(n)); +} + +size_t ColumnVariant::allocatedBytes() const +{ + size_t res = local_discriminators->allocatedBytes() + offsets->allocatedBytes(); + for (const auto & variant : variants) + res += variant->allocatedBytes(); + return res; +} + +void ColumnVariant::protect() +{ + local_discriminators->protect(); + offsets->protect(); + for (auto & variant : variants) + variant->protect(); +} + +void ColumnVariant::getExtremes(Field & min, Field & max) const +{ + min = Null(); + max = Null(); +} + +void ColumnVariant::forEachSubcolumn(MutableColumnCallback callback) +{ + callback(local_discriminators); + callback(offsets); + for (auto & variant : variants) + callback(variant); +} + +void ColumnVariant::forEachSubcolumnRecursively(RecursiveMutableColumnCallback callback) +{ + callback(*local_discriminators); + local_discriminators->forEachSubcolumnRecursively(callback); + callback(*offsets); + offsets->forEachSubcolumnRecursively(callback); + + for (auto & variant : variants) + { + callback(*variant); + variant->forEachSubcolumnRecursively(callback); + } +} + +bool ColumnVariant::structureEquals(const IColumn & rhs) const +{ + const auto * rhs_variant = typeid_cast(&rhs); + if (!rhs_variant) + return false; + + const size_t num_variants = variants.size(); + if (num_variants != rhs_variant->variants.size()) + return false; + + for (size_t i = 0; i < num_variants; ++i) + if (!variants[i]->structureEquals(rhs_variant->getVariantByGlobalDiscriminator(globalDiscriminatorByLocal(i)))) + return false; + + return true; +} + +ColumnPtr ColumnVariant::compress() const +{ + ColumnPtr local_discriminators_compressed = local_discriminators->compress(); + ColumnPtr offsets_compressed = offsets->compress(); + size_t byte_size = local_discriminators_compressed->byteSize() + offsets_compressed->byteSize(); + Columns compressed; + compressed.reserve(variants.size()); + for (const auto & variant : variants) + { + auto compressed_variant = variant->compress(); + byte_size += compressed_variant->byteSize(); + compressed.emplace_back(std::move(compressed_variant)); + } + + return ColumnCompressed::create(size(), byte_size, + [my_local_discriminators_compressed = std::move(local_discriminators_compressed), my_offsets_compressed = std::move(offsets_compressed), my_compressed = std::move(compressed), my_local_to_global_discriminators = this->local_to_global_discriminators]() mutable + { + for (auto & variant : my_compressed) + variant = variant->decompress(); + return ColumnVariant::create(my_local_discriminators_compressed->decompress(), my_offsets_compressed->decompress(), my_compressed, my_local_to_global_discriminators); + }); +} + +double ColumnVariant::getRatioOfDefaultRows(double) const +{ + UInt64 num_defaults = getNumberOfDefaultRows(); + return static_cast(num_defaults) / local_discriminators->size(); +} + +UInt64 ColumnVariant::getNumberOfDefaultRows() const +{ + size_t total_variant_sizes = 0; + for (const auto & variant : variants) + total_variant_sizes += variant->size(); + return local_discriminators->size() - total_variant_sizes; +} + +void ColumnVariant::getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const +{ + return getIndicesOfNonDefaultRowsImpl(indices, from, limit); +} + +void ColumnVariant::finalize() +{ + for (auto & variant : variants) + variant->finalize(); +} + +bool ColumnVariant::isFinalized() const +{ + return std::all_of(variants.begin(), variants.end(), [](const auto & variant) { return variant->isFinalized(); }); +} + +std::optional ColumnVariant::getLocalDiscriminatorOfOneNoneEmptyVariantNoNulls() const +{ + for (size_t i = 0; i != variants.size(); ++i) + { + if (variants[i]->size() == local_discriminators->size()) + return i; + } + + return std::nullopt; +} + +void ColumnVariant::applyNullMap(const ColumnVector::Container & null_map) +{ + applyNullMapImpl(null_map); +} + +void ColumnVariant::applyNegatedNullMap(const ColumnVector::Container & null_map) +{ + applyNullMapImpl(null_map); +} + +template +void ColumnVariant::applyNullMapImpl(const ColumnVector::Container & null_map) +{ + if (null_map.size() != local_discriminators->size()) + throw Exception(ErrorCodes::SIZES_OF_NESTED_COLUMNS_ARE_INCONSISTENT, + "Logical error: Sizes of discriminators column and null map data are not equal"); + + /// If we have only NULLs, nothing to do. + if (hasOnlyNulls()) + { + return; + } + + /// If we have only 1 non empty column and no NULLs, we can just filter that + /// variant according to the null_map. + if (auto non_empty_local_discr = getLocalDiscriminatorOfOneNoneEmptyVariantNoNulls()) + { + auto & local_discriminators_data = getLocalDiscriminators(); + auto & offsets_data = getOffsets(); + size_t size_hint = 0; + + if constexpr (inverted) + { + for (size_t i = 0; i != local_discriminators_data.size(); ++i) + { + if (null_map[i]) + offsets_data[i] = size_hint++; + else + local_discriminators_data[i] = NULL_DISCRIMINATOR; + } + variants[*non_empty_local_discr] = variants[*non_empty_local_discr]->filter(null_map, size_hint); + } + else + { + ColumnVector::Container filter; + filter.reserve(null_map.size()); + for (size_t i = 0; i != local_discriminators_data.size(); ++i) + { + if (null_map[i]) + { + filter.push_back(0); + local_discriminators_data[i] = NULL_DISCRIMINATOR; + } + else + { + filter.push_back(1); + offsets_data[i] = size_hint++; + } + } + variants[*non_empty_local_discr] = variants[*non_empty_local_discr]->filter(filter, size_hint); + } + + return; + } + + /// In general case we should iterate through null_map + discriminators, + /// create filter for each variant and update offsets column. + std::vector variant_filters; + variant_filters.resize(variants.size()); + std::vector variant_new_sizes; + variant_new_sizes.resize(variants.size(), 0); + + auto & local_discriminators_data = getLocalDiscriminators(); + auto & offsets_data = getOffsets(); + for (size_t i = 0; i != local_discriminators_data.size(); ++i) + { + auto & discr = local_discriminators_data[i]; + if (discr != NULL_DISCRIMINATOR) + { + if (null_map[i] ^ inverted) + { + auto & variant_filter = variant_filters[discr]; + /// We create filters lazily. + if (variant_filter.empty()) + variant_filter.resize_fill(variants[discr]->size(), 1); + variant_filter[offsets_data[i]] = 0; + discr = NULL_DISCRIMINATOR; + } + else + { + offsets_data[i] = variant_new_sizes[discr]++; + } + } + } + + for (size_t i = 0; i != variants.size(); ++i) + { + if (!variant_filters[i].empty()) + variants[i] = variants[i]->filter(variant_filters[i], variant_new_sizes[i]); + } +} + +} diff --git a/src/Columns/ColumnVariant.h b/src/Columns/ColumnVariant.h new file mode 100644 index 00000000000..692fdd1709e --- /dev/null +++ b/src/Columns/ColumnVariant.h @@ -0,0 +1,306 @@ +#pragma once + +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + +/** + * Column for storing Variant(...) type values. + * Variant type represents a union of other data types. + * For example, type Variant(T1, T2, ..., TN) means that each row of this type + * has a value of either type T1 or T2 or ... or TN or none of them (NULL value) + * + * ColumnVariant stores: + * - The discriminators column, which determines which variant is stored in each row. + * - The offsets column, which determines the offset in the corresponding variant column in each row. + * - The list of variant columns with only real values (so the sizes of variant columns can be different). + * Discriminator is an index of a variant in the variants list, it also has special value called NULL_DISCRIMINATOR + * that indicates that the value in the row is NULL. + * + * We want to be able to extend Variant column for free without rewriting the data, but as we don't care about the + * order of variants during Variant creation (we want Variant(T1, T2) to be the same as Variant(T2, T1)), we support + * some global order of nested types inside Variant during type creation, so after extension the order of variant types + * (and so their discriminators) can change. For example: Variant(T1, T3) -> Variant(T1, T2, T3). + * To avoid full rewrite of discriminators column on Variant extension, we differentiate local order of variants + * inside a column and global order of variants created during type creation. So, ColumnVariant stores only local + * discriminators and additionally stores the mapping between global and local discriminators. + * So, when we need to extend Variant column with new variant, we can just append it to a list of variant columns + * with new local discriminator and update mapping from global to local orders. + * + * Note that two instances of ColumnVariant can have different local orders, so we should always use global + * discriminators during inter-column interactions. + * + * Let's take an example with type Variant(UInt32, String, Array(UInt32)): + * During type creation we will sort types by their names and get the global order: Array(UInt32), String, UInt32. + * So, type Array(UInt32) will have global discriminator 0, String - 1 and UInt32 - 2. + * Let's say we have a column with local order (String, UInt32, Array(UInt32)) and values: + * 'Hello', 42, NULL, 'World', 43, [1, 2, 3], NULL, 44 + * + * Let's see how these values will be stored in ColumnVariant: + * + * local_to_global_discriminators: {0 : 1, 1 : 2, 2 : 0} + * global_to_local_discriminators: {0 : 2, 1 : 0, 2 : 1} + * local_discriminators offsets String UInt32 Array(UInt32) + * 0 0 'Hello' 42 [1, 2, 3] + * 1 0 'World' 43 + * NULL_DISCRIMINATOR 0 44 + * 0 1 + * 1 1 + * 2 0 + * NULL_DISCRIMINATOR 0 + * 1 2 + * + */ +class ColumnVariant final : public COWHelper +{ +public: + using Discriminator = UInt8; + using Discriminators = PaddedPODArray; + using ColumnDiscriminators = ColumnVector; + using ColumnOffsets = ColumnVector; + + static constexpr UInt8 NULL_DISCRIMINATOR = std::numeric_limits::max(); /// 255 + static constexpr size_t MAX_NESTED_COLUMNS = std::numeric_limits::max(); /// 255 + +private: + friend class COWHelper; + + using NestedColumns = std::vector; + + /// Create an empty column with provided variants. + /// Variants are in global order. + explicit ColumnVariant(MutableColumns && variants_); + /// Variants are in local order according to provided mapping. + explicit ColumnVariant(MutableColumns && variants_, const std::vector & local_to_global_discriminators_); + + /// Create column from discriminators column and list of variant columns. + /// Offsets column should be constructed according to the discriminators. + /// Variants are in global order. + ColumnVariant(MutableColumnPtr local_discriminators_, MutableColumns && variants_); + /// Variants are in local order according to provided mapping. + ColumnVariant(MutableColumnPtr local_discriminators_, MutableColumns && variants_, const std::vector & local_to_global_discriminators_); + + /// Create column from discriminators column, offsets column and list of variant columns. + /// Variants are in global order. + ColumnVariant(MutableColumnPtr local_discriminators_, MutableColumnPtr offsets_, MutableColumns && variants_); + /// Variants are in local order according to provided mapping. + ColumnVariant(MutableColumnPtr local_discriminators_, MutableColumnPtr offsets_, MutableColumns && variants_, const std::vector & local_to_global_discriminators_); + + ColumnVariant(const ColumnVariant &) = default; + +public: + /** Create immutable column using immutable arguments. This arguments may be shared with other variants. + * Use IColumn::mutate in order to make mutable column and mutate shared nested variants. + */ + using Base = COWHelper; + static Ptr create(const Columns & variants_) { return create(variants_, {}); } + static Ptr create(const Columns & variants_, const std::vector & local_to_global_discriminators_); + static Ptr create(const ColumnPtr & local_discriminators_, const Columns & variants_) { return create(local_discriminators_, variants_, {}); } + static Ptr create(const ColumnPtr & local_discriminators_, const Columns & variants_, const std::vector & local_to_global_discriminators_); + static Ptr create(const ColumnPtr & local_discriminators_, const DB::ColumnPtr & offsets_, const Columns & variants_) { return create(local_discriminators_, offsets_, variants_, {}); } + static Ptr create(const ColumnPtr & local_discriminators_, const DB::ColumnPtr & offsets_, const Columns & variants_, const std::vector & local_to_global_discriminators_); + + static MutablePtr create(MutableColumns && variants_) + { + return Base::create(std::move(variants_)); + } + + static MutablePtr create(MutableColumns && variants_, const std::vector & local_to_global_discriminators_) + { + return Base::create(std::move(variants_), local_to_global_discriminators_); + } + + static MutablePtr create(MutableColumnPtr local_discriminators_, MutableColumns && variants_) + { + return Base::create(std::move(local_discriminators_), std::move(variants_)); + } + + static MutablePtr create(MutableColumnPtr local_discriminators_, MutableColumns && variants_, const std::vector & local_to_global_discriminators_) + { + return Base::create(std::move(local_discriminators_), std::move(variants_), local_to_global_discriminators_); + } + + static MutablePtr create(MutableColumnPtr local_discriminators_, MutableColumnPtr offsets_, MutableColumns && variants_) + { + return Base::create(std::move(local_discriminators_), std::move(offsets_), std::move(variants_)); + } + + static MutablePtr create(MutableColumnPtr local_discriminators_, MutableColumnPtr offsets_, MutableColumns && variants_, const std::vector & local_to_global_discriminators_) + { + return Base::create(std::move(local_discriminators_), std::move(offsets_), std::move(variants_), local_to_global_discriminators_); + } + + std::string getName() const override; + const char * getFamilyName() const override { return "Variant"; } + TypeIndex getDataType() const override { return TypeIndex::Variant; } + + MutableColumnPtr cloneEmpty() const override; + MutableColumnPtr cloneResized(size_t size) const override; + + size_t ALWAYS_INLINE offsetAt(size_t i) const { return getOffsets()[i]; } + Discriminator ALWAYS_INLINE localDiscriminatorAt(size_t i) const { return getLocalDiscriminators()[i]; } + Discriminator ALWAYS_INLINE globalDiscriminatorAt(size_t i) const { return globalDiscriminatorByLocal(getLocalDiscriminators()[i]); } + + Discriminator ALWAYS_INLINE globalDiscriminatorByLocal(Discriminator local_discr) const + { + /// NULL_DISCRIMINATOR is always the same in local and global orders. + return local_discr == NULL_DISCRIMINATOR ? NULL_DISCRIMINATOR : local_to_global_discriminators[local_discr]; + } + + Discriminator ALWAYS_INLINE localDiscriminatorByGlobal(Discriminator global_discr) const + { + /// NULL_DISCRIMINATOR is always the same in local and global orders. + return global_discr == NULL_DISCRIMINATOR ? NULL_DISCRIMINATOR : global_to_local_discriminators[global_discr]; + } + + size_t size() const override + { + return local_discriminators->size(); + } + + Field operator[](size_t n) const override; + void get(size_t n, Field & res) const override; + + bool isDefaultAt(size_t n) const override; + bool isNullAt(size_t n) const override; + StringRef getDataAt(size_t n) const override; + void insertData(const char * pos, size_t length) override; + void insert(const Field & x) override; + void insertFrom(const IColumn & src_, size_t n) override; + void insertRangeFrom(const IColumn & src, size_t start, size_t length) override; + void insertManyFrom(const IColumn & src, size_t position, size_t length) override; + void insertDefault() override; + void insertManyDefaults(size_t length) override; + void popBack(size_t n) override; + StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin, const UInt8 *) const override; + const char * deserializeAndInsertFromArena(const char * pos) override; + const char * skipSerializedInArena(const char * pos) const override; + void updateHashWithValue(size_t n, SipHash & hash) const override; + void updateWeakHash32(WeakHash32 & hash) const override; + void updateHashFast(SipHash & hash) const override; + ColumnPtr filter(const Filter & filt, ssize_t result_size_hint) const override; + void expand(const Filter & mask, bool inverted) override; + ColumnPtr permute(const Permutation & perm, size_t limit) const override; + ColumnPtr index(const IColumn & indexes, size_t limit) const override; + template + ColumnPtr indexImpl(const PaddedPODArray & indexes, size_t limit) const; + ColumnPtr replicate(const Offsets & replicate_offsets) const override; + MutableColumns scatter(ColumnIndex num_variants, const Selector & selector) const override; + void gather(ColumnGathererStream & gatherer_stream) override; + + /// Variant type is not comparable. + int compareAt(size_t, size_t, const IColumn &, int) const override + { + return 0; + } + + void compareColumn(const IColumn &, size_t, PaddedPODArray *, PaddedPODArray &, int, int) const override + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method compareColumn is not supported for ColumnAggregateFunction"); + } + + bool hasEqualValues() const override; + void getExtremes(Field & min, Field & max) const override; + void getPermutation(IColumn::PermutationSortDirection direction, IColumn::PermutationSortStability stability, + size_t limit, int nan_direction_hint, IColumn::Permutation & res) const override; + void updatePermutation(IColumn::PermutationSortDirection direction, IColumn::PermutationSortStability stability, + size_t limit, int nan_direction_hint, IColumn::Permutation & res, EqualRanges & equal_ranges) const override; + + void reserve(size_t n) override; + void ensureOwnership() override; + size_t byteSize() const override; + size_t byteSizeAt(size_t n) const override; + size_t allocatedBytes() const override; + void protect() override; + void forEachSubcolumn(MutableColumnCallback callback) override; + void forEachSubcolumnRecursively(RecursiveMutableColumnCallback callback) override; + bool structureEquals(const IColumn & rhs) const override; + ColumnPtr compress() const override; + double getRatioOfDefaultRows(double sample_ratio) const override; + UInt64 getNumberOfDefaultRows() const override; + void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override; + void finalize() override; + bool isFinalized() const override; + + const IColumn & getVariantByLocalDiscriminator(size_t discr) const { return *variants[discr]; } + const IColumn & getVariantByGlobalDiscriminator(size_t discr) const { return *variants[global_to_local_discriminators.at(discr)]; } + IColumn & getVariantByLocalDiscriminator(size_t discr) { return *variants[discr]; } + IColumn & getVariantByGlobalDiscriminator(size_t discr) { return *variants[global_to_local_discriminators.at(discr)]; } + + const ColumnPtr & getVariantPtrByLocalDiscriminator(size_t discr) const { return variants[discr]; } + const ColumnPtr & getVariantPtrByGlobalDiscriminator(size_t discr) const { return variants[global_to_local_discriminators.at(discr)]; } + ColumnPtr & getVariantPtrByLocalDiscriminator(size_t discr) { return variants[discr]; } + ColumnPtr & getVariantPtrByGlobalDiscriminator(size_t discr) { return variants[global_to_local_discriminators.at(discr)]; } + + const IColumn & getLocalDiscriminatorsColumn() const { return *local_discriminators; } + IColumn & getLocalDiscriminatorsColumn() { return *local_discriminators; } + + const ColumnPtr & getLocalDiscriminatorsPtr() const { return local_discriminators; } + ColumnPtr & getLocalDiscriminatorsPtr() { return local_discriminators; } + + const Discriminators & ALWAYS_INLINE getLocalDiscriminators() const { return assert_cast(*local_discriminators).getData(); } + Discriminators & ALWAYS_INLINE getLocalDiscriminators() { return assert_cast(*local_discriminators).getData(); } + + const IColumn & getOffsetsColumn() const { return *offsets; } + IColumn & getOffsetsColumn() { return *offsets; } + + const ColumnPtr & getOffsetsPtr() const { return offsets; } + ColumnPtr & getOffsetsPtr() { return offsets; } + + const Offsets & ALWAYS_INLINE getOffsets() const { return assert_cast(*offsets).getData(); } + Offsets & ALWAYS_INLINE getOffsets() { return assert_cast(*offsets).getData(); } + + size_t getNumVariants() const { return variants.size(); } + + bool hasOnlyNulls() const + { + /// If all variants are empty, we have only NULL values. + return std::all_of(variants.begin(), variants.end(), [](const auto & v){ return v->empty(); } ); + } + + /// Check if local and global order is the same. + bool hasGlobalVariantsOrder() const + { + for (size_t i = 0; i != local_to_global_discriminators.size(); ++i) + { + if (local_to_global_discriminators[i] != i) + return false; + } + + return true; + } + + /// Check if we have only 1 non-empty variant and no NULL values, + /// and if so, return the discriminator of this non-empty column. + std::optional getLocalDiscriminatorOfOneNoneEmptyVariantNoNulls() const; + + /// Apply null map to a Variant column. + /// Replace corresponding discriminators with NULL_DISCRIMINATOR + /// and filter out rows in variants if needed. + void applyNullMap(const ColumnVector::Container & null_map); + void applyNegatedNullMap(const ColumnVector::Container & null_map); + +private: + void initIdentityGlobalToLocalDiscriminatorsMapping(); + + template + void applyNullMapImpl(const ColumnVector::Container & null_map); + + WrappedPtr local_discriminators; + WrappedPtr offsets; + NestedColumns variants; + + std::vector global_to_local_discriminators; + std::vector local_to_global_discriminators; +}; + + +} diff --git a/src/Columns/IColumn.cpp b/src/Columns/IColumn.cpp index 7923bca6354..82dc82e0bd9 100644 --- a/src/Columns/IColumn.cpp +++ b/src/Columns/IColumn.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -83,6 +84,11 @@ bool isColumnNullable(const IColumn & column) return checkColumn(column); } +bool isColumnNullableOrLowCardinalityNullable(const IColumn & column) +{ + return isColumnNullable(column) || isColumnLowCardinalityNullable(column); +} + bool isColumnConst(const IColumn & column) { return checkColumn(column); diff --git a/src/Columns/IColumn.h b/src/Columns/IColumn.h index f012eeca61f..0dcba5b310c 100644 --- a/src/Columns/IColumn.h +++ b/src/Columns/IColumn.h @@ -631,6 +631,17 @@ struct IsMutableColumns template <> struct IsMutableColumns<> { static const bool value = true; }; +template +struct IsMutableColumnsOrRvalueReferences; + +template +struct IsMutableColumnsOrRvalueReferences +{ + static const bool value = (std::is_assignable::value || std::is_rvalue_reference_v) && IsMutableColumnsOrRvalueReferences::value; +}; + +template <> +struct IsMutableColumnsOrRvalueReferences<> { static const bool value = true; }; template const Type * checkAndGetColumn(const IColumn & column) @@ -662,4 +673,7 @@ bool isColumnConst(const IColumn & column); /// True if column's an ColumnNullable instance. It's just a syntax sugar for type check. bool isColumnNullable(const IColumn & column); +/// True if column's is ColumnNullable or ColumnLowCardinality with nullable nested column. +bool isColumnNullableOrLowCardinalityNullable(const IColumn & column); + } diff --git a/src/Columns/MaskOperations.cpp b/src/Columns/MaskOperations.cpp index b84268356a7..518269e1728 100644 --- a/src/Columns/MaskOperations.cpp +++ b/src/Columns/MaskOperations.cpp @@ -17,7 +17,7 @@ namespace ErrorCodes } template -void expandDataByMask(PaddedPODArray & data, const PaddedPODArray & mask, bool inverted) +void expandDataByMask(PaddedPODArray & data, const PaddedPODArray & mask, bool inverted, T default_value) { if (mask.size() < data.size()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Mask size should be no less than data size."); @@ -38,7 +38,7 @@ void expandDataByMask(PaddedPODArray & data, const PaddedPODArray & ma --from; } else - data[index] = T(); + data[index] = default_value; --index; } @@ -49,7 +49,7 @@ void expandDataByMask(PaddedPODArray & data, const PaddedPODArray & ma /// Explicit instantiations - not to place the implementation of the function above in the header file. #define INSTANTIATE(TYPE) \ -template void expandDataByMask(PaddedPODArray &, const PaddedPODArray &, bool); +template void expandDataByMask(PaddedPODArray &, const PaddedPODArray &, bool, TYPE); INSTANTIATE(UInt8) INSTANTIATE(UInt16) diff --git a/src/Columns/MaskOperations.h b/src/Columns/MaskOperations.h index e43b4588258..cc5226bf0c1 100644 --- a/src/Columns/MaskOperations.h +++ b/src/Columns/MaskOperations.h @@ -13,7 +13,7 @@ namespace DB /// If inverted is true, we will work with inverted mask. This function is used in implementations of /// expand() method in IColumn interface. template -void expandDataByMask(PaddedPODArray & data, const PaddedPODArray & mask, bool inverted); +void expandDataByMask(PaddedPODArray & data, const PaddedPODArray & mask, bool inverted, T default_value = T()); struct MaskInfo { diff --git a/src/Columns/tests/gtest_column_variant.cpp b/src/Columns/tests/gtest_column_variant.cpp new file mode 100644 index 00000000000..b701e2d3183 --- /dev/null +++ b/src/Columns/tests/gtest_column_variant.cpp @@ -0,0 +1,688 @@ +#include +#include +#include +#include +#include + +using namespace DB; + +TEST(ColumnVariant, CreateFromEmptyColumns) +{ + MutableColumns columns; + columns.push_back(ColumnUInt32::create()); + columns.push_back(ColumnString::create()); + auto column = ColumnVariant::create(std::move(columns)); + ASSERT_TRUE(column->empty() && column->getLocalDiscriminators().empty() && column->getOffsets().empty()); +} + +TEST(ColumnVariant, CreateFromEmptyColumnsWithLocalOrder) +{ + MutableColumns columns; + columns.push_back(ColumnUInt32::create()); + columns.push_back(ColumnString::create()); + std::vector local_to_global_discriminators; + local_to_global_discriminators.push_back(1); + local_to_global_discriminators.push_back(0); + auto column = ColumnVariant::create(std::move(columns), local_to_global_discriminators); + ASSERT_TRUE(column->empty() && column->getLocalDiscriminators().empty() && column->getOffsets().empty()); + ASSERT_EQ(column->localDiscriminatorByGlobal(0), 0); + ASSERT_EQ(column->localDiscriminatorByGlobal(1), 1); + ASSERT_EQ(column->globalDiscriminatorByLocal(0), 0); + ASSERT_EQ(column->globalDiscriminatorByLocal(1), 1); +} + +MutableColumns createColumns1() +{ + MutableColumns columns; + auto column1 = ColumnUInt64::create(); + column1->insertValue(42); + columns.push_back(std::move(column1)); + auto column2 = ColumnString::create(); + column2->insertData("Hello", 5); + column2->insertData("World", 5); + columns.push_back(std::move(column2)); + auto column3 = ColumnUInt32::create(); + columns.push_back(std::move(column3)); + return columns; +} + +MutableColumnPtr createDiscriminators1() +{ + auto discriminators_column = ColumnVariant::ColumnDiscriminators::create(); + discriminators_column->insertValue(0); + discriminators_column->insertValue(1); + discriminators_column->insertValue(ColumnVariant::NULL_DISCRIMINATOR); + discriminators_column->insertValue(1); + discriminators_column->insertValue(ColumnVariant::NULL_DISCRIMINATOR); + return discriminators_column; +} + +void reorderColumns(const std::vector & local_to_global_order, MutableColumns & columns) +{ + MutableColumns res; + for (auto global_discr : local_to_global_order) + res.push_back(std::move(columns[global_discr])); + columns = std::move(res); +} + +template +void reorderDiscriminators(const std::vector & local_to_global_order, Ptr & discriminators) +{ + std::vector global_to_local_order(local_to_global_order.size()); + for (size_t i = 0; i != local_to_global_order.size(); ++i) + global_to_local_order[local_to_global_order[i]] = i; + + auto & discriminators_data = assert_cast(discriminators.get())->getData(); + for (auto & discr : discriminators_data) + { + if (discr != ColumnVariant::NULL_DISCRIMINATOR) + discr = global_to_local_order[discr]; + } +} + +MutableColumnPtr createOffsets1() +{ + auto offsets = ColumnVariant::ColumnOffsets::create(); + offsets->insertValue(0); + offsets->insertValue(0); + offsets->insertValue(0); + offsets->insertValue(1); + offsets->insertValue(0); + return offsets; +} + +std::vector createLocalToGlobalOrder1() +{ + std::vector local_to_global_discriminators; + local_to_global_discriminators.push_back(1); + local_to_global_discriminators.push_back(2); + local_to_global_discriminators.push_back(0); + return local_to_global_discriminators; +} + +void checkColumnVariant1(ColumnVariant * column) +{ + const auto & offsets = column->getOffsets(); + ASSERT_EQ(column->size(), 5); + ASSERT_EQ(offsets[0], 0); + ASSERT_EQ(offsets[1], 0); + ASSERT_EQ(offsets[3], 1); + ASSERT_TRUE(column->isDefaultAt(2) && column->isDefaultAt(4)); + ASSERT_EQ((*column)[0].get(), 42); + ASSERT_EQ((*column)[1].get(), "Hello"); + ASSERT_TRUE((*column)[2].isNull()); + ASSERT_EQ((*column)[3].get(), "World"); + ASSERT_TRUE((*column)[4].isNull()); +} + +void checkColumnVariant1Order(ColumnVariant * column) +{ + ASSERT_EQ(column->localDiscriminatorByGlobal(0), 2); + ASSERT_EQ(column->localDiscriminatorByGlobal(1), 0); + ASSERT_EQ(column->localDiscriminatorByGlobal(2), 1); + ASSERT_EQ(column->globalDiscriminatorByLocal(0), 1); + ASSERT_EQ(column->globalDiscriminatorByLocal(1), 2); + ASSERT_EQ(column->globalDiscriminatorByLocal(2), 0); + ASSERT_EQ(column->localDiscriminatorAt(0), 2); + ASSERT_EQ(column->localDiscriminatorAt(1), 0); + ASSERT_EQ(column->localDiscriminatorAt(2), ColumnVariant::NULL_DISCRIMINATOR); + ASSERT_EQ(column->localDiscriminatorAt(3), 0); + ASSERT_EQ(column->localDiscriminatorAt(4), ColumnVariant::NULL_DISCRIMINATOR); + ASSERT_EQ(column->globalDiscriminatorAt(0), 0); + ASSERT_EQ(column->globalDiscriminatorAt(1), 1); + ASSERT_EQ(column->globalDiscriminatorAt(2), ColumnVariant::NULL_DISCRIMINATOR); + ASSERT_EQ(column->globalDiscriminatorAt(3), 1); + ASSERT_EQ(column->globalDiscriminatorAt(4), ColumnVariant::NULL_DISCRIMINATOR); +} + +TEST(ColumnVariant, CreateFromDiscriminatorsAndColumns) +{ + auto columns = createColumns1(); + auto discriminators = createDiscriminators1(); + auto column = ColumnVariant::create(std::move(discriminators), std::move(columns)); + checkColumnVariant1(column.get()); +} + +TEST(ColumnVariant, CreateFromDiscriminatorsAndColumnsWithLocalOrder) +{ + auto local_to_global_order = createLocalToGlobalOrder1(); + auto columns = createColumns1(); + reorderColumns(local_to_global_order, columns); + auto discriminators = createDiscriminators1(); + reorderDiscriminators(local_to_global_order, discriminators); + auto column = ColumnVariant::create(std::move(discriminators), std::move(columns), createLocalToGlobalOrder1()); + checkColumnVariant1(column.get()); + checkColumnVariant1Order(column.get()); +} + +TEST(ColumnVariant, CreateFromDiscriminatorsOffsetsAndColumns) +{ + auto columns = createColumns1(); + auto discriminators = createDiscriminators1(); + auto offsets = createOffsets1(); + auto column = ColumnVariant::create(std::move(discriminators), std::move(offsets), std::move(columns)); + checkColumnVariant1(column.get()); +} + +TEST(ColumnVariant, CreateFromDiscriminatorsOffsetsAndColumnsWithLocalOrder) +{ + auto local_to_global_order = createLocalToGlobalOrder1(); + auto columns = createColumns1(); + reorderColumns(local_to_global_order, columns); + auto discriminators = createDiscriminators1(); + reorderDiscriminators(local_to_global_order, discriminators); + auto offsets = createOffsets1(); + auto column = ColumnVariant::create(std::move(discriminators), std::move(offsets), std::move(columns), createLocalToGlobalOrder1()); + checkColumnVariant1(column.get()); + checkColumnVariant1Order(column.get()); +} + +ColumnVariant::MutablePtr createVariantWithOneFullColumNoNulls(size_t size, bool change_order) +{ + MutableColumns columns; + auto column1 = ColumnUInt64::create(); + for (size_t i = 0; i != size; ++i) + column1->insertValue(i); + columns.push_back(std::move(column1)); + auto column2 = ColumnString::create(); + columns.push_back(std::move(column2)); + auto column3 = ColumnUInt32::create(); + columns.push_back(std::move(column3)); + auto discriminators_column = ColumnVariant::ColumnDiscriminators::create(); + for (size_t i = 0; i != size; ++i) + discriminators_column->insertValue(0); + if (change_order) + { + auto local_to_global_order = createLocalToGlobalOrder1(); + reorderColumns(local_to_global_order, columns); + reorderDiscriminators(local_to_global_order, discriminators_column); + return ColumnVariant::create(std::move(discriminators_column), std::move(columns), createLocalToGlobalOrder1()); + } + return ColumnVariant::create(std::move(discriminators_column), std::move(columns)); +} + +TEST(ColumnVariant, CreateFromDiscriminatorsAndOneFullColumnNoNulls) +{ + auto column = createVariantWithOneFullColumNoNulls(3, false); + const auto & offsets = column->getOffsets(); + ASSERT_EQ(column->size(), 3); + ASSERT_EQ(offsets[0], 0); + ASSERT_EQ(offsets[1], 1); + ASSERT_EQ(offsets[2], 2); + ASSERT_EQ((*column)[0].get(), 0); + ASSERT_EQ((*column)[1].get(), 1); + ASSERT_EQ((*column)[2].get(), 2); +} + +TEST(ColumnVariant, CreateFromDiscriminatorsAndOneFullColumnNoNullsWithLocalOrder) +{ + auto column = createVariantWithOneFullColumNoNulls(3, true); + const auto & offsets = column->getOffsets(); + ASSERT_EQ(column->size(), 3); + ASSERT_EQ(offsets[0], 0); + ASSERT_EQ(offsets[1], 1); + ASSERT_EQ(offsets[2], 2); + ASSERT_EQ((*column)[0].get(), 0); + ASSERT_EQ((*column)[1].get(), 1); + ASSERT_EQ((*column)[2].get(), 2); + ASSERT_EQ(column->localDiscriminatorAt(0), 2); + ASSERT_EQ(column->localDiscriminatorAt(1), 2); + ASSERT_EQ(column->localDiscriminatorAt(2), 2); + ASSERT_EQ(column->globalDiscriminatorAt(0), 0); + ASSERT_EQ(column->globalDiscriminatorAt(0), 0); + ASSERT_EQ(column->globalDiscriminatorAt(0), 0); +} + +TEST(ColumnVariant, CloneResizedToEmpty) +{ + auto column = ColumnVariant::create(createDiscriminators1(), createOffsets1(), createColumns1()); + auto resized_column = column->cloneResized(0); + ASSERT_TRUE(resized_column->empty()); +} + +TEST(ColumnVariant, CloneResizedToLarge) +{ + auto column = ColumnVariant::create(createDiscriminators1(), createOffsets1(), createColumns1()); + auto resized_column = column->cloneResized(7); + const auto * resized_column_variant = assert_cast(resized_column.get()); + ASSERT_EQ(resized_column_variant->size(), 7); + const auto & offsets = resized_column_variant->getOffsets(); + for (size_t i = 0; i != 7; ++i) + { + if (i == 3) + ASSERT_EQ(offsets[i], 1); + else + ASSERT_EQ(offsets[i], 0); + } + + const auto & discriminators = resized_column_variant->getLocalDiscriminators(); + std::vector null_indexes = {2, 4, 5, 6}; + for (size_t i : null_indexes) + ASSERT_EQ(discriminators[i], ColumnVariant::NULL_DISCRIMINATOR); + ASSERT_EQ(resized_column_variant->getVariantByLocalDiscriminator(0).size(), 1); + ASSERT_EQ(resized_column_variant->getVariantByLocalDiscriminator(1).size(), 2); +} + +TEST(ColumnVariant, CloneResizedWithOneFullColumnNoNulls) +{ + auto column = createVariantWithOneFullColumNoNulls(5, false); + auto resized_column = column->cloneResized(3); + const auto * resized_column_variant = assert_cast(resized_column.get()); + ASSERT_EQ(resized_column_variant->size(), 3); + const auto & offsets = resized_column_variant->getOffsets(); + for (size_t i = 0; i != 3; ++i) + ASSERT_EQ(offsets[i], i); + const auto & discriminators = resized_column_variant->getLocalDiscriminators(); + for (size_t i = 0; i != 3; ++i) + ASSERT_EQ(discriminators[i], 0); + ASSERT_EQ(resized_column_variant->getVariantByLocalDiscriminator(0).size(), 3); + ASSERT_EQ(resized_column_variant->getVariantByLocalDiscriminator(1).size(), 0); + ASSERT_EQ(resized_column_variant->getVariantByLocalDiscriminator(2).size(), 0); +} + +MutableColumns createColumns2() +{ + MutableColumns columns; + auto column1 = ColumnUInt64::create(); + column1->insertValue(42); + column1->insertValue(43); + column1->insertValue(44); + columns.push_back(std::move(column1)); + auto column2 = ColumnString::create(); + column2->insertData("Hello", 5); + column2->insertData("World", 5); + columns.push_back(std::move(column2)); + auto column3 = ColumnUInt8::create(); + columns.push_back(std::move(column3)); + return columns; +} + +TEST(ColumnVariant, CloneResizedGeneral1) +{ + /// D c1 c2 c3 + /// 0 42 Hello + /// 1 43 World + /// NULL 44 + /// 0 + /// 1 + /// NULL + /// 0 + auto discriminators_column = ColumnVariant::ColumnDiscriminators::create(); + discriminators_column->insertValue(0); + discriminators_column->insertValue(1); + discriminators_column->insertValue(ColumnVariant::NULL_DISCRIMINATOR); + discriminators_column->insertValue(0); + discriminators_column->insertValue(1); + discriminators_column->insertValue(ColumnVariant::NULL_DISCRIMINATOR); + discriminators_column->insertValue(0); + auto column = ColumnVariant::create(std::move(discriminators_column), createColumns2()); + auto resized_column = column->cloneResized(4); + const auto * resized_column_variant = assert_cast(resized_column.get()); + ASSERT_EQ(resized_column_variant->size(), 4); + ASSERT_EQ(resized_column_variant->getVariantByLocalDiscriminator(0).size(), 2); + ASSERT_EQ(resized_column_variant->getVariantByLocalDiscriminator(1).size(), 1); + ASSERT_EQ(resized_column_variant->getVariantByLocalDiscriminator(2).size(), 0); + const auto & discriminators = resized_column_variant->getLocalDiscriminators(); + ASSERT_EQ(discriminators[0], 0); + ASSERT_EQ(discriminators[1], 1); + ASSERT_EQ(discriminators[2], ColumnVariant::NULL_DISCRIMINATOR); + ASSERT_EQ(discriminators[3], 0); + const auto & offsets = resized_column_variant->getOffsets(); + ASSERT_EQ(offsets[0], 0); + ASSERT_EQ(offsets[1], 0); + ASSERT_EQ(offsets[3], 1); + ASSERT_EQ((*resized_column_variant)[0].get(), 42); + ASSERT_EQ((*resized_column_variant)[1].get(), "Hello"); + ASSERT_EQ((*resized_column_variant)[3].get(), 43); +} + +TEST(ColumnVariant, CloneResizedGeneral2) +{ + /// D c1 c2 c3 + /// 0 42 Hello + /// NULL 43 World + /// NULL 44 + /// 0 + /// 1 + /// 1 + /// 0 + auto discriminators_column = ColumnVariant::ColumnDiscriminators::create(); + discriminators_column->insertValue(0); + discriminators_column->insertValue(ColumnVariant::NULL_DISCRIMINATOR); + discriminators_column->insertValue(ColumnVariant::NULL_DISCRIMINATOR); + discriminators_column->insertValue(0); + discriminators_column->insertValue(1); + discriminators_column->insertValue(1); + discriminators_column->insertValue(0); + auto column = ColumnVariant::create(std::move(discriminators_column), createColumns2()); + auto resized_column = column->cloneResized(3); + const auto * resized_column_variant = assert_cast(resized_column.get()); + ASSERT_EQ(resized_column_variant->size(), 3); + ASSERT_EQ(resized_column_variant->getVariantByLocalDiscriminator(0).size(), 1); + ASSERT_EQ(resized_column_variant->getVariantByLocalDiscriminator(1).size(), 0); + ASSERT_EQ(resized_column_variant->getVariantByLocalDiscriminator(2).size(), 0); + const auto & discriminators = resized_column_variant->getLocalDiscriminators(); + ASSERT_EQ(discriminators[0], 0); + ASSERT_EQ(discriminators[1], ColumnVariant::NULL_DISCRIMINATOR); + ASSERT_EQ(discriminators[2], ColumnVariant::NULL_DISCRIMINATOR); + const auto & offsets = resized_column_variant->getOffsets(); + ASSERT_EQ(offsets[0], 0); + ASSERT_EQ((*resized_column_variant)[0].get(), 42); +} + +TEST(ColumnVariant, CloneResizedGeneral3) +{ + /// D c1 c2 c3 + /// 0 42 Hello + /// 1 43 World + /// 1 44 + /// 0 + /// NULL + /// NULL + /// 0 + auto discriminators_column = ColumnVariant::ColumnDiscriminators::create(); + discriminators_column->insertValue(0); + discriminators_column->insertValue(1); + discriminators_column->insertValue(1); + discriminators_column->insertValue(0); + discriminators_column->insertValue(ColumnVariant::NULL_DISCRIMINATOR); + discriminators_column->insertValue(ColumnVariant::NULL_DISCRIMINATOR); + discriminators_column->insertValue(0); + auto column = ColumnVariant::create(std::move(discriminators_column), createColumns2()); + auto resized_column = column->cloneResized(5); + const auto * resized_column_variant = assert_cast(resized_column.get()); + ASSERT_EQ(resized_column_variant->size(), 5); + ASSERT_EQ(resized_column_variant->getVariantByLocalDiscriminator(0).size(), 2); + ASSERT_EQ(resized_column_variant->getVariantByLocalDiscriminator(1).size(), 2); + ASSERT_EQ(resized_column_variant->getVariantByLocalDiscriminator(2).size(), 0); + const auto & discriminators = resized_column_variant->getLocalDiscriminators(); + ASSERT_EQ(discriminators[0], 0); + ASSERT_EQ(discriminators[1], 1); + ASSERT_EQ(discriminators[2], 1); + ASSERT_EQ(discriminators[3], 0); + const auto & offsets = resized_column_variant->getOffsets(); + ASSERT_EQ(offsets[0], 0); + ASSERT_EQ(offsets[1], 0); + ASSERT_EQ(offsets[2], 1); + ASSERT_EQ(offsets[3], 1); + ASSERT_EQ((*resized_column_variant)[0].get(), 42); + ASSERT_EQ((*resized_column_variant)[1].get(), "Hello"); + ASSERT_EQ((*resized_column_variant)[2].get(), "World"); + ASSERT_EQ((*resized_column_variant)[3].get(), 43); +} + +MutableColumnPtr createDiscriminators2() +{ + auto discriminators_column = ColumnVariant::ColumnDiscriminators::create(); + discriminators_column->insertValue(0); + discriminators_column->insertValue(1); + discriminators_column->insertValue(ColumnVariant::NULL_DISCRIMINATOR); + discriminators_column->insertValue(0); + discriminators_column->insertValue(1); + discriminators_column->insertValue(ColumnVariant::NULL_DISCRIMINATOR); + discriminators_column->insertValue(0); + return discriminators_column; +} + +std::vector createLocalToGlobalOrder2() +{ + std::vector local_to_global_discriminators; + local_to_global_discriminators.push_back(2); + local_to_global_discriminators.push_back(0); + local_to_global_discriminators.push_back(1); + return local_to_global_discriminators; +} + +ColumnVariant::MutablePtr createVariantColumn1(bool reorder) +{ + auto columns = createColumns1(); + auto discriminators = createDiscriminators1(); + if (!reorder) + return ColumnVariant::create(std::move(discriminators), std::move(columns)); + auto local_to_global_order = createLocalToGlobalOrder1(); + reorderColumns(local_to_global_order, columns); + reorderDiscriminators(local_to_global_order, discriminators); + return ColumnVariant::create(std::move(discriminators), std::move(columns), local_to_global_order); +} + +ColumnVariant::MutablePtr createVariantColumn2(bool reorder) +{ + auto columns = createColumns2(); + auto discriminators = createDiscriminators2(); + if (!reorder) + return ColumnVariant::create(std::move(discriminators), std::move(columns)); + auto local_to_global_order = createLocalToGlobalOrder2(); + reorderColumns(local_to_global_order, columns); + reorderDiscriminators(local_to_global_order, discriminators); + return ColumnVariant::create(std::move(discriminators), std::move(columns), local_to_global_order); +} + +TEST(ColumnVariant, InsertFrom) +{ + for (bool change_order : {false, true}) + { + auto column_to = createVariantColumn1(change_order); + auto column_from = createVariantColumn2(change_order); + column_to->insertFrom(*column_from, 3); + ASSERT_EQ(column_to->globalDiscriminatorAt(5), 0); + ASSERT_EQ((*column_to)[5].get(), 43); + } +} + +TEST(ColumnVariant, InsertRangeFromOneColumnNoNulls) +{ + for (bool change_order : {false, true}) + { + auto column_to = createVariantColumn2(change_order); + auto column_from = createVariantWithOneFullColumNoNulls(5, change_order); + column_to->insertRangeFrom(*column_from, 2, 2); + ASSERT_EQ(column_to->globalDiscriminatorAt(7), 0); + ASSERT_EQ(column_to->globalDiscriminatorAt(8), 0); + ASSERT_EQ((*column_to)[7].get(), 2); + ASSERT_EQ((*column_to)[8].get(), 3); + } +} + +TEST(ColumnVariant, InsertRangeFromGeneral) +{ + for (bool change_order : {false, true}) + { + auto column_to = createVariantColumn1(change_order); + auto column_from = createVariantColumn2(change_order); + column_to->insertRangeFrom(*column_from, 1, 4); + ASSERT_EQ(column_to->globalDiscriminatorAt(5), 1); + ASSERT_EQ(column_to->globalDiscriminatorAt(6), ColumnVariant::NULL_DISCRIMINATOR); + ASSERT_EQ(column_to->globalDiscriminatorAt(7), 0); + ASSERT_EQ(column_to->globalDiscriminatorAt(8), 1); + ASSERT_EQ((*column_to)[5].get(), "Hello"); + ASSERT_EQ((*column_to)[7].get(), 43); + ASSERT_EQ((*column_to)[8].get(), "World"); + } +} + +TEST(ColumnVariant, InsertManyFrom) +{ + for (bool change_order : {false, true}) + { + auto column_to = createVariantColumn1(change_order); + auto column_from = createVariantColumn2(change_order); + column_to->insertManyFrom(*column_from, 3, 2); + ASSERT_EQ(column_to->globalDiscriminatorAt(5), 0); + ASSERT_EQ(column_to->globalDiscriminatorAt(6), 0); + ASSERT_EQ((*column_to)[5].get(), 43); + ASSERT_EQ((*column_to)[6].get(), 43); + } +} + +TEST(ColumnVariant, PopBackOneColumnNoNulls) +{ + auto column = createVariantWithOneFullColumNoNulls(5, false); + column->popBack(3); + ASSERT_EQ(column->size(), 2); + ASSERT_EQ(column->getVariantByLocalDiscriminator(0).size(), 2); + ASSERT_EQ((*column)[0].get(), 0); + ASSERT_EQ((*column)[1].get(), 1); +} + +TEST(ColumnVariant, PopBackGeneral) +{ + auto column = ColumnVariant::create(createDiscriminators2(), createColumns2()); + column->popBack(4); + ASSERT_EQ(column->size(), 3); + ASSERT_EQ(column->getVariantByLocalDiscriminator(0).size(), 1); + ASSERT_EQ(column->getVariantByLocalDiscriminator(1).size(), 1); + ASSERT_EQ((*column)[0].get(), 42); + ASSERT_EQ((*column)[1].get(), "Hello"); + ASSERT_TRUE((*column)[2].isNull()); +} + +TEST(ColumnVariant, FilterOneColumnNoNulls) +{ + auto column = createVariantWithOneFullColumNoNulls(3, false); + IColumn::Filter filter; + filter.push_back(1); + filter.push_back(0); + filter.push_back(1); + auto filtered_column = column->filter(filter, -1); + ASSERT_EQ(filtered_column->size(), 2); + ASSERT_EQ((*filtered_column)[0].get(), 0); + ASSERT_EQ((*filtered_column)[1].get(), 2); +} + +TEST(ColumnVariant, FilterGeneral) +{ + auto column = ColumnVariant::create(createDiscriminators2(), createColumns2()); + IColumn::Filter filter; + filter.push_back(0); + filter.push_back(1); + filter.push_back(1); + filter.push_back(0); + filter.push_back(0); + filter.push_back(1); + filter.push_back(0); + auto filtered_column = column->filter(filter, -1); + ASSERT_EQ(filtered_column->size(), 3); + ASSERT_EQ((*filtered_column)[0].get(), "Hello"); + ASSERT_TRUE((*filtered_column)[1].isNull()); + ASSERT_TRUE((*filtered_column)[2].isNull()); +} + +TEST(ColumnVariant, PermuteAndIndexOneColumnNoNulls) +{ + auto column = createVariantWithOneFullColumNoNulls(4, false); + IColumn::Permutation permutation; + permutation.push_back(1); + permutation.push_back(3); + permutation.push_back(2); + permutation.push_back(0); + auto permuted_column = column->permute(permutation, 3); + ASSERT_EQ(permuted_column->size(), 3); + ASSERT_EQ((*permuted_column)[0].get(), 1); + ASSERT_EQ((*permuted_column)[1].get(), 3); + ASSERT_EQ((*permuted_column)[2].get(), 2); + + auto index = ColumnUInt64::create(); + index->getData() = std::move(permutation); + auto indexed_column = column->index(*index, 3); + ASSERT_EQ(indexed_column->size(), 3); + ASSERT_EQ((*indexed_column)[0].get(), 1); + ASSERT_EQ((*indexed_column)[1].get(), 3); + ASSERT_EQ((*indexed_column)[2].get(), 2); +} + +TEST(ColumnVariant, PermuteGeneral) +{ + auto column = ColumnVariant::create(createDiscriminators2(), createColumns2()); + IColumn::Permutation permutation; + permutation.push_back(3); + permutation.push_back(4); + permutation.push_back(1); + permutation.push_back(5); + auto permuted_column = column->permute(permutation, 4); + ASSERT_EQ(permuted_column->size(), 4); + ASSERT_EQ((*permuted_column)[0].get(), 43); + ASSERT_EQ((*permuted_column)[1].get(), "World"); + ASSERT_EQ((*permuted_column)[2].get(), "Hello"); + ASSERT_TRUE((*permuted_column)[3].isNull()); +} + +TEST(ColumnVariant, ReplicateOneColumnNoNull) +{ + auto column = createVariantWithOneFullColumNoNulls(3, false); + IColumn::Offsets offsets; + offsets.push_back(0); + offsets.push_back(3); + offsets.push_back(6); + auto replicated_column = column->replicate(offsets); + ASSERT_EQ(replicated_column->size(), 6); + ASSERT_EQ((*replicated_column)[0].get(), 1); + ASSERT_EQ((*replicated_column)[1].get(), 1); + ASSERT_EQ((*replicated_column)[2].get(), 1); + ASSERT_EQ((*replicated_column)[3].get(), 2); + ASSERT_EQ((*replicated_column)[4].get(), 2); + ASSERT_EQ((*replicated_column)[5].get(), 2); +} + +TEST(ColumnVariant, ReplicateGeneral) +{ + auto column = ColumnVariant::create(createDiscriminators1(), createColumns1()); + IColumn::Offsets offsets; + offsets.push_back(1); + offsets.push_back(3); + offsets.push_back(5); + offsets.push_back(5); + offsets.push_back(7); + auto replicated_column = column->replicate(offsets); + ASSERT_EQ(replicated_column->size(), 7); + ASSERT_EQ((*replicated_column)[0].get(), 42); + ASSERT_EQ((*replicated_column)[1].get(), "Hello"); + ASSERT_EQ((*replicated_column)[2].get(), "Hello"); + ASSERT_TRUE((*replicated_column)[3].isNull()); + ASSERT_TRUE((*replicated_column)[4].isNull()); + ASSERT_TRUE((*replicated_column)[5].isNull()); + ASSERT_TRUE((*replicated_column)[6].isNull()); +} + +TEST(ColumnVariant, ScatterOneColumnNoNulls) +{ + auto column = createVariantWithOneFullColumNoNulls(5, false); + IColumn::Selector selector; + selector.push_back(0); + selector.push_back(1); + selector.push_back(2); + selector.push_back(0); + selector.push_back(1); + auto columns = column->scatter(3, selector); + ASSERT_EQ(columns[0]->size(), 2); + ASSERT_EQ((*columns[0])[0].get(), 0); + ASSERT_EQ((*columns[0])[1].get(), 3); + ASSERT_EQ(columns[1]->size(), 2); + ASSERT_EQ((*columns[1])[0].get(), 1); + ASSERT_EQ((*columns[1])[1].get(), 4); + ASSERT_EQ(columns[2]->size(), 1); + ASSERT_EQ((*columns[2])[0].get(), 2); +} + +TEST(ColumnVariant, ScatterGeneral) +{ + auto column = ColumnVariant::create(createDiscriminators2(), createColumns2()); + IColumn::Selector selector; + selector.push_back(0); + selector.push_back(0); + selector.push_back(2); + selector.push_back(0); + selector.push_back(1); + selector.push_back(2); + selector.push_back(1); + auto columns = column->scatter(3, selector); + ASSERT_EQ(columns[0]->size(), 3); + ASSERT_EQ((*columns[0])[0].get(), 42); + ASSERT_EQ((*columns[0])[1].get(), "Hello"); + ASSERT_EQ((*columns[0])[2].get(), 43); + ASSERT_EQ(columns[1]->size(), 2); + ASSERT_EQ((*columns[1])[0].get(), "World"); + ASSERT_EQ((*columns[1])[1].get(), 44); + ASSERT_EQ(columns[2]->size(), 2); + ASSERT_TRUE((*columns[2])[0].isNull()); + ASSERT_TRUE((*columns[2])[1].isNull()); +} diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 7e50a81ada8..0151dcb982d 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -816,6 +816,7 @@ class IColumn; M(Bool, function_json_value_return_type_allow_complex, false, "Allow function JSON_VALUE to return complex type, such as: struct, array, map.", 0) \ M(Bool, use_with_fill_by_sorting_prefix, true, "Columns preceding WITH FILL columns in ORDER BY clause form sorting prefix. Rows with different values in sorting prefix are filled independently", 0) \ M(Bool, optimize_uniq_to_count, true, "Rewrite uniq and its variants(except uniqUpTo) to count if subquery has distinct or group by clause.", 0) \ + M(Bool, use_variant_when_no_common_type_in_if, false, "Use Variant as a result type for if/multiIf in case when there is no common type for arguments", 0) \ \ /** Experimental functions */ \ M(Bool, allow_experimental_materialized_postgresql_table, false, "Allows to use the MaterializedPostgreSQL table engine. Disabled by default, because this feature is experimental", 0) \ @@ -823,6 +824,7 @@ class IColumn; M(Bool, allow_experimental_nlp_functions, false, "Enable experimental functions for natural language processing.", 0) \ M(Bool, allow_experimental_hash_functions, false, "Enable experimental hash functions", 0) \ M(Bool, allow_experimental_object_type, false, "Allow Object and JSON data types", 0) \ + M(Bool, allow_experimental_variant_type, false, "Allow Variant data type", 0) \ M(Bool, allow_experimental_annoy_index, false, "Allows to use Annoy index. Disabled by default because this feature is experimental", 0) \ M(Bool, allow_experimental_usearch_index, false, "Allows to use USearch index. Disabled by default because this feature is experimental", 0) \ M(UInt64, max_limit_for_ann_queries, 1'000'000, "SELECT queries with LIMIT bigger than this setting cannot use ANN indexes. Helps to prevent memory overflows in ANN search indexes.", 0) \ diff --git a/src/Core/TypeId.h b/src/Core/TypeId.h index 9c634d2321c..7003e880cd5 100644 --- a/src/Core/TypeId.h +++ b/src/Core/TypeId.h @@ -49,6 +49,7 @@ enum class TypeIndex IPv4, IPv6, JSONPaths, + Variant, }; /** diff --git a/src/DataTypes/DataTypeFactory.cpp b/src/DataTypes/DataTypeFactory.cpp index 415f24d8151..d154b386ace 100644 --- a/src/DataTypes/DataTypeFactory.cpp +++ b/src/DataTypes/DataTypeFactory.cpp @@ -290,6 +290,7 @@ DataTypeFactory::DataTypeFactory() registerDataTypeDomainGeo(*this); registerDataTypeMap(*this); registerDataTypeObject(*this); + registerDataTypeVariant(*this); } DataTypeFactory & DataTypeFactory::instance() diff --git a/src/DataTypes/DataTypeFactory.h b/src/DataTypes/DataTypeFactory.h index ba7c1a3d7fe..a2aeb6f3646 100644 --- a/src/DataTypes/DataTypeFactory.h +++ b/src/DataTypes/DataTypeFactory.h @@ -100,5 +100,6 @@ void registerDataTypeDomainBool(DataTypeFactory & factory); void registerDataTypeDomainSimpleAggregateFunction(DataTypeFactory & factory); void registerDataTypeDomainGeo(DataTypeFactory & factory); void registerDataTypeObject(DataTypeFactory & factory); +void registerDataTypeVariant(DataTypeFactory & factory); } diff --git a/src/DataTypes/DataTypeNullable.cpp b/src/DataTypes/DataTypeNullable.cpp index 41a9a1de543..484d779551f 100644 --- a/src/DataTypes/DataTypeNullable.cpp +++ b/src/DataTypes/DataTypeNullable.cpp @@ -114,5 +114,33 @@ DataTypePtr makeNullableOrLowCardinalityNullable(const DataTypePtr & type) return std::make_shared(type); } +DataTypePtr makeNullableOrLowCardinalityNullableSafe(const DataTypePtr & type) +{ + if (isNullableOrLowCardinalityNullable(type)) + return type; + + if (type->lowCardinality()) + { + const auto & dictionary_type = assert_cast(*type).getDictionaryType(); + return std::make_shared(makeNullable(dictionary_type)); + } + + return makeNullableSafe(type); +} + +DataTypePtr removeNullableOrLowCardinalityNullable(const DataTypePtr & type) +{ + if (type->isNullable()) + return static_cast(*type).getNestedType(); + + if (type->isLowCardinalityNullable()) + { + auto dict_type = removeNullable(static_cast(*type).getDictionaryType()); + return std::make_shared(dict_type); + } + + return type; + +} } diff --git a/src/DataTypes/DataTypeNullable.h b/src/DataTypes/DataTypeNullable.h index 06d46fb15ed..7ad0e1ba5f1 100644 --- a/src/DataTypes/DataTypeNullable.h +++ b/src/DataTypes/DataTypeNullable.h @@ -54,5 +54,8 @@ DataTypePtr makeNullable(const DataTypePtr & type); DataTypePtr makeNullableSafe(const DataTypePtr & type); DataTypePtr removeNullable(const DataTypePtr & type); DataTypePtr makeNullableOrLowCardinalityNullable(const DataTypePtr & type); +DataTypePtr makeNullableOrLowCardinalityNullableSafe(const DataTypePtr & type); +/// Nullable(T) -> T, LowCardinality(Nullable(T)) -> T +DataTypePtr removeNullableOrLowCardinalityNullable(const DataTypePtr & type); } diff --git a/src/DataTypes/DataTypeTuple.cpp b/src/DataTypes/DataTypeTuple.cpp index fd2e5e6a784..df9af203618 100644 --- a/src/DataTypes/DataTypeTuple.cpp +++ b/src/DataTypes/DataTypeTuple.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -172,11 +173,15 @@ MutableColumnPtr DataTypeTuple::createColumn() const MutableColumnPtr DataTypeTuple::createColumn(const ISerialization & serialization) const { + /// If we read Tuple as Variant subcolumn, it may be wrapped to SerializationVariantElement. + /// Here we don't need it, so we drop this wrapper. + const auto * current_serialization = &serialization; + while (const auto * serialization_variant_element = typeid_cast(current_serialization)) + current_serialization = serialization_variant_element->getNested().get(); + /// If we read subcolumn of nested Tuple, it may be wrapped to SerializationNamed /// several times to allow to reconstruct the substream path name. /// Here we don't need substream path name, so we drop first several wrapper serializations. - - const auto * current_serialization = &serialization; while (const auto * serialization_named = typeid_cast(current_serialization)) current_serialization = serialization_named->getNested().get(); diff --git a/src/DataTypes/DataTypeVariant.cpp b/src/DataTypes/DataTypeVariant.cpp new file mode 100644 index 00000000000..77e1c504cf8 --- /dev/null +++ b/src/DataTypes/DataTypeVariant.cpp @@ -0,0 +1,197 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; + extern const int EMPTY_DATA_PASSED; +} + + +DataTypeVariant::DataTypeVariant(const DataTypes & variants_) +{ + /// Sort nested types by their full names and squash identical types. + std::map name_to_type; + for (const auto & type : variants_) + { + /// Nullable(...), LowCardinality(Nullable(...)) and Variant(...) types are not allowed inside Variant type. + if (isNullableOrLowCardinalityNullable(type)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Nullable/LowCardinality(Nullable) types are not allowed inside Variant type"); + if (type->getTypeId() == TypeIndex::Variant) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Nested Variant types are not allowed"); + /// Don't use Nothing type as a variant. + if (!isNothing(type)) + name_to_type[type->getName()] = type; + } + + variants.reserve(name_to_type.size()); + for (const auto & [_, type] : name_to_type) + variants.push_back(type); + + if (variants.empty()) + throw Exception(ErrorCodes::EMPTY_DATA_PASSED, "Variant cannot be empty"); + + if (variants.size() > ColumnVariant::MAX_NESTED_COLUMNS) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Variant type with more than {} nested types is not allowed", ColumnVariant::MAX_NESTED_COLUMNS); +} + +std::string DataTypeVariant::doGetName() const +{ + size_t size = variants.size(); + WriteBufferFromOwnString s; + + s << "Variant("; + for (size_t i = 0; i < size; ++i) + { + if (i != 0) + s << ", "; + + s << variants[i]->getName(); + } + s << ")"; + + return s.str(); +} + +std::string DataTypeVariant::doGetPrettyName(size_t indent) const +{ + size_t size = variants.size(); + WriteBufferFromOwnString s; + s << "Variant(\n"; + + for (size_t i = 0; i != size; ++i) + { + if (i != 0) + s << ",\n"; + + s << fourSpaceIndent(indent + 1) << variants[i]->getPrettyName(indent + 1); + } + + s << '\n' << fourSpaceIndent(indent) << ')'; + return s.str(); +} + +MutableColumnPtr DataTypeVariant::createColumn() const +{ + size_t size = variants.size(); + MutableColumns nested_columns; + nested_columns.reserve(size); + for (size_t i = 0; i < size; ++i) + nested_columns.push_back(variants[i]->createColumn()); + + return ColumnVariant::create(std::move(nested_columns)); +} + + +Field DataTypeVariant::getDefault() const +{ + return Null(); +} + +bool DataTypeVariant::equals(const IDataType & rhs) const +{ + if (typeid(rhs) != typeid(*this)) + return false; + + const DataTypeVariant & rhs_variant = static_cast(rhs); + + size_t size = variants.size(); + if (size != rhs_variant.variants.size()) + return false; + + for (size_t i = 0; i < size; ++i) + if (!variants[i]->equals(*rhs_variant.variants[i])) + return false; + + return true; +} + +bool DataTypeVariant::textCanContainOnlyValidUTF8() const +{ + return std::all_of(variants.begin(), variants.end(), [](auto && elem) { return elem->textCanContainOnlyValidUTF8(); }); +} + +bool DataTypeVariant::haveMaximumSizeOfValue() const +{ + return std::all_of(variants.begin(), variants.end(), [](auto && elem) { return elem->haveMaximumSizeOfValue(); }); +} + +bool DataTypeVariant::hasDynamicSubcolumns() const +{ + return std::any_of(variants.begin(), variants.end(), [](auto && elem) { return elem->hasDynamicSubcolumns(); }); +} + +std::optional DataTypeVariant::tryGetVariantDiscriminator(const DataTypePtr & type) const +{ + String type_name = type->getName(); + for (size_t i = 0; i != variants.size(); ++i) + { + /// We don't use equals here, because it doesn't respect custom type names. + if (variants[i]->getName() == type_name) + return i; + } + + return std::nullopt; +} + +size_t DataTypeVariant::getMaximumSizeOfValueInMemory() const +{ + size_t max_size = 0; + for (const auto & elem : variants) + { + size_t elem_max_size = elem->getMaximumSizeOfValueInMemory(); + if (elem_max_size > max_size) + max_size = elem_max_size; + } + return max_size; +} + +SerializationPtr DataTypeVariant::doGetDefaultSerialization() const +{ + SerializationVariant::VariantSerializations serializations; + serializations.reserve(variants.size()); + Names variant_names; + variant_names.reserve(variants.size()); + + for (const auto & variant : variants) + { + serializations.push_back(variant->getDefaultSerialization()); + variant_names.push_back(variant->getName()); + } + + return std::make_shared(std::move(serializations), std::move(variant_names), SerializationVariant::getVariantsDeserializeTextOrder(variants), getName()); +} + +static DataTypePtr create(const ASTPtr & arguments) +{ + if (!arguments || arguments->children.empty()) + throw Exception(ErrorCodes::EMPTY_DATA_PASSED, "Variant cannot be empty"); + + DataTypes nested_types; + nested_types.reserve(arguments->children.size()); + + for (const ASTPtr & child : arguments->children) + nested_types.emplace_back(DataTypeFactory::instance().get(child)); + + return std::make_shared(nested_types); +} + + +void registerDataTypeVariant(DataTypeFactory & factory) +{ + factory.registerDataType("Variant", create); +} + +} diff --git a/src/DataTypes/DataTypeVariant.h b/src/DataTypes/DataTypeVariant.h new file mode 100644 index 00000000000..60113a188b0 --- /dev/null +++ b/src/DataTypes/DataTypeVariant.h @@ -0,0 +1,64 @@ +#pragma once + +#include +#include +#include + + +namespace DB +{ + +/** Variant data type. + * This type represents a union of other data types. + * For example, type Variant(T1, T2, ..., TN) means that each row of this type + * has a value of either type T1 or T2 or ... or TN or none of them (NULL value). + * Nullable(...), LowCardinality(Nullable(...)) and Variant(...) types are not allowed + * inside Variant type. + * The order of nested types doesn't matter: Variant(T1, T2) = Variant(T2, T1). + * To have global order of nested types we sort variants by type names on Variant creation. + * The index of a variant in a sorted list is called global variant discriminator. + */ +class DataTypeVariant final : public IDataType +{ +private: + DataTypes variants; + +public: + static constexpr bool is_parametric = true; + + explicit DataTypeVariant(const DataTypes & variants_); + + TypeIndex getTypeId() const override { return TypeIndex::Variant; } + const char * getFamilyName() const override { return "Variant"; } + + bool canBeInsideNullable() const override { return false; } + bool supportsSparseSerialization() const override { return false; } + bool canBeInsideSparseColumns() const override { return false; } + + MutableColumnPtr createColumn() const override; + + Field getDefault() const override; + + bool equals(const IDataType & rhs) const override; + + bool isParametric() const override { return true; } + bool haveSubtypes() const override { return true; } + bool textCanContainOnlyValidUTF8() const override; + bool haveMaximumSizeOfValue() const override; + bool hasDynamicSubcolumns() const override; + size_t getMaximumSizeOfValueInMemory() const override; + + const DataTypePtr & getVariant(size_t i) const { return variants[i]; } + const DataTypes & getVariants() const { return variants; } + + /// Check if Variant has provided type in the list of variants and return its discriminator. + std::optional tryGetVariantDiscriminator(const DataTypePtr & type) const; + +private: + std::string doGetName() const override; + std::string doGetPrettyName(size_t indent) const override; + SerializationPtr doGetDefaultSerialization() const override; +}; + +} + diff --git a/src/DataTypes/EnumValues.cpp b/src/DataTypes/EnumValues.cpp index 9df49e765a7..8a4b1304d5e 100644 --- a/src/DataTypes/EnumValues.cpp +++ b/src/DataTypes/EnumValues.cpp @@ -74,6 +74,27 @@ T EnumValues::getValue(StringRef field_name, bool try_treat_as_id) const return it->getMapped(); } +template +bool EnumValues::tryGetValue(T & x, StringRef field_name, bool try_treat_as_id) const +{ + const auto it = name_to_value_map.find(field_name); + if (!it) + { + /// It is used in CSV and TSV input formats. If we fail to find given string in + /// enum names, we will try to treat it as enum id. + if (try_treat_as_id) + { + ReadBufferFromMemory tmp_buf(field_name.data, field_name.size); + if (!tryReadText(x, tmp_buf) || !tmp_buf.eof() || !value_to_name_map.contains(x)) + return false; + return true; + } + return false; + } + x = it->getMapped(); + return true; +} + template Names EnumValues::getAllRegisteredNames() const { diff --git a/src/DataTypes/EnumValues.h b/src/DataTypes/EnumValues.h index 5189f7a56f5..889878bc60f 100644 --- a/src/DataTypes/EnumValues.h +++ b/src/DataTypes/EnumValues.h @@ -7,7 +7,7 @@ namespace DB { -namespace ErrorCodes +namespace ErrorCodesEnumValues { extern const int BAD_ARGUMENTS; } @@ -42,6 +42,11 @@ public: return it; } + bool hasValue(const T & value) const + { + return value_to_name_map.contains(value); + } + /// throws exception if value is not valid const StringRef & getNameForValue(const T & value) const { @@ -60,6 +65,7 @@ public: } T getValue(StringRef field_name, bool try_treat_as_id = false) const; + bool tryGetValue(T & x, StringRef field_name, bool try_treat_as_id = false) const; template bool containsAll(const TValues & rhs_values) const diff --git a/src/DataTypes/IDataType.cpp b/src/DataTypes/IDataType.cpp index 2a7e0f246de..392c56343e3 100644 --- a/src/DataTypes/IDataType.cpp +++ b/src/DataTypes/IDataType.cpp @@ -109,11 +109,26 @@ Ptr IDataType::getForSubcolumn( bool throw_if_null) const { Ptr res; - forEachSubcolumn([&](const auto &, const auto & name, const auto & subdata) + + ISerialization::StreamCallback callback_with_data = [&](const auto & subpath) { - if (name == subcolumn_name) - res = subdata.*member; - }, data); + for (size_t i = 0; i < subpath.size(); ++i) + { + size_t prefix_len = i + 1; + if (!subpath[i].visited && ISerialization::hasSubcolumnForPath(subpath, prefix_len)) + { + auto name = ISerialization::getSubcolumnNameForStream(subpath, prefix_len); + /// Create data from path only if it's requested subcolumn. + if (name == subcolumn_name) + res = ISerialization::createFromPath(subpath, prefix_len).*member; + } + subpath[i].visited = true; + } + }; + + ISerialization::EnumerateStreamsSettings settings; + settings.position_independent_encoding = false; + data.serialization->enumerateStreams(settings, callback_with_data, data); if (!res && throw_if_null) throw Exception(ErrorCodes::ILLEGAL_COLUMN, "There is no subcolumn {} in type {}", subcolumn_name, getName()); diff --git a/src/DataTypes/IDataType.h b/src/DataTypes/IDataType.h index eabf066bc3d..ccdf54f57c3 100644 --- a/src/DataTypes/IDataType.h +++ b/src/DataTypes/IDataType.h @@ -412,6 +412,8 @@ struct WhichDataType constexpr bool isSimple() const { return isInt() || isUInt() || isFloat() || isString(); } constexpr bool isLowCardinality() const { return idx == TypeIndex::LowCardinality; } + + constexpr bool isVariant() const { return idx == TypeIndex::Variant; } }; /// IDataType helpers (alternative for IDataType virtual methods with single point of truth) @@ -464,6 +466,7 @@ template inline bool isTuple(const T & data_type) { return WhichDat template inline bool isMap(const T & data_type) {return WhichDataType(data_type).isMap(); } template inline bool isInterval(const T & data_type) {return WhichDataType(data_type).isInterval(); } template inline bool isObject(const T & data_type) { return WhichDataType(data_type).isObject(); } +template inline bool isVariant(const T & data_type) { return WhichDataType(data_type).isVariant(); } template inline bool isNothing(const T & data_type) { return WhichDataType(data_type).isNothing(); } diff --git a/src/DataTypes/Serializations/ISerialization.cpp b/src/DataTypes/Serializations/ISerialization.cpp index e70dc6a2380..86a37949dc8 100644 --- a/src/DataTypes/Serializations/ISerialization.cpp +++ b/src/DataTypes/Serializations/ISerialization.cpp @@ -55,6 +55,9 @@ String ISerialization::Substream::toString() const return fmt::format("TupleElement({}, escape_tuple_delimiter = {})", tuple_element_name, escape_tuple_delimiter ? "true" : "false"); + if (type == VariantElement) + return fmt::format("VariantElement({})", variant_element_name); + return String(magic_enum::enum_name(type)); } @@ -172,6 +175,10 @@ String getNameForSubstreamPath( else stream_name += "." + it->tuple_element_name; } + else if (it->type == Substream::VariantDiscriminators) + stream_name += ".discr"; + else if (it->type == Substream::VariantElement) + stream_name += "." + it->variant_element_name; } return stream_name; @@ -252,6 +259,45 @@ bool ISerialization::isSpecialCompressionAllowed(const SubstreamPath & path) return true; } +#define TRY_DESERIALIZE_TEXT(deserialize) \ + size_t prev_size = column.size(); \ + try \ + { \ + deserialize(column, istr, settings); \ + return true; \ + } \ + catch (...) \ + { \ + if (column.size() > prev_size) \ + column.popBack(column.size() - prev_size); \ + return false; \ + } \ + +bool ISerialization::tryDeserializeTextCSV(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const +{ + TRY_DESERIALIZE_TEXT(deserializeTextCSV) +} + +bool ISerialization::tryDeserializeTextEscaped(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const +{ + TRY_DESERIALIZE_TEXT(deserializeTextEscaped) +} + +bool ISerialization::tryDeserializeTextJSON(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const +{ + TRY_DESERIALIZE_TEXT(deserializeTextJSON) +} + +bool ISerialization::tryDeserializeTextQuoted(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const +{ + TRY_DESERIALIZE_TEXT(deserializeTextQuoted) +} + +bool ISerialization::tryDeserializeWholeText(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const +{ + TRY_DESERIALIZE_TEXT(deserializeWholeText) +} + void ISerialization::deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { String field; @@ -261,6 +307,15 @@ void ISerialization::deserializeTextRaw(IColumn & column, ReadBuffer & istr, con deserializeWholeText(column, buf, settings); } +bool ISerialization::tryDeserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String field; + /// Read until \t or \n. + readString(field, istr); + ReadBufferFromString buf(field); + return tryDeserializeWholeText(column, buf, settings); +} + void ISerialization::serializeTextMarkdown( const DB::IColumn & column, size_t row_num, DB::WriteBuffer & ostr, const DB::FormatSettings & settings) const { @@ -288,7 +343,9 @@ bool ISerialization::hasSubcolumnForPath(const SubstreamPath & path, size_t pref size_t last_elem = prefix_len - 1; return path[last_elem].type == Substream::NullMap || path[last_elem].type == Substream::TupleElement - || path[last_elem].type == Substream::ArraySizes; + || path[last_elem].type == Substream::ArraySizes + || path[last_elem].type == Substream::VariantDiscriminators + || path[last_elem].type == Substream::VariantElement; } ISerialization::SubstreamData ISerialization::createFromPath(const SubstreamPath & path, size_t prefix_len) @@ -317,6 +374,8 @@ void ISerialization::throwUnexpectedDataAfterParsedValue(IColumn & column, ReadB { WriteBufferFromOwnString ostr; serializeText(column, column.size() - 1, ostr, settings); + /// Restore correct column size. + column.popBack(1); throw Exception( ErrorCodes::UNEXPECTED_DATA_AFTER_PARSED_VALUE, "Unexpected data '{}' after parsed {} value '{}'", diff --git a/src/DataTypes/Serializations/ISerialization.h b/src/DataTypes/Serializations/ISerialization.h index 030c3c6d81e..f0273f59d1f 100644 --- a/src/DataTypes/Serializations/ISerialization.h +++ b/src/DataTypes/Serializations/ISerialization.h @@ -152,6 +152,10 @@ public: ObjectStructure, ObjectData, + VariantDiscriminators, + VariantElements, + VariantElement, + Regular, }; @@ -160,6 +164,9 @@ public: /// Index of tuple element, starting at 1 or name. String tuple_element_name; + /// The name of a variant element type. + String variant_element_name; + /// Do we need to escape a dot in filenames for tuple elements. bool escape_tuple_delimiter = true; @@ -320,17 +327,20 @@ public: virtual void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0; virtual void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; + virtual bool tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const; /** Text serialization as a literal that may be inserted into a query. */ virtual void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0; virtual void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; + virtual bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const; /** Text serialization for the CSV format. */ virtual void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0; virtual void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; + virtual bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const; /** Text serialization for displaying on a terminal or saving into a text file, and the like. * Without escaping or quoting. @@ -340,11 +350,13 @@ public: /** Text deserialization in case when buffer contains only one value, without any escaping and delimiters. */ virtual void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; + virtual bool tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const; /** Text serialization intended for using in JSON format. */ virtual void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const = 0; virtual void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const = 0; + virtual bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const; virtual void serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t /*indent*/) const { serializeTextJSON(column, row_num, ostr, settings); @@ -364,6 +376,7 @@ public: * additional code in data types serialization and ReadHelpers. */ virtual void deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const; + virtual bool tryDeserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const; virtual void serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const; virtual void serializeTextMarkdown(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const; diff --git a/src/DataTypes/Serializations/SerializationArray.cpp b/src/DataTypes/Serializations/SerializationArray.cpp index c804f58c567..be23278ef25 100644 --- a/src/DataTypes/Serializations/SerializationArray.cpp +++ b/src/DataTypes/Serializations/SerializationArray.cpp @@ -417,9 +417,11 @@ static void serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffe } -template -static void deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && read_nested, bool allow_unenclosed) +template +static ReturnType deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && read_nested, bool allow_unenclosed) { + static constexpr bool throw_exception = std::is_same_v; + ColumnArray & column_array = assert_cast(column); ColumnArray::Offsets & offsets = column_array.getOffsets(); @@ -431,7 +433,18 @@ static void deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && r if (checkChar('[', istr)) has_braces = true; else if (!allow_unenclosed) - throw Exception(ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT, "Array does not start with '[' character"); + { + if constexpr (throw_exception) + throw Exception(ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT, "Array does not start with '[' character"); + return ReturnType(false); + } + + auto on_error_no_throw = [&]() + { + if (size) + nested_column.popBack(size); + return ReturnType(false); + }; try { @@ -441,11 +454,17 @@ static void deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && r if (!first) { if (*istr.position() == ',') + { ++istr.position(); + } else - throw ParsingException(ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT, - "Cannot read array from text, expected comma or end of array, found '{}'", - *istr.position()); + { + if constexpr (throw_exception) + throw ParsingException(ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT, + "Cannot read array from text, expected comma or end of array, found '{}'", + *istr.position()); + return on_error_no_throw(); + } } first = false; @@ -455,25 +474,42 @@ static void deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && r if (*istr.position() == ']') break; - read_nested(nested_column); + if constexpr (throw_exception) + read_nested(nested_column); + else if (!read_nested(nested_column)) + return on_error_no_throw(); + ++size; skipWhitespaceIfAny(istr); } if (has_braces) - assertChar(']', istr); + { + if constexpr (throw_exception) + assertChar(']', istr); + else if (!checkChar(']', istr)) + return on_error_no_throw(); + } else /// If array is not enclosed in braces, we read until EOF. - assertEOF(istr); + { + if constexpr (throw_exception) + assertEOF(istr); + else if (!istr.eof()) + return on_error_no_throw(); + } } catch (...) { if (size) nested_column.popBack(size); - throw; + if constexpr (throw_exception) + throw; + return ReturnType(false); } offsets.push_back(offsets.back() + size); + return ReturnType(true); } @@ -492,8 +528,8 @@ void SerializationArray::deserializeText(IColumn & column, ReadBuffer & istr, co deserializeTextImpl(column, istr, [&](IColumn & nested_column) { - if (settings.null_as_default) - SerializationNullable::deserializeTextQuotedImpl(nested_column, istr, settings, nested); + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(nested_column)) + SerializationNullable::deserializeNullAsDefaultOrNestedTextQuoted(nested_column, istr, settings, nested); else nested->deserializeTextQuoted(nested_column, istr, settings); }, false); @@ -502,6 +538,29 @@ void SerializationArray::deserializeText(IColumn & column, ReadBuffer & istr, co throwUnexpectedDataAfterParsedValue(column, istr, settings, "Array"); } +bool SerializationArray::tryDeserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const +{ + auto read_nested = [&](IColumn & nested_column) + { + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(nested_column)) + return SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextQuoted(nested_column, istr, settings, nested); + return nested->tryDeserializeTextQuoted(nested_column, istr, settings); + }; + + bool ok = deserializeTextImpl(column, istr, std::move(read_nested), false); + + if (!ok) + return false; + + if (whole && !istr.eof()) + { + column.popBack(1); + return false; + } + + return true; +} + void SerializationArray::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { const ColumnArray & column_array = assert_cast(column); @@ -557,13 +616,25 @@ void SerializationArray::deserializeTextJSON(IColumn & column, ReadBuffer & istr deserializeTextImpl(column, istr, [&](IColumn & nested_column) { - if (settings.null_as_default) - SerializationNullable::deserializeTextJSONImpl(nested_column, istr, settings, nested); + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(nested_column)) + SerializationNullable::deserializeNullAsDefaultOrNestedTextJSON(nested_column, istr, settings, nested); else nested->deserializeTextJSON(nested_column, istr, settings); }, false); } +bool SerializationArray::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + auto read_nested = [&](IColumn & nested_column) + { + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(nested_column)) + return SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextJSON(nested_column, istr, settings, nested); + return nested->tryDeserializeTextJSON(nested_column, istr, settings); + }; + + return deserializeTextImpl(column, istr, std::move(read_nested), false); +} + void SerializationArray::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { @@ -606,8 +677,8 @@ void SerializationArray::deserializeTextCSV(IColumn & column, ReadBuffer & istr, deserializeTextImpl(column, rb, [&](IColumn & nested_column) { - if (settings.null_as_default) - SerializationNullable::deserializeTextCSVImpl(nested_column, rb, settings, nested); + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(nested_column)) + SerializationNullable::deserializeNullAsDefaultOrNestedTextCSV(nested_column, rb, settings, nested); else nested->deserializeTextCSV(nested_column, rb, settings); }, true); @@ -617,12 +688,43 @@ void SerializationArray::deserializeTextCSV(IColumn & column, ReadBuffer & istr, deserializeTextImpl(column, rb, [&](IColumn & nested_column) { - if (settings.null_as_default) - SerializationNullable::deserializeTextQuotedImpl(nested_column, rb, settings, nested); + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(nested_column)) + SerializationNullable::deserializeNullAsDefaultOrNestedTextQuoted(nested_column, rb, settings, nested); else nested->deserializeTextQuoted(nested_column, rb, settings); }, true); } } +bool SerializationArray::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String s; + if (!tryReadCSV(s, istr, settings.csv)) + return false; + ReadBufferFromString rb(s); + + if (settings.csv.arrays_as_nested_csv) + { + auto read_nested = [&](IColumn & nested_column) + { + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(nested_column)) + return SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextCSV(nested_column, rb, settings, nested); + return nested->tryDeserializeTextCSV(nested_column, rb, settings); + }; + + return deserializeTextImpl(column, rb, read_nested, true); + } + else + { + auto read_nested = [&](IColumn & nested_column) + { + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(nested_column)) + return SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextQuoted(nested_column, rb, settings, nested); + return nested->tryDeserializeTextQuoted(nested_column, rb, settings); + }; + + return deserializeTextImpl(column, rb, read_nested, true); + } +} + } diff --git a/src/DataTypes/Serializations/SerializationArray.h b/src/DataTypes/Serializations/SerializationArray.h index de331169db5..82f5e8bce45 100644 --- a/src/DataTypes/Serializations/SerializationArray.h +++ b/src/DataTypes/Serializations/SerializationArray.h @@ -20,15 +20,18 @@ public: void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &, bool whole) const override; + bool tryDeserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &, bool whole) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent) const override; void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; /** Streaming serialization of arrays is arranged in a special way: * - elements placed in a row are written/read without array sizes; diff --git a/src/DataTypes/Serializations/SerializationBool.cpp b/src/DataTypes/Serializations/SerializationBool.cpp index 41b5bf806e5..f745fac4d30 100644 --- a/src/DataTypes/Serializations/SerializationBool.cpp +++ b/src/DataTypes/Serializations/SerializationBool.cpp @@ -150,30 +150,42 @@ bool tryDeserializeAllVariants(ColumnUInt8 * column, ReadBuffer & istr) return true; } -void deserializeImpl( +template +ReturnType deserializeImpl( IColumn & column, ReadBuffer & istr, const FormatSettings & settings, std::function check_end_of_value) { + static constexpr bool throw_exception = std::is_same_v; + ColumnUInt8 * col = checkAndGetDeserializeColumnType(column); + auto restore_column_if_needed = [&, prev_size = col->size()]() + { + if (col->size() > prev_size) + col->popBack(1); + }; PeekableReadBuffer buf(istr); buf.setCheckpoint(); if (checkString(settings.bool_true_representation, buf) && check_end_of_value(buf)) { col->insert(true); - return; + return ReturnType(true); } buf.rollbackToCheckpoint(); if (checkString(settings.bool_false_representation, buf) && check_end_of_value(buf)) { - col->insert(false); buf.dropCheckpoint(); if (buf.hasUnreadData()) - throw Exception( - ErrorCodes::CANNOT_PARSE_BOOL, - "Cannot continue parsing after parsed bool value because it will result in the loss of some data. It may happen if " - "bool_true_representation or bool_false_representation contains some delimiters of input format"); - return; + { + if constexpr (throw_exception) + throw Exception( + ErrorCodes::CANNOT_PARSE_BOOL, + "Cannot continue parsing after parsed bool value because it will result in the loss of some data. It may happen if " + "bool_true_representation or bool_false_representation contains some delimiters of input format"); + return ReturnType(false); + } + col->insert(false); + return ReturnType(true); } buf.rollbackToCheckpoint(); @@ -181,22 +193,31 @@ void deserializeImpl( { buf.dropCheckpoint(); if (buf.hasUnreadData()) - throw Exception( - ErrorCodes::CANNOT_PARSE_BOOL, - "Cannot continue parsing after parsed bool value because it will result in the loss of some data. It may happen if " - "bool_true_representation or bool_false_representation contains some delimiters of input format"); - return; + { + if constexpr (throw_exception) + throw Exception( + ErrorCodes::CANNOT_PARSE_BOOL, + "Cannot continue parsing after parsed bool value because it will result in the loss of some data. It may happen if " + "bool_true_representation or bool_false_representation contains some delimiters of input format"); + restore_column_if_needed(); + return ReturnType(false); + } + return ReturnType(true); } buf.makeContinuousMemoryFromCheckpointToPos(); buf.rollbackToCheckpoint(); - throw Exception( - ErrorCodes::CANNOT_PARSE_BOOL, - "Cannot parse boolean value here: '{}', should be '{}' or '{}' controlled by setting bool_true_representation and " - "bool_false_representation or one of " - "True/False/T/F/Y/N/Yes/No/On/Off/Enable/Disable/Enabled/Disabled/1/0", - String(buf.position(), std::min(10lu, buf.available())), - settings.bool_true_representation, settings.bool_false_representation); + restore_column_if_needed(); + if constexpr (throw_exception) + throw Exception( + ErrorCodes::CANNOT_PARSE_BOOL, + "Cannot parse boolean value here: '{}', should be '{}' or '{}' controlled by setting bool_true_representation and " + "bool_false_representation or one of " + "True/False/T/F/Y/N/Yes/No/On/Off/Enable/Disable/Enabled/Disabled/1/0", + String(buf.position(), std::min(10lu, buf.available())), + settings.bool_true_representation, settings.bool_false_representation); + + return ReturnType(false); } } @@ -225,6 +246,14 @@ void SerializationBool::deserializeTextEscaped(IColumn & column, ReadBuffer & is deserializeImpl(column, istr, settings, [](ReadBuffer & buf){ return buf.eof() || *buf.position() == '\t' || *buf.position() == '\n'; }); } +bool SerializationBool::tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + if (istr.eof()) + return false; + + return deserializeImpl(column, istr, settings, [](ReadBuffer & buf){ return buf.eof() || *buf.position() == '\t' || *buf.position() == '\n'; }); +} + void SerializationBool::serializeTextJSON(const IColumn &column, size_t row_num, WriteBuffer &ostr, const FormatSettings &settings) const { serializeSimple(column, row_num, ostr, settings); @@ -250,6 +279,33 @@ void SerializationBool::deserializeTextJSON(IColumn &column, ReadBuffer &istr, c col->insert(value); } +bool SerializationBool::tryDeserializeTextJSON(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings &) const +{ + if (istr.eof()) + return false; + + ColumnUInt8 * col = checkAndGetDeserializeColumnType(column); + bool value = false; + char first_char = *istr.position(); + if (first_char == 't' || first_char == 'f') + { + if (!readBoolTextWord(value, istr)) + return false; + } + else if (first_char == '1' || first_char == '0') + { + /// Doesn't throw. + readBoolText(value, istr); + } + else + { + return false; + } + + col->insert(value); + return true; +} + void SerializationBool::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { serializeCustom(column, row_num, ostr, settings); @@ -263,6 +319,14 @@ void SerializationBool::deserializeTextCSV(IColumn & column, ReadBuffer & istr, deserializeImpl(column, istr, settings, [&](ReadBuffer & buf){ return buf.eof() || *buf.position() == settings.csv.delimiter || *buf.position() == '\n' || *buf.position() == '\r'; }); } +bool SerializationBool::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + if (istr.eof()) + return false; + + return deserializeImpl(column, istr, settings, [&](ReadBuffer & buf){ return buf.eof() || *buf.position() == settings.csv.delimiter || *buf.position() == '\n' || *buf.position() == '\r'; }); +} + void SerializationBool::serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { serializeCustom(column, row_num, ostr, settings); @@ -276,15 +340,30 @@ void SerializationBool::deserializeTextRaw(IColumn & column, ReadBuffer & istr, deserializeImpl(column, istr, settings, [&](ReadBuffer & buf){ return buf.eof() || *buf.position() == '\t' || *buf.position() == '\n'; }); } +bool SerializationBool::tryDeserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + if (istr.eof()) + return false; + + return deserializeImpl(column, istr, settings, [&](ReadBuffer & buf){ return buf.eof() || *buf.position() == '\t' || *buf.position() == '\n'; }); +} + void SerializationBool::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { serializeSimple(column, row_num, ostr, settings); } -void SerializationBool::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +template +ReturnType deserializeTextQuotedImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) { + static constexpr bool throw_exception = std::is_same_v; + if (istr.eof()) - throw Exception(ErrorCodes::CANNOT_PARSE_BOOL, "Expected boolean value but get EOF."); + { + if constexpr (throw_exception) + throw Exception(ErrorCodes::CANNOT_PARSE_BOOL, "Expected boolean value but get EOF."); + return ReturnType(false); + } auto * col = checkAndGetDeserializeColumnType(column); @@ -292,11 +371,17 @@ void SerializationBool::deserializeTextQuoted(IColumn & column, ReadBuffer & ist switch (symbol) { case 't': - assertStringCaseInsensitive("true", istr); + if constexpr (throw_exception) + assertStringCaseInsensitive("true", istr); + else if (!checkStringCaseInsensitive("true", istr)) + return ReturnType(false); col->insert(true); break; case 'f': - assertStringCaseInsensitive("false", istr); + if constexpr (throw_exception) + assertStringCaseInsensitive("false", istr); + else if (!checkStringCaseInsensitive("false", istr)) + return ReturnType(false); col->insert(false); break; case '1': @@ -307,16 +392,40 @@ void SerializationBool::deserializeTextQuoted(IColumn & column, ReadBuffer & ist break; case '\'': ++istr.position(); - deserializeImpl(column, istr, settings, [](ReadBuffer & buf){ return !buf.eof() && *buf.position() == '\''; }); - assertChar('\'', istr); + if constexpr (throw_exception) + { + deserializeImpl(column, istr, settings, [](ReadBuffer & buf){ return !buf.eof() && *buf.position() == '\''; }); + assertChar('\'', istr); + } + else + { + if (!deserializeImpl(column, istr, settings, [](ReadBuffer & buf) { return !buf.eof() && *buf.position() == '\''; }) || !checkChar('\'', istr)) + return ReturnType(false); + } break; default: - throw Exception( - ErrorCodes::CANNOT_PARSE_BOOL, - "Cannot parse boolean value here: '{}', should be true/false, 1/0 or on of " - "True/False/T/F/Y/N/Yes/No/On/Off/Enable/Disable/Enabled/Disabled/1/0 in quotes", - String(istr.position(), std::min(10ul, istr.available()))); + { + if constexpr (throw_exception) + throw Exception( + ErrorCodes::CANNOT_PARSE_BOOL, + "Cannot parse boolean value here: '{}', should be true/false, 1/0 or on of " + "True/False/T/F/Y/N/Yes/No/On/Off/Enable/Disable/Enabled/Disabled/1/0 in quotes", + String(istr.position(), std::min(10ul, istr.available()))); + return ReturnType(false); + } } + + return ReturnType(true); +} + +void SerializationBool::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeTextQuotedImpl(column, istr, settings); +} + +bool SerializationBool::tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + return deserializeTextQuotedImpl(column, istr, settings); } void SerializationBool::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const @@ -327,6 +436,14 @@ void SerializationBool::deserializeWholeText(IColumn & column, ReadBuffer & istr deserializeImpl(column, istr, settings, [&](ReadBuffer & buf){ return buf.eof(); }); } +bool SerializationBool::tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + if (istr.eof()) + return false; + + return deserializeImpl(column, istr, settings, [&](ReadBuffer & buf){ return buf.eof(); }); +} + void SerializationBool::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { serializeSimple(column, row_num, ostr, settings); diff --git a/src/DataTypes/Serializations/SerializationBool.h b/src/DataTypes/Serializations/SerializationBool.h index a5aa0ca80a2..3e511b7249e 100644 --- a/src/DataTypes/Serializations/SerializationBool.h +++ b/src/DataTypes/Serializations/SerializationBool.h @@ -15,21 +15,27 @@ public: void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; - void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; void deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; }; diff --git a/src/DataTypes/Serializations/SerializationCustomSimpleText.cpp b/src/DataTypes/Serializations/SerializationCustomSimpleText.cpp index 03564bac64b..abe443cab1b 100644 --- a/src/DataTypes/Serializations/SerializationCustomSimpleText.cpp +++ b/src/DataTypes/Serializations/SerializationCustomSimpleText.cpp @@ -24,6 +24,12 @@ void deserializeFromString(const SerializationCustomSimpleText & domain, IColumn domain.deserializeText(column, istr, settings, true); } +bool tryDeserializeFromString(const SerializationCustomSimpleText & domain, IColumn & column, const String & s, const FormatSettings & settings) +{ + ReadBufferFromString istr(s); + return domain.tryDeserializeText(column, istr, settings, true); +} + } namespace DB @@ -34,6 +40,19 @@ SerializationCustomSimpleText::SerializationCustomSimpleText(const Serialization { } +bool SerializationCustomSimpleText::tryDeserializeText(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings, bool whole) const +{ + try + { + deserializeText(column, istr, settings, whole); + return true; + } + catch (...) + { + return false; + } +} + void SerializationCustomSimpleText::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { String str; @@ -41,6 +60,13 @@ void SerializationCustomSimpleText::deserializeWholeText(IColumn & column, ReadB deserializeFromString(*this, column, str, settings); } +bool SerializationCustomSimpleText::tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String str; + readStringUntilEOF(str, istr); + return tryDeserializeFromString(*this, column, str, settings); +} + void SerializationCustomSimpleText::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeEscapedString(serializeToString(*this, column, row_num, settings), ostr); @@ -53,6 +79,13 @@ void SerializationCustomSimpleText::deserializeTextEscaped(IColumn & column, Rea deserializeFromString(*this, column, str, settings); } +bool SerializationCustomSimpleText::tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String str; + readEscapedString(str, istr); + return tryDeserializeFromString(*this, column, str, settings); +} + void SerializationCustomSimpleText::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeQuotedString(serializeToString(*this, column, row_num, settings), ostr); @@ -65,6 +98,14 @@ void SerializationCustomSimpleText::deserializeTextQuoted(IColumn & column, Read deserializeFromString(*this, column, str, settings); } +bool SerializationCustomSimpleText::tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String str; + if (!tryReadQuotedString(str, istr)) + return false; + return tryDeserializeFromString(*this, column, str, settings); +} + void SerializationCustomSimpleText::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeCSVString(serializeToString(*this, column, row_num, settings), ostr); @@ -77,6 +118,13 @@ void SerializationCustomSimpleText::deserializeTextCSV(IColumn & column, ReadBuf deserializeFromString(*this, column, str, settings); } +bool SerializationCustomSimpleText::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String str; + readCSVStringInto(str, istr, settings.csv); + return tryDeserializeFromString(*this, column, str, settings); +} + void SerializationCustomSimpleText::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeJSONString(serializeToString(*this, column, row_num, settings), ostr, settings); @@ -89,6 +137,14 @@ void SerializationCustomSimpleText::deserializeTextJSON(IColumn & column, ReadBu deserializeFromString(*this, column, str, settings); } +bool SerializationCustomSimpleText::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String str; + if (!tryReadJSONStringInto(str, istr)) + return false; + return tryDeserializeFromString(*this, column, str, settings); +} + void SerializationCustomSimpleText::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeXMLStringForTextElement(serializeToString(*this, column, row_num, settings), ostr); diff --git a/src/DataTypes/Serializations/SerializationCustomSimpleText.h b/src/DataTypes/Serializations/SerializationCustomSimpleText.h index 0c909350002..c80a57e234c 100644 --- a/src/DataTypes/Serializations/SerializationCustomSimpleText.h +++ b/src/DataTypes/Serializations/SerializationCustomSimpleText.h @@ -22,20 +22,24 @@ public: /// whole = true means that buffer contains only one value, so we should read until EOF. /// It's needed to check if there is garbage after parsed field. virtual void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &, bool whole) const = 0; + virtual bool tryDeserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const; /** Text deserialization in case when buffer contains only one value, without any escaping and delimiters. */ void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; /** Text serialization with escaping but without quoting. */ void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; /** Text serialization as a literal that may be inserted into a query. */ void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; /** Text serialization for the CSV format. */ @@ -44,12 +48,14 @@ public: * (the delimiter is not consumed). */ void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; /** Text serialization intended for using in JSON format. * force_quoting_64bit_integers parameter forces to brace UInt64 and Int64 types into quotes. */ void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; /** Text serialization for putting into the XML format. */ diff --git a/src/DataTypes/Serializations/SerializationDate.cpp b/src/DataTypes/Serializations/SerializationDate.cpp index 534f599a072..38e1bb87b6d 100644 --- a/src/DataTypes/Serializations/SerializationDate.cpp +++ b/src/DataTypes/Serializations/SerializationDate.cpp @@ -22,6 +22,15 @@ void SerializationDate::deserializeWholeText(IColumn & column, ReadBuffer & istr throwUnexpectedDataAfterParsedValue(column, istr, settings, "Date"); } +bool SerializationDate::tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + DayNum x; + if (!tryReadDateText(x, istr, time_zone) || !istr.eof()) + return false; + assert_cast(column).getData().push_back(x); + return true; +} + void SerializationDate::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const { DayNum x; @@ -29,6 +38,15 @@ void SerializationDate::deserializeTextEscaped(IColumn & column, ReadBuffer & is assert_cast(column).getData().push_back(x); } +bool SerializationDate::tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + DayNum x; + if (!tryReadDateText(x, istr, time_zone)) + return false; + assert_cast(column).getData().push_back(x); + return true; +} + void SerializationDate::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { serializeText(column, row_num, ostr, settings); @@ -50,6 +68,16 @@ void SerializationDate::deserializeTextQuoted(IColumn & column, ReadBuffer & ist assert_cast(column).getData().push_back(x); /// It's important to do this at the end - for exception safety. } +bool SerializationDate::tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + DayNum x; + if (!checkChar('\'', istr) || !tryReadDateText(x, istr, time_zone) || !checkChar('\'', istr)) + return false; + + assert_cast(column).getData().push_back(x); + return true; +} + void SerializationDate::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeChar('"', ostr); @@ -66,6 +94,15 @@ void SerializationDate::deserializeTextJSON(IColumn & column, ReadBuffer & istr, assert_cast(column).getData().push_back(x); } +bool SerializationDate::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + DayNum x; + if (!checkChar('"', istr) || !tryReadDateText(x, istr, time_zone) || !checkChar('"', istr)) + return false; + assert_cast(column).getData().push_back(x); + return true; +} + void SerializationDate::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeChar('"', ostr); @@ -80,6 +117,15 @@ void SerializationDate::deserializeTextCSV(IColumn & column, ReadBuffer & istr, assert_cast(column).getData().push_back(value); } +bool SerializationDate::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + DayNum value; + if (!tryReadCSV(value, istr, time_zone)) + return false; + assert_cast(column).getData().push_back(value); + return true; +} + SerializationDate::SerializationDate(const DateLUTImpl & time_zone_) : time_zone(time_zone_) { } diff --git a/src/DataTypes/Serializations/SerializationDate.h b/src/DataTypes/Serializations/SerializationDate.h index f751b06fba6..dcf79eb49da 100644 --- a/src/DataTypes/Serializations/SerializationDate.h +++ b/src/DataTypes/Serializations/SerializationDate.h @@ -13,14 +13,19 @@ public: void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; protected: const DateLUTImpl & time_zone; diff --git a/src/DataTypes/Serializations/SerializationDate32.cpp b/src/DataTypes/Serializations/SerializationDate32.cpp index 851710de839..70a22d59e42 100644 --- a/src/DataTypes/Serializations/SerializationDate32.cpp +++ b/src/DataTypes/Serializations/SerializationDate32.cpp @@ -21,6 +21,15 @@ void SerializationDate32::deserializeWholeText(IColumn & column, ReadBuffer & is throwUnexpectedDataAfterParsedValue(column, istr, settings, "Date32"); } +bool SerializationDate32::tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + ExtendedDayNum x; + if (!tryReadDateText(x, istr, time_zone) || !istr.eof()) + return false; + assert_cast(column).getData().push_back(x); + return true; +} + void SerializationDate32::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const { ExtendedDayNum x; @@ -28,6 +37,15 @@ void SerializationDate32::deserializeTextEscaped(IColumn & column, ReadBuffer & assert_cast(column).getData().push_back(x); } +bool SerializationDate32::tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + ExtendedDayNum x; + if (!tryReadDateText(x, istr, time_zone)) + return false; + assert_cast(column).getData().push_back(x); + return true; +} + void SerializationDate32::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { serializeText(column, row_num, ostr, settings); @@ -49,6 +67,15 @@ void SerializationDate32::deserializeTextQuoted(IColumn & column, ReadBuffer & i assert_cast(column).getData().push_back(x); /// It's important to do this at the end - for exception safety. } +bool SerializationDate32::tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + ExtendedDayNum x; + if (!checkChar('\'', istr) || !tryReadDateText(x, istr, time_zone) || !checkChar('\'', istr)) + return false; + assert_cast(column).getData().push_back(x); /// It's important to do this at the end - for exception safety. + return true; +} + void SerializationDate32::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeChar('"', ostr); @@ -65,6 +92,15 @@ void SerializationDate32::deserializeTextJSON(IColumn & column, ReadBuffer & ist assert_cast(column).getData().push_back(x); } +bool SerializationDate32::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + ExtendedDayNum x; + if (!checkChar('"', istr) || !tryReadDateText(x, istr, time_zone) || !checkChar('"', istr)) + return false; + assert_cast(column).getData().push_back(x); + return true; +} + void SerializationDate32::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeChar('"', ostr); @@ -79,6 +115,15 @@ void SerializationDate32::deserializeTextCSV(IColumn & column, ReadBuffer & istr assert_cast(column).getData().push_back(value.getExtenedDayNum()); } +bool SerializationDate32::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + LocalDate value; + if (!tryReadCSV(value, istr)) + return false; + assert_cast(column).getData().push_back(value.getExtenedDayNum()); + return true; +} + SerializationDate32::SerializationDate32(const DateLUTImpl & time_zone_) : time_zone(time_zone_) { } diff --git a/src/DataTypes/Serializations/SerializationDate32.h b/src/DataTypes/Serializations/SerializationDate32.h index 49560fb6c7d..be2e2b76c1d 100644 --- a/src/DataTypes/Serializations/SerializationDate32.h +++ b/src/DataTypes/Serializations/SerializationDate32.h @@ -12,14 +12,19 @@ public: void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; protected: const DateLUTImpl & time_zone; diff --git a/src/DataTypes/Serializations/SerializationDateTime.cpp b/src/DataTypes/Serializations/SerializationDateTime.cpp index 77beb0d9b75..17465d85e9d 100644 --- a/src/DataTypes/Serializations/SerializationDateTime.cpp +++ b/src/DataTypes/Serializations/SerializationDateTime.cpp @@ -21,15 +21,56 @@ inline void readText(time_t & x, ReadBuffer & istr, const FormatSettings & setti switch (settings.date_time_input_format) { case FormatSettings::DateTimeInputFormat::Basic: - readDateTimeText(x, istr, time_zone); - return; + readDateTimeTextImpl<>(x, istr, time_zone); + break; case FormatSettings::DateTimeInputFormat::BestEffort: parseDateTimeBestEffort(x, istr, time_zone, utc_time_zone); - return; + break; case FormatSettings::DateTimeInputFormat::BestEffortUS: parseDateTimeBestEffortUS(x, istr, time_zone, utc_time_zone); - return; + break; } + + if (x < 0) + x = 0; +} + +inline void readAsIntText(time_t & x, ReadBuffer & istr) +{ + readIntText(x, istr); + if (x < 0) + x = 0; +} + +inline bool tryReadText(time_t & x, ReadBuffer & istr, const FormatSettings & settings, const DateLUTImpl & time_zone, const DateLUTImpl & utc_time_zone) +{ + bool res; + switch (settings.date_time_input_format) + { + case FormatSettings::DateTimeInputFormat::Basic: + res = tryReadDateTimeText(x, istr, time_zone); + break; + case FormatSettings::DateTimeInputFormat::BestEffort: + res = tryParseDateTimeBestEffort(x, istr, time_zone, utc_time_zone); + break; + case FormatSettings::DateTimeInputFormat::BestEffortUS: + res = tryParseDateTimeBestEffortUS(x, istr, time_zone, utc_time_zone); + break; + } + + if (x < 0) + x = 0; + + return res; +} + +inline bool tryReadAsIntText(time_t & x, ReadBuffer & istr) +{ + if (!tryReadIntText(x, istr)) + return false; + if (x < 0) + x = 0; + return true; } } @@ -68,15 +109,32 @@ void SerializationDateTime::deserializeWholeText(IColumn & column, ReadBuffer & throwUnexpectedDataAfterParsedValue(column, istr, settings, "DateTime"); } +bool SerializationDateTime::tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + time_t x = 0; + if (!tryReadText(x, istr, settings, time_zone, utc_time_zone) || !istr.eof()) + return false; + + assert_cast(column).getData().push_back(static_cast(x)); + return true; +} + void SerializationDateTime::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { time_t x = 0; readText(x, istr, settings, time_zone, utc_time_zone); - if (x < 0) - x = 0; assert_cast(column).getData().push_back(static_cast(x)); } +bool SerializationDateTime::tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + time_t x = 0; + if (!tryReadText(x, istr, settings, time_zone, utc_time_zone)) + return false; + assert_cast(column).getData().push_back(static_cast(x)); + return true; +} + void SerializationDateTime::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeChar('\'', ostr); @@ -94,15 +152,32 @@ void SerializationDateTime::deserializeTextQuoted(IColumn & column, ReadBuffer & } else /// Just 1504193808 or 01504193808 { - readIntText(x, istr); + readAsIntText(x, istr); } - if (x < 0) - x = 0; /// It's important to do this at the end - for exception safety. assert_cast(column).getData().push_back(static_cast(x)); } +bool SerializationDateTime::tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + time_t x = 0; + if (checkChar('\'', istr)) /// Cases: '2017-08-31 18:36:48' or '1504193808' + { + if (!tryReadText(x, istr, settings, time_zone, utc_time_zone) || !checkChar('\'', istr)) + return false; + } + else /// Just 1504193808 or 01504193808 + { + if (!tryReadAsIntText(x, istr)) + return false; + } + + /// It's important to do this at the end - for exception safety. + assert_cast(column).getData().push_back(static_cast(x)); + return true; +} + void SerializationDateTime::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeChar('"', ostr); @@ -120,13 +195,30 @@ void SerializationDateTime::deserializeTextJSON(IColumn & column, ReadBuffer & i } else { - readIntText(x, istr); + readAsIntText(x, istr); } - if (x < 0) - x = 0; + assert_cast(column).getData().push_back(static_cast(x)); } +bool SerializationDateTime::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + time_t x = 0; + if (checkChar('"', istr)) + { + if (!tryReadText(x, istr, settings, time_zone, utc_time_zone) || !checkChar('"', istr)) + return false; + } + else + { + if (!tryReadIntText(x, istr)) + return false; + } + + assert_cast(column).getData().push_back(static_cast(x)); + return true; +} + void SerializationDateTime::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeChar('"', ostr); @@ -165,13 +257,48 @@ void SerializationDateTime::deserializeTextCSV(IColumn & column, ReadBuffer & is readCSVString(datetime_str, istr, settings.csv); ReadBufferFromString buf(datetime_str); readText(x, buf, settings, time_zone, utc_time_zone); + if (!buf.eof()) + throwUnexpectedDataAfterParsedValue(column, istr, settings, "DateTime"); } } - if (x < 0) - x = 0; - assert_cast(column).getData().push_back(static_cast(x)); } +bool SerializationDateTime::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + time_t x = 0; + + if (istr.eof()) + return false; + + char maybe_quote = *istr.position(); + + if (maybe_quote == '\'' || maybe_quote == '\"') + { + ++istr.position(); + if (!tryReadText(x, istr, settings, time_zone, utc_time_zone) || !checkChar(maybe_quote, istr)) + return false; + } + else + { + if (settings.csv.delimiter != ',' || settings.date_time_input_format == FormatSettings::DateTimeInputFormat::Basic) + { + if (!tryReadText(x, istr, settings, time_zone, utc_time_zone)) + return false; + } + else + { + String datetime_str; + readCSVString(datetime_str, istr, settings.csv); + ReadBufferFromString buf(datetime_str); + if (!tryReadText(x, buf, settings, time_zone, utc_time_zone) || !buf.eof()) + return false; + } + } + + assert_cast(column).getData().push_back(static_cast(x)); + return true; +} + } diff --git a/src/DataTypes/Serializations/SerializationDateTime.h b/src/DataTypes/Serializations/SerializationDateTime.h index f4a142483e5..584b0c4116b 100644 --- a/src/DataTypes/Serializations/SerializationDateTime.h +++ b/src/DataTypes/Serializations/SerializationDateTime.h @@ -15,14 +15,19 @@ public: void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; }; } diff --git a/src/DataTypes/Serializations/SerializationDateTime64.cpp b/src/DataTypes/Serializations/SerializationDateTime64.cpp index 93891886000..a19619bf8d3 100644 --- a/src/DataTypes/Serializations/SerializationDateTime64.cpp +++ b/src/DataTypes/Serializations/SerializationDateTime64.cpp @@ -47,6 +47,16 @@ void SerializationDateTime64::deserializeText(IColumn & column, ReadBuffer & ist throwUnexpectedDataAfterParsedValue(column, istr, settings, "DateTime64"); } +bool SerializationDateTime64::tryDeserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &, bool whole) const +{ + DateTime64 result = 0; + if (tryReadDateTime64Text(result, scale, istr, time_zone) || (whole && istr.eof())) + return false; + + assert_cast(column).getData().push_back(result); + return true; +} + void SerializationDateTime64::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { deserializeTextEscaped(column, istr, settings); @@ -75,6 +85,29 @@ static inline void readText(DateTime64 & x, UInt32 scale, ReadBuffer & istr, con } } +static inline bool tryReadText(DateTime64 & x, UInt32 scale, ReadBuffer & istr, const FormatSettings & settings, const DateLUTImpl & time_zone, const DateLUTImpl & utc_time_zone) +{ + switch (settings.date_time_input_format) + { + case FormatSettings::DateTimeInputFormat::Basic: + return tryReadDateTime64Text(x, scale, istr, time_zone); + case FormatSettings::DateTimeInputFormat::BestEffort: + return tryParseDateTime64BestEffort(x, scale, istr, time_zone, utc_time_zone); + case FormatSettings::DateTimeInputFormat::BestEffortUS: + return tryParseDateTime64BestEffortUS(x, scale, istr, time_zone, utc_time_zone); + } +} + + +bool SerializationDateTime64::tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + DateTime64 x = 0; + if (!tryReadText(x, scale, istr, settings, time_zone, utc_time_zone) || !istr.eof()) + return false; + assert_cast(column).getData().push_back(x); + return true; +} + void SerializationDateTime64::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { DateTime64 x = 0; @@ -82,6 +115,15 @@ void SerializationDateTime64::deserializeTextEscaped(IColumn & column, ReadBuffe assert_cast(column).getData().push_back(x); } +bool SerializationDateTime64::tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + DateTime64 x = 0; + if (!tryReadText(x, scale, istr, settings, time_zone, utc_time_zone)) + return false; + assert_cast(column).getData().push_back(x); + return true; +} + void SerializationDateTime64::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeChar('\'', ostr); @@ -104,6 +146,23 @@ void SerializationDateTime64::deserializeTextQuoted(IColumn & column, ReadBuffer assert_cast(column).getData().push_back(x); /// It's important to do this at the end - for exception safety. } +bool SerializationDateTime64::tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + DateTime64 x = 0; + if (checkChar('\'', istr)) /// Cases: '2017-08-31 18:36:48' or '1504193808' + { + if (tryReadText(x, scale, istr, settings, time_zone, utc_time_zone) || !checkChar('\'', istr)) + return false; + } + else /// Just 1504193808 or 01504193808 + { + if (!tryReadIntText(x, istr)) + return false; + } + assert_cast(column).getData().push_back(x); /// It's important to do this at the end - for exception safety. + return true; +} + void SerializationDateTime64::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeChar('"', ostr); @@ -126,6 +185,23 @@ void SerializationDateTime64::deserializeTextJSON(IColumn & column, ReadBuffer & assert_cast(column).getData().push_back(x); } +bool SerializationDateTime64::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + DateTime64 x = 0; + if (checkChar('"', istr)) + { + if (!tryReadText(x, scale, istr, settings, time_zone, utc_time_zone) || !checkChar('"', istr)) + return false; + } + else + { + if (!tryReadIntText(x, istr)) + return false; + } + assert_cast(column).getData().push_back(x); + return true; +} + void SerializationDateTime64::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeChar('"', ostr); @@ -170,4 +246,40 @@ void SerializationDateTime64::deserializeTextCSV(IColumn & column, ReadBuffer & assert_cast(column).getData().push_back(x); } +bool SerializationDateTime64::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + DateTime64 x = 0; + + if (istr.eof()) + return false; + + char maybe_quote = *istr.position(); + + if (maybe_quote == '\'' || maybe_quote == '\"') + { + ++istr.position(); + if (!tryReadText(x, scale, istr, settings, time_zone, utc_time_zone) || !checkChar(maybe_quote, istr)) + return false; + } + else + { + if (settings.csv.delimiter != ',' || settings.date_time_input_format == FormatSettings::DateTimeInputFormat::Basic) + { + if (tryReadText(x, scale, istr, settings, time_zone, utc_time_zone)) + return false; + } + else + { + String datetime_str; + readCSVString(datetime_str, istr, settings.csv); + ReadBufferFromString buf(datetime_str); + if (!tryReadText(x, scale, buf, settings, time_zone, utc_time_zone) || !buf.eof()) + return false; + } + } + + assert_cast(column).getData().push_back(x); + return true; +} + } diff --git a/src/DataTypes/Serializations/SerializationDateTime64.h b/src/DataTypes/Serializations/SerializationDateTime64.h index f817edbf0dd..b49bd1e9098 100644 --- a/src/DataTypes/Serializations/SerializationDateTime64.h +++ b/src/DataTypes/Serializations/SerializationDateTime64.h @@ -15,15 +15,21 @@ public: void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const override; + bool tryDeserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const override; void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; }; } diff --git a/src/DataTypes/Serializations/SerializationDecimal.cpp b/src/DataTypes/Serializations/SerializationDecimal.cpp index b576b7a048c..d632c224783 100644 --- a/src/DataTypes/Serializations/SerializationDecimal.cpp +++ b/src/DataTypes/Serializations/SerializationDecimal.cpp @@ -16,11 +16,19 @@ namespace ErrorCodes } template -bool SerializationDecimal::tryReadText(T & x, ReadBuffer & istr, UInt32 precision, UInt32 scale) +bool SerializationDecimal::tryReadText(T & x, ReadBuffer & istr, UInt32 precision, UInt32 scale, bool csv) { UInt32 unread_scale = scale; - if (!tryReadDecimalText(istr, x, precision, unread_scale)) - return false; + if (csv) + { + if (!tryReadCSVDecimalText(istr, x, precision, unread_scale)) + return false; + } + else + { + if (!tryReadDecimalText(istr, x, precision, unread_scale)) + return false; + } if (common::mulOverflow(x.value, DecimalUtils::scaleMultiplier(unread_scale), x.value)) return false; @@ -59,6 +67,16 @@ void SerializationDecimal::deserializeText(IColumn & column, ReadBuffer & ist ISerialization::throwUnexpectedDataAfterParsedValue(column, istr, settings, "Decimal"); } +template +bool SerializationDecimal::tryDeserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &, bool whole) const +{ + T x; + if (!tryReadText(x, istr) || (whole && !istr.eof())) + return false; + assert_cast(column).getData().push_back(x); + return true; +} + template void SerializationDecimal::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const { @@ -67,6 +85,16 @@ void SerializationDecimal::deserializeTextCSV(IColumn & column, ReadBuffer & assert_cast(column).getData().push_back(x); } +template +bool SerializationDecimal::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + T x; + if (!tryReadText(x, istr, true)) + return false; + assert_cast(column).getData().push_back(x); + return true; +} + template void SerializationDecimal::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { @@ -88,6 +116,18 @@ void SerializationDecimal::deserializeTextJSON(IColumn & column, ReadBuffer & assertChar('"', istr); } +template +bool SerializationDecimal::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + bool have_quotes = checkChar('"', istr); + T x; + if (!tryReadText(x, istr) || (have_quotes && !checkChar('"', istr))) + return false; + + assert_cast(column).getData().push_back(x); + return true; +} + template class SerializationDecimal; template class SerializationDecimal; diff --git a/src/DataTypes/Serializations/SerializationDecimal.h b/src/DataTypes/Serializations/SerializationDecimal.h index 57decdd0973..22a8eb1a47c 100644 --- a/src/DataTypes/Serializations/SerializationDecimal.h +++ b/src/DataTypes/Serializations/SerializationDecimal.h @@ -16,15 +16,19 @@ public: void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &, bool whole) const override; + bool tryDeserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &, bool whole) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void readText(T & x, ReadBuffer & istr, bool csv = false) const { readText(x, istr, this->precision, this->scale, csv); } + bool tryReadText(T & x, ReadBuffer & istr, bool csv = false) const { return tryReadText(x, istr, this->precision, this->scale, csv); } static void readText(T & x, ReadBuffer & istr, UInt32 precision_, UInt32 scale_, bool csv = false); - static bool tryReadText(T & x, ReadBuffer & istr, UInt32 precision_, UInt32 scale_); + static bool tryReadText(T & x, ReadBuffer & istr, UInt32 precision_, UInt32 scale_, bool csv = false); }; } diff --git a/src/DataTypes/Serializations/SerializationEnum.cpp b/src/DataTypes/Serializations/SerializationEnum.cpp index 9b3a437e9cf..6ad55913738 100644 --- a/src/DataTypes/Serializations/SerializationEnum.cpp +++ b/src/DataTypes/Serializations/SerializationEnum.cpp @@ -34,6 +34,27 @@ void SerializationEnum::deserializeTextEscaped(IColumn & column, ReadBuffe } } +template +bool SerializationEnum::tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + FieldType x; + if (settings.tsv.enum_as_number) + { + if (!tryReadValue(istr, x)) + return false; + } + else + { + std::string field_name; + readEscapedString(field_name, istr); + if (!this->tryGetValue(x, StringRef(field_name), true)) + return false; + } + + assert_cast(column).getData().push_back(x); + return true; +} + template void SerializationEnum::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const { @@ -48,6 +69,18 @@ void SerializationEnum::deserializeTextQuoted(IColumn & column, ReadBuffer assert_cast(column).getData().push_back(ref_enum_values.getValue(StringRef(field_name))); } +template +bool SerializationEnum::tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + std::string field_name; + readQuotedStringWithSQLStyle(field_name, istr); + FieldType x; + if (!this->tryGetValue(x, StringRef(field_name))) + return false; + assert_cast(column).getData().push_back(x); + return true; +} + template void SerializationEnum::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { @@ -65,6 +98,27 @@ void SerializationEnum::deserializeWholeText(IColumn & column, ReadBuffer } } +template +bool SerializationEnum::tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + FieldType x; + if (settings.tsv.enum_as_number) + { + if (!tryReadValue(istr, x) || !istr.eof()) + return false; + } + else + { + std::string field_name; + readStringUntilEOF(field_name, istr); + if (!this->tryGetValue(x, StringRef(field_name), true)) + return false; + } + + assert_cast(column).getData().push_back(x); + return true; +} + template void SerializationEnum::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { @@ -90,6 +144,27 @@ void SerializationEnum::deserializeTextJSON(IColumn & column, ReadBuffer & } } +template +bool SerializationEnum::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + FieldType x; + if (!istr.eof() && *istr.position() != '"') + { + if (!tryReadValue(istr, x)) + return false; + } + else + { + std::string field_name; + readJSONString(field_name, istr); + if (!this->tryGetValue(x, StringRef(field_name))) + return false; + } + + assert_cast(column).getData().push_back(x); + return true; +} + template void SerializationEnum::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const { @@ -109,6 +184,28 @@ void SerializationEnum::deserializeTextCSV(IColumn & column, ReadBuffer & } } +template +bool SerializationEnum::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + FieldType x; + + if (settings.csv.enum_as_number) + { + if (!tryReadValue(istr, x)) + return false; + } + else + { + std::string field_name; + readCSVString(field_name, istr, settings.csv); + if (!this->tryGetValue(x, StringRef(field_name), true)) + return false; + } + + assert_cast(column).getData().push_back(x); + return true; +} + template void SerializationEnum::serializeTextMarkdown( const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const diff --git a/src/DataTypes/Serializations/SerializationEnum.h b/src/DataTypes/Serializations/SerializationEnum.h index 03b134e59a6..708161dc5fd 100644 --- a/src/DataTypes/Serializations/SerializationEnum.h +++ b/src/DataTypes/Serializations/SerializationEnum.h @@ -34,15 +34,20 @@ public: void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeTextMarkdown(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; @@ -53,6 +58,14 @@ public: return ref_enum_values.findByValue(x)->first; } + bool tryReadValue(ReadBuffer & istr, FieldType & x) const + { + if (!tryReadText(x, istr) || !this->hasValue(x)) + return false; + + return true; + } + std::optional> own_enum_values; std::shared_ptr> own_enum_type; const EnumValues & ref_enum_values; diff --git a/src/DataTypes/Serializations/SerializationFixedString.cpp b/src/DataTypes/Serializations/SerializationFixedString.cpp index fa50af52f2f..23e959d80c9 100644 --- a/src/DataTypes/Serializations/SerializationFixedString.cpp +++ b/src/DataTypes/Serializations/SerializationFixedString.cpp @@ -150,12 +150,49 @@ static inline void read(const SerializationFixedString & self, IColumn & column, } } +bool SerializationFixedString::tryAlignStringLength(size_t n, PaddedPODArray & data, size_t string_start) +{ + size_t length = data.size() - string_start; + if (length < n) + { + data.resize_fill(string_start + n); + } + else if (length > n) + { + data.resize_assume_reserved(string_start); + return false; + } + + return true; +} + +template +static inline bool tryRead(const SerializationFixedString & self, IColumn & column, Reader && reader) +{ + ColumnFixedString::Chars & data = typeid_cast(column).getChars(); + size_t prev_size = data.size(); + try + { + return reader(data) && SerializationFixedString::tryAlignStringLength(self.getN(), data, prev_size); + } + catch (...) + { + data.resize_assume_reserved(prev_size); + return false; + } +} + void SerializationFixedString::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const { read(*this, column, [&istr](ColumnFixedString::Chars & data) { readEscapedStringInto(data, istr); }); } +bool SerializationFixedString::tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + return tryRead(*this, column, [&istr](ColumnFixedString::Chars & data) { readEscapedStringInto(data, istr); return true; }); +} + void SerializationFixedString::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const { @@ -169,12 +206,22 @@ void SerializationFixedString::deserializeTextQuoted(IColumn & column, ReadBuffe read(*this, column, [&istr](ColumnFixedString::Chars & data) { readQuotedStringInto(data, istr); }); } +bool SerializationFixedString::tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + return tryRead(*this, column, [&istr](ColumnFixedString::Chars & data) { return tryReadQuotedStringInto(data, istr); }); +} + void SerializationFixedString::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const { read(*this, column, [&istr](ColumnFixedString::Chars & data) { readStringUntilEOFInto(data, istr); }); } +bool SerializationFixedString::tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + return tryRead(*this, column, [&istr](ColumnFixedString::Chars & data) { readStringUntilEOFInto(data, istr); return true; }); +} + void SerializationFixedString::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { @@ -188,6 +235,10 @@ void SerializationFixedString::deserializeTextJSON(IColumn & column, ReadBuffer read(*this, column, [&istr](ColumnFixedString::Chars & data) { readJSONStringInto(data, istr); }); } +bool SerializationFixedString::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + return tryRead(*this, column, [&istr](ColumnFixedString::Chars & data) { return tryReadJSONStringInto(data, istr); }); +} void SerializationFixedString::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const { @@ -208,6 +259,11 @@ void SerializationFixedString::deserializeTextCSV(IColumn & column, ReadBuffer & read(*this, column, [&istr, &csv = settings.csv](ColumnFixedString::Chars & data) { readCSVStringInto(data, istr, csv); }); } +bool SerializationFixedString::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + return tryRead(*this, column, [&istr, &csv = settings.csv](ColumnFixedString::Chars & data) { readCSVStringInto(data, istr, csv); return true; }); +} + void SerializationFixedString::serializeTextMarkdown( const DB::IColumn & column, size_t row_num, DB::WriteBuffer & ostr, const DB::FormatSettings & settings) const { diff --git a/src/DataTypes/Serializations/SerializationFixedString.h b/src/DataTypes/Serializations/SerializationFixedString.h index c27b10ad158..8eb4eacdbff 100644 --- a/src/DataTypes/Serializations/SerializationFixedString.h +++ b/src/DataTypes/Serializations/SerializationFixedString.h @@ -26,20 +26,25 @@ public: void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextMarkdown(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; @@ -47,6 +52,7 @@ public: /// If the length is less than getN() the function will add zero characters up to getN(). /// If the length is greater than getN() the function will throw an exception. static void alignStringLength(size_t n, PaddedPODArray & data, size_t string_start); + static bool tryAlignStringLength(size_t n, PaddedPODArray & data, size_t string_start); }; } diff --git a/src/DataTypes/Serializations/SerializationIPv4andIPv6.cpp b/src/DataTypes/Serializations/SerializationIPv4andIPv6.cpp new file mode 100644 index 00000000000..81c4af97401 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationIPv4andIPv6.cpp @@ -0,0 +1,188 @@ +#include + +namespace DB +{ + +template +void SerializationIP::serializeText(const DB::IColumn & column, size_t row_num, DB::WriteBuffer & ostr, const DB::FormatSettings &) const +{ + writeText(assert_cast &>(column).getData()[row_num], ostr); +} + +template +void SerializationIP::deserializeText(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings, bool whole) const +{ + IPv x; + readText(x, istr); + + if (whole && !istr.eof()) + throwUnexpectedDataAfterParsedValue(column, istr, settings, TypeName.data()); + + assert_cast &>(column).getData().push_back(x); +} + +template +bool SerializationIP::tryDeserializeText(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings &, bool whole) const +{ + IPv x; + if (!tryReadText(x, istr) || (whole && !istr.eof())) + return false; + + assert_cast &>(column).getData().push_back(x); + return true; +} + +template +void SerializationIP::serializeTextQuoted(const DB::IColumn & column, size_t row_num, DB::WriteBuffer & ostr, const DB::FormatSettings & settings) const +{ + writeChar('\'', ostr); + serializeText(column, row_num, ostr, settings); + writeChar('\'', ostr); +} + +template +void SerializationIP::deserializeTextQuoted(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings &) const +{ + IPv x; + assertChar('\'', istr); + readText(x, istr); + assertChar('\'', istr); + assert_cast &>(column).getData().push_back(x); /// It's important to do this at the end - for exception safety. +} + +template +bool SerializationIP::tryDeserializeTextQuoted(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings &) const +{ + IPv x; + if (!checkChar('\'', istr) || !tryReadText(x, istr) || !checkChar('\'', istr)) + return false; + assert_cast &>(column).getData().push_back(x); + return true; +} + +template +void SerializationIP::serializeTextJSON(const DB::IColumn & column, size_t row_num, DB::WriteBuffer & ostr, const DB::FormatSettings & settings) const +{ + writeChar('"', ostr); + serializeText(column, row_num, ostr, settings); + writeChar('"', ostr); +} + +template +void SerializationIP::deserializeTextJSON(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const +{ + IPv x; + assertChar('"', istr); + readText(x, istr); + /// this code looks weird, but we want to throw specific exception to match original behavior... + if (istr.eof()) + assertChar('"', istr); + if (*istr.position() != '"') + throwUnexpectedDataAfterParsedValue(column, istr, settings, TypeName.data()); + istr.ignore(); + + assert_cast &>(column).getData().push_back(x); +} + +template +bool SerializationIP::tryDeserializeTextJSON(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings &) const +{ + IPv x; + if (!checkChar('"', istr) || !tryReadText(x, istr) || !checkChar('"', istr)) + return false; + + assert_cast &>(column).getData().push_back(x); + return true; +} + +template +void SerializationIP::serializeTextCSV(const DB::IColumn & column, size_t row_num, DB::WriteBuffer & ostr, const DB::FormatSettings & settings) const +{ + writeChar('"', ostr); + serializeText(column, row_num, ostr, settings); + writeChar('"', ostr); +} + +template +void SerializationIP::deserializeTextCSV(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings &) const +{ + IPv value; + readCSV(value, istr); + + assert_cast &>(column).getData().push_back(value); +} + +template +bool SerializationIP::tryDeserializeTextCSV(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings &) const +{ + IPv value; + if (!tryReadCSV(value, istr)) + return false; + + assert_cast &>(column).getData().push_back(value); + return true; +} + +template +void SerializationIP::serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings &) const +{ + IPv x = field.get(); + if constexpr (std::is_same_v) + writeBinary(x, ostr); + else + writeBinaryLittleEndian(x, ostr); +} + +template +void SerializationIP::deserializeBinary(DB::Field & field, DB::ReadBuffer & istr, const DB::FormatSettings &) const +{ + IPv x; + if constexpr (std::is_same_v) + readBinary(x, istr); + else + readBinaryLittleEndian(x, istr); + field = NearestFieldType(x); +} + +template +void SerializationIP::serializeBinary(const DB::IColumn & column, size_t row_num, DB::WriteBuffer & ostr, const DB::FormatSettings &) const +{ + writeBinary(assert_cast &>(column).getData()[row_num], ostr); +} + +template +void SerializationIP::deserializeBinary(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings &) const +{ + IPv x; + readBinary(x.toUnderType(), istr); + assert_cast &>(column).getData().push_back(x); +} + +template +void SerializationIP::serializeBinaryBulk(const DB::IColumn & column, DB::WriteBuffer & ostr, size_t offset, size_t limit) const +{ + const typename ColumnVector::Container & x = typeid_cast &>(column).getData(); + + size_t size = x.size(); + + if (limit == 0 || offset + limit > size) + limit = size - offset; + + if (limit) + ostr.write(reinterpret_cast(&x[offset]), sizeof(IPv) * limit); +} + +template +void SerializationIP::deserializeBinaryBulk(DB::IColumn & column, DB::ReadBuffer & istr, size_t limit, double) const +{ + typename ColumnVector::Container & x = typeid_cast &>(column).getData(); + size_t initial_size = x.size(); + x.resize(initial_size + limit); + size_t size = istr.readBig(reinterpret_cast(&x[initial_size]), sizeof(IPv) * limit); + x.resize(initial_size + size / sizeof(IPv)); +} + +template class SerializationIP; +template class SerializationIP; + +} diff --git a/src/DataTypes/Serializations/SerializationIPv4andIPv6.h b/src/DataTypes/Serializations/SerializationIPv4andIPv6.h index 7d8669fd444..a53f257646b 100644 --- a/src/DataTypes/Serializations/SerializationIPv4andIPv6.h +++ b/src/DataTypes/Serializations/SerializationIPv4andIPv6.h @@ -13,123 +13,30 @@ template class SerializationIP : public SimpleTextSerialization { public: - void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override - { - writeText(assert_cast &>(column).getData()[row_num], ostr); - } - void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const override - { - IPv x; - readText(x, istr); + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const override; + bool tryDeserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const override; - if (whole && !istr.eof()) - throwUnexpectedDataAfterParsedValue(column, istr, settings, TypeName.data()); + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; - assert_cast &>(column).getData().push_back(x); - } - void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override - { - serializeText(column, row_num, ostr, settings); - } - void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override - { - deserializeText(column, istr, settings, false); - } - void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override - { - writeChar('\'', ostr); - serializeText(column, row_num, ostr, settings); - writeChar('\'', ostr); - } - void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override - { - IPv x; - assertChar('\'', istr); - readText(x, istr); - assertChar('\'', istr); - assert_cast &>(column).getData().push_back(x); /// It's important to do this at the end - for exception safety. - } - void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override - { - writeChar('"', ostr); - serializeText(column, row_num, ostr, settings); - writeChar('"', ostr); - } - void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override - { - IPv x; - assertChar('"', istr); - readText(x, istr); - /// this code looks weird, but we want to throw specific exception to match original behavior... - if (istr.eof()) - assertChar('"', istr); - if (*istr.position() != '"') - throwUnexpectedDataAfterParsedValue(column, istr, settings, TypeName.data()); - istr.ignore(); + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; - assert_cast &>(column).getData().push_back(x); - } - void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override - { - writeChar('"', ostr); - serializeText(column, row_num, ostr, settings); - writeChar('"', ostr); - } - void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &/* settings*/) const override - { - IPv value; - readCSV(value, istr); + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &/* settings*/) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &/* settings*/) const override; - assert_cast &>(column).getData().push_back(value); - } + void serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings &) const override; - void serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings &) const override - { - IPv x = field.get(); - if constexpr (std::is_same_v) - writeBinary(x, ostr); - else - writeBinaryLittleEndian(x, ostr); - } - void deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings &) const override - { - IPv x; - if constexpr (std::is_same_v) - readBinary(x, istr); - else - readBinaryLittleEndian(x, istr); - field = NearestFieldType(x); - } - void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override - { - writeBinary(assert_cast &>(column).getData()[row_num], ostr); - } - void deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override - { - IPv x; - readBinary(x.toUnderType(), istr); - assert_cast &>(column).getData().push_back(x); - } - void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const override - { - const typename ColumnVector::Container & x = typeid_cast &>(column).getData(); + void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; + void deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - size_t size = x.size(); - - if (limit == 0 || offset + limit > size) - limit = size - offset; - - if (limit) - ostr.write(reinterpret_cast(&x[offset]), sizeof(IPv) * limit); - } - void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double /*avg_value_size_hint*/) const override - { - typename ColumnVector::Container & x = typeid_cast &>(column).getData(); - size_t initial_size = x.size(); - x.resize(initial_size + limit); - size_t size = istr.readBig(reinterpret_cast(&x[initial_size]), sizeof(IPv) * limit); - x.resize(initial_size + size / sizeof(IPv)); - } + void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const override; + void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double /*avg_value_size_hint*/) const override; }; using SerializationIPv4 = SerializationIP; diff --git a/src/DataTypes/Serializations/SerializationLowCardinality.cpp b/src/DataTypes/Serializations/SerializationLowCardinality.cpp index 3e1cbdb00f5..9efe05042ed 100644 --- a/src/DataTypes/Serializations/SerializationLowCardinality.cpp +++ b/src/DataTypes/Serializations/SerializationLowCardinality.cpp @@ -700,6 +700,11 @@ void SerializationLowCardinality::deserializeTextEscaped(IColumn & column, ReadB deserializeImpl(column, &ISerialization::deserializeTextEscaped, istr, settings); } +bool SerializationLowCardinality::tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + return tryDeserializeImpl(column, &ISerialization::tryDeserializeTextEscaped, istr, settings); +} + void SerializationLowCardinality::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { serializeImpl(column, row_num, &ISerialization::serializeTextQuoted, ostr, settings); @@ -710,11 +715,21 @@ void SerializationLowCardinality::deserializeTextQuoted(IColumn & column, ReadBu deserializeImpl(column, &ISerialization::deserializeTextQuoted, istr, settings); } +bool SerializationLowCardinality::tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + return tryDeserializeImpl(column, &ISerialization::tryDeserializeTextQuoted, istr, settings); +} + void SerializationLowCardinality::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { deserializeImpl(column, &ISerialization::deserializeWholeText, istr, settings); } +bool SerializationLowCardinality::tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + return tryDeserializeImpl(column, &ISerialization::tryDeserializeWholeText, istr, settings); +} + void SerializationLowCardinality::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { serializeImpl(column, row_num, &ISerialization::serializeTextCSV, ostr, settings); @@ -725,6 +740,11 @@ void SerializationLowCardinality::deserializeTextCSV(IColumn & column, ReadBuffe deserializeImpl(column, &ISerialization::deserializeTextCSV, istr, settings); } +bool SerializationLowCardinality::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + return tryDeserializeImpl(column, &ISerialization::tryDeserializeTextCSV, istr, settings); +} + void SerializationLowCardinality::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { serializeImpl(column, row_num, &ISerialization::serializeText, ostr, settings); @@ -740,6 +760,11 @@ void SerializationLowCardinality::deserializeTextJSON(IColumn & column, ReadBuff deserializeImpl(column, &ISerialization::deserializeTextJSON, istr, settings); } +bool SerializationLowCardinality::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + return tryDeserializeImpl(column, &ISerialization::tryDeserializeTextJSON, istr, settings); +} + void SerializationLowCardinality::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { serializeImpl(column, row_num, &ISerialization::serializeTextXML, ostr, settings); @@ -750,6 +775,11 @@ void SerializationLowCardinality::deserializeTextRaw(IColumn & column, ReadBuffe deserializeImpl(column, &ISerialization::deserializeTextRaw, istr, settings); } +bool SerializationLowCardinality::tryDeserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + return tryDeserializeImpl(column, &ISerialization::tryDeserializeTextRaw, istr, settings); +} + void SerializationLowCardinality::serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { serializeImpl(column, row_num, &ISerialization::serializeTextRaw, ostr, settings); @@ -769,7 +799,7 @@ template void SerializationLowCardinality::deserializeImpl( IColumn & column, SerializationLowCardinality::DeserializeFunctionPtr func, Args &&... args) const { - auto & low_cardinality_column= getColumnLowCardinality(column); + auto & low_cardinality_column = getColumnLowCardinality(column); auto temp_column = low_cardinality_column.getDictionary().getNestedColumn()->cloneEmpty(); auto serialization = dictionary_type->getDefaultSerialization(); @@ -778,4 +808,19 @@ void SerializationLowCardinality::deserializeImpl( low_cardinality_column.insertFromFullColumn(*temp_column, 0); } +template +bool SerializationLowCardinality::tryDeserializeImpl( + IColumn & column, SerializationLowCardinality::TryDeserializeFunctionPtr func, Args &&... args) const +{ + auto & low_cardinality_column = getColumnLowCardinality(column); + auto temp_column = low_cardinality_column.getDictionary().getNestedColumn()->cloneEmpty(); + + auto serialization = dictionary_type->getDefaultSerialization(); + if (!(serialization.get()->*func)(*temp_column, std::forward(args)...)) + return false; + + low_cardinality_column.insertFromFullColumn(*temp_column, 0); + return true; +} + } diff --git a/src/DataTypes/Serializations/SerializationLowCardinality.h b/src/DataTypes/Serializations/SerializationLowCardinality.h index 5f56bcf8108..d2c3a95c702 100644 --- a/src/DataTypes/Serializations/SerializationLowCardinality.h +++ b/src/DataTypes/Serializations/SerializationLowCardinality.h @@ -55,16 +55,22 @@ public: void deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; void deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; private: @@ -79,6 +85,12 @@ private: template void deserializeImpl(IColumn & column, DeserializeFunctionPtr func, Args &&... args) const; + + template + using TryDeserializeFunctionPtr = bool (ISerialization::*)(IColumn &, Params ...) const; + + template + bool tryDeserializeImpl(IColumn & column, TryDeserializeFunctionPtr func, Args &&... args) const; }; } diff --git a/src/DataTypes/Serializations/SerializationMap.cpp b/src/DataTypes/Serializations/SerializationMap.cpp index 7588e630689..7b6f87baf2e 100644 --- a/src/DataTypes/Serializations/SerializationMap.cpp +++ b/src/DataTypes/Serializations/SerializationMap.cpp @@ -115,9 +115,11 @@ void SerializationMap::serializeTextImpl( writeChar('}', ostr); } -template -void SerializationMap::deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && reader) const +template +ReturnType SerializationMap::deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && reader) const { + static constexpr bool throw_exception = std::is_same_v; + auto & column_map = assert_cast(column); auto & nested_array = column_map.getNestedColumn(); @@ -128,7 +130,21 @@ void SerializationMap::deserializeTextImpl(IColumn & column, ReadBuffer & istr, auto & value_column = nested_tuple.getColumn(1); size_t size = 0; - assertChar('{', istr); + if constexpr (throw_exception) + assertChar('{', istr); + else if (!checkChar('{', istr)) + return ReturnType(false); + + auto on_error_no_throw = [&]() + { + if (size) + { + nested_tuple.getColumnPtr(0) = key_column.cut(0, offsets.back()); + nested_tuple.getColumnPtr(1) = value_column.cut(0, offsets.back()); + } + + return ReturnType(false); + }; try { @@ -138,9 +154,15 @@ void SerializationMap::deserializeTextImpl(IColumn & column, ReadBuffer & istr, if (!first) { if (*istr.position() == ',') + { ++istr.position(); + } else - throw Exception(ErrorCodes::CANNOT_READ_MAP_FROM_TEXT, "Cannot read Map from text"); + { + if constexpr (throw_exception) + throw Exception(ErrorCodes::CANNOT_READ_MAP_FROM_TEXT, "Cannot read Map from text"); + return on_error_no_throw(); + } } first = false; @@ -150,19 +172,32 @@ void SerializationMap::deserializeTextImpl(IColumn & column, ReadBuffer & istr, if (*istr.position() == '}') break; - reader(istr, key, key_column); + if constexpr (throw_exception) + reader(istr, key, key_column); + else if (!reader(istr, key, key_column)) + return on_error_no_throw(); + ++size; skipWhitespaceIfAny(istr); - assertChar(':', istr); + if constexpr (throw_exception) + assertChar(':', istr); + else if (!checkChar(':', istr)) + return on_error_no_throw(); skipWhitespaceIfAny(istr); - reader(istr, value, value_column); + if constexpr (throw_exception) + reader(istr, value, value_column); + else if (!reader(istr, value, value_column)) + return on_error_no_throw(); skipWhitespaceIfAny(istr); } - assertChar('}', istr); + if constexpr (throw_exception) + assertChar('}', istr); + else if (!checkChar('}', istr)) + return on_error_no_throw(); } catch (...) { @@ -171,10 +206,14 @@ void SerializationMap::deserializeTextImpl(IColumn & column, ReadBuffer & istr, nested_tuple.getColumnPtr(0) = key_column.cut(0, offsets.back()); nested_tuple.getColumnPtr(1) = value_column.cut(0, offsets.back()); } - throw; + + if constexpr (throw_exception) + throw; + return ReturnType(false); } offsets.push_back(offsets.back() + size); + return ReturnType(true); } void SerializationMap::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const @@ -192,8 +231,8 @@ void SerializationMap::deserializeText(IColumn & column, ReadBuffer & istr, cons deserializeTextImpl(column, istr, [&settings](ReadBuffer & buf, const SerializationPtr & subcolumn_serialization, IColumn & subcolumn) { - if (settings.null_as_default) - SerializationNullable::deserializeTextQuotedImpl(subcolumn, buf, settings, subcolumn_serialization); + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(subcolumn)) + SerializationNullable::deserializeNullAsDefaultOrNestedTextQuoted(subcolumn, buf, settings, subcolumn_serialization); else subcolumn_serialization->deserializeTextQuoted(subcolumn, buf, settings); }); @@ -202,6 +241,28 @@ void SerializationMap::deserializeText(IColumn & column, ReadBuffer & istr, cons throwUnexpectedDataAfterParsedValue(column, istr, settings, "Map"); } +bool SerializationMap::tryDeserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const +{ + auto reader = [&settings](ReadBuffer & buf, const SerializationPtr & subcolumn_serialization, IColumn & subcolumn) + { + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(subcolumn)) + return SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextQuoted(subcolumn, buf, settings, subcolumn_serialization); + return subcolumn_serialization->tryDeserializeTextQuoted(subcolumn, buf, settings); + }; + + auto ok = deserializeTextImpl(column, istr, reader); + if (!ok) + return false; + + if (whole && !istr.eof()) + { + column.popBack(1); + return false; + } + + return true; +} + void SerializationMap::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { serializeTextImpl(column, row_num, ostr, @@ -260,13 +321,25 @@ void SerializationMap::deserializeTextJSON(IColumn & column, ReadBuffer & istr, deserializeTextImpl(column, istr, [&settings](ReadBuffer & buf, const SerializationPtr & subcolumn_serialization, IColumn & subcolumn) { - if (settings.null_as_default) - SerializationNullable::deserializeTextJSONImpl(subcolumn, buf, settings, subcolumn_serialization); + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(subcolumn)) + SerializationNullable::deserializeNullAsDefaultOrNestedTextJSON(subcolumn, buf, settings, subcolumn_serialization); else subcolumn_serialization->deserializeTextJSON(subcolumn, buf, settings); }); } +bool SerializationMap::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + auto reader = [&settings](ReadBuffer & buf, const SerializationPtr & subcolumn_serialization, IColumn & subcolumn) + { + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(subcolumn)) + return SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextJSON(subcolumn, buf, settings, subcolumn_serialization); + return subcolumn_serialization->tryDeserializeTextJSON(subcolumn, buf, settings); + }; + + return deserializeTextImpl(column, istr, reader); +} + void SerializationMap::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { const auto & column_map = assert_cast(column); @@ -308,6 +381,15 @@ void SerializationMap::deserializeTextCSV(IColumn & column, ReadBuffer & istr, c deserializeText(column, rb, settings, true); } +bool SerializationMap::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String s; + if (!tryReadCSV(s, istr, settings.csv)) + return false; + ReadBufferFromString rb(s); + return tryDeserializeText(column, rb, settings, true); +} + void SerializationMap::enumerateStreams( EnumerateStreamsSettings & settings, const StreamCallback & callback, diff --git a/src/DataTypes/Serializations/SerializationMap.h b/src/DataTypes/Serializations/SerializationMap.h index f32c656757d..3e27ef1b04a 100644 --- a/src/DataTypes/Serializations/SerializationMap.h +++ b/src/DataTypes/Serializations/SerializationMap.h @@ -24,13 +24,16 @@ public: void deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &, bool whole) const override; + bool tryDeserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &, bool whole) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent) const override; void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void enumerateStreams( EnumerateStreamsSettings & settings, @@ -68,8 +71,8 @@ private: template void serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffer & ostr, KeyWriter && key_writer, ValueWriter && value_writer) const; - template - void deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && reader) const; + template + ReturnType deserializeTextImpl(IColumn & column, ReadBuffer & istr, Reader && reader) const; }; } diff --git a/src/DataTypes/Serializations/SerializationNamed.cpp b/src/DataTypes/Serializations/SerializationNamed.cpp index ca60948ce68..1a9cbe9a37d 100644 --- a/src/DataTypes/Serializations/SerializationNamed.cpp +++ b/src/DataTypes/Serializations/SerializationNamed.cpp @@ -1,4 +1,5 @@ #include +#include namespace DB { diff --git a/src/DataTypes/Serializations/SerializationNothing.h b/src/DataTypes/Serializations/SerializationNothing.h index 02974d1ca76..7d1fff55b01 100644 --- a/src/DataTypes/Serializations/SerializationNothing.h +++ b/src/DataTypes/Serializations/SerializationNothing.h @@ -25,6 +25,7 @@ public: void deserializeBinary(IColumn &, ReadBuffer &, const FormatSettings &) const override { throwNoSerialization(); } void serializeText(const IColumn &, size_t, WriteBuffer &, const FormatSettings &) const override { throwNoSerialization(); } void deserializeText(IColumn &, ReadBuffer &, const FormatSettings &, bool) const override { throwNoSerialization(); } + bool tryDeserializeText(IColumn &, ReadBuffer &, const FormatSettings &, bool) const override { throwNoSerialization(); } /// These methods read and write zero bytes just to allow to figure out size of column. void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const override; diff --git a/src/DataTypes/Serializations/SerializationNullable.cpp b/src/DataTypes/Serializations/SerializationNullable.cpp index 15203bdc9fa..e7f0e61f2a5 100644 --- a/src/DataTypes/Serializations/SerializationNullable.cpp +++ b/src/DataTypes/Serializations/SerializationNullable.cpp @@ -187,55 +187,59 @@ void SerializationNullable::serializeBinary(const IColumn & column, size_t row_n nested->serializeBinary(col.getNestedColumn(), row_num, ostr, settings); } -/// Deserialize value into ColumnNullable. -/// We need to insert both to nested column and to null byte map, or, in case of exception, to not insert at all. -template -requires std::same_as -static ReturnType -safeDeserialize(IColumn & column, const ISerialization &, CheckForNull && check_for_null, DeserializeNested && deserialize_nested) +template +ReturnType safeAppendToNullMap(ColumnNullable & column, bool is_null) { - ColumnNullable & col = assert_cast(column); - - if (check_for_null()) + try { - col.insertDefault(); + column.getNullMapData().push_back(is_null); } - else + catch (...) { - deserialize_nested(col.getNestedColumn()); - - try - { - col.getNullMapData().push_back(0); - } - catch (...) - { - col.getNestedColumn().popBack(1); + column.getNestedColumn().popBack(1); + if constexpr (std::is_same_v) throw; - } + return ReturnType(false); } + + return ReturnType(true); } -/// Deserialize value into non-nullable column. In case of NULL, insert default value and return false. +/// Deserialize value into non-nullable column. In case of NULL, insert default and set is_null to true. +/// If ReturnType is bool, return true if parsing was succesfull and false in case of any error. template -requires std::same_as -static ReturnType -safeDeserialize(IColumn & column, const ISerialization &, CheckForNull && check_for_null, DeserializeNested && deserialize_nested) +static ReturnType deserializeImpl(IColumn & column, ReadBuffer & buf, CheckForNull && check_for_null, DeserializeNested && deserialize_nested, bool & is_null) { - bool insert_default = check_for_null(); - if (insert_default) + is_null = check_for_null(buf); + if (is_null) + { column.insertDefault(); + } else - deserialize_nested(column); - return !insert_default; + { + if constexpr (std::is_same_v) + deserialize_nested(column, buf); + else if (!deserialize_nested(column, buf)) + return ReturnType(false); + } + + return ReturnType(true); } void SerializationNullable::deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { - safeDeserialize(column, *nested, - [&istr] { bool is_null = false; readBinary(is_null, istr); return is_null; }, - [this, &istr, settings] (IColumn & nested_column) { nested->deserializeBinary(nested_column, istr, settings); }); + ColumnNullable & col = assert_cast(column); + bool is_null; + auto check_for_null = [](ReadBuffer & buf) + { + bool is_null_ = false; + readBinary(is_null_, buf); + return is_null_; + }; + auto deserialize_nested = [this, &settings] (IColumn & nested_column, ReadBuffer & buf) { nested->deserializeBinary(nested_column, buf, settings); }; + deserializeImpl(col.getNestedColumn(), istr, check_for_null, deserialize_nested, is_null); + safeAppendToNullMap(col, is_null); } @@ -244,20 +248,19 @@ void SerializationNullable::serializeTextEscaped(const IColumn & column, size_t const ColumnNullable & col = assert_cast(column); if (col.isNullAt(row_num)) - writeString(settings.tsv.null_representation, ostr); + serializeNullEscaped(ostr, settings); else nested->serializeTextEscaped(col.getNestedColumn(), row_num, ostr, settings); } - -void SerializationNullable::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +void SerializationNullable::serializeNullEscaped(DB::WriteBuffer & ostr, const DB::FormatSettings & settings) { - deserializeTextEscapedImpl(column, istr, settings, nested); + writeString(settings.tsv.null_representation, ostr); } -void SerializationNullable::deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +bool SerializationNullable::tryDeserializeNullEscaped(DB::ReadBuffer & istr, const DB::FormatSettings & settings) { - deserializeTextRawImpl(column, istr, settings, nested); + return checkString(settings.tsv.null_representation, istr); } void SerializationNullable::serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const @@ -265,72 +268,73 @@ void SerializationNullable::serializeTextRaw(const IColumn & column, size_t row_ const ColumnNullable & col = assert_cast(column); if (col.isNullAt(row_num)) - writeString(settings.tsv.null_representation, ostr); + serializeNullRaw(ostr, settings); else nested->serializeTextRaw(col.getNestedColumn(), row_num, ostr, settings); } -template -ReturnType SerializationNullable::deserializeTextRawImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested) +void SerializationNullable::serializeNullRaw(DB::WriteBuffer & ostr, const DB::FormatSettings & settings) { - return deserializeTextEscapedAndRawImpl(column, istr, settings, nested); + writeString(settings.tsv.null_representation, ostr); } -template -ReturnType SerializationNullable::deserializeTextEscapedImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, - const SerializationPtr & nested) +bool SerializationNullable::tryDeserializeNullRaw(DB::ReadBuffer & istr, const DB::FormatSettings & settings) { - return deserializeTextEscapedAndRawImpl(column, istr, settings, nested); + return checkString(settings.tsv.null_representation, istr); } template -ReturnType SerializationNullable::deserializeTextEscapedAndRawImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, - const SerializationPtr & nested_serialization) +ReturnType deserializeTextEscapedAndRawImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization, bool & is_null) { + static constexpr bool throw_exception = std::is_same_v; + const String & null_representation = settings.tsv.null_representation; + auto deserialize_nested = [&nested_serialization, &settings] (IColumn & nested_column, ReadBuffer & buf_) + { + if constexpr (throw_exception) + { + if constexpr (escaped) + nested_serialization->deserializeTextEscaped(nested_column, buf_, settings); + else + nested_serialization->deserializeTextRaw(nested_column, buf_, settings); + } + else + { + if constexpr (escaped) + return nested_serialization->tryDeserializeTextEscaped(nested_column, buf_, settings); + else + return nested_serialization->tryDeserializeTextRaw(nested_column, buf_, settings); + } + }; /// Some data types can deserialize absence of data (e.g. empty string), so eof is ok. if (istr.eof() || (!null_representation.empty() && *istr.position() != null_representation[0])) { /// This is not null, surely. - return safeDeserialize(column, *nested_serialization, - [] { return false; }, - [&nested_serialization, &istr, &settings] (IColumn & nested_column) - { - if constexpr (escaped) - nested_serialization->deserializeTextEscaped(nested_column, istr, settings); - else - nested_serialization->deserializeTextRaw(nested_column, istr, settings); - }); + return deserializeImpl(column, istr, [](ReadBuffer &){ return false; }, deserialize_nested, is_null); } /// Check if we have enough data in buffer to check if it's a null. if (istr.available() > null_representation.size()) { - auto check_for_null = [&istr, &null_representation]() + auto check_for_null = [&null_representation](ReadBuffer & buf) { - auto * pos = istr.position(); - if (checkString(null_representation, istr) && (*istr.position() == '\t' || *istr.position() == '\n')) + auto * pos = buf.position(); + if (checkString(null_representation, buf) && (*buf.position() == '\t' || *buf.position() == '\n')) return true; - istr.position() = pos; + buf.position() = pos; return false; }; - auto deserialize_nested = [&nested_serialization, &settings, &istr] (IColumn & nested_column) - { - if constexpr (escaped) - nested_serialization->deserializeTextEscaped(nested_column, istr, settings); - else - nested_serialization->deserializeTextRaw(nested_column, istr, settings); - }; - return safeDeserialize(column, *nested_serialization, check_for_null, deserialize_nested); + return deserializeImpl(column, istr, check_for_null, deserialize_nested, is_null); } /// We don't have enough data in buffer to check if it's a null. /// Use PeekableReadBuffer to make a checkpoint before checking null /// representation and rollback if check was failed. - PeekableReadBuffer buf(istr, true); - auto check_for_null = [&buf, &null_representation]() + PeekableReadBuffer peekable_buf(istr, true); + auto check_for_null = [&null_representation](ReadBuffer & buf_) { + auto & buf = assert_cast(buf_); buf.setCheckpoint(); SCOPE_EXIT(buf.dropCheckpoint()); if (checkString(null_representation, buf) && (buf.eof() || *buf.position() == '\t' || *buf.position() == '\n')) @@ -340,16 +344,18 @@ ReturnType SerializationNullable::deserializeTextEscapedAndRawImpl(IColumn & col return false; }; - auto deserialize_nested = [&nested_serialization, &settings, &buf, &null_representation, &istr] (IColumn & nested_column) + auto deserialize_nested_with_check = [&deserialize_nested, &nested_serialization, &settings, &null_representation, &istr] (IColumn & nested_column, ReadBuffer & buf_) { + auto & buf = assert_cast(buf_); auto * pos = buf.position(); - if constexpr (escaped) - nested_serialization->deserializeTextEscaped(nested_column, buf, settings); - else - nested_serialization->deserializeTextRaw(nested_column, buf, settings); + if constexpr (throw_exception) + deserialize_nested(nested_column, buf); + else if (!deserialize_nested(nested_column, buf)) + return ReturnType(false); + /// Check that we don't have any unread data in PeekableReadBuffer own memory. if (likely(!buf.hasUnreadData())) - return; + return ReturnType(true); /// We have some unread data in PeekableReadBuffer own memory. /// It can happen only if there is a string instead of a number @@ -358,6 +364,9 @@ ReturnType SerializationNullable::deserializeTextEscapedAndRawImpl(IColumn & col /// We also should delete incorrectly deserialized value from nested column. nested_column.popBack(1); + if constexpr (!throw_exception) + return ReturnType(false); + if (null_representation.find('\t') != std::string::npos || null_representation.find('\n') != std::string::npos) throw DB::ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA, "TSV custom null representation " "containing '\\t' or '\\n' may not work correctly for large input."); @@ -375,7 +384,63 @@ ReturnType SerializationNullable::deserializeTextEscapedAndRawImpl(IColumn & col istr.count(), std::string(pos, buf.position() - pos), parsed_value.str()); }; - return safeDeserialize(column, *nested_serialization, check_for_null, deserialize_nested); + return deserializeImpl(column, peekable_buf, check_for_null, deserialize_nested_with_check, is_null); +} + +void SerializationNullable::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + ColumnNullable & col = assert_cast(column); + bool is_null; + deserializeTextEscapedAndRawImpl(col.getNestedColumn(), istr, settings, nested, is_null); + safeAppendToNullMap(col, is_null); +} + +bool SerializationNullable::tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + ColumnNullable & col = assert_cast(column); + bool is_null; + return deserializeTextEscapedAndRawImpl(col.getNestedColumn(), istr, settings, nested, is_null) && safeAppendToNullMap(col, is_null); +} + +bool SerializationNullable::deserializeNullAsDefaultOrNestedTextEscaped(IColumn & nested_column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization) +{ + bool is_null; + deserializeTextEscapedAndRawImpl(nested_column, istr, settings, nested_serialization, is_null); + return !is_null; +} + +bool SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextEscaped(IColumn & nested_column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization) +{ + bool is_null; + return deserializeTextEscapedAndRawImpl(nested_column, istr, settings, nested_serialization, is_null); +} + +void SerializationNullable::deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + ColumnNullable & col = assert_cast(column); + bool is_null; + deserializeTextEscapedAndRawImpl(col.getNestedColumn(), istr, settings, nested, is_null); + safeAppendToNullMap(col, is_null); +} + +bool SerializationNullable::tryDeserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + ColumnNullable & col = assert_cast(column); + bool is_null; + return deserializeTextEscapedAndRawImpl(col.getNestedColumn(), istr, settings, nested, is_null) && safeAppendToNullMap(col, is_null); +} + +bool SerializationNullable::deserializeNullAsDefaultOrNestedTextRaw(IColumn & nested_column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization) +{ + bool is_null; + deserializeTextEscapedAndRawImpl(nested_column, istr, settings, nested_serialization, is_null); + return !is_null; +} + +bool SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextRaw(IColumn & nested_column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization) +{ + bool is_null; + return deserializeTextEscapedAndRawImpl(nested_column, istr, settings, nested_serialization, is_null); } void SerializationNullable::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const @@ -383,45 +448,51 @@ void SerializationNullable::serializeTextQuoted(const IColumn & column, size_t r const ColumnNullable & col = assert_cast(column); if (col.isNullAt(row_num)) - writeCString("NULL", ostr); + serializeNullQuoted(ostr); else nested->serializeTextQuoted(col.getNestedColumn(), row_num, ostr, settings); } - -void SerializationNullable::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +void SerializationNullable::serializeNullQuoted(DB::WriteBuffer & ostr) { - deserializeTextQuotedImpl(column, istr, settings, nested); + writeCString("NULL", ostr); +} + +bool SerializationNullable::tryDeserializeNullQuoted(DB::ReadBuffer & istr) +{ + return checkStringCaseInsensitive("NULL", istr); } template -ReturnType SerializationNullable::deserializeTextQuotedImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, - const SerializationPtr & nested) +ReturnType deserializeTextQuotedImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested, bool & is_null) { + static constexpr bool throw_exception = std::is_same_v; + + auto deserialize_nested = [&nested, &settings] (IColumn & nested_column, ReadBuffer & buf) + { + if constexpr (!throw_exception) + return nested->tryDeserializeTextQuoted(nested_column, buf, settings); + nested->deserializeTextQuoted(nested_column, buf, settings); + }; + if (istr.eof() || (*istr.position() != 'N' && *istr.position() != 'n')) { /// This is not null, surely. - return safeDeserialize(column, *nested, - [] { return false; }, - [&nested, &istr, &settings] (IColumn & nested_column) { nested->deserializeTextQuoted(nested_column, istr, settings); }); + return deserializeImpl(column, istr, [](ReadBuffer &){ return false; }, deserialize_nested, is_null); } /// Check if we have enough data in buffer to check if it's a null. if (istr.available() >= 4) { - auto check_for_null = [&istr]() + auto check_for_null = [](ReadBuffer & buf) { - auto * pos = istr.position(); - if (checkStringCaseInsensitive("NULL", istr)) + auto * pos = buf.position(); + if (checkStringCaseInsensitive("NULL", buf)) return true; - istr.position() = pos; + buf.position() = pos; return false; }; - auto deserialize_nested = [&nested, &settings, &istr] (IColumn & nested_column) - { - nested->deserializeTextQuoted(nested_column, istr, settings); - }; - return safeDeserialize(column, *nested, check_for_null, deserialize_nested); + return deserializeImpl(column, istr, check_for_null, deserialize_nested, is_null); } /// We don't have enough data in buffer to check if it's a NULL @@ -429,9 +500,10 @@ ReturnType SerializationNullable::deserializeTextQuotedImpl(IColumn & column, Re /// to differentiate for example NULL and NaN for float) /// Use PeekableReadBuffer to make a checkpoint before checking /// null and rollback if the check was failed. - PeekableReadBuffer buf(istr, true); - auto check_for_null = [&buf]() + PeekableReadBuffer peekable_buf(istr, true); + auto check_for_null = [](ReadBuffer & buf_) { + auto & buf = assert_cast(buf_); buf.setCheckpoint(); SCOPE_EXIT(buf.dropCheckpoint()); if (checkStringCaseInsensitive("NULL", buf)) @@ -441,39 +513,74 @@ ReturnType SerializationNullable::deserializeTextQuotedImpl(IColumn & column, Re return false; }; - auto deserialize_nested = [&nested, &settings, &buf] (IColumn & nested_column) + auto deserialize_nested_with_check = [&deserialize_nested] (IColumn & nested_column, ReadBuffer & buf_) { - nested->deserializeTextQuoted(nested_column, buf, settings); + auto & buf = assert_cast(buf_); + + if constexpr (throw_exception) + deserialize_nested(nested_column, buf); + else if (!deserialize_nested(nested_column, buf)) + return false; + /// Check that we don't have any unread data in PeekableReadBuffer own memory. if (likely(!buf.hasUnreadData())) - return; + return ReturnType(true); /// We have some unread data in PeekableReadBuffer own memory. /// It can happen only if there is an unquoted string instead of a number. /// We also should delete incorrectly deserialized value from nested column. nested_column.popBack(1); + + if constexpr (!throw_exception) + return ReturnType(false); + throw DB::ParsingException( ErrorCodes::CANNOT_READ_ALL_DATA, "Error while parsing Nullable: got an unquoted string {} instead of a number", String(buf.position(), std::min(10ul, buf.available()))); }; - return safeDeserialize(column, *nested, check_for_null, deserialize_nested); + return deserializeImpl(column, peekable_buf, check_for_null, deserialize_nested_with_check, is_null); } -void SerializationNullable::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +void SerializationNullable::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { - deserializeWholeTextImpl(column, istr, settings, nested); + ColumnNullable & col = assert_cast(column); + bool is_null; + deserializeTextQuotedImpl(col.getNestedColumn(), istr, settings, nested, is_null); + safeAppendToNullMap(col, is_null); +} + +bool SerializationNullable::tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + ColumnNullable & col = assert_cast(column); + bool is_null; + return deserializeTextQuotedImpl(col.getNestedColumn(), istr, settings, nested, is_null) && safeAppendToNullMap(col, is_null); +} + +bool SerializationNullable::deserializeNullAsDefaultOrNestedTextQuoted(DB::IColumn & nested_column, DB::ReadBuffer & istr, const DB::FormatSettings & settings, const DB::SerializationPtr & nested_serialization) +{ + bool is_null; + deserializeTextQuotedImpl(nested_column, istr, settings, nested_serialization, is_null); + return !is_null; +} + +bool SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextQuoted(DB::IColumn & nested_column, DB::ReadBuffer & istr, const DB::FormatSettings & settings, const DB::SerializationPtr & nested_serialization) +{ + bool is_null; + return deserializeTextQuotedImpl(nested_column, istr, settings, nested_serialization, is_null); } template -ReturnType SerializationNullable::deserializeWholeTextImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, - const SerializationPtr & nested) +ReturnType deserializeWholeTextImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested, bool & is_null) { - PeekableReadBuffer buf(istr, true); - auto check_for_null = [&buf]() + static constexpr bool throw_exception = std::is_same_v; + + PeekableReadBuffer peekable_buf(istr, true); + auto check_for_null = [](ReadBuffer & buf_) { + auto & buf = assert_cast(buf_); buf.setCheckpoint(); SCOPE_EXIT(buf.dropCheckpoint()); @@ -488,15 +595,46 @@ ReturnType SerializationNullable::deserializeWholeTextImpl(IColumn & column, Rea return false; }; - auto deserialize_nested = [&nested, &settings, &buf] (IColumn & nested_column) + auto deserialize_nested = [&nested, &settings] (IColumn & nested_column, ReadBuffer & buf_) { + auto & buf = assert_cast(buf_); + if constexpr (!throw_exception) + return nested->tryDeserializeWholeText(nested_column, buf, settings); + nested->deserializeWholeText(nested_column, buf, settings); assert(!buf.hasUnreadData()); }; - return safeDeserialize(column, *nested, check_for_null, deserialize_nested); + return deserializeImpl(column, peekable_buf, check_for_null, deserialize_nested, is_null); } +void SerializationNullable::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + ColumnNullable & col = assert_cast(column); + bool is_null; + deserializeWholeTextImpl(col.getNestedColumn(), istr, settings, nested, is_null); + safeAppendToNullMap(col, is_null); +} + +bool SerializationNullable::tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + ColumnNullable & col = assert_cast(column); + bool is_null; + return deserializeWholeTextImpl(col.getNestedColumn(), istr, settings, nested, is_null) && safeAppendToNullMap(col, is_null); +} + +bool SerializationNullable::deserializeNullAsDefaultOrNestedWholeText(DB::IColumn & nested_column, DB::ReadBuffer & istr, const DB::FormatSettings & settings, const DB::SerializationPtr & nested_serialization) +{ + bool is_null; + deserializeWholeTextImpl(nested_column, istr, settings, nested_serialization, is_null); + return !is_null; +} + +bool SerializationNullable::tryDeserializeNullAsDefaultOrNestedWholeText(DB::IColumn & nested_column, DB::ReadBuffer & istr, const DB::FormatSettings & settings, const DB::SerializationPtr & nested_serialization) +{ + bool is_null; + return deserializeWholeTextImpl(nested_column, istr, settings, nested_serialization, is_null); +} void SerializationNullable::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { @@ -508,48 +646,56 @@ void SerializationNullable::serializeTextCSV(const IColumn & column, size_t row_ nested->serializeTextCSV(col.getNestedColumn(), row_num, ostr, settings); } -void SerializationNullable::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +void SerializationNullable::serializeNullCSV(DB::WriteBuffer & ostr, const DB::FormatSettings & settings) { - deserializeTextCSVImpl(column, istr, settings, nested); + writeString(settings.csv.null_representation, ostr); +} + +bool SerializationNullable::tryDeserializeNullCSV(DB::ReadBuffer & istr, const DB::FormatSettings & settings) +{ + return checkString(settings.csv.null_representation, istr); } template -ReturnType SerializationNullable::deserializeTextCSVImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, - const SerializationPtr & nested_serialization) +ReturnType deserializeTextCSVImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization, bool & is_null) { + static constexpr bool throw_exception = std::is_same_v; + + auto deserialize_nested = [&nested_serialization, &settings] (IColumn & nested_column, ReadBuffer & buf) + { + if constexpr (!throw_exception) + return nested_serialization->tryDeserializeTextCSV(nested_column, buf, settings); + nested_serialization->deserializeTextCSV(nested_column, buf, settings); + }; + const String & null_representation = settings.csv.null_representation; if (istr.eof() || (!null_representation.empty() && *istr.position() != null_representation[0])) { /// This is not null, surely. - return safeDeserialize(column, *nested_serialization, - [] { return false; }, - [&nested_serialization, &istr, &settings] (IColumn & nested_column) { nested_serialization->deserializeTextCSV(nested_column, istr, settings); }); + return deserializeImpl(column, istr, [](ReadBuffer &){ return false; }, deserialize_nested, is_null); } /// Check if we have enough data in buffer to check if it's a null. if (settings.csv.custom_delimiter.empty() && istr.available() > null_representation.size()) { - auto check_for_null = [&istr, &null_representation, &settings]() + auto check_for_null = [&null_representation, &settings](ReadBuffer & buf) { - auto * pos = istr.position(); - if (checkString(null_representation, istr) && (*istr.position() == settings.csv.delimiter || *istr.position() == '\r' || *istr.position() == '\n')) + auto * pos = buf.position(); + if (checkString(null_representation, buf) && (*buf.position() == settings.csv.delimiter || *buf.position() == '\r' || *buf.position() == '\n')) return true; - istr.position() = pos; + buf.position() = pos; return false; }; - auto deserialize_nested = [&nested_serialization, &settings, &istr] (IColumn & nested_column) - { - nested_serialization->deserializeTextCSV(nested_column, istr, settings); - }; - return safeDeserialize(column, *nested_serialization, check_for_null, deserialize_nested); + return deserializeImpl(column, istr, check_for_null, deserialize_nested, is_null); } /// We don't have enough data in buffer to check if it's a null. /// Use PeekableReadBuffer to make a checkpoint before checking null /// representation and rollback if the check was failed. - PeekableReadBuffer buf(istr, true); - auto check_for_null = [&buf, &null_representation, &settings]() + PeekableReadBuffer peekable_buf(istr, true); + auto check_for_null = [&null_representation, &settings](ReadBuffer & buf_) { + auto & buf = assert_cast(buf_); buf.setCheckpoint(); SCOPE_EXIT(buf.dropCheckpoint()); if (checkString(null_representation, buf)) @@ -572,13 +718,18 @@ ReturnType SerializationNullable::deserializeTextCSVImpl(IColumn & column, ReadB return false; }; - auto deserialize_nested = [&nested_serialization, &settings, &buf, &null_representation, &istr] (IColumn & nested_column) + auto deserialize_nested_with_check = [&deserialize_nested, &nested_serialization, &settings, &null_representation, &istr] (IColumn & nested_column, ReadBuffer & buf_) { + auto & buf = assert_cast(buf_); auto * pos = buf.position(); - nested_serialization->deserializeTextCSV(nested_column, buf, settings); + if constexpr (throw_exception) + deserialize_nested(nested_column, buf); + else if (!deserialize_nested(nested_column, buf)) + return ReturnType(false); + /// Check that we don't have any unread data in PeekableReadBuffer own memory. if (likely(!buf.hasUnreadData())) - return; + return ReturnType(true); /// We have some unread data in PeekableReadBuffer own memory. /// It can happen only if there is an unquoted string instead of a number @@ -587,6 +738,9 @@ ReturnType SerializationNullable::deserializeTextCSVImpl(IColumn & column, ReadB /// We also should delete incorrectly deserialized value from nested column. nested_column.popBack(1); + if constexpr (!throw_exception) + return ReturnType(false); + if (null_representation.find(settings.csv.delimiter) != std::string::npos || null_representation.find('\r') != std::string::npos || null_representation.find('\n') != std::string::npos) throw DB::ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA, "CSV custom null representation containing " @@ -602,7 +756,35 @@ ReturnType SerializationNullable::deserializeTextCSVImpl(IColumn & column, ReadB istr.count(), std::string(pos, buf.position() - pos), parsed_value.str()); }; - return safeDeserialize(column, *nested_serialization, check_for_null, deserialize_nested); + return deserializeImpl(column, peekable_buf, check_for_null, deserialize_nested_with_check, is_null); +} + +void SerializationNullable::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + ColumnNullable & col = assert_cast(column); + bool is_null; + deserializeTextCSVImpl(col.getNestedColumn(), istr, settings, nested, is_null); + safeAppendToNullMap(col, is_null); +} + +bool SerializationNullable::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + ColumnNullable & col = assert_cast(column); + bool is_null; + return deserializeTextCSVImpl(col.getNestedColumn(), istr, settings, nested, is_null) && safeAppendToNullMap(col, is_null); +} + +bool SerializationNullable::deserializeNullAsDefaultOrNestedTextCSV(DB::IColumn & nested_column, DB::ReadBuffer & istr, const DB::FormatSettings & settings, const DB::SerializationPtr & nested_serialization) +{ + bool is_null; + deserializeTextCSVImpl(nested_column, istr, settings, nested_serialization, is_null); + return !is_null; +} + +bool SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextCSV(DB::IColumn & nested_column, DB::ReadBuffer & istr, const DB::FormatSettings & settings, const DB::SerializationPtr & nested_serialization) +{ + bool is_null; + return deserializeTextCSVImpl(nested_column, istr, settings, nested_serialization, is_null); } void SerializationNullable::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const @@ -616,38 +798,86 @@ void SerializationNullable::serializeText(const IColumn & column, size_t row_num /// This assumes UTF-8 and proper font support. This is Ok, because Pretty formats are "presentational", not for data exchange. if (col.isNullAt(row_num)) - { - if (settings.pretty.charset == FormatSettings::Pretty::Charset::UTF8) - writeCString("ᴺᵁᴸᴸ", ostr); - else - writeCString("NULL", ostr); - } + serializeNullText(ostr, settings); else nested->serializeText(col.getNestedColumn(), row_num, ostr, settings); } +void SerializationNullable::serializeNullText(DB::WriteBuffer & ostr, const DB::FormatSettings & settings) +{ + if (settings.pretty.charset == FormatSettings::Pretty::Charset::UTF8) + writeCString("ᴺᵁᴸᴸ", ostr); + else + writeCString("NULL", ostr); +} + +bool SerializationNullable::tryDeserializeNullText(DB::ReadBuffer & istr) +{ + if (checkCharCaseInsensitive('N', istr)) + return checkStringCaseInsensitive("ULL", istr); + return checkStringCaseInsensitive("ᴺᵁᴸᴸ", istr); +} + void SerializationNullable::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { const ColumnNullable & col = assert_cast(column); if (col.isNullAt(row_num)) - writeCString("null", ostr); + serializeNullJSON(ostr); else nested->serializeTextJSON(col.getNestedColumn(), row_num, ostr, settings); } -void SerializationNullable::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +void SerializationNullable::serializeNullJSON(DB::WriteBuffer & ostr) { - deserializeTextJSONImpl(column, istr, settings, nested); + writeCString("null", ostr); +} + +bool SerializationNullable::tryDeserializeNullJSON(DB::ReadBuffer & istr) +{ + return checkString("null", istr); } template -ReturnType SerializationNullable::deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, - const SerializationPtr & nested) +ReturnType deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested, bool & is_null) { - return safeDeserialize(column, *nested, - [&istr] { return checkStringByFirstCharacterAndAssertTheRest("null", istr); }, - [&nested, &istr, &settings] (IColumn & nested_column) { nested->deserializeTextJSON(nested_column, istr, settings); }); + auto check_for_null = [](ReadBuffer & buf){ return checkStringByFirstCharacterAndAssertTheRest("null", buf); }; + auto deserialize_nested = [&nested, &settings](IColumn & nested_column, ReadBuffer & buf) + { + if constexpr (std::is_same_v) + return nested->tryDeserializeTextJSON(nested_column, buf, settings); + nested->deserializeTextJSON(nested_column, buf, settings); + }; + + return deserializeImpl(column, istr, check_for_null, deserialize_nested, is_null); +} + +void SerializationNullable::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + ColumnNullable & col = assert_cast(column); + bool is_null; + deserializeTextJSONImpl(col.getNestedColumn(), istr, settings, nested, is_null); + safeAppendToNullMap(col, is_null); +} + +bool SerializationNullable::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + ColumnNullable & col = assert_cast(column); + bool is_null; + return deserializeTextJSONImpl(col.getNestedColumn(), istr, settings, nested, is_null) && safeAppendToNullMap(col, is_null); +} + +bool SerializationNullable::deserializeNullAsDefaultOrNestedTextJSON(DB::IColumn & nested_column, DB::ReadBuffer & istr, const DB::FormatSettings & settings, const DB::SerializationPtr & nested_serialization) +{ + bool is_null; + deserializeTextJSONImpl(nested_column, istr, settings, nested_serialization, is_null); + return !is_null; +} + +bool SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextJSON(DB::IColumn & nested_column, DB::ReadBuffer & istr, const DB::FormatSettings & settings, const DB::SerializationPtr & nested_serialization) +{ + bool is_null; + return deserializeTextJSONImpl(nested_column, istr, settings, nested_serialization, is_null); } void SerializationNullable::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const @@ -660,11 +890,9 @@ void SerializationNullable::serializeTextXML(const IColumn & column, size_t row_ nested->serializeTextXML(col.getNestedColumn(), row_num, ostr, settings); } -template bool SerializationNullable::deserializeWholeTextImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); -template bool SerializationNullable::deserializeTextEscapedImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); -template bool SerializationNullable::deserializeTextQuotedImpl(IColumn & column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested); -template bool SerializationNullable::deserializeTextCSVImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); -template bool SerializationNullable::deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested); -template bool SerializationNullable::deserializeTextRawImpl(IColumn & column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested); +void SerializationNullable::serializeNullXML(DB::WriteBuffer & ostr) +{ + writeCString("\\N", ostr); +} } diff --git a/src/DataTypes/Serializations/SerializationNullable.h b/src/DataTypes/Serializations/SerializationNullable.h index 3ec01b46de5..37858ccdefd 100644 --- a/src/DataTypes/Serializations/SerializationNullable.h +++ b/src/DataTypes/Serializations/SerializationNullable.h @@ -51,9 +51,12 @@ public: void deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; @@ -66,31 +69,49 @@ public: * In CSV, non-NULL string value, starting with \N characters, must be placed in quotes, to avoid ambiguity. */ void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; - /// If ReturnType is bool, check for NULL and deserialize value into non-nullable column (and return true) or insert default value of nested type (and return false) - /// If ReturnType is void, deserialize Nullable(T) - template - static ReturnType deserializeWholeTextImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); - template - static ReturnType deserializeTextEscapedImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); - template - static ReturnType deserializeTextQuotedImpl(IColumn & column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested); - template - static ReturnType deserializeTextCSVImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); - template - static ReturnType deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested); - template - static ReturnType deserializeTextRawImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); - template - static ReturnType deserializeTextEscapedAndRawImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested); + /// If Check for NULL and deserialize value into non-nullable column (and return true) or insert default value of nested type (and return false) + static bool deserializeNullAsDefaultOrNestedWholeText(IColumn & nested_column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization); + static bool deserializeNullAsDefaultOrNestedTextEscaped(IColumn & nested_column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization); + static bool deserializeNullAsDefaultOrNestedTextQuoted(IColumn & nested_column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested_serialization); + static bool deserializeNullAsDefaultOrNestedTextCSV(IColumn & nested_column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization); + static bool deserializeNullAsDefaultOrNestedTextJSON(IColumn & nested_column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested_serialization); + static bool deserializeNullAsDefaultOrNestedTextRaw(IColumn & nested_column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization); + + /// If Check for NULL and deserialize value into non-nullable column or insert default value of nested type. + /// Return true if parsing was successful and false in case of any error. + static bool tryDeserializeNullAsDefaultOrNestedWholeText(IColumn & nested_column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization); + static bool tryDeserializeNullAsDefaultOrNestedTextEscaped(IColumn & nested_column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization); + static bool tryDeserializeNullAsDefaultOrNestedTextQuoted(IColumn & nested_column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested_serialization); + static bool tryDeserializeNullAsDefaultOrNestedTextCSV(IColumn & nested_column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization); + static bool tryDeserializeNullAsDefaultOrNestedTextJSON(IColumn & nested_column, ReadBuffer & istr, const FormatSettings &, const SerializationPtr & nested_serialization); + static bool tryDeserializeNullAsDefaultOrNestedTextRaw(IColumn & nested_column, ReadBuffer & istr, const FormatSettings & settings, const SerializationPtr & nested_serialization); + + + static void serializeNullEscaped(WriteBuffer & ostr, const FormatSettings & settings); + static bool tryDeserializeNullEscaped(ReadBuffer & istr, const FormatSettings & settings); + static void serializeNullQuoted(WriteBuffer & ostr); + static bool tryDeserializeNullQuoted(ReadBuffer & istr); + static void serializeNullCSV(WriteBuffer & ostr, const FormatSettings & settings); + static bool tryDeserializeNullCSV(ReadBuffer & istr, const FormatSettings & settings); + static void serializeNullJSON(WriteBuffer & ostr); + static bool tryDeserializeNullJSON(ReadBuffer & istr); + static void serializeNullRaw(WriteBuffer & ostr, const FormatSettings & settings); + static bool tryDeserializeNullRaw(ReadBuffer & istr, const FormatSettings & settings); + static void serializeNullText(WriteBuffer & ostr, const FormatSettings & settings); + static bool tryDeserializeNullText(ReadBuffer & istr); + static void serializeNullXML(WriteBuffer & ostr); private: struct SubcolumnCreator : public ISubcolumnCreator diff --git a/src/DataTypes/Serializations/SerializationNumber.cpp b/src/DataTypes/Serializations/SerializationNumber.cpp index b6c7e4618b8..bdb4dfc6735 100644 --- a/src/DataTypes/Serializations/SerializationNumber.cpp +++ b/src/DataTypes/Serializations/SerializationNumber.cpp @@ -37,6 +37,18 @@ void SerializationNumber::deserializeText(IColumn & column, ReadBuffer & istr throwUnexpectedDataAfterParsedValue(column, istr, settings, "Number"); } +template +bool SerializationNumber::tryDeserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &, bool whole) const +{ + T x; + + if (!tryReadText(x, istr) || (whole && !istr.eof())) + return false; + + assert_cast &>(column).getData().push_back(x); + return true; +} + template void SerializationNumber::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { @@ -44,9 +56,10 @@ void SerializationNumber::serializeTextJSON(const IColumn & column, size_t ro writeJSONNumber(x, ostr, settings); } -template -void SerializationNumber::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +template +ReturnType deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) { + static constexpr bool throw_exception = std::is_same_v; bool has_quote = false; if (!istr.eof() && *istr.position() == '"') /// We understand the number both in quotes and without. { @@ -54,13 +67,16 @@ void SerializationNumber::deserializeTextJSON(IColumn & column, ReadBuffer & ++istr.position(); } - FieldType x; + T x; /// null if (!has_quote && !istr.eof() && *istr.position() == 'n') { ++istr.position(); - assertString("ull", istr); + if constexpr (throw_exception) + assertString("ull", istr); + else if (!checkString("ull", istr)) + return ReturnType(false); x = NaNOrZero(); } @@ -73,26 +89,62 @@ void SerializationNumber::deserializeTextJSON(IColumn & column, ReadBuffer & { // extra conditions to parse true/false strings into 1/0 if (istr.eof()) - throwReadAfterEOF(); + { + if constexpr (throw_exception) + throwReadAfterEOF(); + else + return false; + } + if (*istr.position() == 't' || *istr.position() == 'f') { bool tmp = false; - readBoolTextWord(tmp, istr); + if constexpr (throw_exception) + readBoolTextWord(tmp, istr); + else if (!readBoolTextWord(tmp, istr)) + return ReturnType(false); + x = tmp; } else - readText(x, istr); + { + if constexpr (throw_exception) + readText(x, istr); + else if (!tryReadText(x, istr)) + return ReturnType(false); + } } else { - readText(x, istr); + if constexpr (throw_exception) + readText(x, istr); + else if (!tryReadText(x, istr)) + return ReturnType(false); } if (has_quote) - assertChar('"', istr); + { + if constexpr (throw_exception) + assertChar('"', istr); + else if (!checkChar('"', istr)) + return ReturnType(false); + } } assert_cast &>(column).getData().push_back(x); + return ReturnType(true); +} + +template +void SerializationNumber::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + deserializeTextJSONImpl(column, istr, settings); +} + +template +bool SerializationNumber::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + return deserializeTextJSONImpl(column, istr, settings); } template @@ -103,6 +155,16 @@ void SerializationNumber::deserializeTextCSV(IColumn & column, ReadBuffer & i assert_cast &>(column).getData().push_back(x); } +template +bool SerializationNumber::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & /*settings*/) const +{ + FieldType x; + if (!tryReadCSV(x, istr)) + return false; + assert_cast &>(column).getData().push_back(x); + return true; +} + template void SerializationNumber::serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings &) const { diff --git a/src/DataTypes/Serializations/SerializationNumber.h b/src/DataTypes/Serializations/SerializationNumber.h index 972c6c9a30f..9d53dc9c494 100644 --- a/src/DataTypes/Serializations/SerializationNumber.h +++ b/src/DataTypes/Serializations/SerializationNumber.h @@ -20,9 +20,12 @@ public: void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const override; + bool tryDeserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; /** Format is platform-dependent. */ void serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings &) const override; diff --git a/src/DataTypes/Serializations/SerializationString.cpp b/src/DataTypes/Serializations/SerializationString.cpp index 788ff429088..1680ec8a333 100644 --- a/src/DataTypes/Serializations/SerializationString.cpp +++ b/src/DataTypes/Serializations/SerializationString.cpp @@ -272,40 +272,67 @@ void SerializationString::serializeTextEscaped(const IColumn & column, size_t ro } -template -static inline void read(IColumn & column, Reader && reader) +template +static inline ReturnType read(IColumn & column, Reader && reader) { + static constexpr bool throw_exception = std::is_same_v; ColumnString & column_string = assert_cast(column); ColumnString::Chars & data = column_string.getChars(); ColumnString::Offsets & offsets = column_string.getOffsets(); size_t old_chars_size = data.size(); size_t old_offsets_size = offsets.size(); - try - { - reader(data); - data.push_back(0); - offsets.push_back(data.size()); - } - catch (...) + auto restore_column = [&]() { offsets.resize_assume_reserved(old_offsets_size); data.resize_assume_reserved(old_chars_size); - throw; + }; + + try + { + if constexpr (throw_exception) + { + reader(data); + } + else if (!reader(data)) + { + restore_column(); + return false; + } + + data.push_back(0); + offsets.push_back(data.size()); + return ReturnType(true); + } + catch (...) + { + restore_column(); + if constexpr (throw_exception) + throw; + else + return false; } } void SerializationString::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const { - read(column, [&](ColumnString::Chars & data) { readStringUntilEOFInto(data, istr); }); + read(column, [&](ColumnString::Chars & data) { readStringUntilEOFInto(data, istr); }); } +bool SerializationString::tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + return read(column, [&](ColumnString::Chars & data) { readStringUntilEOFInto(data, istr); return true; }); +} void SerializationString::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const { - read(column, [&](ColumnString::Chars & data) { readEscapedStringInto(data, istr); }); + read(column, [&](ColumnString::Chars & data) { readEscapedStringInto(data, istr); }); } +bool SerializationString::tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + return read(column, [&](ColumnString::Chars & data) { readEscapedStringInto(data, istr); return true; }); +} void SerializationString::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const { @@ -315,7 +342,12 @@ void SerializationString::serializeTextQuoted(const IColumn & column, size_t row void SerializationString::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const { - read(column, [&](ColumnString::Chars & data) { readQuotedStringInto(data, istr); }); + read(column, [&](ColumnString::Chars & data) { readQuotedStringInto(data, istr); }); +} + +bool SerializationString::tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + return read(column, [&](ColumnString::Chars & data) { return tryReadQuotedStringInto(data, istr); }); } @@ -329,11 +361,11 @@ void SerializationString::deserializeTextJSON(IColumn & column, ReadBuffer & ist { if (settings.json.read_objects_as_strings && !istr.eof() && *istr.position() == '{') { - read(column, [&](ColumnString::Chars & data) { readJSONObjectPossiblyInvalid(data, istr); }); + read(column, [&](ColumnString::Chars & data) { readJSONObjectPossiblyInvalid(data, istr); }); } else if (settings.json.read_arrays_as_strings && !istr.eof() && *istr.position() == '[') { - read(column, [&](ColumnString::Chars & data) { readJSONArrayInto(data, istr); }); + read(column, [&](ColumnString::Chars & data) { readJSONArrayInto(data, istr); }); } else if (settings.json.read_numbers_as_strings && !istr.eof() && *istr.position() != '"') { @@ -342,12 +374,40 @@ void SerializationString::deserializeTextJSON(IColumn & column, ReadBuffer & ist Float64 tmp; ReadBufferFromString buf(field); if (tryReadFloatText(tmp, buf) && buf.eof()) - read(column, [&](ColumnString::Chars & data) { data.insert(field.begin(), field.end()); }); + read(column, [&](ColumnString::Chars & data) { data.insert(field.begin(), field.end()); }); else throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse JSON String value here: {}", field); } else - read(column, [&](ColumnString::Chars & data) { readJSONStringInto(data, istr); }); + read(column, [&](ColumnString::Chars & data) { readJSONStringInto(data, istr); }); +} + +bool SerializationString::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + if (settings.json.read_objects_as_strings && !istr.eof() && *istr.position() == '{') + return read(column, [&](ColumnString::Chars & data) { return readJSONObjectPossiblyInvalid(data, istr); }); + + if (settings.json.read_arrays_as_strings && !istr.eof() && *istr.position() == '[') + return read(column, [&](ColumnString::Chars & data) { return readJSONArrayInto(data, istr); }); + + if (settings.json.read_numbers_as_strings && !istr.eof() && *istr.position() != '"') + { + String field; + if (!tryReadJSONField(field, istr)) + return false; + + Float64 tmp; + ReadBufferFromString buf(field); + if (tryReadFloatText(tmp, buf) && buf.eof()) + { + read(column, [&](ColumnString::Chars & data) { data.insert(field.begin(), field.end()); }); + return true; + } + + return false; + } + + return read(column, [&](ColumnString::Chars & data) { return tryReadJSONStringInto(data, istr); }); } @@ -365,7 +425,12 @@ void SerializationString::serializeTextCSV(const IColumn & column, size_t row_nu void SerializationString::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { - read(column, [&](ColumnString::Chars & data) { readCSVStringInto(data, istr, settings.csv); }); + read(column, [&](ColumnString::Chars & data) { readCSVStringInto(data, istr, settings.csv); }); +} + +bool SerializationString::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + return read(column, [&](ColumnString::Chars & data) { readCSVStringInto(data, istr, settings.csv); return true; }); } void SerializationString::serializeTextMarkdown( diff --git a/src/DataTypes/Serializations/SerializationString.h b/src/DataTypes/Serializations/SerializationString.h index cd4cdf79c11..89ab84f0d22 100644 --- a/src/DataTypes/Serializations/SerializationString.h +++ b/src/DataTypes/Serializations/SerializationString.h @@ -18,20 +18,25 @@ public: void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextMarkdown(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; }; diff --git a/src/DataTypes/Serializations/SerializationTuple.cpp b/src/DataTypes/Serializations/SerializationTuple.cpp index cbbe97eb05c..c0b0658e6b4 100644 --- a/src/DataTypes/Serializations/SerializationTuple.cpp +++ b/src/DataTypes/Serializations/SerializationTuple.cpp @@ -62,15 +62,35 @@ void SerializationTuple::serializeBinary(const IColumn & column, size_t row_num, } -template -static void addElementSafe(size_t num_elems, IColumn & column, F && impl) +template +static ReturnType addElementSafe(size_t num_elems, IColumn & column, F && impl) { + static constexpr bool throw_exception = std::is_same_v; + /// We use the assumption that tuples of zero size do not exist. size_t old_size = column.size(); + auto restore_elements = [&]() + { + for (size_t i = 0; i < num_elems; ++i) + { + auto & element_column = extractElementColumn(column, i); + if (element_column.size() > old_size) + element_column.popBack(1); + } + }; + try { - impl(); + if constexpr (throw_exception) + { + impl(); + } + else if (!impl()) + { + restore_elements(); + return ReturnType(false); + } // Check that all columns now have the same size. size_t new_size = column.size(); @@ -81,22 +101,23 @@ static void addElementSafe(size_t num_elems, IColumn & column, F && impl) { // This is not a logical error because it may work with // user-supplied data. - throw Exception(ErrorCodes::SIZES_OF_COLUMNS_IN_TUPLE_DOESNT_MATCH, - "Cannot read a tuple because not all elements are present"); + if constexpr (throw_exception) + throw Exception(ErrorCodes::SIZES_OF_COLUMNS_IN_TUPLE_DOESNT_MATCH, + "Cannot read a tuple because not all elements are present"); + restore_elements(); + return ReturnType(false); } } } catch (...) { - for (size_t i = 0; i < num_elems; ++i) - { - auto & element_column = extractElementColumn(column, i); - if (element_column.size() > old_size) - element_column.popBack(1); - } - - throw; + restore_elements(); + if constexpr (throw_exception) + throw; + return ReturnType(false); } + + return ReturnType(true); } void SerializationTuple::deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const @@ -120,25 +141,51 @@ void SerializationTuple::serializeText(const IColumn & column, size_t row_num, W writeChar(')', ostr); } -void SerializationTuple::deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const +template +ReturnType SerializationTuple::deserializeTextImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const { - const size_t size = elems.size(); - assertChar('(', istr); + static constexpr bool throw_exception = std::is_same_v; - addElementSafe(elems.size(), column, [&] + const size_t size = elems.size(); + if constexpr (throw_exception) + assertChar('(', istr); + else if (!checkChar('(', istr)) + return ReturnType(false); + + auto impl = [&]() { for (size_t i = 0; i < size; ++i) { skipWhitespaceIfAny(istr); if (i != 0) { - assertChar(',', istr); + if constexpr (throw_exception) + assertChar(',', istr); + else if (!checkChar(',', istr)) + return ReturnType(false); + skipWhitespaceIfAny(istr); } - if (settings.null_as_default) - SerializationNullable::deserializeTextQuotedImpl(extractElementColumn(column, i), istr, settings, elems[i]); + + auto & element_column = extractElementColumn(column, i); + if constexpr (throw_exception) + { + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(element_column)) + SerializationNullable::deserializeNullAsDefaultOrNestedTextQuoted(element_column, istr, settings, elems[i]); + else + elems[i]->deserializeTextQuoted(element_column, istr, settings); + } else - elems[i]->deserializeTextQuoted(extractElementColumn(column, i), istr, settings); + { + bool ok; + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(element_column)) + ok = SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextQuoted(element_column, istr, settings, elems[i]); + else + ok = elems[i]->tryDeserializeTextQuoted(element_column, istr, settings); + + if (!ok) + return false; + } } // Special format for one element tuple (1,) @@ -150,11 +197,35 @@ void SerializationTuple::deserializeText(IColumn & column, ReadBuffer & istr, co } skipWhitespaceIfAny(istr); - assertChar(')', istr); + if constexpr (throw_exception) + assertChar(')', istr); + else if (!checkChar(')', istr)) + return ReturnType(false); if (whole && !istr.eof()) - throwUnexpectedDataAfterParsedValue(column, istr, settings, "Tuple"); - }); + { + if constexpr (throw_exception) + throwUnexpectedDataAfterParsedValue(column, istr, settings, "Tuple"); + return ReturnType(false); + } + + return ReturnType(true); + }; + + if constexpr (throw_exception) + addElementSafe(elems.size(), column, impl); + else + return addElementSafe(elems.size(), column, impl); +} + +void SerializationTuple::deserializeText(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings, bool whole) const +{ + deserializeTextImpl(column, istr, settings, whole); +} + +bool SerializationTuple::tryDeserializeText(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings, bool whole) const +{ + return deserializeTextImpl(column, istr, settings, whole); } void SerializationTuple::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const @@ -239,16 +310,39 @@ void SerializationTuple::serializeTextJSONPretty(const IColumn & column, size_t } } -void SerializationTuple::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +template +ReturnType SerializationTuple::deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { + static constexpr bool throw_exception = std::is_same_v; + + auto deserialize_element = [&](IColumn & element_column, size_t element_pos) + { + if constexpr (throw_exception) + { + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(element_column)) + SerializationNullable::deserializeNullAsDefaultOrNestedTextJSON(element_column, istr, settings, elems[element_pos]); + else + elems[element_pos]->deserializeTextJSON(element_column, istr, settings); + } + else + { + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(element_column)) + return SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextJSON(element_column, istr, settings, elems[element_pos]); + return elems[element_pos]->tryDeserializeTextJSON(element_column, istr, settings); + } + }; + if (settings.json.read_named_tuples_as_objects && have_explicit_names) { skipWhitespaceIfAny(istr); - assertChar('{', istr); + if constexpr (throw_exception) + assertChar('{', istr); + else if (!checkChar('{', istr)) + return ReturnType(false); skipWhitespaceIfAny(istr); - addElementSafe(elems.size(), column, [&] + auto impl = [&]() { std::vector seen_elements(elems.size(), 0); size_t processed = 0; @@ -256,18 +350,32 @@ void SerializationTuple::deserializeTextJSON(IColumn & column, ReadBuffer & istr while (!istr.eof() && *istr.position() != '}') { if (!settings.json.ignore_unknown_keys_in_named_tuple && processed == elems.size()) - throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected number of elements in named tuple. Expected no more than {} (consider enabling input_format_json_ignore_unknown_keys_in_named_tuple setting)", elems.size()); + { + if constexpr (throw_exception) + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected number of elements in named tuple. Expected no more than {} (consider enabling input_format_json_ignore_unknown_keys_in_named_tuple setting)", elems.size()); + return ReturnType(false); + } if (processed + skipped > 0) { - assertChar(',', istr); + if constexpr (throw_exception) + assertChar(',', istr); + else if (!checkChar(',', istr)) + return ReturnType(false); skipWhitespaceIfAny(istr); } std::string name; - readDoubleQuotedString(name, istr); + if constexpr (throw_exception) + readDoubleQuotedString(name, istr); + else if (!tryReadDoubleQuotedString(name, istr)) + return ReturnType(false); + skipWhitespaceIfAny(istr); - assertChar(':', istr); + if constexpr (throw_exception) + assertChar(':', istr); + else if (!checkChar(':', istr)) + return ReturnType(false); skipWhitespaceIfAny(istr); const size_t element_pos = getPositionByName(name); @@ -275,36 +383,52 @@ void SerializationTuple::deserializeTextJSON(IColumn & column, ReadBuffer & istr { if (settings.json.ignore_unknown_keys_in_named_tuple) { - skipJSONField(istr, name); + if constexpr (throw_exception) + skipJSONField(istr, name); + else if (!trySkipJSONField(istr, name)) + return ReturnType(false); + skipWhitespaceIfAny(istr); ++skipped; continue; } else - throw Exception(ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK, "Tuple doesn't have element with name '{}', enable setting input_format_json_ignore_unknown_keys_in_named_tuple", name); + { + if constexpr (throw_exception) + throw Exception(ErrorCodes::NOT_FOUND_COLUMN_IN_BLOCK, "Tuple doesn't have element with name '{}', enable setting input_format_json_ignore_unknown_keys_in_named_tuple", name); + return ReturnType(false); + } } seen_elements[element_pos] = 1; auto & element_column = extractElementColumn(column, element_pos); - try + if constexpr (throw_exception) { - if (settings.null_as_default) - SerializationNullable::deserializeTextJSONImpl(element_column, istr, settings, elems[element_pos]); - else - elems[element_pos]->deserializeTextJSON(element_column, istr, settings); + try + { + deserialize_element(element_column, element_pos); + } + catch (Exception & e) + { + e.addMessage("(while reading the value of nested key " + name + ")"); + throw; + } } - catch (Exception & e) + else { - e.addMessage("(while reading the value of nested key " + name + ")"); - throw; + if (!deserialize_element(element_column, element_pos)) + return ReturnType(false); } skipWhitespaceIfAny(istr); ++processed; } - assertChar('}', istr); + if constexpr (throw_exception) + assertChar('}', istr); + else if (!checkChar('}', istr)) + return ReturnType(false); /// Check if we have missing elements. if (processed != elems.size()) @@ -315,41 +439,87 @@ void SerializationTuple::deserializeTextJSON(IColumn & column, ReadBuffer & istr continue; if (!settings.json.defaults_for_missing_elements_in_named_tuple) - throw Exception( - ErrorCodes::INCORRECT_DATA, - "JSON object doesn't contain tuple element {}. If you want to insert defaults in case of missing elements, " - "enable setting input_format_json_defaults_for_missing_elements_in_named_tuple", - elems[element_pos]->getElementName()); + { + if constexpr (throw_exception) + throw Exception( + ErrorCodes::INCORRECT_DATA, + "JSON object doesn't contain tuple element {}. If you want to insert defaults in case of missing elements, " + "enable setting input_format_json_defaults_for_missing_elements_in_named_tuple", + elems[element_pos]->getElementName()); + return ReturnType(false); + } auto & element_column = extractElementColumn(column, element_pos); element_column.insertDefault(); } } - }); + + return ReturnType(true); + }; + + if constexpr (throw_exception) + addElementSafe(elems.size(), column, impl); + else + return addElementSafe(elems.size(), column, impl); } else { - assertChar('[', istr); + skipWhitespaceIfAny(istr); + if constexpr (throw_exception) + assertChar('[', istr); + else if (!checkChar('[', istr)) + return ReturnType(false); + skipWhitespaceIfAny(istr); - addElementSafe(elems.size(), column, [&] + auto impl = [&]() { for (size_t i = 0; i < elems.size(); ++i) { skipWhitespaceIfAny(istr); if (i != 0) { - assertChar(',', istr); + if constexpr (throw_exception) + assertChar(',', istr); + else if (!checkChar(',', istr)) + return ReturnType(false); skipWhitespaceIfAny(istr); } - elems[i]->deserializeTextJSON(extractElementColumn(column, i), istr, settings); + + auto & element_column = extractElementColumn(column, i); + + if constexpr (throw_exception) + deserialize_element(element_column, i); + else if (!deserialize_element(element_column, i)) + return ReturnType(false); } skipWhitespaceIfAny(istr); - assertChar(']', istr); - }); + if constexpr (throw_exception) + assertChar(']', istr); + else if (!checkChar(']', istr)) + return ReturnType(false); + + return ReturnType(true); + }; + + if constexpr (throw_exception) + addElementSafe(elems.size(), column, impl); + else + return addElementSafe(elems.size(), column, impl); } } +void SerializationTuple::deserializeTextJSON(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const +{ + deserializeTextJSONImpl(column, istr, settings); +} + +bool SerializationTuple::tryDeserializeTextJSON(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const +{ + return deserializeTextJSONImpl(column, istr, settings); +} + + void SerializationTuple::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeCString("", ostr); @@ -385,14 +555,48 @@ void SerializationTuple::deserializeTextCSV(IColumn & column, ReadBuffer & istr, assertChar(settings.csv.tuple_delimiter, istr); skipWhitespaceIfAny(istr); } - if (settings.null_as_default) - SerializationNullable::deserializeTextCSVImpl(extractElementColumn(column, i), istr, settings, elems[i]); + + auto & element_column = extractElementColumn(column, i); + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(element_column)) + SerializationNullable::deserializeNullAsDefaultOrNestedTextCSV(element_column, istr, settings, elems[i]); else - elems[i]->deserializeTextCSV(extractElementColumn(column, i), istr, settings); + elems[i]->deserializeTextCSV(element_column, istr, settings); } }); } +bool SerializationTuple::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + return addElementSafe(elems.size(), column, [&] + { + const size_t size = elems.size(); + for (size_t i = 0; i < size; ++i) + { + if (i != 0) + { + skipWhitespaceIfAny(istr); + if (!checkChar(settings.csv.tuple_delimiter, istr)) + return false; + skipWhitespaceIfAny(istr); + } + + auto & element_column = extractElementColumn(column, i); + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(element_column)) + { + if (!SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextCSV(element_column, istr, settings, elems[i])) + return false; + } + else + { + if (!elems[i]->tryDeserializeTextCSV(element_column, istr, settings)) + return false; + } + } + + return true; + }); +} + void SerializationTuple::enumerateStreams( EnumerateStreamsSettings & settings, const StreamCallback & callback, diff --git a/src/DataTypes/Serializations/SerializationTuple.h b/src/DataTypes/Serializations/SerializationTuple.h index 7325259f440..d9c63a05217 100644 --- a/src/DataTypes/Serializations/SerializationTuple.h +++ b/src/DataTypes/Serializations/SerializationTuple.h @@ -23,14 +23,17 @@ public: void deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &, bool whole) const override; + bool tryDeserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &, bool whole) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent) const override; void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; /// Tuples in CSV format will be serialized as separate columns (that is, losing their nesting in the tuple). void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; /** Each sub-column in a tuple is serialized in separate stream. */ @@ -73,6 +76,15 @@ private: bool have_explicit_names; size_t getPositionByName(const String & name) const; + + template + ReturnType deserializeTextImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const; + + template + ReturnType deserializeTextJSONImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const; + + template + ReturnType deserializeTextCSVImpl(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const; }; } diff --git a/src/DataTypes/Serializations/SerializationUUID.cpp b/src/DataTypes/Serializations/SerializationUUID.cpp index 5cf17b4c0c8..5a7aeca67a0 100644 --- a/src/DataTypes/Serializations/SerializationUUID.cpp +++ b/src/DataTypes/Serializations/SerializationUUID.cpp @@ -25,15 +25,16 @@ void SerializationUUID::deserializeText(IColumn & column, ReadBuffer & istr, con throwUnexpectedDataAfterParsedValue(column, istr, settings, "UUID"); } -void SerializationUUID::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +bool SerializationUUID::tryDeserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &, bool whole) const { - deserializeText(column, istr, settings, false); + UUID x; + if (!tryReadText(x, istr) || (whole && !istr.eof())) + return false; + + assert_cast(column).getData().push_back(x); + return true; } -void SerializationUUID::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const -{ - serializeText(column, row_num, ostr, settings); -} void SerializationUUID::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { @@ -76,6 +77,17 @@ void SerializationUUID::deserializeTextQuoted(IColumn & column, ReadBuffer & ist assert_cast(column).getData().push_back(std::move(uuid)); /// It's important to do this at the end - for exception safety. } +bool SerializationUUID::tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + UUID uuid; + String field; + if (!checkChar('\'', istr) || !tryReadText(uuid, istr) || !checkChar('\'', istr)) + return false; + + assert_cast(column).getData().push_back(std::move(uuid)); + return true; +} + void SerializationUUID::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeChar('"', ostr); @@ -92,6 +104,15 @@ void SerializationUUID::deserializeTextJSON(IColumn & column, ReadBuffer & istr, assert_cast(column).getData().push_back(x); } +bool SerializationUUID::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + UUID x; + if (!checkChar('"', istr) || !tryReadText(x, istr) || !checkChar('"', istr)) + return false; + assert_cast(column).getData().push_back(x); + return true; +} + void SerializationUUID::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { writeChar('"', ostr); @@ -106,6 +127,14 @@ void SerializationUUID::deserializeTextCSV(IColumn & column, ReadBuffer & istr, assert_cast(column).getData().push_back(value); } +bool SerializationUUID::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const +{ + UUID value; + if (!tryReadCSV(value, istr)) + return false; + assert_cast(column).getData().push_back(value); + return true; +} void SerializationUUID::serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings &) const { diff --git a/src/DataTypes/Serializations/SerializationUUID.h b/src/DataTypes/Serializations/SerializationUUID.h index da8c15f7279..458504f8f42 100644 --- a/src/DataTypes/Serializations/SerializationUUID.h +++ b/src/DataTypes/Serializations/SerializationUUID.h @@ -10,14 +10,16 @@ class SerializationUUID : public SimpleTextSerialization public: void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const override; - void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const override; void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings &) const override; diff --git a/src/DataTypes/Serializations/SerializationVariant.cpp b/src/DataTypes/Serializations/SerializationVariant.cpp new file mode 100644 index 00000000000..ebd44fd6955 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationVariant.cpp @@ -0,0 +1,828 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; + extern const int NOT_IMPLEMENTED; + extern const int INCORRECT_DATA; +} + +void SerializationVariant::enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const +{ + const auto * type_variant = data.type ? &assert_cast(*data.type) : nullptr; + const auto * column_variant = data.column ? &assert_cast(*data.column) : nullptr; + + auto discriminators_serialization = std::make_shared(std::make_shared>(), "discr", false); + auto local_discriminators = column_variant ? column_variant->getLocalDiscriminatorsPtr() : nullptr; + + settings.path.push_back(Substream::VariantDiscriminators); + auto discriminators_data = SubstreamData(discriminators_serialization) + .withType(type_variant ? std::make_shared>() : nullptr) + .withColumn(column_variant ? column_variant->getLocalDiscriminatorsPtr() : nullptr) + .withSerializationInfo(data.serialization_info); + + settings.path.back().data = discriminators_data; + callback(settings.path); + settings.path.pop_back(); + + settings.path.push_back(Substream::VariantElements); + settings.path.back().data = data; + + for (size_t i = 0; i < variants.size(); ++i) + { + settings.path.back().creator = std::make_shared(local_discriminators, variant_names[i], i, column_variant ? column_variant->localDiscriminatorByGlobal(i) : i); + + auto variant_data = SubstreamData(variants[i]) + .withType(type_variant ? type_variant->getVariant(i) : nullptr) + .withColumn(column_variant ? column_variant->getVariantPtrByGlobalDiscriminator(i) : nullptr) + .withSerializationInfo(data.serialization_info); + + addVariantElementToPath(settings.path, i); + settings.path.back().data = variant_data; + variants[i]->enumerateStreams(settings, callback, variant_data); + settings.path.pop_back(); + } + + settings.path.pop_back(); +} + +struct SerializeBinaryBulkStateVariant : public ISerialization::SerializeBinaryBulkState +{ + std::vector states; +}; + +struct DeserializeBinaryBulkStateVariant : public ISerialization::DeserializeBinaryBulkState +{ + std::vector states; +}; + +void SerializationVariant::serializeBinaryBulkStatePrefix( + const IColumn & column, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + const ColumnVariant & col = assert_cast(column); + + auto variant_state = std::make_shared(); + variant_state->states.resize(variants.size()); + + settings.path.push_back(Substream::VariantElements); + + for (size_t i = 0; i < variants.size(); ++i) + { + addVariantElementToPath(settings.path, i); + variants[i]->serializeBinaryBulkStatePrefix(col.getVariantByGlobalDiscriminator(i), settings, variant_state->states[i]); + settings.path.pop_back(); + } + + settings.path.pop_back(); + state = std::move(variant_state); +} + + +void SerializationVariant::serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + auto * variant_state = checkAndGetState(state); + + settings.path.push_back(Substream::VariantElements); + for (size_t i = 0; i < variants.size(); ++i) + { + addVariantElementToPath(settings.path, i); + variants[i]->serializeBinaryBulkStateSuffix(settings, variant_state->states[i]); + settings.path.pop_back(); + } + settings.path.pop_back(); +} + + +void SerializationVariant::deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const +{ + auto variant_state = std::make_shared(); + variant_state->states.resize(variants.size()); + + settings.path.push_back(Substream::VariantElements); + for (size_t i = 0; i < variants.size(); ++i) + { + addVariantElementToPath(settings.path, i); + variants[i]->deserializeBinaryBulkStatePrefix(settings, variant_state->states[i]); + settings.path.pop_back(); + } + + settings.path.pop_back(); + state = std::move(variant_state); +} + + +void SerializationVariant::serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const +{ + const ColumnVariant & col = assert_cast(column); + if (const size_t size = col.size(); limit == 0 || offset + limit > size) + limit = size - offset; + + settings.path.push_back(Substream::VariantDiscriminators); + auto * discriminators_stream = settings.getter(settings.path); + settings.path.pop_back(); + + if (!discriminators_stream) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Got empty stream for VariantDiscriminators in SerializationVariant::serializeBinaryBulkWithMultipleStreams"); + + auto * variant_state = checkAndGetState(state); + + /// If offset = 0 and limit == col.size() or we have only NULLs, we don't need to calculate + /// offsets and limits for variants and need to just serialize whole columns. + if ((offset == 0 && limit == col.size()) || col.hasOnlyNulls()) + { + /// First, serialize discriminators. + /// If we have only NULLs or local and global discriminators are the same, just serialize the column as is. + if (col.hasOnlyNulls() || col.hasGlobalVariantsOrder()) + { + SerializationNumber().serializeBinaryBulk(col.getLocalDiscriminatorsColumn(), *discriminators_stream, offset, limit); + } + /// If local and global discriminators are different, we should convert local to global before serializing (because we don't serialize the mapping). + else + { + const auto & local_discriminators = col.getLocalDiscriminators(); + for (size_t i = offset; i != offset + limit; ++i) + writeBinaryLittleEndian(col.globalDiscriminatorByLocal(local_discriminators[i]), *discriminators_stream); + } + + /// Second, serialize variants in global order. + settings.path.push_back(Substream::VariantElements); + for (size_t i = 0; i != variants.size(); ++i) + { + addVariantElementToPath(settings.path, i); + variants[i]->serializeBinaryBulkWithMultipleStreams(col.getVariantByGlobalDiscriminator(i), 0, 0, settings, variant_state->states[i]); + settings.path.pop_back(); + } + settings.path.pop_back(); + return; + } + + /// If we have only one non empty variant and no NULLs, we can use the same limit offset for this variant. + if (auto non_empty_local_discr = col.getLocalDiscriminatorOfOneNoneEmptyVariantNoNulls()) + { + /// First, serialize discriminators. + /// We know that all discriminators are the same, so we just need to serialize this discriminator limit times. + auto non_empty_global_discr = col.globalDiscriminatorByLocal(*non_empty_local_discr); + for (size_t i = 0; i != limit; ++i) + writeBinaryLittleEndian(non_empty_global_discr, *discriminators_stream); + + /// Second, serialize variants in global order. + settings.path.push_back(Substream::VariantElements); + for (size_t i = 0; i != variants.size(); ++i) + { + addVariantElementToPath(settings.path, i); + /// For non empty variant use the same offset/limit as for whole Variant column + if (i == non_empty_global_discr) + variants[i]->serializeBinaryBulkWithMultipleStreams(col.getVariantByGlobalDiscriminator(i), offset, limit, settings, variant_state->states[i]); + /// For empty variants, use just 0/0, they won't serialize anything. + else + variants[i]->serializeBinaryBulkWithMultipleStreams(col.getVariantByGlobalDiscriminator(i), 0, 0, settings, variant_state->states[i]); + settings.path.pop_back(); + } + settings.path.pop_back(); + return; + } + + /// In general case we should iterate through local discriminators in range [offset, offset + limit] to serialize global discriminators and calculate offset/limit pair for each variant. + const auto & local_discriminators = col.getLocalDiscriminators(); + const auto & offsets = col.getOffsets(); + std::vector> variant_offsets_and_limits(variants.size(), {0, 0}); + size_t end = offset + limit; + for (size_t i = offset; i < end; ++i) + { + auto global_discr = col.globalDiscriminatorByLocal(local_discriminators[i]); + writeBinaryLittleEndian(global_discr, *discriminators_stream); + + if (global_discr != ColumnVariant::NULL_DISCRIMINATOR) + { + /// If we see this discriminator for the first time, update offset + if (!variant_offsets_and_limits[global_discr].second) + variant_offsets_and_limits[global_discr].first = offsets[i]; + /// Update limit for this discriminator. + ++variant_offsets_and_limits[global_discr].second; + } + } + + /// If limit for some variant is 0, it means that we don't have its discriminator in the range. + /// Set offset to the size of column for such variants, so we won't serialize values from them. + for (size_t i = 0; i != variant_offsets_and_limits.size(); ++i) + { + if (!variant_offsets_and_limits[i].second) + variant_offsets_and_limits[i].first = col.getVariantByGlobalDiscriminator(i).size(); + } + + /// Serialize variants in global order. + settings.path.push_back(Substream::VariantElements); + for (size_t i = 0; i != variants.size(); ++i) + { + addVariantElementToPath(settings.path, i); + variants[i]->serializeBinaryBulkWithMultipleStreams( + col.getVariantByGlobalDiscriminator(i), + variant_offsets_and_limits[i].first, + variant_offsets_and_limits[i].second, + settings, + variant_state->states[i]); + settings.path.pop_back(); + } + settings.path.pop_back(); +} + + +void SerializationVariant::deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const +{ + auto mutable_column = column->assumeMutable(); + ColumnVariant & col = assert_cast(*mutable_column); + /// We always serialize Variant column with global variants order, + /// so while deserialization column should be always with global variants order. + if (!col.hasGlobalVariantsOrder()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Trying to deserialize data into Variant column with not global variants order"); + + /// First, deserialize new discriminators. + /// We deserialize them into a separate column to be able to use substream cache, + /// so if we also need to deserialize some of sub columns, we will read discriminators only once. + settings.path.push_back(Substream::VariantDiscriminators); + ColumnPtr discriminators; + if (auto cached_discriminators = getFromSubstreamsCache(cache, settings.path)) + { + discriminators = cached_discriminators; + } + else + { + auto * discriminators_stream = settings.getter(settings.path); + if (!discriminators_stream) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Got empty stream for VariantDiscriminators in SerializationVariant::deserializeBinaryBulkWithMultipleStreams"); + + discriminators = ColumnVariant::ColumnDiscriminators::create(); + SerializationNumber().deserializeBinaryBulk(*discriminators->assumeMutable(), *discriminators_stream, limit, 0); + addToSubstreamsCache(cache, settings.path, discriminators); + } + + settings.path.pop_back(); + + /// Iterate through new discriminators, append them to column and calculate the limit for each variant. + /// While calculating limits we can also fill offsets column (we store offsets only in memory). + const auto & discriminators_data = assert_cast(*discriminators).getData(); + auto & local_discriminators = col.getLocalDiscriminators(); + local_discriminators.reserve(local_discriminators.size() + limit); + auto & offsets = col.getOffsets(); + offsets.reserve(offsets.size() + limit); + std::vector variant_limits(variants.size(), 0); + for (size_t i = 0; i != limit; ++i) + { + ColumnVariant::Discriminator discr = discriminators_data[i]; + local_discriminators.push_back(discr); + if (discr == ColumnVariant::NULL_DISCRIMINATOR) + offsets.emplace_back(); + else + offsets.push_back(col.getVariantByLocalDiscriminator(discr).size() + variant_limits[discr]++); + } + + /// Now we can deserialize variants according to their limits. + auto * variant_state = checkAndGetState(state); + settings.path.push_back(Substream::VariantElements); + for (size_t i = 0; i != variants.size(); ++i) + { + addVariantElementToPath(settings.path, i); + variants[i]->deserializeBinaryBulkWithMultipleStreams(col.getVariantPtrByLocalDiscriminator(i), variant_limits[i], settings, variant_state->states[i], cache); + settings.path.pop_back(); + } + settings.path.pop_back(); +} + +void SerializationVariant::addVariantElementToPath(DB::ISerialization::SubstreamPath & path, size_t i) const +{ + path.push_back(Substream::VariantElement); + path.back().variant_element_name = variant_names[i]; +} + +void SerializationVariant::serializeBinary(const Field & /*field*/, WriteBuffer & /*ostr*/, const FormatSettings & /*settings*/) const +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinary from a field is not implemented for SerializationVariant"); +} + +void SerializationVariant::deserializeBinary(Field & /*field*/, ReadBuffer & /*istr*/, const FormatSettings & /*settings*/) const +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method deserializeBinary to a field is not implemented for SerializationVariant"); +} + +void SerializationVariant::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnVariant & col = assert_cast(column); + auto global_discr = col.globalDiscriminatorAt(row_num); + writeBinaryLittleEndian(global_discr, ostr); + if (global_discr != ColumnVariant::NULL_DISCRIMINATOR) + variants[global_discr]->serializeBinary(col.getVariantByGlobalDiscriminator(global_discr), col.offsetAt(row_num), ostr, settings); +} + +void SerializationVariant::deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + ColumnVariant & col = assert_cast(column); + ColumnVariant::Discriminator global_discr; + readBinaryLittleEndian(global_discr, istr); + if (global_discr == ColumnVariant::NULL_DISCRIMINATOR) + { + col.insertDefault(); + } + else + { + auto & variant_column = col.getVariantByGlobalDiscriminator(global_discr); + variants[global_discr]->deserializeBinary(variant_column, istr, settings); + col.getLocalDiscriminators().push_back(col.localDiscriminatorByGlobal(global_discr)); + col.getOffsets().push_back(variant_column.size() - 1); + } +} + +namespace +{ + +std::unordered_map getTypesTextDeserializePriorityMap() +{ + static const std::vector priorities = { + /// Complex types have highest priority. + TypeIndex::Array, + TypeIndex::Tuple, + TypeIndex::Map, + TypeIndex::AggregateFunction, + + /// Enums can be parsed both from strings and numbers. + /// So they have high enough priority. + TypeIndex::Enum8, + TypeIndex::Enum16, + + /// Types that can be parsed from strings. + TypeIndex::UUID, + TypeIndex::IPv4, + TypeIndex::IPv6, + + /// Types that can be parsed from numbers. + /// The order: + /// 1) Integers + /// 2) Big Integers + /// 3) Decimals + /// 4) Floats + /// In each group small types have higher priority. + TypeIndex::Int8, + TypeIndex::UInt8, + TypeIndex::Int16, + TypeIndex::UInt16, + TypeIndex::Int32, + TypeIndex::UInt32, + TypeIndex::Int64, + TypeIndex::UInt64, + TypeIndex::Int128, + TypeIndex::UInt128, + TypeIndex::Int256, + TypeIndex::UInt256, + TypeIndex::Decimal32, + TypeIndex::Decimal64, + TypeIndex::Decimal128, + TypeIndex::Decimal256, + TypeIndex::Float32, + TypeIndex::Float64, + + /// Dates and DateTimes. More simple Date types have higher priority. + /// They have lower priority as numbers as some DateTimes sometimes can + /// be also parsed from numbers, but we don't want it usually. + TypeIndex::Date, + TypeIndex::Date32, + TypeIndex::DateTime, + TypeIndex::DateTime64, + + /// String types have almost the lowest priority, + /// as in text formats almost all data can + /// be deserialized into String type. + TypeIndex::FixedString, + TypeIndex::String, + }; + + std::unordered_map priority_map; + priority_map.reserve(priorities.size()); + for (size_t i = 0; i != priorities.size(); ++i) + priority_map[priorities[i]] = priorities.size() - i; + return priority_map; +} + +/// We want to create more or less optimal order of types in which we will try text deserializations. +/// To do it, for each type we calculate a priority and then sort them by this priority. +/// Above we defined priority of each data type, but types can be nested and also we can have LowCardinality and Nullable. +/// To sort any nested types we create a priority that is a tuple of 3 elements: +/// 1) The maximum depth of nested types like Array/Map/Tuple. +/// 2) The combination of simple and complex types priorities. +/// 3) The depth of nested types LowCardinality/Nullable. +/// So, when we will sort types, first we will sort by the maximum depth of nested types, so more nested types are deserialized first, +/// then for types with the same depth we sort by the types priority, and last we sort by the depth of LowCardinality/Nullable types, +/// so if we have types with the same level of nesting and the same priority, we will first try to deserialize LowCardinality/Nullable types +/// (for example if we have types Array(Array(String)) and Array(Array(Nullable(String))). +/// This is just a batch of heuristics, +std::tuple getTypeTextDeserializePriority(const DataTypePtr & type, size_t nested_depth, size_t simple_nested_depth, std::unordered_map & priority_map) +{ + if (const auto * nullable_type = typeid_cast(type.get())) + return getTypeTextDeserializePriority(nullable_type->getNestedType(), nested_depth, simple_nested_depth + 1, priority_map); + + if (const auto * lc_type = typeid_cast(type.get())) + return getTypeTextDeserializePriority(lc_type->getDictionaryType(), nested_depth, simple_nested_depth + 1, priority_map); + + if (const auto * array_type = typeid_cast(type.get())) + { + auto [elements_nested_depth, elements_priority, elements_simple_nested_depth] = getTypeTextDeserializePriority(array_type->getNestedType(), nested_depth + 1, simple_nested_depth, priority_map); + return {elements_nested_depth, elements_priority + priority_map[TypeIndex::Array], elements_simple_nested_depth}; + } + + if (const auto * tuple_type = typeid_cast(type.get())) + { + size_t max_nested_depth = 0; + size_t sum_priority = 0; + size_t max_simple_nested_depth = 0; + for (const auto & elem : tuple_type->getElements()) + { + auto [elem_nested_depth, elem_priority, elem_simple_nested_depth] = getTypeTextDeserializePriority(elem, nested_depth + 1, simple_nested_depth, priority_map); + sum_priority += elem_priority; + if (elem_nested_depth > max_nested_depth) + max_nested_depth = elem_nested_depth; + if (elem_simple_nested_depth > max_simple_nested_depth) + max_simple_nested_depth = elem_simple_nested_depth; + } + + return {max_nested_depth, sum_priority + priority_map[TypeIndex::Tuple], max_simple_nested_depth}; + } + + if (const auto * map_type = typeid_cast(type.get())) + { + auto [key_max_depth, key_priority, key_simple_nested_depth] = getTypeTextDeserializePriority(map_type->getKeyType(), nested_depth + 1, simple_nested_depth, priority_map); + auto [value_max_depth, value_priority, value_simple_nested_depth] = getTypeTextDeserializePriority(map_type->getValueType(), nested_depth + 1, simple_nested_depth, priority_map); + return {std::max(key_max_depth, value_max_depth), key_priority + value_priority + priority_map[TypeIndex::Map], std::max(key_simple_nested_depth, value_simple_nested_depth)}; + } + + if (const auto * variant_type = typeid_cast(type.get())) + { + size_t max_priority = 0; + size_t max_depth = 0; + size_t max_simple_nested_depth = 0; + for (const auto & variant : variant_type->getVariants()) + { + auto [variant_max_depth, variant_priority, variant_simple_nested_depth] = getTypeTextDeserializePriority(variant, nested_depth, simple_nested_depth, priority_map); + if (variant_priority > max_priority) + max_priority = variant_priority; + if (variant_max_depth > max_depth) + max_depth = variant_max_depth; + if (variant_simple_nested_depth > max_simple_nested_depth) + max_simple_nested_depth = variant_simple_nested_depth; + } + + return {max_depth, max_priority, max_simple_nested_depth}; + } + + return {nested_depth, priority_map[type->getTypeId()], simple_nested_depth}; +} + +} + +std::vector SerializationVariant::getVariantsDeserializeTextOrder(const DB::DataTypes & variant_types) +{ + std::vector> priorities; + priorities.reserve(variant_types.size()); + std::vector order; + order.reserve(variant_types.size()); + auto priority_map = getTypesTextDeserializePriorityMap(); + for (size_t i = 0; i != variant_types.size(); ++i) + { + priorities.push_back(getTypeTextDeserializePriority(variant_types[i], 0, 0, priority_map)); + order.push_back(i); + } + + std::sort(order.begin(), order.end(), [&](size_t left, size_t right) { return priorities[left] > priorities[right]; }); + String types_order; + for (auto i : order) + types_order += " " + variant_types[i]->getName(); + return order; +} + + +bool SerializationVariant::tryDeserializeImpl( + IColumn & column, + const String & field, + std::function check_for_null, + std::function try_deserialize_variant) const +{ + auto & column_variant = assert_cast(column); + ReadBufferFromString null_buf(field); + if (check_for_null(null_buf) && null_buf.eof()) + { + column_variant.insertDefault(); + return true; + } + + for (size_t global_discr : deserialize_text_order) + { + ReadBufferFromString variant_buf(field); + /// Usually try_deserialize_variant should not throw an exception, but let's use try/catch just in case. + try + { + auto & variant_column = column_variant.getVariantByGlobalDiscriminator(global_discr); + size_t prev_size = variant_column.size(); + if (try_deserialize_variant(variant_column, variants[global_discr], variant_buf) && variant_buf.eof()) + { + column_variant.getLocalDiscriminators().push_back(column_variant.localDiscriminatorByGlobal(global_discr)); + column_variant.getOffsets().push_back(prev_size); + return true; + } + else if (variant_column.size() > prev_size) + { + variant_column.popBack(1); + } + } + catch (...) + { + /// Try next variant. + } + } + + return false; +} + +void SerializationVariant::serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnVariant & col = assert_cast(column); + auto global_discr = col.globalDiscriminatorAt(row_num); + if (global_discr == ColumnVariant::NULL_DISCRIMINATOR) + SerializationNullable::serializeNullEscaped(ostr, settings); + else + variants[global_discr]->serializeTextEscaped(col.getVariantByGlobalDiscriminator(global_discr), col.offsetAt(row_num), ostr, settings); +} + +bool SerializationVariant::tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String field; + readEscapedString(field, istr); + return tryDeserializeTextEscapedImpl(column, field, settings); +} + +void SerializationVariant::deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String field; + readEscapedString(field, istr); + if (!tryDeserializeTextEscapedImpl(column, field, settings)) + throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse escaped value of type {} here: {}", variant_name, field); +} + +bool SerializationVariant::tryDeserializeTextEscapedImpl(DB::IColumn & column, const String & field, const DB::FormatSettings & settings) const +{ + auto check_for_null = [&](ReadBuffer & buf) + { + return SerializationNullable::tryDeserializeNullEscaped(buf, settings); + }; + auto try_deserialize_variant =[&](IColumn & variant_column, const SerializationPtr & variant_serialization, ReadBuffer & buf) + { + return variant_serialization->tryDeserializeTextEscaped(variant_column, buf, settings); + }; + + return tryDeserializeImpl(column, field, check_for_null, try_deserialize_variant); +} + +void SerializationVariant::serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnVariant & col = assert_cast(column); + auto global_discr = col.globalDiscriminatorAt(row_num); + if (global_discr == ColumnVariant::NULL_DISCRIMINATOR) + SerializationNullable::serializeNullRaw(ostr, settings); + else + variants[global_discr]->serializeTextRaw(col.getVariantByGlobalDiscriminator(global_discr), col.offsetAt(row_num), ostr, settings); +} + +bool SerializationVariant::tryDeserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String field; + readString(field, istr); + return tryDeserializeTextRawImpl(column, field, settings); +} + +void SerializationVariant::deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String field; + readString(field, istr); + if (!tryDeserializeTextRawImpl(column, field, settings)) + throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse raw value of type {} here: {}", variant_name, field); +} + +bool SerializationVariant::tryDeserializeTextRawImpl(DB::IColumn & column, const String & field, const DB::FormatSettings & settings) const +{ + auto check_for_null = [&](ReadBuffer & buf) + { + return SerializationNullable::tryDeserializeNullRaw(buf, settings); + }; + auto try_deserialize_variant =[&](IColumn & variant_column, const SerializationPtr & variant_serialization, ReadBuffer & buf) + { + return variant_serialization->tryDeserializeTextRaw(variant_column, buf, settings); + }; + + return tryDeserializeImpl(column, field, check_for_null, try_deserialize_variant); +} + +void SerializationVariant::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnVariant & col = assert_cast(column); + auto global_discr = col.globalDiscriminatorAt(row_num); + if (global_discr == ColumnVariant::NULL_DISCRIMINATOR) + SerializationNullable::serializeNullQuoted(ostr); + else + variants[global_discr]->serializeTextQuoted(col.getVariantByGlobalDiscriminator(global_discr), col.offsetAt(row_num), ostr, settings); +} + +bool SerializationVariant::tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String field; + if (!tryReadQuotedField(field, istr)) + return false; + return tryDeserializeTextQuotedImpl(column, field, settings); +} + +void SerializationVariant::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String field; + readQuotedField(field, istr); + if (!tryDeserializeTextQuotedImpl(column, field, settings)) + throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse quoted value of type {} here: {}", variant_name, field); +} + +bool SerializationVariant::tryDeserializeTextQuotedImpl(DB::IColumn & column, const String & field, const DB::FormatSettings & settings) const +{ + auto check_for_null = [&](ReadBuffer & buf) + { + return SerializationNullable::tryDeserializeNullQuoted(buf); + }; + auto try_deserialize_variant =[&](IColumn & variant_column, const SerializationPtr & variant_serialization, ReadBuffer & buf) + { + return variant_serialization->tryDeserializeTextQuoted(variant_column, buf, settings); + }; + + return tryDeserializeImpl(column, field, check_for_null, try_deserialize_variant); +} + +void SerializationVariant::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnVariant & col = assert_cast(column); + auto global_discr = col.globalDiscriminatorAt(row_num); + if (global_discr == ColumnVariant::NULL_DISCRIMINATOR) + SerializationNullable::serializeNullCSV(ostr, settings); + else + variants[global_discr]->serializeTextCSV(col.getVariantByGlobalDiscriminator(global_discr), col.offsetAt(row_num), ostr, settings); +} + +bool SerializationVariant::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String field; + readCSVStringInto(field, istr, settings.csv); + return tryDeserializeTextCSVImpl(column, field, settings); +} + +void SerializationVariant::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String field; + readCSVField(field, istr, settings.csv); + if (!tryDeserializeTextCSVImpl(column, field, settings)) + throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse CSV value of type {} here: {}", variant_name, field); +} + +bool SerializationVariant::tryDeserializeTextCSVImpl(DB::IColumn & column, const String & field, const DB::FormatSettings & settings) const +{ + auto check_for_null = [&](ReadBuffer & buf) + { + return SerializationNullable::tryDeserializeNullCSV(buf, settings); + }; + auto try_deserialize_variant =[&](IColumn & variant_column, const SerializationPtr & variant_serialization, ReadBuffer & buf) + { + return variant_serialization->tryDeserializeTextCSV(variant_column, buf, settings); + }; + + return tryDeserializeImpl(column, field, check_for_null, try_deserialize_variant); +} + +void SerializationVariant::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnVariant & col = assert_cast(column); + auto global_discr = col.globalDiscriminatorAt(row_num); + if (global_discr == ColumnVariant::NULL_DISCRIMINATOR) + SerializationNullable::serializeNullText(ostr, settings); + else + variants[global_discr]->serializeText(col.getVariantByGlobalDiscriminator(global_discr), col.offsetAt(row_num), ostr, settings); +} + +bool SerializationVariant::tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String field; + readStringUntilEOF(field, istr); + return tryDeserializeWholeTextImpl(column, field, settings); +} + +void SerializationVariant::deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String field; + readStringUntilEOF(field, istr); + if (!tryDeserializeWholeTextImpl(column, field, settings)) + throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse text value of type {} here: {}", variant_name, field); +} + +bool SerializationVariant::tryDeserializeWholeTextImpl(DB::IColumn & column, const String & field, const DB::FormatSettings & settings) const +{ + auto check_for_null = [&](ReadBuffer & buf) + { + return SerializationNullable::tryDeserializeNullText(buf); + }; + auto try_deserialize_variant =[&](IColumn & variant_column, const SerializationPtr & variant_serialization, ReadBuffer & buf) + { + return variant_serialization->tryDeserializeWholeText(variant_column, buf, settings); + }; + + return tryDeserializeImpl(column, field, check_for_null, try_deserialize_variant); +} + +void SerializationVariant::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnVariant & col = assert_cast(column); + auto global_discr = col.globalDiscriminatorAt(row_num); + if (global_discr == ColumnVariant::NULL_DISCRIMINATOR) + SerializationNullable::serializeNullJSON(ostr); + else + variants[global_discr]->serializeTextJSON(col.getVariantByGlobalDiscriminator(global_discr), col.offsetAt(row_num), ostr, settings); +} + +bool SerializationVariant::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String field; + if (!tryReadJSONField(field, istr)) + return false; + return tryDeserializeTextJSONImpl(column, field, settings); +} + +void SerializationVariant::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + String field; + readJSONField(field, istr); + if (!tryDeserializeTextJSONImpl(column, field, settings)) + throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot parse JSON value of type {} here: {}", variant_name, field); +} + +bool SerializationVariant::tryDeserializeTextJSONImpl(DB::IColumn & column, const String & field, const DB::FormatSettings & settings) const +{ + auto check_for_null = [&](ReadBuffer & buf) + { + return SerializationNullable::tryDeserializeNullJSON(buf); + }; + auto try_deserialize_variant =[&](IColumn & variant_column, const SerializationPtr & variant_serialization, ReadBuffer & buf) + { + return variant_serialization->tryDeserializeTextJSON(variant_column, buf, settings); + }; + + return tryDeserializeImpl(column, field, check_for_null, try_deserialize_variant); +} + +void SerializationVariant::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const +{ + const ColumnVariant & col = assert_cast(column); + auto global_discr = col.globalDiscriminatorAt(row_num); + if (global_discr == ColumnVariant::NULL_DISCRIMINATOR) + SerializationNullable::serializeNullXML(ostr); + else + variants[global_discr]->serializeTextXML(col.getVariantByGlobalDiscriminator(global_discr), col.offsetAt(row_num), ostr, settings); +} + +} diff --git a/src/DataTypes/Serializations/SerializationVariant.h b/src/DataTypes/Serializations/SerializationVariant.h new file mode 100644 index 00000000000..b6bee94c65f --- /dev/null +++ b/src/DataTypes/Serializations/SerializationVariant.h @@ -0,0 +1,116 @@ +#pragma once + +#include +#include + +namespace DB +{ + +class SerializationVariant : public ISerialization +{ +public: + using VariantSerializations = std::vector; + + explicit SerializationVariant( + const VariantSerializations & variants_, + const std::vector & variant_names_, + const std::vector & deserialize_text_order_, + const String & variant_name_) + : variants(variants_), variant_names(variant_names_), deserialize_text_order(deserialize_text_order_), variant_name(variant_name_) + { + } + + void enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const override; + + void serializeBinaryBulkStatePrefix( + const IColumn & column, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const override; + + void serializeBinary(const Field & field, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeBinary(Field & field, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeBinary(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeTextRaw(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + void deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + bool tryDeserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; + + void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; + + /// Determine the order in which we should try to deserialize variants. + /// In some cases the text representation of a value can be deserialized + /// into several types (for example, almost all text values can be deserialized + /// into String type), so we uses some heuristics to determine the more optimal order. + static std::vector getVariantsDeserializeTextOrder(const DataTypes & variant_types); + +private: + void addVariantElementToPath(SubstreamPath & path, size_t i) const; + + bool tryDeserializeTextEscapedImpl(IColumn & column, const String & field, const FormatSettings & settings) const; + bool tryDeserializeTextQuotedImpl(IColumn & column, const String & field, const FormatSettings & settings) const; + bool tryDeserializeWholeTextImpl(IColumn & column, const String & field, const FormatSettings & settings) const; + bool tryDeserializeTextCSVImpl(IColumn & column, const String & field, const FormatSettings & settings) const; + bool tryDeserializeTextJSONImpl(IColumn & column, const String & field, const FormatSettings & settings) const; + bool tryDeserializeTextRawImpl(IColumn & column, const String & field, const FormatSettings & settings) const; + + bool tryDeserializeImpl( + IColumn & column, + const String & field, + std::function check_for_null, + std::function try_deserialize_nested) const; + + VariantSerializations variants; + std::vector variant_names; + std::vector deserialize_text_order; + /// Name of Variant data type for better exception messages. + String variant_name; +}; + +} diff --git a/src/DataTypes/Serializations/SerializationVariantElement.cpp b/src/DataTypes/Serializations/SerializationVariantElement.cpp new file mode 100644 index 00000000000..4b24ee5754e --- /dev/null +++ b/src/DataTypes/Serializations/SerializationVariantElement.cpp @@ -0,0 +1,241 @@ +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; + extern const int LOGICAL_ERROR; +} + +void SerializationVariantElement::enumerateStreams( + DB::ISerialization::EnumerateStreamsSettings & settings, + const DB::ISerialization::StreamCallback & callback, + const DB::ISerialization::SubstreamData & data) const +{ + /// We will need stream for discriminators during deserialization. + settings.path.push_back(Substream::VariantDiscriminators); + callback(settings.path); + settings.path.pop_back(); + + addVariantToPath(settings.path); + settings.path.back().data = data; + nested_serialization->enumerateStreams(settings, callback, data); + removeVariantFromPath(settings.path); +} + +void SerializationVariantElement::serializeBinaryBulkStatePrefix(const IColumn &, SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const +{ + throw Exception( + ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkStatePrefix is not implemented for SerializationVariantElement"); +} + +void SerializationVariantElement::serializeBinaryBulkStateSuffix(SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const +{ + throw Exception( + ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkStateSuffix is not implemented for SerializationVariantElement"); +} + +void SerializationVariantElement::deserializeBinaryBulkStatePrefix(DeserializeBinaryBulkSettings & settings, DeserializeBinaryBulkStatePtr & state) const +{ + addVariantToPath(settings.path); + nested_serialization->deserializeBinaryBulkStatePrefix(settings, state); + removeVariantFromPath(settings.path); +} + +void SerializationVariantElement::serializeBinaryBulkWithMultipleStreams(const IColumn &, size_t, size_t, SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkWithMultipleStreams is not implemented for SerializationVariantElement"); +} + +void SerializationVariantElement::deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const +{ + auto mutable_column = column->assumeMutable(); + ColumnNullable * nullable_col = typeid_cast(mutable_column.get()); + NullMap * null_map = nullable_col ? &nullable_col->getNullMapData() : nullptr; + + /// First, deserialize discriminators from Variant column. + settings.path.push_back(Substream::VariantDiscriminators); + ColumnPtr discriminators; + if (auto cached_discriminators = getFromSubstreamsCache(cache, settings.path)) + { + discriminators = cached_discriminators; + } + else + { + auto * discriminators_stream = settings.getter(settings.path); + if (!discriminators_stream) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Got empty stream for VariantDiscriminators in SerializationVariantElement::deserializeBinaryBulkWithMultipleStreams"); + + discriminators = ColumnVariant::ColumnDiscriminators::create(); + SerializationNumber().deserializeBinaryBulk(*discriminators->assumeMutable(), *discriminators_stream, limit, 0); + addToSubstreamsCache(cache, settings.path, discriminators); + } + settings.path.pop_back(); + + /// Iterate through discriminators to calculate the size of the variant. + const auto & discriminators_data = assert_cast(*discriminators).getData(); + size_t variant_size = 0; + for (auto discr : discriminators_data) + variant_size += discr == variant_discriminator; + + /// Now we know the size of the variant and can deserialize it. + + /// If the size of variant column is the same as the size of discriminators, + /// we can deserialize new values directly into our column. + if (variant_size == discriminators_data.size()) + { + addVariantToPath(settings.path); + /// Special case when our result column is LowCardinality(Nullable(T)). + /// In this case the variant type is LowCardinality(T), and we cannot just + /// deserialize its values directly into LowCardinality(Nullable(T)) column. + /// We create a separate column with type LowCardinality(T), deserialize + /// values into it and then insert into result column using insertRangeFrom. + if (isColumnLowCardinalityNullable(*column)) + { + ColumnPtr variant_col = mutable_column->cloneEmpty(); + /// LowCardinality(Nullable(T)) -> LowCardinality(T) + assert_cast(*variant_col->assumeMutable()).nestedRemoveNullable(); + nested_serialization->deserializeBinaryBulkWithMultipleStreams(variant_col, limit, settings, state, cache); + mutable_column->insertRangeFrom(*variant_col, 0, variant_col->size()); + } + else + { + nested_serialization->deserializeBinaryBulkWithMultipleStreams(nullable_col ? nullable_col->getNestedColumnPtr() : column, limit, settings, state, cache); + } + if (nullable_col) + null_map->resize_fill(null_map->size() + limit, 0); + removeVariantFromPath(settings.path); + return; + } + + /// If variant size is 0, just fill column with default values. + if (variant_size == 0) + { + mutable_column->insertManyDefaults(limit); + return; + } + + /// In general case we should deserialize variant into a separate column, + /// iterate through discriminators and insert values from variant only when + /// row contains its discriminator and default value otherwise. + mutable_column->reserve(mutable_column->size() + limit); + mutable_column = nullable_col ? nullable_col->getNestedColumnPtr()->assumeMutable() : std::move(mutable_column); + ColumnPtr variant_col = mutable_column->cloneEmpty(); + + /// Special case when our result column is LowCardinality(Nullable(T)). + /// We should remove Nullable from variant column before deserialization. + if (isColumnLowCardinalityNullable(*column)) + assert_cast(*variant_col->assumeMutable()).nestedRemoveNullable(); + + addVariantToPath(settings.path); + nested_serialization->deserializeBinaryBulkWithMultipleStreams(variant_col, variant_size, settings, state, cache); + removeVariantFromPath(settings.path); + + size_t variant_index = 0; + for (auto discr : discriminators_data) + { + if (discr == variant_discriminator) + { + if (null_map) + null_map->push_back(0); + mutable_column->insertFrom(*variant_col, variant_index++); + } + else + { + if (null_map) + null_map->push_back(1); + mutable_column->insertDefault(); + } + } +} + +void SerializationVariantElement::addVariantToPath(DB::ISerialization::SubstreamPath & path) const +{ + path.push_back(Substream::VariantElements); + path.push_back(Substream::VariantElement); + path.back().variant_element_name = variant_element_name; +} + +void SerializationVariantElement::removeVariantFromPath(DB::ISerialization::SubstreamPath & path) const +{ + path.pop_back(); + path.pop_back(); +} + +SerializationVariantElement::VariantSubcolumnCreator::VariantSubcolumnCreator( + const DB::ColumnPtr & local_discriminators_, + const DB::String & variant_element_name_, + const ColumnVariant::Discriminator global_variant_discriminator_, + const ColumnVariant::Discriminator local_variant_discriminator_) + : local_discriminators(local_discriminators_) + , variant_element_name(variant_element_name_) + , global_variant_discriminator(global_variant_discriminator_) + , local_variant_discriminator(local_variant_discriminator_) +{ +} + +DataTypePtr SerializationVariantElement::VariantSubcolumnCreator::create(const DB::DataTypePtr & prev) const +{ + return makeNullableOrLowCardinalityNullableSafe(prev); +} + +SerializationPtr SerializationVariantElement::VariantSubcolumnCreator::create(const DB::SerializationPtr & prev) const +{ + return std::make_shared(prev, variant_element_name, global_variant_discriminator); +} + +ColumnPtr SerializationVariantElement::VariantSubcolumnCreator::create(const DB::ColumnPtr & prev) const +{ + /// Case when original Variant column contained only one non-empty variant and no NULLs. + /// In this case just use this variant. + if (prev->size() == local_discriminators->size()) + return makeNullableOrLowCardinalityNullableSafe(prev); + + /// If this variant is empty, fill result column with default values. + if (prev->empty()) + { + auto res = IColumn::mutate(makeNullableOrLowCardinalityNullableSafe(prev)); + res->insertManyDefaults(local_discriminators->size()); + return res; + } + + /// In general case we should iterate through discriminators and create null-map for our variant. + NullMap null_map; + null_map.reserve(local_discriminators->size()); + const auto & local_discriminators_data = assert_cast(*local_discriminators).getData(); + for (auto local_discr : local_discriminators_data) + null_map.push_back(local_discr != local_variant_discriminator); + + /// Now we can create new column from null-map and variant column using IColumn::expand. + auto res_column = IColumn::mutate(prev); + + /// Special case for LowCardinality. We want the result to be LowCardinality(Nullable), + /// but we don't have a good way to apply null-mask for LowCardinality(), so, we first + /// convert our column to LowCardinality(Nullable()) and then use expand which will + /// fill rows with 0 in mask with default value (that is NULL). + if (prev->lowCardinality()) + res_column = assert_cast(*res_column).cloneNullable(); + + res_column->expand(null_map, /*inverted = */ true); + + if (res_column->canBeInsideNullable()) + { + auto null_map_col = ColumnUInt8::create(); + null_map_col->getData() = std::move(null_map); + return ColumnNullable::create(std::move(res_column), std::move(null_map_col)); + } + + return res_column; +} + +} diff --git a/src/DataTypes/Serializations/SerializationVariantElement.h b/src/DataTypes/Serializations/SerializationVariantElement.h new file mode 100644 index 00000000000..c343c219cf3 --- /dev/null +++ b/src/DataTypes/Serializations/SerializationVariantElement.h @@ -0,0 +1,87 @@ +#pragma once + +#include +#include +#include +#include + +namespace DB +{ + +class SerializationVariant; + +/// Serialization for Variant element when we read it as a subcolumn. +class SerializationVariantElement final : public SerializationWrapper +{ +private: + /// To be able to deserialize Variant element as a subcolumn + /// we need its type name and global discriminator. + String variant_element_name; + ColumnVariant::Discriminator variant_discriminator; + +public: + SerializationVariantElement(const SerializationPtr & nested_, const String & variant_element_name_, ColumnVariant::Discriminator variant_discriminator_) + : SerializationWrapper(nested_) + , variant_element_name(variant_element_name_) + , variant_discriminator(variant_discriminator_) + { + } + + void enumerateStreams( + EnumerateStreamsSettings & settings, + const StreamCallback & callback, + const SubstreamData & data) const override; + + void serializeBinaryBulkStatePrefix( + const IColumn & column, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkStateSuffix( + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkStatePrefix( + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state) const override; + + void serializeBinaryBulkWithMultipleStreams( + const IColumn & column, + size_t offset, + size_t limit, + SerializeBinaryBulkSettings & settings, + SerializeBinaryBulkStatePtr & state) const override; + + void deserializeBinaryBulkWithMultipleStreams( + ColumnPtr & column, + size_t limit, + DeserializeBinaryBulkSettings & settings, + DeserializeBinaryBulkStatePtr & state, + SubstreamsCache * cache) const override; + +private: + friend SerializationVariant; + + void addVariantToPath(SubstreamPath & path) const; + void removeVariantFromPath(SubstreamPath & path) const; + + struct VariantSubcolumnCreator : public ISubcolumnCreator + { + const ColumnPtr local_discriminators; + const String variant_element_name; + const ColumnVariant::Discriminator global_variant_discriminator; + const ColumnVariant::Discriminator local_variant_discriminator; + + VariantSubcolumnCreator( + const ColumnPtr & local_discriminators_, + const String & variant_element_name_, + const ColumnVariant::Discriminator global_variant_discriminator_, + const ColumnVariant::Discriminator local_variant_discriminator_); + + DataTypePtr create(const DataTypePtr & prev) const override; + ColumnPtr create(const ColumnPtr & prev) const override; + SerializationPtr create(const SerializationPtr & prev) const override; + }; +}; + +} diff --git a/src/DataTypes/Serializations/SerializationWrapper.cpp b/src/DataTypes/Serializations/SerializationWrapper.cpp index 18e4891ee65..bde52bb8096 100644 --- a/src/DataTypes/Serializations/SerializationWrapper.cpp +++ b/src/DataTypes/Serializations/SerializationWrapper.cpp @@ -96,6 +96,11 @@ void SerializationWrapper::deserializeTextEscaped(IColumn & column, ReadBuffer & nested_serialization->deserializeTextEscaped(column, istr, settings); } +bool SerializationWrapper::tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + return nested_serialization->tryDeserializeTextEscaped(column, istr, settings); +} + void SerializationWrapper::serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { nested_serialization->serializeTextQuoted(column, row_num, ostr, settings); @@ -106,6 +111,11 @@ void SerializationWrapper::deserializeTextQuoted(IColumn & column, ReadBuffer & nested_serialization->deserializeTextQuoted(column, istr, settings); } +bool SerializationWrapper::tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + return nested_serialization->tryDeserializeTextQuoted(column, istr, settings); +} + void SerializationWrapper::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { nested_serialization->serializeTextCSV(column, row_num, ostr, settings); @@ -116,6 +126,11 @@ void SerializationWrapper::deserializeTextCSV(IColumn & column, ReadBuffer & ist nested_serialization->deserializeTextCSV(column, istr, settings); } +bool SerializationWrapper::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + return nested_serialization->tryDeserializeTextCSV(column, istr, settings); +} + void SerializationWrapper::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { nested_serialization->serializeText(column, row_num, ostr, settings); @@ -126,6 +141,11 @@ void SerializationWrapper::deserializeWholeText(IColumn & column, ReadBuffer & i nested_serialization->deserializeWholeText(column, istr, settings); } +bool SerializationWrapper::tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + return nested_serialization->tryDeserializeWholeText(column, istr, settings); +} + void SerializationWrapper::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { nested_serialization->serializeTextJSON(column, row_num, ostr, settings); @@ -136,6 +156,11 @@ void SerializationWrapper::deserializeTextJSON(IColumn & column, ReadBuffer & is nested_serialization->deserializeTextJSON(column, istr, settings); } +bool SerializationWrapper::tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const +{ + return nested_serialization->tryDeserializeTextJSON(column, istr, settings); +} + void SerializationWrapper::serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent) const { nested_serialization->serializeTextJSONPretty(column, row_num, ostr, settings, indent); diff --git a/src/DataTypes/Serializations/SerializationWrapper.h b/src/DataTypes/Serializations/SerializationWrapper.h index 31900f93148..6c5e2046062 100644 --- a/src/DataTypes/Serializations/SerializationWrapper.h +++ b/src/DataTypes/Serializations/SerializationWrapper.h @@ -63,18 +63,23 @@ public: void serializeTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextQuoted(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextJSONPretty(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings, size_t indent) const override; void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; diff --git a/src/DataTypes/Serializations/SimpleTextSerialization.h b/src/DataTypes/Serializations/SimpleTextSerialization.h index 0247f30b30a..11f56de73d1 100644 --- a/src/DataTypes/Serializations/SimpleTextSerialization.h +++ b/src/DataTypes/Serializations/SimpleTextSerialization.h @@ -36,29 +36,67 @@ protected: deserializeText(column, istr, settings, true); } + bool tryDeserializeWholeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override + { + return tryDeserializeText(column, istr, settings, true); + } + void deserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override { deserializeText(column, istr, settings, false); } + bool tryDeserializeTextEscaped(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override + { + return tryDeserializeText(column, istr, settings, false); + } + void deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override { deserializeText(column, istr, settings, false); } + bool tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override + { + return tryDeserializeText(column, istr, settings, false); + } + void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override { deserializeText(column, istr, settings, false); } + bool tryDeserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override + { + return tryDeserializeText(column, istr, settings, false); + } + void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override { deserializeText(column, istr, settings, false); } + bool tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override + { + return tryDeserializeText(column, istr, settings, false); + } + /// whole = true means that buffer contains only one value, so we should read until EOF. /// It's needed to check if there is garbage after parsed field. virtual void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &, bool whole) const = 0; + + virtual bool tryDeserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings & settings, bool whole) const + { + try + { + deserializeText(column, istr, settings, whole); + return true; + } + catch (...) + { + return false; + } + } }; } diff --git a/src/DataTypes/Utils.cpp b/src/DataTypes/Utils.cpp index e58331a8bcb..2f29d57d454 100644 --- a/src/DataTypes/Utils.cpp +++ b/src/DataTypes/Utils.cpp @@ -223,6 +223,7 @@ bool canBeSafelyCasted(const DataTypePtr & from_type, const DataTypePtr & to_typ case TypeIndex::AggregateFunction: case TypeIndex::Nothing: case TypeIndex::JSONPaths: + case TypeIndex::Variant: return false; } diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 36dd858dcf7..76f6dc25aae 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -907,6 +907,7 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep query_context->setSetting("allow_experimental_nlp_functions", 1); query_context->setSetting("allow_experimental_hash_functions", 1); query_context->setSetting("allow_experimental_object_type", 1); + query_context->setSetting("allow_experimental_variant_type", 1); query_context->setSetting("allow_experimental_annoy_index", 1); query_context->setSetting("allow_experimental_usearch_index", 1); query_context->setSetting("allow_experimental_bigint_types", 1); diff --git a/src/Formats/EscapingRuleUtils.cpp b/src/Formats/EscapingRuleUtils.cpp index 9cc7cb3b89e..a2528f9f948 100644 --- a/src/Formats/EscapingRuleUtils.cpp +++ b/src/Formats/EscapingRuleUtils.cpp @@ -109,31 +109,31 @@ bool deserializeFieldByEscapingRule( { case FormatSettings::EscapingRule::Escaped: if (parse_as_nullable) - read = SerializationNullable::deserializeTextEscapedImpl(column, buf, format_settings, serialization); + read = SerializationNullable::deserializeNullAsDefaultOrNestedTextEscaped(column, buf, format_settings, serialization); else serialization->deserializeTextEscaped(column, buf, format_settings); break; case FormatSettings::EscapingRule::Quoted: if (parse_as_nullable) - read = SerializationNullable::deserializeTextQuotedImpl(column, buf, format_settings, serialization); + read = SerializationNullable::deserializeNullAsDefaultOrNestedTextQuoted(column, buf, format_settings, serialization); else serialization->deserializeTextQuoted(column, buf, format_settings); break; case FormatSettings::EscapingRule::CSV: if (parse_as_nullable) - read = SerializationNullable::deserializeTextCSVImpl(column, buf, format_settings, serialization); + read = SerializationNullable::deserializeNullAsDefaultOrNestedTextCSV(column, buf, format_settings, serialization); else serialization->deserializeTextCSV(column, buf, format_settings); break; case FormatSettings::EscapingRule::JSON: if (parse_as_nullable) - read = SerializationNullable::deserializeTextJSONImpl(column, buf, format_settings, serialization); + read = SerializationNullable::deserializeNullAsDefaultOrNestedTextJSON(column, buf, format_settings, serialization); else serialization->deserializeTextJSON(column, buf, format_settings); break; case FormatSettings::EscapingRule::Raw: if (parse_as_nullable) - read = SerializationNullable::deserializeTextRawImpl(column, buf, format_settings, serialization); + read = SerializationNullable::deserializeNullAsDefaultOrNestedTextRaw(column, buf, format_settings, serialization); else serialization->deserializeTextRaw(column, buf, format_settings); break; diff --git a/src/Formats/JSONUtils.cpp b/src/Formats/JSONUtils.cpp index 7ddfdb6b572..4e7795f61bd 100644 --- a/src/Formats/JSONUtils.cpp +++ b/src/Formats/JSONUtils.cpp @@ -288,14 +288,14 @@ namespace JSONUtils ReadBufferFromString buf(str); if (as_nullable) - return SerializationNullable::deserializeWholeTextImpl(column, buf, format_settings, serialization); + return SerializationNullable::deserializeNullAsDefaultOrNestedWholeText(column, buf, format_settings, serialization); serialization->deserializeWholeText(column, buf, format_settings); return true; } if (as_nullable) - return SerializationNullable::deserializeTextJSONImpl(column, in, format_settings, serialization); + return SerializationNullable::deserializeNullAsDefaultOrNestedTextJSON(column, in, format_settings, serialization); serialization->deserializeTextJSON(column, in, format_settings); return true; diff --git a/src/Formats/SchemaInferenceUtils.cpp b/src/Formats/SchemaInferenceUtils.cpp index e2ba188d015..6890e412f75 100644 --- a/src/Formats/SchemaInferenceUtils.cpp +++ b/src/Formats/SchemaInferenceUtils.cpp @@ -946,7 +946,7 @@ namespace if constexpr (is_json) ok = tryReadJSONStringInto(field, buf); else - ok = tryReadQuotedStringInto(field, buf); + ok = tryReadQuotedString(field, buf); if (!ok) return nullptr; diff --git a/src/Functions/FunctionsConversion.h b/src/Functions/FunctionsConversion.h index bef1e7b420a..f9f61ceed0d 100644 --- a/src/Functions/FunctionsConversion.h +++ b/src/Functions/FunctionsConversion.h @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -40,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -4067,6 +4069,259 @@ arguments, result_type, input_rows_count); \ "Cast to Object can be performed only from flatten named Tuple, Map or String. Got: {}", from_type->getName()); } + WrapperType createVariantToVariantWrapper(const DataTypeVariant & from_variant, const DataTypeVariant & to_variant) const + { + /// We support only extension of variant type, so, only new types can be added. + /// For example: Variant(T1, T2) -> Variant(T1, T2, T3) is supported, but Variant(T1, T2) -> Variant(T1, T3) is not supported. + /// We want to extend Variant type for free without rewriting the data, but we sort data types inside Variant during type creation + /// (we do it because we want Variant(T1, T2) to be the same as Variant(T2, T1)), but after extension the order of variant types + /// (and so their discriminators) can be different. For example: Variant(T1, T3) -> Variant(T1, T2, T3). + /// To avoid full rewrite of discriminators column, ColumnVariant supports it's local order of variant columns (and so local + /// discriminators) and stores mapping global order -> local order. + /// So, to extend Variant with new types for free, we should keep old local order for old variants, append new variants and change + /// mapping global order -> local order according to the new global order. + + /// Create map (new variant type) -> (it's global discriminator in new order). + const auto & new_variants = to_variant.getVariants(); + std::unordered_map new_variant_types_to_new_global_discriminator; + new_variant_types_to_new_global_discriminator.reserve(new_variants.size()); + for (size_t i = 0; i != new_variants.size(); ++i) + new_variant_types_to_new_global_discriminator[new_variants[i]->getName()] = i; + + /// Create set of old variant types. + const auto & old_variants = from_variant.getVariants(); + std::unordered_map old_variant_types_to_old_global_discriminator; + old_variant_types_to_old_global_discriminator.reserve(old_variants.size()); + for (size_t i = 0; i != old_variants.size(); ++i) + old_variant_types_to_old_global_discriminator[old_variants[i]->getName()] = i; + + /// Check that the set of old variants types is a subset of new variant types and collect new global discriminator for each old global discriminator. + std::unordered_map old_global_discriminator_to_new; + old_global_discriminator_to_new.reserve(old_variants.size()); + for (const auto & [old_variant_type, old_discriminator] : old_variant_types_to_old_global_discriminator) + { + auto it = new_variant_types_to_new_global_discriminator.find(old_variant_type); + if (it == new_variant_types_to_new_global_discriminator.end()) + throw Exception( + ErrorCodes::CANNOT_CONVERT_TYPE, + "Cannot convert type {} to {}. Conversion between Variant types is allowed only when new Variant type is an extension " + "of an initial one", from_variant.getName(), to_variant.getName()); + old_global_discriminator_to_new[old_discriminator] = it->second; + } + + /// Collect variant types and their global discriminators that should be added to the old Variant to get the new Variant. + std::vector> variant_types_and_discriminators_to_add; + variant_types_and_discriminators_to_add.reserve(new_variants.size() - old_variants.size()); + for (size_t i = 0; i != new_variants.size(); ++i) + { + if (!old_variant_types_to_old_global_discriminator.contains(new_variants[i]->getName())) + variant_types_and_discriminators_to_add.emplace_back(new_variants[i], i); + } + + return [old_global_discriminator_to_new, variant_types_and_discriminators_to_add] + (ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable *, size_t) -> ColumnPtr + { + const auto & column_variant = assert_cast(*arguments.front().column.get()); + size_t num_old_variants = column_variant.getNumVariants(); + Columns new_variant_columns; + new_variant_columns.reserve(num_old_variants + variant_types_and_discriminators_to_add.size()); + std::vector new_local_to_global_discriminators; + new_local_to_global_discriminators.reserve(num_old_variants + variant_types_and_discriminators_to_add.size()); + for (size_t i = 0; i != num_old_variants; ++i) + { + new_variant_columns.push_back(column_variant.getVariantPtrByLocalDiscriminator(i)); + new_local_to_global_discriminators.push_back(old_global_discriminator_to_new.at(column_variant.globalDiscriminatorByLocal(i))); + } + + for (const auto & [new_variant_type, new_global_discriminator] : variant_types_and_discriminators_to_add) + { + new_variant_columns.push_back(new_variant_type->createColumn()); + new_local_to_global_discriminators.push_back(new_global_discriminator); + } + + return ColumnVariant::create(column_variant.getLocalDiscriminatorsPtr(), column_variant.getOffsetsPtr(), new_variant_columns, new_local_to_global_discriminators); + }; + } + + WrapperType createVariantToColumnWrapper(const DataTypeVariant & from_variant, const DataTypePtr & to_type) const + { + const auto & variant_types = from_variant.getVariants(); + std::vector variant_wrappers; + variant_wrappers.reserve(variant_types.size()); + + /// Create conversion wrapper for each variant. + for (const auto & variant_type : variant_types) + variant_wrappers.push_back(prepareUnpackDictionaries(variant_type, to_type)); + + return [variant_wrappers, variant_types, to_type] + (ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable *, size_t input_rows_count) -> ColumnPtr + { + const auto & column_variant = assert_cast(*arguments.front().column.get()); + + /// First, cast each variant to the result type. + std::vector casted_variant_columns; + casted_variant_columns.reserve(variant_types.size()); + for (size_t i = 0; i != variant_types.size(); ++i) + { + auto variant_col = column_variant.getVariantPtrByLocalDiscriminator(i); + ColumnsWithTypeAndName variant = {{variant_col, variant_types[i], "" }}; + const auto & variant_wrapper = variant_wrappers[column_variant.globalDiscriminatorByLocal(i)]; + casted_variant_columns.push_back(variant_wrapper(variant, result_type, nullptr, variant_col->size())); + } + + /// Second, construct resulting column from casted variant columns according to discriminators. + const auto & local_discriminators = column_variant.getLocalDiscriminators(); + auto res = result_type->createColumn(); + res->reserve(input_rows_count); + for (size_t i = 0; i != input_rows_count; ++i) + { + auto local_discr = local_discriminators[i]; + if (local_discr == ColumnVariant::NULL_DISCRIMINATOR) + res->insertDefault(); + else + res->insertFrom(*casted_variant_columns[local_discr], column_variant.offsetAt(i)); + } + + return res; + }; + } + + static ColumnPtr createVariantFromDescriptorsAndOneNonEmptyVariant(const DataTypes & variant_types, const ColumnPtr & discriminators, const ColumnPtr & variant, ColumnVariant::Discriminator variant_discr) + { + Columns variants; + variants.reserve(variant_types.size()); + for (size_t i = 0; i != variant_types.size(); ++i) + { + if (i == variant_discr) + variants.emplace_back(variant); + else + variants.push_back(variant_types[i]->createColumn()); + } + + return ColumnVariant::create(discriminators, variants); + } + + WrapperType createColumnToVariantWrapper(const DataTypePtr & from_type, const DataTypeVariant & to_variant) const + { + /// We allow converting NULL to Variant(...) as Variant can store NULLs. + if (from_type->onlyNull()) + { + return [](ColumnsWithTypeAndName &, const DataTypePtr & result_type, const ColumnNullable *, size_t input_rows_count) -> ColumnPtr + { + auto result_column = result_type->createColumn(); + result_column->insertManyDefaults(input_rows_count); + return result_column; + }; + } + + auto variant_discr_opt = to_variant.tryGetVariantDiscriminator(removeNullableOrLowCardinalityNullable(from_type)); + if (!variant_discr_opt) + throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "Cannot convert type {} to {}. Conversion to Variant allowed only for types from this Variant", from_type->getName(), to_variant.getName()); + + return [variant_discr = *variant_discr_opt] + (ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable *, size_t) -> ColumnPtr + { + const auto & result_variant_type = assert_cast(*result_type); + const auto & variant_types = result_variant_type.getVariants(); + if (const ColumnNullable * col_nullable = typeid_cast(arguments.front().column.get())) + { + const auto & column = col_nullable->getNestedColumnPtr(); + const auto & null_map = col_nullable->getNullMapData(); + IColumn::Filter filter; + filter.reserve(column->size()); + auto discriminators = ColumnVariant::ColumnDiscriminators::create(); + auto & discriminators_data = discriminators->getData(); + discriminators_data.reserve(column->size()); + size_t variant_size_hint = 0; + for (size_t i = 0; i != column->size(); ++i) + { + if (null_map[i]) + { + discriminators_data.push_back(ColumnVariant::NULL_DISCRIMINATOR); + filter.push_back(0); + } + else + { + discriminators_data.push_back(variant_discr); + filter.push_back(1); + ++variant_size_hint; + } + } + + ColumnPtr variant_column; + /// If there were no NULLs, just use the column. + if (variant_size_hint == column->size()) + variant_column = column; + /// Otherwise we should use filtered column. + else + variant_column = column->filter(filter, variant_size_hint); + return createVariantFromDescriptorsAndOneNonEmptyVariant(variant_types, std::move(discriminators), variant_column, variant_discr); + } + else if (isColumnLowCardinalityNullable(*arguments.front().column)) + { + const auto & column = arguments.front().column; + + /// Variant column cannot have LowCardinality(Nullable(...)) variant, as Variant column stores NULLs itself. + /// We should create a null-map, insert NULL_DISCRIMINATOR on NULL values and filter initial column. + const auto & col_lc = assert_cast(*column); + const auto & indexes = col_lc.getIndexes(); + auto null_index = col_lc.getDictionary().getNullValueIndex(); + IColumn::Filter filter; + filter.reserve(col_lc.size()); + auto discriminators = ColumnVariant::ColumnDiscriminators::create(); + auto & discriminators_data = discriminators->getData(); + discriminators_data.reserve(col_lc.size()); + size_t variant_size_hint = 0; + for (size_t i = 0; i != col_lc.size(); ++i) + { + if (indexes.getUInt(i) == null_index) + { + discriminators_data.push_back(ColumnVariant::NULL_DISCRIMINATOR); + filter.push_back(0); + } + else + { + discriminators_data.push_back(variant_discr); + filter.push_back(1); + ++variant_size_hint; + } + } + + MutableColumnPtr variant_column; + /// If there were no NULLs, we can just clone the column. + if (variant_size_hint == col_lc.size()) + variant_column = IColumn::mutate(column); + /// Otherwise we should filter column. + else + variant_column = column->filter(filter, variant_size_hint)->assumeMutable(); + + assert_cast(*variant_column).nestedRemoveNullable(); + return createVariantFromDescriptorsAndOneNonEmptyVariant(variant_types, std::move(discriminators), std::move(variant_column), variant_discr); + } + else + { + const auto & column = arguments.front().column; + auto discriminators = ColumnVariant::ColumnDiscriminators::create(); + discriminators->getData().resize_fill(column->size(), variant_discr); + return createVariantFromDescriptorsAndOneNonEmptyVariant(variant_types, std::move(discriminators), column, variant_discr); + } + }; + } + + /// Wrapper for conversion to/from Variant type + WrapperType createVariantWrapper(const DataTypePtr & from_type, const DataTypePtr & to_type) const + { + if (const auto * from_variant = checkAndGetDataType(from_type.get())) + { + if (const auto * to_variant = checkAndGetDataType(to_type.get())) + return createVariantToVariantWrapper(*from_variant, *to_variant); + + return createVariantToColumnWrapper(*from_variant, to_type); + } + + return createColumnToVariantWrapper(from_type, assert_cast(*to_type)); + } + template WrapperType createEnumWrapper(const DataTypePtr & from_type, const DataTypeEnum * to_type) const { @@ -4246,6 +4501,11 @@ arguments, result_type, input_rows_count); \ WrapperType prepareUnpackDictionaries(const DataTypePtr & from_type, const DataTypePtr & to_type) const { + /// Conversion from/to Variant data type is processed in a special way. + /// We don't need to remove LowCardinality/Nullable. + if (isVariant(to_type) || isVariant(from_type)) + return createVariantWrapper(from_type, to_type); + const auto * from_low_cardinality = typeid_cast(from_type.get()); const auto * to_low_cardinality = typeid_cast(to_type.get()); const auto & from_nested = from_low_cardinality ? from_low_cardinality->getDictionaryType() : from_type; @@ -4253,7 +4513,7 @@ arguments, result_type, input_rows_count); \ if (from_type->onlyNull()) { - if (!to_nested->isNullable()) + if (!to_nested->isNullable() && !isVariant(to_type)) { if (cast_type == CastType::accurateOrNull) { diff --git a/src/Functions/if.cpp b/src/Functions/if.cpp index eba1733c683..b15bc5938be 100644 --- a/src/Functions/if.cpp +++ b/src/Functions/if.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -14,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -22,6 +24,8 @@ #include #include #include +#include + #include @@ -215,9 +219,16 @@ class FunctionIf : public FunctionIfBase { public: static constexpr auto name = "if"; - static FunctionPtr create(ContextPtr) { return std::make_shared(); } + static FunctionPtr create(ContextPtr context) + { + return std::make_shared(context->getSettingsRef().allow_experimental_variant_type && context->getSettingsRef().use_variant_when_no_common_type_in_if); + } + + FunctionIf(bool use_variant_when_no_common_type_ = false) : FunctionIfBase(), use_variant_when_no_common_type(use_variant_when_no_common_type_) {} private: + bool use_variant_when_no_common_type = false; + template static UInt32 decimalScale(const ColumnsWithTypeAndName & arguments [[maybe_unused]]) { @@ -626,13 +637,23 @@ private: } static ColumnPtr executeGeneric( - const ColumnUInt8 * cond_col, const ColumnsWithTypeAndName & arguments, size_t input_rows_count) + const ColumnUInt8 * cond_col, const ColumnsWithTypeAndName & arguments, size_t input_rows_count, bool use_variant_when_no_common_type) { /// Convert both columns to the common type (if needed). const ColumnWithTypeAndName & arg1 = arguments[1]; const ColumnWithTypeAndName & arg2 = arguments[2]; - DataTypePtr common_type = getLeastSupertype(DataTypes{arg1.type, arg2.type}); + DataTypePtr common_type; + if (use_variant_when_no_common_type) + { + common_type = tryGetLeastSupertype(DataTypes{arg1.type, arg2.type}); + if (!common_type) + common_type = std::make_shared(DataTypes{removeNullableOrLowCardinalityNullable(arg1.type), removeNullableOrLowCardinalityNullable(arg2.type)}); + } + else + { + common_type = getLeastSupertype(DataTypes{arg1.type, arg2.type}); + } ColumnPtr col_then = castColumn(arg1, common_type); ColumnPtr col_else = castColumn(arg2, common_type); @@ -807,6 +828,10 @@ private: ColumnPtr executeForNullableThenElse(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const { + /// If result type is Variant, we don't need to remove Nullable. + if (isVariant(result_type)) + return nullptr; + const ColumnWithTypeAndName & arg_cond = arguments[0]; const ColumnWithTypeAndName & arg_then = arguments[1]; const ColumnWithTypeAndName & arg_else = arguments[2]; @@ -912,6 +937,11 @@ private: assert_cast(*result_column).applyNullMap(assert_cast(*arg_cond.column)); return result_column; } + else if (auto * variant_column = typeid_cast(result_column.get())) + { + variant_column->applyNullMap(assert_cast(*arg_cond.column).getData()); + return result_column; + } else return ColumnNullable::create(materializeColumnIfConst(result_column), arg_cond.column); } @@ -950,6 +980,11 @@ private: assert_cast(*result_column).applyNegatedNullMap(assert_cast(*arg_cond.column)); return result_column; } + else if (auto * variant_column = typeid_cast(result_column.get())) + { + variant_column->applyNegatedNullMap(assert_cast(*arg_cond.column).getData()); + return result_column; + } else { size_t size = input_rows_count; @@ -1039,6 +1074,13 @@ public: throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of first argument (condition) of function if. " "Must be UInt8.", arguments[0]->getName()); + if (use_variant_when_no_common_type) + { + if (auto res = tryGetLeastSupertype(DataTypes{arguments[1], arguments[2]})) + return res; + return std::make_shared(DataTypes{removeNullableOrLowCardinalityNullable(arguments[1]), removeNullableOrLowCardinalityNullable(arguments[2])}); + } + return getLeastSupertype(DataTypes{arguments[1], arguments[2]}); } @@ -1122,7 +1164,7 @@ public: || (res = executeGenericArray(cond_col, arguments, result_type)) || (res = executeTuple(arguments, result_type, input_rows_count)))) { - return executeGeneric(cond_col, arguments, input_rows_count); + return executeGeneric(cond_col, arguments, input_rows_count, use_variant_when_no_common_type); } return res; diff --git a/src/Functions/isNotNull.cpp b/src/Functions/isNotNull.cpp index cbdc08c2fab..360c2fc7f9f 100644 --- a/src/Functions/isNotNull.cpp +++ b/src/Functions/isNotNull.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include @@ -45,6 +46,18 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override { const ColumnWithTypeAndName & elem = arguments[0]; + + if (isVariant(elem.type)) + { + const auto & discriminators = checkAndGetColumn(*elem.column)->getLocalDiscriminators(); + auto res = DataTypeUInt8().createColumn(); + auto & data = typeid_cast(*res).getData(); + data.reserve(discriminators.size()); + for (auto discr : discriminators) + data.push_back(discr != ColumnVariant::NULL_DISCRIMINATOR); + return res; + } + if (elem.type->isLowCardinalityNullable()) { const auto * low_cardinality_column = checkAndGetColumn(*elem.column); diff --git a/src/Functions/isNull.cpp b/src/Functions/isNull.cpp index cdce037088d..4bf4e44f866 100644 --- a/src/Functions/isNull.cpp +++ b/src/Functions/isNull.cpp @@ -5,6 +5,7 @@ #include #include #include +#include namespace DB @@ -44,6 +45,18 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t) const override { const ColumnWithTypeAndName & elem = arguments[0]; + + if (isVariant(elem.type)) + { + const auto & discriminators = checkAndGetColumn(*elem.column)->getLocalDiscriminators(); + auto res = DataTypeUInt8().createColumn(); + auto & data = typeid_cast(*res).getData(); + data.reserve(discriminators.size()); + for (auto discr : discriminators) + data.push_back(discr == ColumnVariant::NULL_DISCRIMINATOR); + return res; + } + if (elem.type->isLowCardinalityNullable()) { const auto * low_cardinality_column = checkAndGetColumn(*elem.column); diff --git a/src/Functions/multiIf.cpp b/src/Functions/multiIf.cpp index cdb9ca061c3..7a2e9444b2c 100644 --- a/src/Functions/multiIf.cpp +++ b/src/Functions/multiIf.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include @@ -117,6 +118,15 @@ public: types_of_branches.emplace_back(arg); }); + if (context->getSettingsRef().allow_experimental_variant_type && context->getSettingsRef().use_variant_when_no_common_type_in_if) + { + if (auto res = tryGetLeastSupertype(types_of_branches)) + return res; + for (auto & type : types_of_branches) + type = removeNullableOrLowCardinalityNullable(type); + return std::make_shared(types_of_branches); + } + return getLeastSupertype(types_of_branches); } diff --git a/src/Functions/variantElement.cpp b/src/Functions/variantElement.cpp new file mode 100644 index 00000000000..7c63e1266e6 --- /dev/null +++ b/src/Functions/variantElement.cpp @@ -0,0 +1,238 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; +} + +namespace +{ + +/** Extract element of Variant by variant type name. + * Also the function looks through Arrays: you can get Array of Variant elements from Array of Variants. + */ +class FunctionVariantElement : public IFunction +{ +public: + static constexpr auto name = "variantElement"; + + static FunctionPtr create(ContextPtr) { return std::make_shared(); } + String getName() const override { return name; } + bool isVariadic() const override { return true; } + size_t getNumberOfArguments() const override { return 0; } + bool useDefaultImplementationForConstants() const override { return true; } + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; } + bool useDefaultImplementationForNulls() const override { return false; } + bool useDefaultImplementationForLowCardinalityColumns() const override { return false; } + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } + + DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override + { + const size_t number_of_arguments = arguments.size(); + + if (number_of_arguments < 2 || number_of_arguments > 3) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Number of arguments for function {} doesn't match: passed {}, should be 2 or 3", + getName(), number_of_arguments); + + size_t count_arrays = 0; + const IDataType * input_type = arguments[0].type.get(); + while (const DataTypeArray * array = checkAndGetDataType(input_type)) + { + input_type = array->getNestedType().get(); + ++count_arrays; + } + + const DataTypeVariant * variant_type = checkAndGetDataType(input_type); + if (!variant_type) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "First argument for function {} must be Variant or Array of Variant. Actual {}", + getName(), + arguments[0].type->getName()); + + std::optional variant_global_discr = getVariantGlobalDiscriminator(arguments[1].column, *variant_type, number_of_arguments); + if (variant_global_discr.has_value()) + { + DataTypePtr return_type = makeNullableOrLowCardinalityNullableSafe(variant_type->getVariant(variant_global_discr.value())); + + for (; count_arrays; --count_arrays) + return_type = std::make_shared(return_type); + + return return_type; + } + else + return arguments[2].type; + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override + { + const auto & input_arg = arguments[0]; + const IDataType * input_type = input_arg.type.get(); + const IColumn * input_col = input_arg.column.get(); + + bool input_arg_is_const = false; + if (typeid_cast(input_col)) + { + input_col = assert_cast(input_col)->getDataColumnPtr().get(); + input_arg_is_const = true; + } + + Columns array_offsets; + while (const DataTypeArray * array_type = checkAndGetDataType(input_type)) + { + const ColumnArray * array_col = assert_cast(input_col); + + input_type = array_type->getNestedType().get(); + input_col = &array_col->getData(); + array_offsets.push_back(array_col->getOffsetsPtr()); + } + + const DataTypeVariant * input_type_as_variant = checkAndGetDataType(input_type); + const ColumnVariant * input_col_as_variant = checkAndGetColumn(input_col); + if (!input_type_as_variant || !input_col_as_variant) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "First argument for function {} must be Variant or array of Variants. Actual {}", getName(), input_arg.type->getName()); + + std::optional variant_global_discr = getVariantGlobalDiscriminator(arguments[1].column, *input_type_as_variant, arguments.size()); + + if (!variant_global_discr.has_value()) + return arguments[2].column; + + const auto & variant_type = input_type_as_variant->getVariant(*variant_global_discr); + const auto & variant_column = input_col_as_variant->getVariantPtrByGlobalDiscriminator(*variant_global_discr); + + /// If Variant has only NULLs or our variant doesn't have any real values, + /// just create column with default values and create null mask with 1. + if (input_col_as_variant->hasOnlyNulls() || variant_column->empty()) + { + auto res = variant_type->createColumn(); + + if (variant_type->lowCardinality()) + assert_cast(*res).nestedToNullable(); + + res->insertManyDefaults(input_col_as_variant->size()); + if (!variant_type->canBeInsideNullable()) + return wrapInArraysAndConstIfNeeded(std::move(res), array_offsets, input_arg_is_const, input_rows_count); + + auto null_map = ColumnUInt8::create(); + auto & null_map_data = null_map->getData(); + null_map_data.resize_fill(input_col_as_variant->size(), 1); + return wrapInArraysAndConstIfNeeded(ColumnNullable::create(std::move(res), std::move(null_map)), array_offsets, input_arg_is_const, input_rows_count); + } + + /// If we extract single non-empty column and have no NULLs, then just return this variant. + if (auto non_empty_local_discr = input_col_as_variant->getLocalDiscriminatorOfOneNoneEmptyVariantNoNulls()) + { + /// If we were trying to extract some other variant, + /// it would be empty and we would already processed this case above. + chassert(input_col_as_variant->globalDiscriminatorByLocal(*non_empty_local_discr) == variant_global_discr); + return wrapInArraysAndConstIfNeeded(makeNullableOrLowCardinalityNullableSafe(variant_column), array_offsets, input_arg_is_const, input_rows_count); + } + + /// In general case we should calculate null-mask for variant + /// according to the discriminators column and expand + /// variant column by this mask to get a full column (with default values on NULLs) + const auto & local_discriminators = input_col_as_variant->getLocalDiscriminators(); + auto null_map = ColumnUInt8::create(); + auto & null_map_data = null_map->getData(); + null_map_data.reserve(local_discriminators.size()); + auto variant_local_discr = input_col_as_variant->localDiscriminatorByGlobal(*variant_global_discr); + for (auto local_discr : local_discriminators) + null_map_data.push_back(local_discr != variant_local_discr); + + auto expanded_variant_column = IColumn::mutate(variant_column); + if (variant_type->lowCardinality()) + expanded_variant_column = assert_cast(*expanded_variant_column).cloneNullable(); + expanded_variant_column->expand(null_map_data, /*inverted = */ true); + if (variant_type->canBeInsideNullable()) + return wrapInArraysAndConstIfNeeded(ColumnNullable::create(std::move(expanded_variant_column), std::move(null_map)), array_offsets, input_arg_is_const, input_rows_count); + return wrapInArraysAndConstIfNeeded(std::move(expanded_variant_column), array_offsets, input_arg_is_const, input_rows_count); + } +private: + std::optional getVariantGlobalDiscriminator(const ColumnPtr & index_column, const DataTypeVariant & variant_type, size_t argument_size) const + { + const auto * name_col = checkAndGetColumnConst(index_column.get()); + if (!name_col) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Second argument to {} with Variant argument must be a constant String", + getName()); + + String variant_element_name = name_col->getValue(); + auto variant_element_type = DataTypeFactory::instance().tryGet(variant_element_name); + if (variant_element_type) + { + const auto & variants = variant_type.getVariants(); + for (size_t i = 0; i != variants.size(); ++i) + { + if (variants[i]->getName() == variant_element_type->getName()) + return i; + } + } + + if (argument_size == 2) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "{} doesn't contain variant with type {}", variant_type.getName(), variant_element_name); + return std::nullopt; + } + + ColumnPtr wrapInArraysAndConstIfNeeded(ColumnPtr res, const Columns & array_offsets, bool input_arg_is_const, size_t input_rows_count) const + { + for (auto it = array_offsets.rbegin(); it != array_offsets.rend(); ++it) + res = ColumnArray::create(res, *it); + + if (input_arg_is_const) + res = ColumnConst::create(res, input_rows_count); + + return res; + } +}; + +} + +REGISTER_FUNCTION(VariantElement) +{ + factory.registerFunction(FunctionDocumentation{ + .description = R"( +Extracts a column with specified type from a `Variant` column. +)", + .syntax{"tupleElement(variant, type_name, [, default_value])"}, + .arguments{{ + {"variant", "Variant column"}, + {"type_name", "The name of the variant type to extract"}, + {"default_value", "The default value that will be used if variant doesn't have variant with specified type. Can be any type. Optional"}}}, + .examples{{{ + "Example", + R"( +CREATE TABLE test (v Variant(UInt64, String, Array(UInt64))) ENGINE = Memory; +INSERT INTO test VALUES (NULL), (42), ('Hello, World!'), ([1, 2, 3]); +SELECT v, variantElement(v, 'String'), variantElement(v, 'UInt64'), variantElement(v, 'Array(UInt64)') FROM test;)", + R"( +┌─v─────────────┬─variantElement(v, 'String')─┬─variantElement(v, 'UInt64')─┬─variantElement(v, 'Array(UInt64)')─┐ +│ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [] │ +│ 42 │ ᴺᵁᴸᴸ │ 42 │ [] │ +│ Hello, World! │ Hello, World! │ ᴺᵁᴸᴸ │ [] │ +│ [1,2,3] │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ [1,2,3] │ +└───────────────┴─────────────────────────────┴─────────────────────────────┴────────────────────────────────────┘ +)"}}}, + .categories{"Variant"}, + }); +} + +} diff --git a/src/IO/ReadHelpers.cpp b/src/IO/ReadHelpers.cpp index ff5743a63af..2534f248d83 100644 --- a/src/IO/ReadHelpers.cpp +++ b/src/IO/ReadHelpers.cpp @@ -619,13 +619,16 @@ void readQuotedStringInto(Vector & s, ReadBuffer & buf) readAnyQuotedStringInto<'\'', enable_sql_style_quoting>(s, buf); } -template +template bool tryReadQuotedStringInto(Vector & s, ReadBuffer & buf) { - return readAnyQuotedStringInto<'\'', false, Vector, bool>(s, buf); + return readAnyQuotedStringInto<'\'', enable_sql_style_quoting, Vector, bool>(s, buf); } -template bool tryReadQuotedStringInto(String & s, ReadBuffer & buf); +template bool tryReadQuotedStringInto(String & s, ReadBuffer & buf); +template bool tryReadQuotedStringInto(String & s, ReadBuffer & buf); +template bool tryReadQuotedStringInto>(PaddedPODArray & s, ReadBuffer & buf); +template bool tryReadQuotedStringInto>(PaddedPODArray & s, ReadBuffer & buf); template void readDoubleQuotedStringInto(Vector & s, ReadBuffer & buf) @@ -633,6 +636,16 @@ void readDoubleQuotedStringInto(Vector & s, ReadBuffer & buf) readAnyQuotedStringInto<'"', enable_sql_style_quoting>(s, buf); } +template +bool tryReadDoubleQuotedStringInto(Vector & s, ReadBuffer & buf) +{ + return readAnyQuotedStringInto<'"', enable_sql_style_quoting, Vector, bool>(s, buf); +} + +template bool tryReadDoubleQuotedStringInto(String & s, ReadBuffer & buf); +template bool tryReadDoubleQuotedStringInto(String & s, ReadBuffer & buf); + + template void readBackQuotedStringInto(Vector & s, ReadBuffer & buf) { @@ -652,6 +665,18 @@ void readQuotedStringWithSQLStyle(String & s, ReadBuffer & buf) readQuotedStringInto(s, buf); } +bool tryReadQuotedString(String & s, ReadBuffer & buf) +{ + s.clear(); + return tryReadQuotedStringInto(s, buf); +} + +bool tryReadQuotedStringWithSQLStyle(String & s, ReadBuffer & buf) +{ + s.clear(); + return tryReadQuotedStringInto(s, buf); +} + template void readQuotedStringInto(PaddedPODArray & s, ReadBuffer & buf); template void readQuotedStringInto(String & s, ReadBuffer & buf); @@ -672,6 +697,18 @@ void readDoubleQuotedStringWithSQLStyle(String & s, ReadBuffer & buf) readDoubleQuotedStringInto(s, buf); } +bool tryReadDoubleQuotedString(String & s, ReadBuffer & buf) +{ + s.clear(); + return tryReadDoubleQuotedStringInto(s, buf); +} + +bool tryReadDoubleQuotedStringWithSQLStyle(String & s, ReadBuffer & buf) +{ + s.clear(); + return tryReadDoubleQuotedStringInto(s, buf); +} + void readBackQuotedString(String & s, ReadBuffer & buf) { s.clear(); @@ -691,7 +728,7 @@ concept WithResize = requires (T value) { value.size() } -> std::integral<>; }; -template +template void readCSVStringInto(Vector & s, ReadBuffer & buf, const FormatSettings::CSV & settings) { /// Empty string @@ -754,12 +791,20 @@ void readCSVStringInto(Vector & s, ReadBuffer & buf, const FormatSettings::CSV & { PeekableReadBuffer * peekable_buf = dynamic_cast(&buf); if (!peekable_buf) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Reading CSV string with custom delimiter is allowed only when using PeekableReadBuffer"); + { + if constexpr (allow_throw) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Reading CSV string with custom delimiter is allowed only when using PeekableReadBuffer"); + return; + } while (true) { if (peekable_buf->eof()) - throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected EOF while reading CSV string, expected custom delimiter \"{}\"", custom_delimiter); + { + if constexpr (allow_throw) + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected EOF while reading CSV string, expected custom delimiter \"{}\"", custom_delimiter); + return; + } char * next_pos = reinterpret_cast(memchr(peekable_buf->position(), custom_delimiter[0], peekable_buf->available())); if (!next_pos) @@ -948,6 +993,9 @@ String readCSVFieldWithTwoPossibleDelimiters(PeekableReadBuffer & buf, const For template void readCSVStringInto>(PaddedPODArray & s, ReadBuffer & buf, const FormatSettings::CSV & settings); template void readCSVStringInto(NullOutput & s, ReadBuffer & buf, const FormatSettings::CSV & settings); +template void readCSVStringInto(String & s, ReadBuffer & buf, const FormatSettings::CSV & settings); +template void readCSVStringInto(String & s, ReadBuffer & buf, const FormatSettings::CSV & settings); +template void readCSVStringInto, false, false>(PaddedPODArray & s, ReadBuffer & buf, const FormatSettings::CSV & settings); template @@ -1069,15 +1117,18 @@ ReturnType readJSONObjectPossiblyInvalid(Vector & s, ReadBuffer & buf) } template void readJSONObjectPossiblyInvalid(String & s, ReadBuffer & buf); +template bool readJSONObjectPossiblyInvalid(String & s, ReadBuffer & buf); template void readJSONObjectPossiblyInvalid>(PaddedPODArray & s, ReadBuffer & buf); +template bool readJSONObjectPossiblyInvalid, bool>(PaddedPODArray & s, ReadBuffer & buf); -template -void readJSONArrayInto(Vector & s, ReadBuffer & buf) +template +ReturnType readJSONArrayInto(Vector & s, ReadBuffer & buf) { - readJSONObjectOrArrayPossiblyInvalid(s, buf); + return readJSONObjectOrArrayPossiblyInvalid(s, buf); } -template void readJSONArrayInto>(PaddedPODArray & s, ReadBuffer & buf); +template void readJSONArrayInto, void>(PaddedPODArray & s, ReadBuffer & buf); +template bool readJSONArrayInto, bool>(PaddedPODArray & s, ReadBuffer & buf); template ReturnType readDateTextFallback(LocalDate & date, ReadBuffer & buf) @@ -1217,6 +1268,13 @@ ReturnType readDateTimeTextFallback(time_t & datetime, ReadBuffer & buf, const D return false; } + if constexpr (!throw_exception) + { + if (!isNumericASCII(s[0]) || !isNumericASCII(s[1]) || !isNumericASCII(s[2]) || !isNumericASCII(s[3]) + || !isNumericASCII(s[5]) || !isNumericASCII(s[6]) || !isNumericASCII(s[8]) || !isNumericASCII(s[9])) + return false; + } + UInt16 year = (s[0] - '0') * 1000 + (s[1] - '0') * 100 + (s[2] - '0') * 10 + (s[3] - '0'); UInt8 month = (s[5] - '0') * 10 + (s[6] - '0'); UInt8 day = (s[8] - '0') * 10 + (s[9] - '0'); @@ -1240,6 +1298,13 @@ ReturnType readDateTimeTextFallback(time_t & datetime, ReadBuffer & buf, const D return false; } + if constexpr (!throw_exception) + { + if (!isNumericASCII(s[0]) || !isNumericASCII(s[1]) || !isNumericASCII(s[3]) || !isNumericASCII(s[4]) + || !isNumericASCII(s[6]) || !isNumericASCII(s[7])) + return false; + } + hour = (s[0] - '0') * 10 + (s[1] - '0'); minute = (s[3] - '0') * 10 + (s[4] - '0'); second = (s[6] - '0') * 10 + (s[7] - '0'); @@ -1259,7 +1324,14 @@ ReturnType readDateTimeTextFallback(time_t & datetime, ReadBuffer & buf, const D { /// Not very efficient. for (const char * digit_pos = s; digit_pos < s_pos; ++digit_pos) + { + if constexpr (!throw_exception) + { + if (!isNumericASCII(*digit_pos)) + return false; + } datetime = datetime * 10 + *digit_pos - '0'; + } } datetime *= negative_multiplier; @@ -1282,14 +1354,24 @@ template bool readDateTimeTextFallback(time_t &, ReadBuffer &, cons template bool readDateTimeTextFallback(time_t &, ReadBuffer &, const DateLUTImpl &); -void skipJSONField(ReadBuffer & buf, StringRef name_of_field) +template +ReturnType skipJSONFieldImpl(ReadBuffer & buf, StringRef name_of_field) { + static constexpr bool throw_exception = std::is_same_v; + if (buf.eof()) - throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected EOF for key '{}'", name_of_field.toString()); + { + if constexpr (throw_exception) + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected EOF for key '{}'", name_of_field.toString()); + return ReturnType(false); + } else if (*buf.position() == '"') /// skip double-quoted string { NullOutput sink; - readJSONStringInto(sink, buf); + if constexpr (throw_exception) + readJSONStringInto(sink, buf); + else if (!tryReadJSONStringInto(sink, buf)) + return ReturnType(false); } else if (isNumericASCII(*buf.position()) || *buf.position() == '-' || *buf.position() == '+' || *buf.position() == '.') /// skip number { @@ -1298,19 +1380,32 @@ void skipJSONField(ReadBuffer & buf, StringRef name_of_field) double v; if (!tryReadFloatText(v, buf)) - throw Exception(ErrorCodes::INCORRECT_DATA, "Expected a number field for key '{}'", name_of_field.toString()); + { + if constexpr (throw_exception) + throw Exception(ErrorCodes::INCORRECT_DATA, "Expected a number field for key '{}'", name_of_field.toString()); + return ReturnType(false); + } } else if (*buf.position() == 'n') /// skip null { - assertString("null", buf); + if constexpr (throw_exception) + assertString("null", buf); + else if (!checkString("null", buf)) + return ReturnType(false); } else if (*buf.position() == 't') /// skip true { - assertString("true", buf); + if constexpr (throw_exception) + assertString("true", buf); + else if (!checkString("true", buf)) + return ReturnType(false); } else if (*buf.position() == 'f') /// skip false { - assertString("false", buf); + if constexpr (throw_exception) + assertString("false", buf); + else if (!checkString("false", buf)) + return ReturnType(false); } else if (*buf.position() == '[') { @@ -1320,12 +1415,16 @@ void skipJSONField(ReadBuffer & buf, StringRef name_of_field) if (!buf.eof() && *buf.position() == ']') /// skip empty array { ++buf.position(); - return; + return ReturnType(true); } while (true) { - skipJSONField(buf, name_of_field); + if constexpr (throw_exception) + skipJSONFieldImpl(buf, name_of_field); + else if (!skipJSONFieldImpl(buf, name_of_field)) + return ReturnType(false); + skipWhitespaceIfAny(buf); if (!buf.eof() && *buf.position() == ',') @@ -1339,7 +1438,11 @@ void skipJSONField(ReadBuffer & buf, StringRef name_of_field) break; } else - throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected symbol for key '{}'", name_of_field.toString()); + { + if constexpr (throw_exception) + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected symbol for key '{}'", name_of_field.toString()); + return ReturnType(false); + } } } else if (*buf.position() == '{') /// skip whole object @@ -1353,19 +1456,34 @@ void skipJSONField(ReadBuffer & buf, StringRef name_of_field) if (*buf.position() == '"') { NullOutput sink; - readJSONStringInto(sink, buf); + if constexpr (throw_exception) + readJSONStringInto(sink, buf); + else if (!tryReadJSONStringInto(sink, buf)) + return ReturnType(false); } else - throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected symbol for key '{}'", name_of_field.toString()); + { + if constexpr (throw_exception) + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected symbol for key '{}'", name_of_field.toString()); + return ReturnType(false); + } // ':' skipWhitespaceIfAny(buf); if (buf.eof() || !(*buf.position() == ':')) - throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected symbol for key '{}'", name_of_field.toString()); + { + if constexpr (throw_exception) + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected symbol for key '{}'", name_of_field.toString()); + return ReturnType(false); + } ++buf.position(); skipWhitespaceIfAny(buf); - skipJSONField(buf, name_of_field); + if constexpr (throw_exception) + skipJSONFieldImpl(buf, name_of_field); + else if (!skipJSONFieldImpl(buf, name_of_field)) + return ReturnType(false); + skipWhitespaceIfAny(buf); // optional ',' @@ -1377,14 +1495,32 @@ void skipJSONField(ReadBuffer & buf, StringRef name_of_field) } if (buf.eof()) - throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected EOF for key '{}'", name_of_field.toString()); + { + if constexpr (throw_exception) + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected EOF for key '{}'", name_of_field.toString()); + return ReturnType(false); + } ++buf.position(); } else { - throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected symbol '{}' for key '{}'", - std::string(*buf.position(), 1), name_of_field.toString()); + if constexpr (throw_exception) + throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected symbol '{}' for key '{}'", + std::string(*buf.position(), 1), name_of_field.toString()); + return ReturnType(false); } + + return ReturnType(true); +} + +void skipJSONField(ReadBuffer & buf, StringRef name_of_field) +{ + skipJSONFieldImpl(buf, name_of_field); +} + +bool trySkipJSONField(ReadBuffer & buf, StringRef name_of_field) +{ + return skipJSONFieldImpl(buf, name_of_field); } @@ -1597,23 +1733,31 @@ void skipToNextRowOrEof(PeekableReadBuffer & buf, const String & row_after_delim } // Use PeekableReadBuffer to copy field to string after parsing. -template -static void readParsedValueInto(Vector & s, ReadBuffer & buf, ParseFunc parse_func) +template +static ReturnType readParsedValueInto(Vector & s, ReadBuffer & buf, ParseFunc parse_func) { PeekableReadBuffer peekable_buf(buf); peekable_buf.setCheckpoint(); - parse_func(peekable_buf); + if constexpr (std::is_same_v) + parse_func(peekable_buf); + else if (!parse_func(peekable_buf)) + return ReturnType(false); peekable_buf.makeContinuousMemoryFromCheckpointToPos(); auto * end = peekable_buf.position(); peekable_buf.rollbackToCheckpoint(); s.append(peekable_buf.position(), end); peekable_buf.position() = end; + return ReturnType(true); } -template -static void readQuotedStringFieldInto(Vector & s, ReadBuffer & buf) +template +static ReturnType readQuotedStringFieldInto(Vector & s, ReadBuffer & buf) { - assertChar('\'', buf); + if constexpr (std::is_same_v) + assertChar('\'', buf); + else if (!checkChar('\'', buf)) + return ReturnType(false); + s.push_back('\''); while (!buf.eof()) { @@ -1641,16 +1785,23 @@ static void readQuotedStringFieldInto(Vector & s, ReadBuffer & buf) } if (buf.eof()) - return; + return ReturnType(false); ++buf.position(); s.push_back('\''); + return ReturnType(true); } -template -static void readQuotedFieldInBracketsInto(Vector & s, ReadBuffer & buf) +template +static ReturnType readQuotedFieldInBracketsInto(Vector & s, ReadBuffer & buf) { - assertChar(opening_bracket, buf); + static constexpr bool throw_exception = std::is_same_v; + + if constexpr (throw_exception) + assertChar(opening_bracket, buf); + else if (!checkChar(opening_bracket, buf)) + return ReturnType(false); + s.push_back(opening_bracket); size_t balance = 1; @@ -1666,7 +1817,10 @@ static void readQuotedFieldInBracketsInto(Vector & s, ReadBuffer & buf) if (*buf.position() == '\'') { - readQuotedStringFieldInto(s, buf); + if constexpr (throw_exception) + readQuotedStringFieldInto(s, buf); + else if (!readQuotedStringFieldInto(s, buf)) + return ReturnType(false); } else if (*buf.position() == opening_bracket) { @@ -1681,13 +1835,20 @@ static void readQuotedFieldInBracketsInto(Vector & s, ReadBuffer & buf) ++buf.position(); } } + + if (balance) + return ReturnType(false); + + return ReturnType(true); } -template -void readQuotedFieldInto(Vector & s, ReadBuffer & buf) +template +ReturnType readQuotedFieldInto(Vector & s, ReadBuffer & buf) { + static constexpr bool throw_exception = std::is_same_v; + if (buf.eof()) - return; + return ReturnType(false); /// Possible values in 'Quoted' field: /// - Strings: '...' @@ -1699,35 +1860,47 @@ void readQuotedFieldInto(Vector & s, ReadBuffer & buf) /// - Number: integer, float, decimal. if (*buf.position() == '\'') - readQuotedStringFieldInto(s, buf); + return readQuotedStringFieldInto(s, buf); else if (*buf.position() == '[') - readQuotedFieldInBracketsInto<'[', ']'>(s, buf); + return readQuotedFieldInBracketsInto(s, buf); else if (*buf.position() == '(') - readQuotedFieldInBracketsInto<'(', ')'>(s, buf); + return readQuotedFieldInBracketsInto(s, buf); else if (*buf.position() == '{') - readQuotedFieldInBracketsInto<'{', '}'>(s, buf); + return readQuotedFieldInBracketsInto(s, buf); else if (checkCharCaseInsensitive('n', buf)) { /// NULL or NaN if (checkCharCaseInsensitive('u', buf)) { - assertStringCaseInsensitive("ll", buf); + if constexpr (throw_exception) + assertStringCaseInsensitive("ll", buf); + else if (!checkStringCaseInsensitive("ll", buf)) + return ReturnType(false); s.append("NULL"); } else { - assertStringCaseInsensitive("an", buf); + if constexpr (throw_exception) + assertStringCaseInsensitive("an", buf); + else if (!checkStringCaseInsensitive("an", buf)) + return ReturnType(false); s.append("NaN"); } } else if (checkCharCaseInsensitive('t', buf)) { - assertStringCaseInsensitive("rue", buf); + if constexpr (throw_exception) + assertStringCaseInsensitive("rue", buf); + else if (!checkStringCaseInsensitive("rue", buf)) + return ReturnType(false); s.append("true"); } else if (checkCharCaseInsensitive('f', buf)) { - assertStringCaseInsensitive("alse", buf); + if constexpr (throw_exception) + assertStringCaseInsensitive("alse", buf); + else if (!checkStringCaseInsensitive("alse", buf)) + return ReturnType(false); s.append("false"); } else @@ -1736,13 +1909,19 @@ void readQuotedFieldInto(Vector & s, ReadBuffer & buf) auto parse_func = [](ReadBuffer & in) { Float64 tmp; - readFloatText(tmp, in); + if constexpr (throw_exception) + readFloatText(tmp, in); + else + return tryReadFloatText(tmp, in); }; - readParsedValueInto(s, buf, parse_func); + + return readParsedValueInto(s, buf, parse_func); } + + return ReturnType(true); } -template void readQuotedFieldInto(NullOutput & s, ReadBuffer & buf); +template void readQuotedFieldInto(NullOutput & s, ReadBuffer & buf); void readQuotedField(String & s, ReadBuffer & buf) { @@ -1750,11 +1929,24 @@ void readQuotedField(String & s, ReadBuffer & buf) readQuotedFieldInto(s, buf); } +bool tryReadQuotedField(String & s, ReadBuffer & buf) +{ + s.clear(); + return readQuotedFieldInto(s, buf); +} + void readJSONField(String & s, ReadBuffer & buf) { s.clear(); auto parse_func = [](ReadBuffer & in) { skipJSONField(in, "json_field"); }; - readParsedValueInto(s, buf, parse_func); + readParsedValueInto(s, buf, parse_func); +} + +bool tryReadJSONField(String & s, ReadBuffer & buf) +{ + s.clear(); + auto parse_func = [](ReadBuffer & in) { return trySkipJSONField(in, "json_field"); }; + return readParsedValueInto(s, buf, parse_func); } void readTSVField(String & s, ReadBuffer & buf) diff --git a/src/IO/ReadHelpers.h b/src/IO/ReadHelpers.h index 17f3d3d4151..ad62a3deaca 100644 --- a/src/IO/ReadHelpers.h +++ b/src/IO/ReadHelpers.h @@ -257,26 +257,43 @@ inline void readBoolText(bool & x, ReadBuffer & buf) x = tmp != '0'; } -inline void readBoolTextWord(bool & x, ReadBuffer & buf, bool support_upper_case = false) +template +inline ReturnType readBoolTextWord(bool & x, ReadBuffer & buf, bool support_upper_case = false) { + static constexpr bool throw_exception = std::is_same_v; + if (buf.eof()) [[unlikely]] - throwReadAfterEOF(); + { + if constexpr (throw_exception) + throwReadAfterEOF(); + else + return ReturnType(false); + } switch (*buf.position()) { case 't': - assertString("true", buf); + if constexpr (throw_exception) + assertString("true", buf); + else if (!checkString("true", buf)) + return ReturnType(false); x = true; break; case 'f': - assertString("false", buf); + if constexpr (throw_exception) + assertString("false", buf); + else if (!checkString("false", buf)) + return ReturnType(false); x = false; break; case 'T': { if (support_upper_case) { - assertString("TRUE", buf); + if constexpr (throw_exception) + assertString("TRUE", buf); + else if (!checkString("TRUE", buf)) + return ReturnType(false); x = true; break; } @@ -287,7 +304,10 @@ inline void readBoolTextWord(bool & x, ReadBuffer & buf, bool support_upper_case { if (support_upper_case) { - assertString("FALSE", buf); + if constexpr (throw_exception) + assertString("FALSE", buf); + else if (!checkString("FALSE", buf)) + return ReturnType(false); x = false; break; } @@ -295,8 +315,15 @@ inline void readBoolTextWord(bool & x, ReadBuffer & buf, bool support_upper_case [[fallthrough]]; } default: - throw ParsingException(ErrorCodes::CANNOT_PARSE_BOOL, "Unexpected Bool value"); + { + if constexpr (throw_exception) + throw ParsingException(ErrorCodes::CANNOT_PARSE_BOOL, "Unexpected Bool value"); + else + return ReturnType(false); + } } + + return ReturnType(true); } enum class ReadIntTextCheckOverflow @@ -468,7 +495,10 @@ void readIntText(T & x, ReadBuffer & buf) template bool tryReadIntText(T & x, ReadBuffer & buf) { - return readIntTextImpl(x, buf); + if constexpr (is_decimal) + return tryReadIntText(x.value, buf); + else + return readIntTextImpl(x, buf); } @@ -477,16 +507,18 @@ bool tryReadIntText(T & x, ReadBuffer & buf) * - for numbers starting with zero, parsed only zero; * - symbol '+' before number is not supported; */ -template -void readIntTextUnsafe(T & x, ReadBuffer & buf) +template +ReturnType readIntTextUnsafe(T & x, ReadBuffer & buf) { + static constexpr bool throw_exception = std::is_same_v; bool negative = false; make_unsigned_t res = 0; auto on_error = [] { - if (throw_on_error) + if constexpr (throw_exception) throwReadAfterEOF(); + return ReturnType(false); }; if (buf.eof()) [[unlikely]] @@ -504,7 +536,7 @@ void readIntTextUnsafe(T & x, ReadBuffer & buf) { ++buf.position(); x = 0; - return; + return ReturnType(true); } while (!buf.eof()) @@ -523,12 +555,13 @@ void readIntTextUnsafe(T & x, ReadBuffer & buf) /// See note about undefined behaviour above. x = is_signed_v && negative ? -res : res; + return ReturnType(true); } template -void tryReadIntTextUnsafe(T & x, ReadBuffer & buf) +bool tryReadIntTextUnsafe(T & x, ReadBuffer & buf) { - return readIntTextUnsafe(x, buf); + return readIntTextUnsafe(x, buf); } @@ -550,9 +583,15 @@ void readEscapedString(String & s, ReadBuffer & buf); void readQuotedString(String & s, ReadBuffer & buf); void readQuotedStringWithSQLStyle(String & s, ReadBuffer & buf); +bool tryReadQuotedString(String & s, ReadBuffer & buf); +bool tryReadQuotedStringWithSQLStyle(String & s, ReadBuffer & buf); + void readDoubleQuotedString(String & s, ReadBuffer & buf); void readDoubleQuotedStringWithSQLStyle(String & s, ReadBuffer & buf); +bool tryReadDoubleQuotedString(String & s, ReadBuffer & buf); +bool tryReadDoubleQuotedStringWithSQLStyle(String & s, ReadBuffer & buf); + void readJSONString(String & s, ReadBuffer & buf); void readBackQuotedString(String & s, ReadBuffer & buf); @@ -615,7 +654,7 @@ void readBackQuotedStringInto(Vector & s, ReadBuffer & buf); template void readStringUntilEOFInto(Vector & s, ReadBuffer & buf); -template +template void readCSVStringInto(Vector & s, ReadBuffer & buf, const FormatSettings::CSV & settings); /// ReturnType is either bool or void. If bool, the function will return false instead of throwing an exception. @@ -628,7 +667,7 @@ bool tryReadJSONStringInto(Vector & s, ReadBuffer & buf) return readJSONStringInto(s, buf); } -template +template bool tryReadQuotedStringInto(Vector & s, ReadBuffer & buf); /// Reads chunk of data between {} in that way, @@ -637,8 +676,8 @@ bool tryReadQuotedStringInto(Vector & s, ReadBuffer & buf); template ReturnType readJSONObjectPossiblyInvalid(Vector & s, ReadBuffer & buf); -template -void readJSONArrayInto(Vector & s, ReadBuffer & buf); +template +ReturnType readJSONArrayInto(Vector & s, ReadBuffer & buf); template void readStringUntilWhitespaceInto(Vector & s, ReadBuffer & buf); @@ -962,6 +1001,13 @@ inline ReturnType readDateTimeTextImpl(time_t & datetime, ReadBuffer & buf, cons { if (s[4] < '0' || s[4] > '9') { + if constexpr (!throw_exception) + { + if (!isNumericASCII(s[0]) || !isNumericASCII(s[1]) || !isNumericASCII(s[2]) || !isNumericASCII(s[3]) + || !isNumericASCII(s[5]) || !isNumericASCII(s[6]) || !isNumericASCII(s[8]) || !isNumericASCII(s[9])) + return ReturnType(false); + } + UInt16 year = (s[0] - '0') * 1000 + (s[1] - '0') * 100 + (s[2] - '0') * 10 + (s[3] - '0'); UInt8 month = (s[5] - '0') * 10 + (s[6] - '0'); UInt8 day = (s[8] - '0') * 10 + (s[9] - '0'); @@ -974,6 +1020,13 @@ inline ReturnType readDateTimeTextImpl(time_t & datetime, ReadBuffer & buf, cons bool dt_long = (s[10] == ' ' || s[10] == 'T'); if (dt_long) { + if constexpr (!throw_exception) + { + if (!isNumericASCII(s[11]) || !isNumericASCII(s[12]) || !isNumericASCII(s[14]) || !isNumericASCII(s[15]) + || !isNumericASCII(s[17]) || !isNumericASCII(s[18])) + return ReturnType(false); + } + hour = (s[11] - '0') * 10 + (s[12] - '0'); minute = (s[14] - '0') * 10 + (s[15] - '0'); second = (s[17] - '0') * 10 + (s[18] - '0'); @@ -1311,6 +1364,11 @@ inline bool tryReadText(is_integer auto & x, ReadBuffer & buf) return tryReadIntText(x, buf); } +inline bool tryReadText(is_floating_point auto & x, ReadBuffer & buf) +{ + return tryReadFloatText(x, buf); +} + inline bool tryReadText(UUID & x, ReadBuffer & buf) { return tryReadUUIDText(x, buf); } inline bool tryReadText(IPv4 & x, ReadBuffer & buf) { return tryReadIPv4Text(x, buf); } inline bool tryReadText(IPv6 & x, ReadBuffer & buf) { return tryReadIPv6Text(x, buf); } @@ -1320,9 +1378,20 @@ inline void readText(is_floating_point auto & x, ReadBuffer & buf) { readFloatTe inline void readText(String & x, ReadBuffer & buf) { readEscapedString(x, buf); } inline void readText(DayNum & x, ReadBuffer & buf, const DateLUTImpl & time_zone = DateLUT::instance()) { readDateText(x, buf, time_zone); } +inline bool tryReadText(DayNum & x, ReadBuffer & buf, const DateLUTImpl & time_zone = DateLUT::instance()) { return tryReadDateText(x, buf, time_zone); } inline void readText(LocalDate & x, ReadBuffer & buf) { readDateText(x, buf); } +inline bool tryReadText(LocalDate & x, ReadBuffer & buf) { return tryReadDateText(x, buf); } inline void readText(LocalDateTime & x, ReadBuffer & buf) { readDateTimeText(x, buf); } +inline bool tryReadText(LocalDateTime & x, ReadBuffer & buf) +{ + time_t time; + if (!tryReadDateTimeText(time, buf)) + return false; + x = LocalDateTime(time, DateLUT::instance()); + return true; +} + inline void readText(UUID & x, ReadBuffer & buf) { readUUIDText(x, buf); } inline void readText(IPv4 & x, ReadBuffer & buf) { readIPv4Text(x, buf); } inline void readText(IPv6 & x, ReadBuffer & buf) { readIPv6Text(x, buf); } @@ -1400,39 +1469,71 @@ inline void readDoubleQuoted(LocalDateTime & x, ReadBuffer & buf) } /// CSV for numbers: quotes are optional, no special escaping rules. -template -inline void readCSVSimple(T & x, ReadBuffer & buf) +template +inline ReturnType readCSVSimple(T & x, ReadBuffer & buf) { + static constexpr bool throw_exception = std::is_same_v; + if (buf.eof()) [[unlikely]] - throwReadAfterEOF(); + { + if constexpr (throw_exception) + throwReadAfterEOF(); + return ReturnType(false); + } char maybe_quote = *buf.position(); if (maybe_quote == '\'' || maybe_quote == '\"') ++buf.position(); - readText(x, buf); + if constexpr (throw_exception) + readText(x, buf); + else if (!tryReadText(x, buf)) + return ReturnType(false); if (maybe_quote == '\'' || maybe_quote == '\"') - assertChar(maybe_quote, buf); + { + if constexpr (throw_exception) + assertChar(maybe_quote, buf); + else if (!checkChar(maybe_quote, buf)) + return ReturnType(false); + } + + return ReturnType(true); } // standalone overload for dates: to avoid instantiating DateLUTs while parsing other types -template -inline void readCSVSimple(T & x, ReadBuffer & buf, const DateLUTImpl & time_zone) +template +inline ReturnType readCSVSimple(T & x, ReadBuffer & buf, const DateLUTImpl & time_zone) { + static constexpr bool throw_exception = std::is_same_v; + if (buf.eof()) [[unlikely]] - throwReadAfterEOF(); + { + if constexpr (throw_exception) + throwReadAfterEOF(); + return ReturnType(false); + } char maybe_quote = *buf.position(); if (maybe_quote == '\'' || maybe_quote == '\"') ++buf.position(); - readText(x, buf, time_zone); + if constexpr (throw_exception) + readText(x, buf, time_zone); + else if (!tryReadText(x, buf, time_zone)) + return ReturnType(false); if (maybe_quote == '\'' || maybe_quote == '\"') - assertChar(maybe_quote, buf); + { + if constexpr (throw_exception) + assertChar(maybe_quote, buf); + else if (!checkChar(maybe_quote, buf)) + return ReturnType(false); + } + + return ReturnType(true); } template @@ -1442,18 +1543,52 @@ inline void readCSV(T & x, ReadBuffer & buf) readCSVSimple(x, buf); } +template +requires is_arithmetic_v +inline bool tryReadCSV(T & x, ReadBuffer & buf) +{ + return readCSVSimple(x, buf); +} + inline void readCSV(String & x, ReadBuffer & buf, const FormatSettings::CSV & settings) { readCSVString(x, buf, settings); } +inline bool tryReadCSV(String & x, ReadBuffer & buf, const FormatSettings::CSV & settings) +{ + x.clear(); + readCSVStringInto(x, buf, settings); + return true; +} + inline void readCSV(LocalDate & x, ReadBuffer & buf) { readCSVSimple(x, buf); } +inline bool tryReadCSV(LocalDate & x, ReadBuffer & buf) { return readCSVSimple(x, buf); } + inline void readCSV(DayNum & x, ReadBuffer & buf) { readCSVSimple(x, buf); } +inline bool tryReadCSV(DayNum & x, ReadBuffer & buf) { return readCSVSimple(x, buf); } inline void readCSV(DayNum & x, ReadBuffer & buf, const DateLUTImpl & time_zone) { readCSVSimple(x, buf, time_zone); } +inline bool tryReadCSV(DayNum & x, ReadBuffer & buf, const DateLUTImpl & time_zone) { return readCSVSimple(x, buf, time_zone); } + inline void readCSV(LocalDateTime & x, ReadBuffer & buf) { readCSVSimple(x, buf); } +inline bool tryReadCSV(LocalDateTime & x, ReadBuffer & buf) { return readCSVSimple(x, buf); } + inline void readCSV(UUID & x, ReadBuffer & buf) { readCSVSimple(x, buf); } +inline bool tryReadCSV(UUID & x, ReadBuffer & buf) { return readCSVSimple(x, buf); } + inline void readCSV(IPv4 & x, ReadBuffer & buf) { readCSVSimple(x, buf); } +inline bool tryReadCSV(IPv4 & x, ReadBuffer & buf) { return readCSVSimple(x, buf); } + inline void readCSV(IPv6 & x, ReadBuffer & buf) { readCSVSimple(x, buf); } +inline bool tryReadCSV(IPv6 & x, ReadBuffer & buf) { return readCSVSimple(x, buf); } + inline void readCSV(UInt128 & x, ReadBuffer & buf) { readCSVSimple(x, buf); } +inline bool tryReadCSV(UInt128 & x, ReadBuffer & buf) { return readCSVSimple(x, buf); } + inline void readCSV(Int128 & x, ReadBuffer & buf) { readCSVSimple(x, buf); } +inline bool tryReadCSV(Int128 & x, ReadBuffer & buf) { return readCSVSimple(x, buf); } + inline void readCSV(UInt256 & x, ReadBuffer & buf) { readCSVSimple(x, buf); } +inline bool tryReadCSV(UInt256 & x, ReadBuffer & buf) { return readCSVSimple(x, buf); } + inline void readCSV(Int256 & x, ReadBuffer & buf) { readCSVSimple(x, buf); } +inline bool tryReadCSV(Int256 & x, ReadBuffer & buf) { return readCSVSimple(x, buf); } template void readBinary(std::vector & x, ReadBuffer & buf) @@ -1535,6 +1670,7 @@ inline void skipWhitespaceIfAny(ReadBuffer & buf, bool one_line = false) /// Skips json value. void skipJSONField(ReadBuffer & buf, StringRef name_of_field); +bool trySkipJSONField(ReadBuffer & buf, StringRef name_of_field); /** Read serialized exception. @@ -1749,12 +1885,14 @@ struct PcgDeserializer } }; -template -void readQuotedFieldInto(Vector & s, ReadBuffer & buf); +template +ReturnType readQuotedFieldInto(Vector & s, ReadBuffer & buf); void readQuotedField(String & s, ReadBuffer & buf); +bool tryReadQuotedField(String & s, ReadBuffer & buf); void readJSONField(String & s, ReadBuffer & buf); +bool tryReadJSONField(String & s, ReadBuffer & buf); void readTSVField(String & s, ReadBuffer & buf); diff --git a/src/IO/readDecimalText.h b/src/IO/readDecimalText.h index 9fd9c439b87..81bde87f1f1 100644 --- a/src/IO/readDecimalText.h +++ b/src/IO/readDecimalText.h @@ -224,4 +224,24 @@ inline void readCSVDecimalText(ReadBuffer & buf, T & x, uint32_t precision, uint assertChar(maybe_quote, buf); } +template +inline bool tryReadCSVDecimalText(ReadBuffer & buf, T & x, uint32_t precision, uint32_t & scale) +{ + if (buf.eof()) + return false; + + char maybe_quote = *buf.position(); + + if (maybe_quote == '\'' || maybe_quote == '\"') + ++buf.position(); + + if (!tryReadDecimalText(buf, x, precision, scale)) + return false; + + if ((maybe_quote == '\'' || maybe_quote == '\"') && !checkChar(maybe_quote, buf)) + return false; + + return true; +} + } diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index bf07f4ed3ee..51f767afc04 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -939,6 +939,20 @@ void InterpreterCreateQuery::validateTableStructure(const ASTCreateQuery & creat } } } + if (!create.attach && !settings.allow_experimental_variant_type) + { + for (const auto & [name, type] : properties.columns.getAllPhysical()) + { + if (isVariant(type)) + { + throw Exception(ErrorCodes::ILLEGAL_COLUMN, + "Cannot create table with column '{}' which type is '{}' " + "because experimental Variant type is not allowed. " + "Set setting allow_experimental_variant_type = 1 in order to allow it", + name, type->getName()); + } + } + } } namespace diff --git a/src/Interpreters/InterpreterInsertQuery.cpp b/src/Interpreters/InterpreterInsertQuery.cpp index 283289f0dfc..32b24cba940 100644 --- a/src/Interpreters/InterpreterInsertQuery.cpp +++ b/src/Interpreters/InterpreterInsertQuery.cpp @@ -532,7 +532,7 @@ BlockIO InterpreterInsertQuery::execute() { /// Change query sample block columns to Nullable to allow inserting nullable columns, where NULL values will be substituted with /// default column values (in AddingDefaultsTransform), so all values will be cast correctly. - if (isNullableOrLowCardinalityNullable(input_columns[col_idx].type) && !isNullableOrLowCardinalityNullable(query_columns[col_idx].type) && output_columns.has(query_columns[col_idx].name)) + if (isNullableOrLowCardinalityNullable(input_columns[col_idx].type) && !isNullableOrLowCardinalityNullable(query_columns[col_idx].type) && !isVariant(query_columns[col_idx].type) && output_columns.has(query_columns[col_idx].name)) query_sample_block.setColumn(col_idx, ColumnWithTypeAndName(makeNullableOrLowCardinalityNullable(query_columns[col_idx].column), makeNullableOrLowCardinalityNullable(query_columns[col_idx].type), query_columns[col_idx].name)); } } diff --git a/src/Interpreters/inplaceBlockConversions.cpp b/src/Interpreters/inplaceBlockConversions.cpp index c7a1cab8bac..fd8f5b154c4 100644 --- a/src/Interpreters/inplaceBlockConversions.cpp +++ b/src/Interpreters/inplaceBlockConversions.cpp @@ -237,17 +237,36 @@ static std::unordered_map collectOffsetsColumns( { auto & offsets_column = offsets_columns[stream_name]; if (!offsets_column) + { offsets_column = current_offsets_column; + } + else + { + /// If we are inside Variant element, it may happen that + /// offsets are different, because when we read Variant + /// element as a subcolumn, we expand this column according + /// to the discriminators, so, offsets column can be changed. + /// In this case we should select the original offsets column + /// of this stream, which is the smallest one. + bool inside_variant_element = false; + for (const auto & elem : subpath) + inside_variant_element |= elem.type == ISerialization::Substream::VariantElement; - #ifndef NDEBUG - const auto & offsets_data = assert_cast(*offsets_column).getData(); - const auto & current_offsets_data = assert_cast(*current_offsets_column).getData(); + if (offsets_column->size() != current_offsets_column->size() && inside_variant_element) + offsets_column = offsets_column->size() < current_offsets_column->size() ? offsets_column : current_offsets_column; +#ifndef NDEBUG + else + { + const auto & offsets_data = assert_cast(*offsets_column).getData(); + const auto & current_offsets_data = assert_cast(*current_offsets_column).getData(); - if (offsets_data != current_offsets_data) - throw Exception(ErrorCodes::LOGICAL_ERROR, - "Found non-equal columns with offsets (sizes: {} and {}) for stream {}", - offsets_data.size(), current_offsets_data.size(), stream_name); - #endif + if (offsets_data != current_offsets_data) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Found non-equal columns with offsets (sizes: {} and {}) for stream {}", + offsets_data.size(), current_offsets_data.size(), stream_name); + } +#endif + } } }, available_column->type, res_columns[i]); } diff --git a/src/Interpreters/parseColumnsListForTableFunction.cpp b/src/Interpreters/parseColumnsListForTableFunction.cpp index 87f76f7f824..551a883d093 100644 --- a/src/Interpreters/parseColumnsListForTableFunction.cpp +++ b/src/Interpreters/parseColumnsListForTableFunction.cpp @@ -60,6 +60,17 @@ void validateDataType(const DataTypePtr & type, const DataTypeValidationSettings MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS); } } + + if (!settings.allow_experimental_variant_type) + { + if (isVariant(type)) + { + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, + "Cannot create column with type '{}' because experimental Variant type is not allowed. " + "Set setting allow_experimental_variant_type = 1 in order to allow it", type->getName()); + } + } } ColumnsDescription parseColumnsListFromString(const std::string & structure, const ContextPtr & context) diff --git a/src/Interpreters/parseColumnsListForTableFunction.h b/src/Interpreters/parseColumnsListForTableFunction.h index ef1bbe5498e..1fbbfa4b12f 100644 --- a/src/Interpreters/parseColumnsListForTableFunction.h +++ b/src/Interpreters/parseColumnsListForTableFunction.h @@ -18,12 +18,14 @@ struct DataTypeValidationSettings : allow_suspicious_low_cardinality_types(settings.allow_suspicious_low_cardinality_types) , allow_experimental_object_type(settings.allow_experimental_object_type) , allow_suspicious_fixed_string_types(settings.allow_suspicious_fixed_string_types) + , allow_experimental_variant_type(settings.allow_experimental_variant_type) { } bool allow_suspicious_low_cardinality_types = true; bool allow_experimental_object_type = true; bool allow_suspicious_fixed_string_types = true; + bool allow_experimental_variant_type = true; }; void validateDataType(const DataTypePtr & type, const DataTypeValidationSettings & settings); diff --git a/src/Parsers/ExpressionElementParsers.cpp b/src/Parsers/ExpressionElementParsers.cpp index 594221fe050..5dc791f7003 100644 --- a/src/Parsers/ExpressionElementParsers.cpp +++ b/src/Parsers/ExpressionElementParsers.cpp @@ -250,7 +250,7 @@ bool ParserTableAsStringLiteralIdentifier::parseImpl(Pos & pos, ASTPtr & node, E ReadBufferFromMemory in(pos->begin, pos->size()); String s; - if (!tryReadQuotedStringInto(s, in)) + if (!tryReadQuotedString(s, in)) { expected.add(pos, "string literal"); return false; diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp index 5e12ec18d27..cab0f7523f1 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp @@ -391,7 +391,7 @@ bool CSVFormatReader::readFieldImpl(ReadBuffer & istr, DB::IColumn & column, con if (format_settings.null_as_default && !isNullableOrLowCardinalityNullable(type)) { /// If value is null but type is not nullable then use default value instead. - return SerializationNullable::deserializeTextCSVImpl(column, istr, format_settings, serialization); + return SerializationNullable::deserializeNullAsDefaultOrNestedTextCSV(column, istr, format_settings, serialization); } /// Read the column normally. diff --git a/src/Processors/Formats/Impl/MySQLDumpRowInputFormat.cpp b/src/Processors/Formats/Impl/MySQLDumpRowInputFormat.cpp index 7e8b4accf4d..9c7f095e661 100644 --- a/src/Processors/Formats/Impl/MySQLDumpRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/MySQLDumpRowInputFormat.cpp @@ -409,7 +409,7 @@ bool MySQLDumpRowInputFormat::readField(IColumn & column, size_t column_idx) const auto & type = types[column_idx]; const auto & serialization = serializations[column_idx]; if (format_settings.null_as_default && !isNullableOrLowCardinalityNullable(type)) - return SerializationNullable::deserializeTextQuotedImpl(column, *in, format_settings, serialization); + return SerializationNullable::deserializeNullAsDefaultOrNestedTextQuoted(column, *in, format_settings, serialization); serialization->deserializeTextQuoted(column, *in, format_settings); return true; diff --git a/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp b/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp index f4f92583473..0f68c28ab1f 100644 --- a/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TSKVRowInputFormat.cpp @@ -147,7 +147,7 @@ bool TSKVRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & ex const auto & type = getPort().getHeader().getByPosition(index).type; const auto & serialization = serializations[index]; if (format_settings.null_as_default && !isNullableOrLowCardinalityNullable(type)) - read_columns[index] = SerializationNullable::deserializeTextEscapedImpl(*columns[index], *in, format_settings, serialization); + read_columns[index] = SerializationNullable::deserializeNullAsDefaultOrNestedTextEscaped(*columns[index], *in, format_settings, serialization); else serialization->deserializeTextEscaped(*columns[index], *in, format_settings); } diff --git a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp index 3205adc2a48..88eb11d130d 100644 --- a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp @@ -167,7 +167,7 @@ bool TabSeparatedFormatReader::readField(IColumn & column, const DataTypePtr & t if (is_raw) { if (as_nullable) - return SerializationNullable::deserializeTextRawImpl(column, *buf, format_settings, serialization); + return SerializationNullable::deserializeNullAsDefaultOrNestedTextRaw(column, *buf, format_settings, serialization); serialization->deserializeTextRaw(column, *buf, format_settings); return true; @@ -175,7 +175,7 @@ bool TabSeparatedFormatReader::readField(IColumn & column, const DataTypePtr & t if (as_nullable) - return SerializationNullable::deserializeTextEscapedImpl(column, *buf, format_settings, serialization); + return SerializationNullable::deserializeNullAsDefaultOrNestedTextEscaped(column, *buf, format_settings, serialization); serialization->deserializeTextEscaped(column, *buf, format_settings); return true; diff --git a/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp b/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp index d55ccce8879..a7b5795b89e 100644 --- a/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ValuesBlockInputFormat.cpp @@ -293,7 +293,7 @@ bool ValuesBlockInputFormat::tryReadValue(IColumn & column, size_t column_idx) const auto & type = types[column_idx]; const auto & serialization = serializations[column_idx]; if (format_settings.null_as_default && !isNullableOrLowCardinalityNullable(type)) - read = SerializationNullable::deserializeTextQuotedImpl(column, *buf, format_settings, serialization); + read = SerializationNullable::deserializeNullAsDefaultOrNestedTextQuoted(column, *buf, format_settings, serialization); else serialization->deserializeTextQuoted(column, *buf, format_settings); } diff --git a/tests/queries/0_stateless/02940_variant_text_deserialization.reference b/tests/queries/0_stateless/02940_variant_text_deserialization.reference new file mode 100644 index 00000000000..98725917567 --- /dev/null +++ b/tests/queries/0_stateless/02940_variant_text_deserialization.reference @@ -0,0 +1,516 @@ +JSON +String +{"v":null,"variantElement(v, 'String')":null} +{"v":"string","variantElement(v, 'String')":"string"} +{"v":"42","variantElement(v, 'String')":null} +FixedString +{"v":null,"variantElement(v, 'FixedString(4)')":null} +{"v":"string","variantElement(v, 'FixedString(4)')":null} +{"v":"abcd","variantElement(v, 'FixedString(4)')":"abcd"} +Bool +{"v":null,"variantElement(v, 'Bool')":null} +{"v":"string","variantElement(v, 'Bool')":null} +{"v":true,"variantElement(v, 'Bool')":true} +Integers +{"v":null,"variantElement(v, 'Int8')":null} +{"v":"string","variantElement(v, 'Int8')":null} +{"v":-1,"variantElement(v, 'Int8')":-1} +{"v":0,"variantElement(v, 'Int8')":0} +{"v":"10000000000","variantElement(v, 'Int8')":null} +{"v":null,"variantElement(v, 'UInt8')":null} +{"v":"string","variantElement(v, 'UInt8')":null} +{"v":"-1","variantElement(v, 'UInt8')":null} +{"v":0,"variantElement(v, 'UInt8')":0} +{"v":"10000000000","variantElement(v, 'UInt8')":null} +{"v":null,"variantElement(v, 'Int16')":null} +{"v":"string","variantElement(v, 'Int16')":null} +{"v":-1,"variantElement(v, 'Int16')":-1} +{"v":0,"variantElement(v, 'Int16')":0} +{"v":"10000000000","variantElement(v, 'Int16')":null} +{"v":null,"variantElement(v, 'UInt16')":null} +{"v":"string","variantElement(v, 'UInt16')":null} +{"v":"-1","variantElement(v, 'UInt16')":null} +{"v":0,"variantElement(v, 'UInt16')":0} +{"v":"10000000000","variantElement(v, 'UInt16')":null} +{"v":null,"variantElement(v, 'Int32')":null} +{"v":"string","variantElement(v, 'Int32')":null} +{"v":-1,"variantElement(v, 'Int32')":-1} +{"v":0,"variantElement(v, 'Int32')":0} +{"v":"10000000000","variantElement(v, 'Int32')":null} +{"v":null,"variantElement(v, 'UInt32')":null} +{"v":"string","variantElement(v, 'UInt32')":null} +{"v":"-1","variantElement(v, 'UInt32')":null} +{"v":0,"variantElement(v, 'UInt32')":0} +{"v":"10000000000","variantElement(v, 'UInt32')":null} +{"v":null,"variantElement(v, 'Int64')":null} +{"v":"string","variantElement(v, 'Int64')":null} +{"v":"-1","variantElement(v, 'Int64')":"-1"} +{"v":"0","variantElement(v, 'Int64')":"0"} +{"v":"10000000000000000000000","variantElement(v, 'Int64')":null} +{"v":null,"variantElement(v, 'UInt64')":null} +{"v":"string","variantElement(v, 'UInt64')":null} +{"v":"-1","variantElement(v, 'UInt64')":null} +{"v":"0","variantElement(v, 'UInt64')":"0"} +{"v":"10000000000000000000000","variantElement(v, 'UInt64')":null} +{"v":null,"variantElement(v, 'Int128')":null} +{"v":"string","variantElement(v, 'Int128')":null} +{"v":"-1","variantElement(v, 'Int128')":"-1"} +{"v":"0","variantElement(v, 'Int128')":"0"} +{"v":null,"variantElement(v, 'UInt128')":null} +{"v":"string","variantElement(v, 'UInt128')":null} +{"v":"-1","variantElement(v, 'UInt128')":null} +{"v":"0","variantElement(v, 'UInt128')":"0"} +Floats +{"v":null,"variantElement(v, 'Float32')":null} +{"v":"string","variantElement(v, 'Float32')":null} +{"v":42.42,"variantElement(v, 'Float32')":42.42} +{"v":null,"variantElement(v, 'Float64')":null} +{"v":"string","variantElement(v, 'Float64')":null} +{"v":42.42,"variantElement(v, 'Float64')":42.42} +Decimals +{"v":null,"variantElement(v, 'Decimal32(6)')":null} +{"v":"string","variantElement(v, 'Decimal32(6)')":null} +{"v":42.42,"variantElement(v, 'Decimal32(6)')":42.42} +{"v":"4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242","variantElement(v, 'Decimal32(6)')":null} +{"v":null,"variantElement(v, 'Decimal64(6)')":null} +{"v":"string","variantElement(v, 'Decimal64(6)')":null} +{"v":42.42,"variantElement(v, 'Decimal64(6)')":42.42} +{"v":"4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242","variantElement(v, 'Decimal64(6)')":null} +{"v":null,"variantElement(v, 'Decimal128(6)')":null} +{"v":"string","variantElement(v, 'Decimal128(6)')":null} +{"v":42.42,"variantElement(v, 'Decimal128(6)')":42.42} +{"v":"4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242","variantElement(v, 'Decimal128(6)')":null} +{"v":null,"variantElement(v, 'Decimal256(6)')":null} +{"v":"string","variantElement(v, 'Decimal256(6)')":null} +{"v":42.42,"variantElement(v, 'Decimal256(6)')":42.42} +{"v":"4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242","variantElement(v, 'Decimal256(6)')":null} +Dates and DateTimes +{"v":null,"variantElement(v, 'Date')":null} +{"v":"string","variantElement(v, 'Date')":null} +{"v":"2020-01-01","variantElement(v, 'Date')":"2020-01-01"} +{"v":"2020-01-01 00:00:00.999","variantElement(v, 'Date')":null} +{"v":null,"variantElement(v, 'Date32')":null} +{"v":"string","variantElement(v, 'Date32')":null} +{"v":"1900-01-01","variantElement(v, 'Date32')":"1900-01-01"} +{"v":"2020-01-01 00:00:00.999","variantElement(v, 'Date32')":null} +{"v":null,"variantElement(v, 'DateTime')":null} +{"v":"string","variantElement(v, 'DateTime')":null} +{"v":"2020-01-01 00:00:00","variantElement(v, 'DateTime')":"2020-01-01 00:00:00"} +{"v":"2020-01-01 00:00:00.999","variantElement(v, 'DateTime')":null} +{"v":null,"variantElement(v, 'DateTime64')":null} +{"v":"string","variantElement(v, 'DateTime64')":null} +{"v":"2020-01-01 00:00:00.999","variantElement(v, 'DateTime64')":"2020-01-01 00:00:00.999"} +{"v":"2020-01-01 00:00:00.999999999 ABC","variantElement(v, 'DateTime64')":null} +UUID +{"v":null,"variantElement(v, 'UUID')":null} +{"v":"string","variantElement(v, 'UUID')":null} +{"v":"c8619cca-0caa-445e-ae76-1d4f6e0b3927","variantElement(v, 'UUID')":"c8619cca-0caa-445e-ae76-1d4f6e0b3927"} +IPv4 +{"v":null,"variantElement(v, 'IPv4')":null} +{"v":"string","variantElement(v, 'IPv4')":null} +{"v":"127.0.0.1","variantElement(v, 'IPv4')":"127.0.0.1"} +IPv6 +{"v":null,"variantElement(v, 'IPv6')":null} +{"v":"string","variantElement(v, 'IPv6')":null} +{"v":"2001:db8:85a3::8a2e:370:7334","variantElement(v, 'IPv6')":"2001:db8:85a3::8a2e:370:7334"} +Enum +{"v":null,"variantElement(v, 'Enum(\\'a\\' = 1)')":null} +{"v":"string","variantElement(v, 'Enum(\\'a\\' = 1)')":null} +{"v":"a","variantElement(v, 'Enum(\\'a\\' = 1)')":"a"} +{"v":"a","variantElement(v, 'Enum(\\'a\\' = 1)')":"a"} +{"v":2,"variantElement(v, 'Enum(\\'a\\' = 1)')":null} +Map +{"v":null,"variantElement(v, 'Map(String, UInt64)')":{}} +{"v":"string","variantElement(v, 'Map(String, UInt64)')":{}} +{"v":{"a":"42","b":"43","c":"0"},"variantElement(v, 'Map(String, UInt64)')":{"a":"42","b":"43","c":"0"}} +{"v":"{\"c\" : 44, \"d\" : [1,2,3]}","variantElement(v, 'Map(String, UInt64)')":{}} +Tuple +{"v":null,"variantElement(v, 'Tuple(a UInt64, b UInt64)')":{"a":"0","b":"0"}} +{"v":"string","variantElement(v, 'Tuple(a UInt64, b UInt64)')":{"a":"0","b":"0"}} +{"v":{"a":"42","b":"0"},"variantElement(v, 'Tuple(a UInt64, b UInt64)')":{"a":"42","b":"0"}} +{"v":{"a":"44","b":"0"},"variantElement(v, 'Tuple(a UInt64, b UInt64)')":{"a":"44","b":"0"}} +\N (0,0) +string (0,0) +(42,0) (42,0) +{"a" : 44, "d" : 32} (0,0) +Array +{"v":null,"variantElement(v, 'Array(UInt64)')":[]} +{"v":"string","variantElement(v, 'Array(UInt64)')":[]} +{"v":["1","2","3"],"variantElement(v, 'Array(UInt64)')":["1","2","3"]} +{"v":["0","0","0"],"variantElement(v, 'Array(UInt64)')":["0","0","0"]} +{"v":"[1, 2, \"hello\"]","variantElement(v, 'Array(UInt64)')":[]} +LowCardinality +{"v":null,"variantElement(v, 'LowCardinality(String)')":null} +{"v":"string","variantElement(v, 'LowCardinality(String)')":"string"} +{"v":"42","variantElement(v, 'LowCardinality(String)')":null} +{"v":null,"variantElement(v, 'Array(LowCardinality(Nullable(String)))')":[]} +{"v":["string",null],"variantElement(v, 'Array(LowCardinality(Nullable(String)))')":["string",null]} +{"v":"42","variantElement(v, 'Array(LowCardinality(Nullable(String)))')":[]} +Nullable +{"v":null,"variantElement(v, 'Array(Nullable(String))')":[]} +{"v":"string","variantElement(v, 'Array(Nullable(String))')":[]} +{"v":["hello",null,"world"],"variantElement(v, 'Array(Nullable(String))')":["hello",null,"world"]} +{"repeat('-', 80)":"--------------------------------------------------------------------------------"} +CSV +String +\N,\N +"string","string" +"string","string" +42,\N +FixedString +\N,\N +"string",\N +"string",\N +"abcd","abcd" +Bool +\N,\N +"Truee",\N +true,true +Integers +\N,\N +"string",\N +-1,-1 +0,0 +10000000000,\N +"42d42",\N +\N,\N +"string",\N +-1,\N +0,0 +10000000000,\N +"42d42",\N +\N,\N +"string",\N +-1,-1 +0,0 +10000000000,\N +"42d42",\N +\N,\N +"string",\N +-1,\N +0,0 +10000000000,\N +"42d42",\N +\N,\N +"string",\N +-1,-1 +0,0 +10000000000,\N +"42d42",\N +\N,\N +"string",\N +-1,\N +0,0 +10000000000,\N +"42d42",\N +\N,\N +"string",\N +-1,-1 +0,0 +10000000000000000000000,\N +"42d42",\N +\N,\N +"string",\N +-1,\N +0,0 +10000000000000000000000,\N +"42d42",\N +\N,\N +"string",\N +-1,-1 +0,0 +"42d42",\N +\N,\N +"string",\N +-1,\N +0,0 +"42d42",\N +Floats +\N,\N +"string",\N +42.42,42.42 +"42.d42",\N +\N,\N +"string",\N +42.42,42.42 +"42.d42",\N +Decimals +\N,\N +"string",\N +42.42,42.42 +"42d42",\N +"4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242",\N +\N,\N +"string",\N +42.42,42.42 +"42d42",\N +"4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242",\N +\N,\N +"string",\N +42.42,42.42 +"42d42",\N +"4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242",\N +\N,\N +"string",\N +42.42,42.42 +"42d42",\N +"4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242",\N +Dates and DateTimes +\N,\N +"string",\N +"2020-01-d1",\N +"2020-01-01","2020-01-01" +"2020-01-01 00:00:00.999",\N +\N,\N +"string",\N +"2020-01-d1",\N +"1900-01-01","1900-01-01" +"2020-01-01 00:00:00.999",\N +\N,\N +"string",\N +"2020-01-d1",\N +"2020-01-01 00:00:00","2020-01-01 00:00:00" +"2020-01-01 00:00:00.999",\N +\N,\N +"string",\N +"2020-01-d1",\N +"2020-01-01 00:00:00.999","2020-01-01 00:00:00.999" +"2020-01-01 00:00:00.999999999 ABC",\N +UUID +\N,\N +"string",\N +"c8619cca-0caa-445e-ae76-1d4f6e0b3927","c8619cca-0caa-445e-ae76-1d4f6e0b3927" +"c8619cca-0caa-445e-ae76-1d4f6e0b3927AAA",\N +IPv4 +\N,\N +"string",\N +"127.0.0.1","127.0.0.1" +"127.0.0.1AAA",\N +IPv6 +\N,\N +"string",\N +"2001:db8:85a3::8a2e:370:7334","2001:db8:85a3::8a2e:370:7334" +"2001:0db8:85a3:0000:0000:8a2e:0370:7334AAA",\N +Enum +\N,\N +"string",\N +"a","a" +"a","a" +2,\N +"aa",\N +Map +\N,"{}" +"string","{}" +"{'a':42,'b':43,'c':0}","{'a':42,'b':43,'c':0}" +"{'c' : 44, 'd' : [1,2,3]}","{}" +"{'c' : 44","{}" +Array +\N,"[]" +"string","[]" +"[1,2,3]","[1,2,3]" +"[0,0,0]","[0,0,0]" +"[1, 2, 'hello']","[]" +"[1, 2","[]" +LowCardinality +\N,\N +"string","string" +42,\N +\N,"[]" +"['string',NULL]","['string',NULL]" +"['string', nul]","[]" +42,"[]" +Nullable +\N,"[]" +"string","[]" +"['hello',NULL,'world']","['hello',NULL,'world']" +"['hello', nul]","[]" +{"repeat('-', 80)":"--------------------------------------------------------------------------------"} +TSV +String +\N \N +string string +42 \N +FixedString +\N \N +string \N +abcd abcd +Bool +\N \N +Truee \N +true true +Integers +\N \N +string \N +-1 -1 +0 0 +10000000000 \N +42d42 \N +\N \N +string \N +-1 \N +0 0 +10000000000 \N +42d42 \N +\N \N +string \N +-1 -1 +0 0 +10000000000 \N +42d42 \N +\N \N +string \N +-1 \N +0 0 +10000000000 \N +42d42 \N +\N \N +string \N +-1 -1 +0 0 +10000000000 \N +42d42 \N +\N \N +string \N +-1 \N +0 0 +10000000000 \N +42d42 \N +\N \N +string \N +-1 -1 +0 0 +10000000000000000000000 \N +42d42 \N +\N \N +string \N +-1 \N +0 0 +10000000000000000000000 \N +42d42 \N +\N \N +string \N +-1 -1 +0 0 +42d42 \N +\N \N +string \N +-1 \N +0 0 +42d42 \N +Floats +\N \N +string \N +42.42 42.42 +42.d42 \N +\N \N +string \N +42.42 42.42 +42.d42 \N +Decimals +\N \N +string \N +42.42 42.42 +42d42 \N +4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242 \N +\N \N +string \N +42.42 42.42 +42d42 \N +4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242 \N +\N \N +string \N +42.42 42.42 +42d42 \N +4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242 \N +\N \N +string \N +42.42 42.42 +42d42 \N +4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242 \N +Dates and DateTimes +\N \N +string \N +2020-01-d1 \N +2020-01-01 2020-01-01 +2020-01-01 00:00:00.999 \N +\N \N +string \N +2020-01-d1 \N +1900-01-01 1900-01-01 +2020-01-01 00:00:00.999 \N +\N \N +string \N +2020-01-d1 \N +2020-01-01 00:00:00 2020-01-01 00:00:00 +2020-01-01 00:00:00.999 \N +\N \N +string \N +2020-01-d1 \N +2020-01-01 00:00:00.999 2020-01-01 00:00:00.999 +2020-01-01 00:00:00.999999999 ABC \N +UUID +\N \N +string \N +c8619cca-0caa-445e-ae76-1d4f6e0b3927 c8619cca-0caa-445e-ae76-1d4f6e0b3927 +c8619cca-0caa-445e-ae76-1d4f6e0b3927AAA \N +IPv4 +\N \N +string \N +127.0.0.1 127.0.0.1 +127.0.0.1AAA \N +IPv6 +\N \N +string \N +2001:db8:85a3::8a2e:370:7334 2001:db8:85a3::8a2e:370:7334 +2001:0db8:85a3:0000:0000:8a2e:0370:7334AAA \N +Enum +\N \N +string \N +a a +a a +2 \N +aa \N +Map +\N {} +string {} +{'a':42,'b':43,'c':0} {'a':42,'b':43,'c':0} +{\'c\' : 44, \'d\' : [1,2,3]} {} +{\'c\' : 44 {} +Array +\N [] +string [] +[1,2,3] [1,2,3] +[0,0,0] [0,0,0] +[1, 2, \'hello\'] [] +[1, 2 [] +LowCardinality +\N \N +string string +42 \N +\N [] +['string',NULL] ['string',NULL] +[\'string\', nul] [] +42 [] +Nullable +\N [] +string [] +['hello',NULL,'world'] ['hello',NULL,'world'] +[\'hello\', nul] [] +{"repeat('-', 80)":"--------------------------------------------------------------------------------"} +Values +String +(NULL,NULL),('string','string'),(42,NULL)FixedString +(NULL,NULL),('string',NULL),('abcd','abcd')Bool +(NULL,NULL),(true,true)Integers +(NULL,NULL),('string',NULL),(-1,-1),(0,0),(10000000000,NULL)(NULL,NULL),('string',NULL),(-1,NULL),(0,0),(10000000000,NULL)(NULL,NULL),('string',NULL),(-1,-1),(0,0),(10000000000,NULL)(NULL,NULL),('string',NULL),(-1,NULL),(0,0),(10000000000,NULL)(NULL,NULL),('string',NULL),(-1,-1),(0,0),(10000000000,NULL)(NULL,NULL),('string',NULL),(-1,NULL),(0,0),(10000000000,NULL)(NULL,NULL),('string',NULL),(-1,-1),(0,0),(10000000000000000000000,NULL)(NULL,NULL),('string',NULL),(-1,NULL),(0,0),(10000000000000000000000,NULL)(NULL,NULL),('string',NULL),(-1,-1),(0,0)(NULL,NULL),('string',NULL),(-1,NULL),(0,0)Floats +(NULL,NULL),('string',NULL),(42.42,42.42)(NULL,NULL),('string',NULL),(42.42,42.42)Decimals +(NULL,NULL),('string',NULL),(42.42,42.42)(NULL,NULL),('string',NULL),(42.42,42.42)(NULL,NULL),('string',NULL),(42.42,42.42)(NULL,NULL),('string',NULL),(42.42,42.42)Dates and DateTimes +(NULL,NULL),('string',NULL),('1970-01-01 00:00:00.000',NULL),('2020-01-01','2020-01-01'),('2020-01-01 00:00:00.999',NULL)(NULL,NULL),('string',NULL),('1970-01-01 00:00:00.000',NULL),('1900-01-01','1900-01-01'),('2020-01-01 00:00:00.999',NULL)(NULL,NULL),('string',NULL),('1970-01-01 00:00:00.000',NULL),('2020-01-01 00:00:00','2020-01-01 00:00:00'),('2020-01-01 00:00:00.999',NULL)(NULL,NULL),('string',NULL),('1970-01-01 00:00:00.000','1970-01-01 00:00:00.000'),('2020-01-01 00:00:00.999',NULL),('2020-01-01 00:00:00.999999999 ABC',NULL)UUID +(NULL,NULL),('string',NULL),('c8619cca-0caa-445e-ae76-1d4f6e0b3927','c8619cca-0caa-445e-ae76-1d4f6e0b3927'),('c8619cca-0caa-445e-ae76-1d4f6e0b3927AAA',NULL)IPv4 +(NULL,NULL),('string',NULL),('127.0.0.1','127.0.0.1'),('127.0.0.1AAA',NULL)IPv6 +(NULL,NULL),('string',NULL),('2001:db8:85a3::8a2e:370:7334','2001:db8:85a3::8a2e:370:7334'),('2001:0db8:85a3:0000:0000:8a2e:0370:7334AAA',NULL)Enum +(NULL,NULL),('string',NULL),('a','a'),(1,NULL),(2,NULL),('aa',NULL)Map +(NULL,{}),('string',{}),({'a':42,'b':43,'c':0},{'a':42,'b':43,'c':0})Array +(NULL,[]),('string',[]),([1,2,3],[1,2,3]),([0,0,0],[0,0,0])LowCardinality +(NULL,NULL),('string','string'),(42,NULL)(NULL,[]),(['string',NULL],['string',NULL]),(42,[])Nullable +(NULL,[]),('string',[]),(['hello',NULL,'world'],['hello',NULL,'world']) diff --git a/tests/queries/0_stateless/02940_variant_text_deserialization.sql b/tests/queries/0_stateless/02940_variant_text_deserialization.sql new file mode 100644 index 00000000000..041d02088ef --- /dev/null +++ b/tests/queries/0_stateless/02940_variant_text_deserialization.sql @@ -0,0 +1,266 @@ +set allow_experimental_variant_type = 1; +set session_timezone = 'UTC'; + +select 'JSON'; +select 'String'; +select v, variantElement(v, 'String') from format(JSONEachRow, 'v Variant(String, UInt64)', '{"v" : null}, {"v" : "string"}, {"v" : 42}') format JSONEachRow; + +select 'FixedString'; +select v, variantElement(v, 'FixedString(4)') from format(JSONEachRow, 'v Variant(String, FixedString(4))', '{"v" : null}, {"v" : "string"}, {"v" : "abcd"}') format JSONEachRow; + +select 'Bool'; +select v, variantElement(v, 'Bool') from format(JSONEachRow, 'v Variant(String, Bool)', '{"v" : null}, {"v" : "string"}, {"v" : true}') format JSONEachRow; + +select 'Integers'; +select v, variantElement(v, 'Int8') from format(JSONEachRow, 'v Variant(String, Int8, UInt64)', '{"v" : null}, {"v" : "string"}, {"v" : -1}, {"v" : 0}, {"v" : 10000000000}') format JSONEachRow; +select v, variantElement(v, 'UInt8') from format(JSONEachRow, 'v Variant(String, UInt8, Int64)', '{"v" : null}, {"v" : "string"}, {"v" : -1}, {"v" : 0}, {"v" : 10000000000}') format JSONEachRow; +select v, variantElement(v, 'Int16') from format(JSONEachRow, 'v Variant(String, Int16, Int64)', '{"v" : null}, {"v" : "string"}, {"v" : -1}, {"v" : 0}, {"v" : 10000000000}') format JSONEachRow; +select v, variantElement(v, 'UInt16') from format(JSONEachRow, 'v Variant(String, UInt16, Int64)', '{"v" : null}, {"v" : "string"}, {"v" : -1}, {"v" : 0}, {"v" : 10000000000}') format JSONEachRow; +select v, variantElement(v, 'Int32') from format(JSONEachRow, 'v Variant(String, Int32, Int64)', '{"v" : null}, {"v" : "string"}, {"v" : -1}, {"v" : 0}, {"v" : 10000000000}') format JSONEachRow; +select v, variantElement(v, 'UInt32') from format(JSONEachRow, 'v Variant(String, UInt32, Int64)', '{"v" : null}, {"v" : "string"}, {"v" : -1}, {"v" : 0}, {"v" : 10000000000}') format JSONEachRow; +select v, variantElement(v, 'Int64') from format(JSONEachRow, 'v Variant(String, Int64, Int128)', '{"v" : null}, {"v" : "string"}, {"v" : -1}, {"v" : 0}, {"v" : 10000000000000000000000}') format JSONEachRow; +select v, variantElement(v, 'UInt64') from format(JSONEachRow, 'v Variant(String, UInt64, Int128)', '{"v" : null}, {"v" : "string"}, {"v" : -1}, {"v" : 0}, {"v" : 10000000000000000000000}') format JSONEachRow; +select v, variantElement(v, 'Int128') from format(JSONEachRow, 'v Variant(String, Int128, Int256)', '{"v" : null}, {"v" : "string"}, {"v" : -1}, {"v" : 0}') format JSONEachRow; +select v, variantElement(v, 'UInt128') from format(JSONEachRow, 'v Variant(String, UInt128, Int256)', '{"v" : null}, {"v" : "string"}, {"v" : -1}, {"v" : 0}') format JSONEachRow; + +select 'Floats'; +select v, variantElement(v, 'Float32') from format(JSONEachRow, 'v Variant(String, Float32)', '{"v" : null}, {"v" : "string"}, {"v" : 42.42}') format JSONEachRow; +select v, variantElement(v, 'Float64') from format(JSONEachRow, 'v Variant(String, Float64)', '{"v" : null}, {"v" : "string"}, {"v" : 42.42}') format JSONEachRow; + +select 'Decimals'; +select v, variantElement(v, 'Decimal32(6)') from format(JSONEachRow, 'v Variant(String, Decimal32(6))', '{"v" : null}, {"v" : "string"}, {"v" : 42.42}, {"v" : 4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242}') format JSONEachRow; +select v, variantElement(v, 'Decimal64(6)') from format(JSONEachRow, 'v Variant(String, Decimal64(6))', '{"v" : null}, {"v" : "string"}, {"v" : 42.42}, {"v" : 4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242}') format JSONEachRow; +select v, variantElement(v, 'Decimal128(6)') from format(JSONEachRow, 'v Variant(String, Decimal128(6))', '{"v" : null}, {"v" : "string"}, {"v" : 42.42}, {"v" : 4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242}') format JSONEachRow; +select v, variantElement(v, 'Decimal256(6)') from format(JSONEachRow, 'v Variant(String, Decimal256(6))', '{"v" : null}, {"v" : "string"}, {"v" : 42.42}, {"v" : 4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242}') format JSONEachRow; + +select 'Dates and DateTimes'; +select v, variantElement(v, 'Date') from format(JSONEachRow, 'v Variant(String, Date, DateTime64)', '{"v" : null}, {"v" : "string"}, {"v" : "2020-01-01"}, {"v" : "2020-01-01 00:00:00.999"}') format JSONEachRow; +select v, variantElement(v, 'Date32') from format(JSONEachRow, 'v Variant(String, Date32, DateTime64)', '{"v" : null}, {"v" : "string"}, {"v" : "1900-01-01"}, {"v" : "2020-01-01 00:00:00.999"}') format JSONEachRow; +select v, variantElement(v, 'DateTime') from format(JSONEachRow, 'v Variant(String, DateTime, DateTime64)', '{"v" : null}, {"v" : "string"}, {"v" : "2020-01-01 00:00:00"}, {"v" : "2020-01-01 00:00:00.999"}') format JSONEachRow; +select v, variantElement(v, 'DateTime64') from format(JSONEachRow, 'v Variant(String, DateTime64)', '{"v" : null}, {"v" : "string"}, {"v" : "2020-01-01 00:00:00.999"}, {"v" : "2020-01-01 00:00:00.999999999 ABC"}') format JSONEachRow; + +select 'UUID'; +select v, variantElement(v, 'UUID') from format(JSONEachRow, 'v Variant(String, UUID)', '{"v" : null}, {"v" : "string"}, {"v" : "c8619cca-0caa-445e-ae76-1d4f6e0b3927"}') format JSONEachRow; + +select 'IPv4'; +select v, variantElement(v, 'IPv4') from format(JSONEachRow, 'v Variant(String, IPv4)', '{"v" : null}, {"v" : "string"}, {"v" : "127.0.0.1"}') format JSONEachRow; + +select 'IPv6'; +select v, variantElement(v, 'IPv6') from format(JSONEachRow, 'v Variant(String, IPv6)', '{"v" : null}, {"v" : "string"}, {"v" : "2001:0db8:85a3:0000:0000:8a2e:0370:7334"}') format JSONEachRow; + +select 'Enum'; +select v, variantElement(v, 'Enum(''a'' = 1)') from format(JSONEachRow, 'v Variant(String, UInt32, Enum(''a'' = 1))', '{"v" : null}, {"v" : "string"}, {"v" : "a"}, {"v" : 1}, {"v" : 2}') format JSONEachRow; + +select 'Map'; +select v, variantElement(v, 'Map(String, UInt64)') from format(JSONEachRow, 'v Variant(String, Map(String, UInt64))', '{"v" : null}, {"v" : "string"}, {"v" : {"a" : 42, "b" : 43, "c" : null}}, {"v" : {"c" : 44, "d" : [1,2,3]}}') format JSONEachRow; + +select 'Tuple'; +select v, variantElement(v, 'Tuple(a UInt64, b UInt64)') from format(JSONEachRow, 'v Variant(String, Tuple(a UInt64, b UInt64))', '{"v" : null}, {"v" : "string"}, {"v" : {"a" : 42, "b" : null}}, {"v" : {"a" : 44, "d" : 32}}') format JSONEachRow; +select v, variantElement(v, 'Tuple(a UInt64, b UInt64)') from format(JSONEachRow, 'v Variant(String, Tuple(a UInt64, b UInt64))', '{"v" : null}, {"v" : "string"}, {"v" : {"a" : 42, "b" : null}}, {"v" : {"a" : 44, "d" : 32}}') settings input_format_json_defaults_for_missing_elements_in_named_tuple=0; + +select 'Array'; +select v, variantElement(v, 'Array(UInt64)') from format(JSONEachRow, 'v Variant(String, Array(UInt64))', '{"v" : null}, {"v" : "string"}, {"v" : [1, 2, 3]}, {"v" : [null, null, null]} {"v" : [1, 2, "hello"]}') format JSONEachRow; + +select 'LowCardinality'; +select v, variantElement(v, 'LowCardinality(String)') from format(JSONEachRow, 'v Variant(LowCardinality(String), UInt64)', '{"v" : null}, {"v" : "string"}, {"v" : 42}') format JSONEachRow; +select v, variantElement(v, 'Array(LowCardinality(Nullable(String)))') from format(JSONEachRow, 'v Variant(Array(LowCardinality(Nullable(String))), UInt64)', '{"v" : null}, {"v" : ["string", null]}, {"v" : 42}') format JSONEachRow; + +select 'Nullable'; +select v, variantElement(v, 'Array(Nullable(String))') from format(JSONEachRow, 'v Variant(String, Array(Nullable(String)))', '{"v" : null}, {"v" : "string"}, {"v" : ["hello", null, "world"]}') format JSONEachRow; + +select repeat('-', 80) format JSONEachRow; + +select 'CSV'; +select 'String'; +select v, variantElement(v, 'String') from format(CSV, 'v Variant(String, UInt64)', '\\N\n"string"\nstring\n42') format CSV; + +select 'FixedString'; +select v, variantElement(v, 'FixedString(4)') from format(CSV, 'v Variant(String, FixedString(4))', '\\N\n"string"\nstring\n"abcd"') format CSV; + +select 'Bool'; +select v, variantElement(v, 'Bool') from format(CSV, 'v Variant(String, Bool)', '\\N\nTruee\nTrue') format CSV; + +select 'Integers'; +select v, variantElement(v, 'Int8') from format(CSV, 'v Variant(String, Int8, UInt64)', '\n"string"\n-1\n0\n10000000000\n42d42') format CSV; +select v, variantElement(v, 'UInt8') from format(CSV, 'v Variant(String, UInt8, Int64)', '\\N\n"string"\n-1\n0\n10000000000\n42d42') format CSV; +select v, variantElement(v, 'Int16') from format(CSV, 'v Variant(String, Int16, Int64)', '\\N\n"string"\n-1\n0\n10000000000\n42d42') format CSV; +select v, variantElement(v, 'UInt16') from format(CSV, 'v Variant(String, UInt16, Int64)', '\\N\n"string"\n-1\n0\n10000000000\n42d42') format CSV; +select v, variantElement(v, 'Int32') from format(CSV, 'v Variant(String, Int32, Int64)', '\\N\n"string"\n-1\n0\n10000000000\n42d42') format CSV; +select v, variantElement(v, 'UInt32') from format(CSV, 'v Variant(String, UInt32, Int64)', '\\N\n"string"\n-1\n0\n10000000000\n42d42') format CSV; +select v, variantElement(v, 'Int64') from format(CSV, 'v Variant(String, Int64, Int128)', '\\N\n"string"\n-1\n0\n10000000000000000000000\n42d42') format CSV; +select v, variantElement(v, 'UInt64') from format(CSV, 'v Variant(String, UInt64, Int128)', '\\N\n"string"\n-1\n0\n10000000000000000000000\n42d42') format CSV; +select v, variantElement(v, 'Int128') from format(CSV, 'v Variant(String, Int128, Int256)', '\\N\n"string"\n-1\n0\n42d42') format CSV; +select v, variantElement(v, 'UInt128') from format(CSV, 'v Variant(String, UInt128, Int256)', '\\N\n"string"\n-1\n0\n42d42') format CSV; + +select 'Floats'; +select v, variantElement(v, 'Float32') from format(CSV, 'v Variant(String, Float32)', '\\N\n"string"\n42.42\n42.d42') format CSV; +select v, variantElement(v, 'Float64') from format(CSV, 'v Variant(String, Float64)', '\\N\n"string"\n42.42\n42.d42') format CSV; + +select 'Decimals'; +select v, variantElement(v, 'Decimal32(6)') from format(CSV, 'v Variant(String, Decimal32(6))', '\\N\n"string"\n42.42\n42d42\n4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242') format CSV; +select v, variantElement(v, 'Decimal64(6)') from format(CSV, 'v Variant(String, Decimal64(6))', '\\N\n"string"\n42.42\n42d42\n4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242') format CSV; +select v, variantElement(v, 'Decimal128(6)') from format(CSV, 'v Variant(String, Decimal128(6))', '\\N\n"string"\n42.42\n42d42\n4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242') format CSV; +select v, variantElement(v, 'Decimal256(6)') from format(CSV, 'v Variant(String, Decimal256(6))', '\\N\n"string"\n42.42\n42d42\n4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242') format CSV; + +select 'Dates and DateTimes'; +select v, variantElement(v, 'Date') from format(CSV, 'v Variant(String, Date, DateTime64)', '\\N\n"string"\n"2020-01-d1"\n"2020-01-01"\n"2020-01-01 00:00:00.999"') format CSV; +select v, variantElement(v, 'Date32') from format(CSV, 'v Variant(String, Date32, DateTime64)', '\\N\n"string"\n"2020-01-d1"\n"1900-01-01"\n"2020-01-01 00:00:00.999"') format CSV; +select v, variantElement(v, 'DateTime') from format(CSV, 'v Variant(String, DateTime, DateTime64)', '\\N\n"string"\n"2020-01-d1"\n"2020-01-01 00:00:00"\n"2020-01-01 00:00:00.999"') format CSV; +select v, variantElement(v, 'DateTime64') from format(CSV, 'v Variant(String, DateTime64)', '\\N\n"string"\n"2020-01-d1"\n"2020-01-01 00:00:00.999"\n"2020-01-01 00:00:00.999999999 ABC"') format CSV; + +select 'UUID'; +select v, variantElement(v, 'UUID') from format(CSV, 'v Variant(String, UUID)', '\\N\n"string"\n"c8619cca-0caa-445e-ae76-1d4f6e0b3927"\nc8619cca-0caa-445e-ae76-1d4f6e0b3927AAA') format CSV; + +select 'IPv4'; +select v, variantElement(v, 'IPv4') from format(CSV, 'v Variant(String, IPv4)', '\\N\n"string"\n"127.0.0.1"\n"127.0.0.1AAA"') format CSV; + +select 'IPv6'; +select v, variantElement(v, 'IPv6') from format(CSV, 'v Variant(String, IPv6)', '\\N\n"string"\n"2001:0db8:85a3:0000:0000:8a2e:0370:7334"\n2001:0db8:85a3:0000:0000:8a2e:0370:7334AAA') format CSV; + +select 'Enum'; +select v, variantElement(v, 'Enum(''a'' = 1)') from format(CSV, 'v Variant(String, UInt32, Enum(''a'' = 1))', '\\N\n"string"\n"a"\n1\n2\naa') format CSV; + +select 'Map'; +select v, variantElement(v, 'Map(String, UInt64)') from format(CSV, 'v Variant(String, Map(String, UInt64))', '\\N\n"string"\n"{''a'' : 42, ''b'' : 43, ''c'' : null}"\n"{''c'' : 44, ''d'' : [1,2,3]}"\n"{''c'' : 44"') format CSV; + +select 'Array'; +select v, variantElement(v, 'Array(UInt64)') from format(CSV, 'v Variant(String, Array(UInt64))', '\\N\n"string"\n"[1, 2, 3]"\n"[null, null, null]"\n"[1, 2, ''hello'']"\n"[1, 2"') format CSV; + +select 'LowCardinality'; +select v, variantElement(v, 'LowCardinality(String)') from format(CSV, 'v Variant(LowCardinality(String), UInt64)', '\\N\n"string"\n42') format CSV; +select v, variantElement(v, 'Array(LowCardinality(Nullable(String)))') from format(CSV, 'v Variant(Array(LowCardinality(Nullable(String))), UInt64, String)', '\\N\n"[''string'', null]"\n"[''string'', nul]"\n42') format CSV; + +select 'Nullable'; +select v, variantElement(v, 'Array(Nullable(String))') from format(CSV, 'v Variant(String, Array(Nullable(String)))', '\\N\n"string"\n"[''hello'', null, ''world'']"\n"[''hello'', nul]"') format CSV; + +select repeat('-', 80) format JSONEachRow; + +select 'TSV'; +select 'String'; +select v, variantElement(v, 'String') from format(TSV, 'v Variant(String, UInt64)', '\\N\nstring\n42') format TSV; + +select 'FixedString'; +select v, variantElement(v, 'FixedString(4)') from format(TSV, 'v Variant(String, FixedString(4))', '\\N\nstring\nabcd') format TSV; + +select 'Bool'; +select v, variantElement(v, 'Bool') from format(TSV, 'v Variant(String, Bool)', '\\N\nTruee\nTrue') format TSV; + +select 'Integers'; +select v, variantElement(v, 'Int8') from format(TSV, 'v Variant(String, Int8, UInt64)', '\\N\nstring\n-1\n0\n10000000000\n42d42') format TSV; +select v, variantElement(v, 'UInt8') from format(TSV, 'v Variant(String, UInt8, Int64)', '\\N\nstring\n-1\n0\n10000000000\n42d42') format TSV; +select v, variantElement(v, 'Int16') from format(TSV, 'v Variant(String, Int16, Int64)', '\\N\nstring\n-1\n0\n10000000000\n42d42') format TSV; +select v, variantElement(v, 'UInt16') from format(TSV, 'v Variant(String, UInt16, Int64)', '\\N\nstring\n-1\n0\n10000000000\n42d42') format TSV; +select v, variantElement(v, 'Int32') from format(TSV, 'v Variant(String, Int32, Int64)', '\\N\nstring\n-1\n0\n10000000000\n42d42') format TSV; +select v, variantElement(v, 'UInt32') from format(TSV, 'v Variant(String, UInt32, Int64)', '\\N\nstring\n-1\n0\n10000000000\n42d42') format TSV; +select v, variantElement(v, 'Int64') from format(TSV, 'v Variant(String, Int64, Int128)', '\\N\nstring\n-1\n0\n10000000000000000000000\n42d42') format TSV; +select v, variantElement(v, 'UInt64') from format(TSV, 'v Variant(String, UInt64, Int128)', '\\N\nstring\n-1\n0\n10000000000000000000000\n42d42') format TSV; +select v, variantElement(v, 'Int128') from format(TSV, 'v Variant(String, Int128, Int256)', '\\N\nstring\n-1\n0\n42d42') format TSV; +select v, variantElement(v, 'UInt128') from format(TSV, 'v Variant(String, UInt128, Int256)', '\\N\nstring\n-1\n0\n42d42') format TSV; + +select 'Floats'; +select v, variantElement(v, 'Float32') from format(TSV, 'v Variant(String, Float32)', '\\N\nstring\n42.42\n42.d42') format TSV; +select v, variantElement(v, 'Float64') from format(TSV, 'v Variant(String, Float64)', '\\N\nstring\n42.42\n42.d42') format TSV; + +select 'Decimals'; +select v, variantElement(v, 'Decimal32(6)') from format(TSV, 'v Variant(String, Decimal32(6))', '\\N\nstring\n42.42\n42d42\n4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242') format TSV; +select v, variantElement(v, 'Decimal64(6)') from format(TSV, 'v Variant(String, Decimal64(6))', '\\N\nstring\n42.42\n42d42\n4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242') format TSV; +select v, variantElement(v, 'Decimal128(6)') from format(TSV, 'v Variant(String, Decimal128(6))', '\\N\nstring\n42.42\n42d42\n4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242') format TSV; +select v, variantElement(v, 'Decimal256(6)') from format(TSV, 'v Variant(String, Decimal256(6))', '\\N\nstring\n42.42\n42d42\n4242424242424242424242424242424242424242424242424242424242424242424242424242424242424242424242.424242424242424242') format TSV; + +select 'Dates and DateTimes'; +select v, variantElement(v, 'Date') from format(TSV, 'v Variant(String, Date, DateTime64)', '\\N\nstring\n2020-01-d1\n2020-01-01\n2020-01-01 00:00:00.999') format TSV; +select v, variantElement(v, 'Date32') from format(TSV, 'v Variant(String, Date32, DateTime64)', '\\N\nstring\n2020-01-d1\n1900-01-01\n2020-01-01 00:00:00.999') format TSV; +select v, variantElement(v, 'DateTime') from format(TSV, 'v Variant(String, DateTime, DateTime64)', '\\N\nstring\n2020-01-d1\n2020-01-01 00:00:00\n2020-01-01 00:00:00.999') format TSV; +select v, variantElement(v, 'DateTime64') from format(TSV, 'v Variant(String, DateTime64)', '\\N\nstring\n2020-01-d1\n2020-01-01 00:00:00.999\n2020-01-01 00:00:00.999999999 ABC') format TSV; + +select 'UUID'; +select v, variantElement(v, 'UUID') from format(TSV, 'v Variant(String, UUID)', '\\N\nstring\nc8619cca-0caa-445e-ae76-1d4f6e0b3927\nc8619cca-0caa-445e-ae76-1d4f6e0b3927AAA') format TSV; + +select 'IPv4'; +select v, variantElement(v, 'IPv4') from format(TSV, 'v Variant(String, IPv4)', '\\N\nstring\n127.0.0.1\n127.0.0.1AAA') format TSV; + +select 'IPv6'; +select v, variantElement(v, 'IPv6') from format(TSV, 'v Variant(String, IPv6)', '\\N\nstring\n2001:0db8:85a3:0000:0000:8a2e:0370:7334\n2001:0db8:85a3:0000:0000:8a2e:0370:7334AAA') format TSV; + +select 'Enum'; +select v, variantElement(v, 'Enum(''a'' = 1)') from format(TSV, 'v Variant(String, UInt32, Enum(''a'' = 1))', '\\N\nstring\na\n1\n2\naa') format TSV; + +select 'Map'; +select v, variantElement(v, 'Map(String, UInt64)') from format(TSV, 'v Variant(String, Map(String, UInt64))', '\\N\nstring\n{''a'' : 42, ''b'' : 43, ''c'' : null}\n{''c'' : 44, ''d'' : [1,2,3]}\n{''c'' : 44') format TSV; + +select 'Array'; +select v, variantElement(v, 'Array(UInt64)') from format(TSV, 'v Variant(String, Array(UInt64))', '\\N\nstring\n[1, 2, 3]\n[null, null, null]\n[1, 2, ''hello'']\n[1, 2') format TSV; + +select 'LowCardinality'; +select v, variantElement(v, 'LowCardinality(String)') from format(TSV, 'v Variant(LowCardinality(String), UInt64)', '\\N\nstring\n42') format TSV; +select v, variantElement(v, 'Array(LowCardinality(Nullable(String)))') from format(TSV, 'v Variant(Array(LowCardinality(Nullable(String))), UInt64, String)', '\\N\n[''string'', null]\n[''string'', nul]\n42') format TSV; + +select 'Nullable'; +select v, variantElement(v, 'Array(Nullable(String))') from format(TSV, 'v Variant(String, Array(Nullable(String)))', '\\N\nstring\n[''hello'', null, ''world'']\n[''hello'', nul]') format TSV; + +select repeat('-', 80) format JSONEachRow; + +select 'Values'; +select 'String'; +select v, variantElement(v, 'String') from format(Values, 'v Variant(String, UInt64)', '(NULL), (''string''), (42)') format Values; + +select 'FixedString'; +select v, variantElement(v, 'FixedString(4)') from format(Values, 'v Variant(String, FixedString(4))', '(NULL), (''string''), (''abcd'')') format Values; + +select 'Bool'; +select v, variantElement(v, 'Bool') from format(Values, 'v Variant(String, Bool)', '(NULL), (True)') format Values; + +select 'Integers'; +select v, variantElement(v, 'Int8') from format(Values, 'v Variant(String, Int8, UInt64)', '(NULL), (''string''), (-1), (0), (10000000000)') format Values; +select v, variantElement(v, 'UInt8') from format(Values, 'v Variant(String, UInt8, Int64)', '(NULL), (''string''), (-1), (0), (10000000000)') format Values; +select v, variantElement(v, 'Int16') from format(Values, 'v Variant(String, Int16, Int64)', '(NULL), (''string''), (-1), (0), (10000000000)') format Values; +select v, variantElement(v, 'UInt16') from format(Values, 'v Variant(String, UInt16, Int64)', '(NULL), (''string''), (-1), (0), (10000000000)') format Values; +select v, variantElement(v, 'Int32') from format(Values, 'v Variant(String, Int32, Int64)', '(NULL), (''string''), (-1), (0), (10000000000)') format Values; +select v, variantElement(v, 'UInt32') from format(Values, 'v Variant(String, UInt32, Int64)', '(NULL), (''string''), (-1), (0), (10000000000)') format Values; +select v, variantElement(v, 'Int64') from format(Values, 'v Variant(String, Int64, Int128)', '(NULL), (''string''), (-1), (0), (10000000000000000000000)') format Values; +select v, variantElement(v, 'UInt64') from format(Values, 'v Variant(String, UInt64, Int128)', '(NULL), (''string''), (-1), (0), (10000000000000000000000)') format Values; +select v, variantElement(v, 'Int128') from format(Values, 'v Variant(String, Int128, Int256)', '(NULL), (''string''), (-1), (0)') format Values; +select v, variantElement(v, 'UInt128') from format(Values, 'v Variant(String, UInt128, Int256)', '(NULL), (''string''), (-1), (0)') format Values; + +select 'Floats'; +select v, variantElement(v, 'Float32') from format(Values, 'v Variant(String, Float32)', '(NULL), (''string''), (42.42)') format Values; +select v, variantElement(v, 'Float64') from format(Values, 'v Variant(String, Float64)', '(NULL), (''string''), (42.42)') format Values; + +select 'Decimals'; +select v, variantElement(v, 'Decimal32(6)') from format(Values, 'v Variant(String, Decimal32(6))', '(NULL), (''string''), (42.42)') format Values; +select v, variantElement(v, 'Decimal64(6)') from format(Values, 'v Variant(String, Decimal64(6))', '(NULL), (''string''), (42.42)') format Values; +select v, variantElement(v, 'Decimal128(6)') from format(Values, 'v Variant(String, Decimal128(6))', '(NULL), (''string''), (42.42)') format Values; +select v, variantElement(v, 'Decimal256(6)') from format(Values, 'v Variant(String, Decimal256(6))', '(NULL), (''string''), (42.42)') format Values; + +select 'Dates and DateTimes'; +select v, variantElement(v, 'Date') from format(Values, 'v Variant(String, Date, DateTime64)', '(NULL), (''string''), (''2020-01-d1''), (''2020-01-01''), (''2020-01-01 00:00:00.999'')') format Values; +select v, variantElement(v, 'Date32') from format(Values, 'v Variant(String, Date32, DateTime64)', '(NULL), (''string''), (''2020-01-d1''), (''1900-01-01''), (''2020-01-01 00:00:00.999'')') format Values; +select v, variantElement(v, 'DateTime') from format(Values, 'v Variant(String, DateTime, DateTime64)', '(NULL), (''string''), (''2020-01-d1''), (''2020-01-01 00:00:00''), (''2020-01-01 00:00:00.999'')') format Values; +select v, variantElement(v, 'DateTime64') from format(Values, 'v Variant(String, DateTime64)', '(NULL), (''string''), (''2020-01-d1''), (''2020-01-01 00:00:00.999''), (''2020-01-01 00:00:00.999999999 ABC'')') format Values; + +select 'UUID'; +select v, variantElement(v, 'UUID') from format(Values, 'v Variant(String, UUID)', '(NULL), (''string''), (''c8619cca-0caa-445e-ae76-1d4f6e0b3927''), (''c8619cca-0caa-445e-ae76-1d4f6e0b3927AAA'')') format Values; + +select 'IPv4'; +select v, variantElement(v, 'IPv4') from format(Values, 'v Variant(String, IPv4)', '(NULL), (''string''), (''127.0.0.1''), (''127.0.0.1AAA'')') format Values; + +select 'IPv6'; +select v, variantElement(v, 'IPv6') from format(Values, 'v Variant(String, IPv6)', '(NULL), (''string''), (''2001:0db8:85a3:0000:0000:8a2e:0370:7334''), (''2001:0db8:85a3:0000:0000:8a2e:0370:7334AAA'')') format Values; + +select 'Enum'; +select v, variantElement(v, 'Enum(''a'' = 1)') from format(Values, 'v Variant(String, UInt32, Enum(''a'' = 1))', '(NULL), (''string''), (''a''), (1), (2), (''aa'')') format Values; + +select 'Map'; +select v, variantElement(v, 'Map(String, UInt64)') from format(Values, 'v Variant(String, Map(String, UInt64))', '(NULL), (''string''), ({''a'' : 42, ''b'' : 43, ''c'' : null})') format Values; + +select 'Array'; +select v, variantElement(v, 'Array(UInt64)') from format(Values, 'v Variant(String, Array(UInt64))', '(NULL), (''string''), ([1, 2, 3]), ([null, null, null])') format Values; + +select 'LowCardinality'; +select v, variantElement(v, 'LowCardinality(String)') from format(Values, 'v Variant(LowCardinality(String), UInt64)', '(NULL), (''string''), (42)') format Values; +select v, variantElement(v, 'Array(LowCardinality(Nullable(String)))') from format(Values, 'v Variant(Array(LowCardinality(Nullable(String))), UInt64, String)', '(NULL), ([''string'', null]), (42)') format Values; + +select 'Nullable'; +select v, variantElement(v, 'Array(Nullable(String))') from format(Values, 'v Variant(String, Array(Nullable(String)))', '(NULL), (''string''), ([''hello'', null, ''world''])') format Values; + +select ''; \ No newline at end of file diff --git a/tests/queries/0_stateless/02941_variant_type_1.reference b/tests/queries/0_stateless/02941_variant_type_1.reference new file mode 100644 index 00000000000..8a6e77d4f6d --- /dev/null +++ b/tests/queries/0_stateless/02941_variant_type_1.reference @@ -0,0 +1,2472 @@ +Memory +test1 insert +test1 select +\N +\N +\N +0 +1 +2 +str_0 +str_1 +str_2 +lc_str_0 +lc_str_1 +lc_str_2 +(0,1) +(1,2) +(2,3) +[0] +[0,1] +[0,1,2] +\N +\N +\N +\N +\N +\N +str_0 +str_1 +str_2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +1 +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +lc_str_0 +lc_str_1 +lc_str_2 +\N +\N +\N +\N +\N +\N +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,1) +(1,2) +(2,3) +(0,0) +(0,0) +(0,0) +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +1 +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +2 +3 +\N +\N +\N +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[0] +[0,1] +[0,1,2] +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +2 +3 +----------------------------------------------------------------------------------------------------------- +test2 insert +test2 select +\N +\N +\N +0 +\N +2 +str_0 +\N +str_2 +lc_str_0 +\N +lc_str_2 +(0,1) +\N +(2,3) +[0] +\N +[0,1,2] +\N +\N +\N +\N +\N +\N +str_0 +\N +str_2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +\N +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +lc_str_0 +\N +lc_str_2 +\N +\N +\N +\N +\N +\N +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,1) +(0,0) +(2,3) +(0,0) +(0,0) +(0,0) +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +\N +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +\N +3 +\N +\N +\N +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[0] +[] +[0,1,2] +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +\N +3 +----------------------------------------------------------------------------------------------------------- +test3 insert +test3 select +\N +str_1 +2 +lc_str_3 +(4,5) +[0,1,2,3,4,5] +\N +str_7 +8 +lc_str_9 +(10,11) +[0,1,2,3,4,5,6,7,8,9,10,11] +\N +str_13 +14 +lc_str_15 +(16,17) +[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17] +\N +str_1 +\N +\N +\N +\N +\N +str_7 +\N +\N +\N +\N +\N +str_13 +\N +\N +\N +\N +\N +\N +2 +\N +\N +\N +\N +\N +8 +\N +\N +\N +\N +\N +14 +\N +\N +\N +\N +\N +\N +lc_str_3 +\N +\N +\N +\N +\N +lc_str_9 +\N +\N +\N +\N +\N +lc_str_15 +\N +\N +(0,0) +(0,0) +(0,0) +(0,0) +(4,5) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(10,11) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(16,17) +(0,0) +\N +\N +\N +\N +4 +\N +\N +\N +\N +\N +10 +\N +\N +\N +\N +\N +16 +\N +\N +\N +\N +\N +5 +\N +\N +\N +\N +\N +11 +\N +\N +\N +\N +\N +17 +\N +[] +[] +[] +[] +[] +[0,1,2,3,4,5] +[] +[] +[] +[] +[] +[0,1,2,3,4,5,6,7,8,9,10,11] +[] +[] +[] +[] +[] +[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17] +\N +\N +\N +\N +\N +6 +\N +\N +\N +\N +\N +12 +\N +\N +\N +\N +\N +18 +----------------------------------------------------------------------------------------------------------- +MergeTree compact +test1 insert +test1 select +\N +\N +\N +0 +1 +2 +str_0 +str_1 +str_2 +lc_str_0 +lc_str_1 +lc_str_2 +(0,1) +(1,2) +(2,3) +[0] +[0,1] +[0,1,2] +\N +\N +\N +\N +\N +\N +str_0 +str_1 +str_2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +1 +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +lc_str_0 +lc_str_1 +lc_str_2 +\N +\N +\N +\N +\N +\N +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,1) +(1,2) +(2,3) +(0,0) +(0,0) +(0,0) +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +1 +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +2 +3 +\N +\N +\N +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[0] +[0,1] +[0,1,2] +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +2 +3 +----------------------------------------------------------------------------------------------------------- +test1 select +\N +\N +\N +0 +1 +2 +str_0 +str_1 +str_2 +lc_str_0 +lc_str_1 +lc_str_2 +(0,1) +(1,2) +(2,3) +[0] +[0,1] +[0,1,2] +\N +\N +\N +\N +\N +\N +str_0 +str_1 +str_2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +1 +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +lc_str_0 +lc_str_1 +lc_str_2 +\N +\N +\N +\N +\N +\N +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,1) +(1,2) +(2,3) +(0,0) +(0,0) +(0,0) +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +1 +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +2 +3 +\N +\N +\N +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[0] +[0,1] +[0,1,2] +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +2 +3 +----------------------------------------------------------------------------------------------------------- +test2 insert +test2 select +\N +\N +\N +0 +\N +2 +str_0 +\N +str_2 +lc_str_0 +\N +lc_str_2 +(0,1) +\N +(2,3) +[0] +\N +[0,1,2] +\N +\N +\N +\N +\N +\N +str_0 +\N +str_2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +\N +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +lc_str_0 +\N +lc_str_2 +\N +\N +\N +\N +\N +\N +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,1) +(0,0) +(2,3) +(0,0) +(0,0) +(0,0) +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +\N +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +\N +3 +\N +\N +\N +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[0] +[] +[0,1,2] +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +\N +3 +----------------------------------------------------------------------------------------------------------- +test2 select +\N +\N +\N +0 +\N +2 +str_0 +\N +str_2 +lc_str_0 +\N +lc_str_2 +(0,1) +\N +(2,3) +[0] +\N +[0,1,2] +\N +\N +\N +\N +\N +\N +str_0 +\N +str_2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +\N +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +lc_str_0 +\N +lc_str_2 +\N +\N +\N +\N +\N +\N +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,1) +(0,0) +(2,3) +(0,0) +(0,0) +(0,0) +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +\N +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +\N +3 +\N +\N +\N +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[0] +[] +[0,1,2] +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +\N +3 +----------------------------------------------------------------------------------------------------------- +test3 insert +test3 select +\N +str_1 +2 +lc_str_3 +(4,5) +[0,1,2,3,4,5] +\N +str_7 +8 +lc_str_9 +(10,11) +[0,1,2,3,4,5,6,7,8,9,10,11] +\N +str_13 +14 +lc_str_15 +(16,17) +[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17] +\N +str_1 +\N +\N +\N +\N +\N +str_7 +\N +\N +\N +\N +\N +str_13 +\N +\N +\N +\N +\N +\N +2 +\N +\N +\N +\N +\N +8 +\N +\N +\N +\N +\N +14 +\N +\N +\N +\N +\N +\N +lc_str_3 +\N +\N +\N +\N +\N +lc_str_9 +\N +\N +\N +\N +\N +lc_str_15 +\N +\N +(0,0) +(0,0) +(0,0) +(0,0) +(4,5) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(10,11) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(16,17) +(0,0) +\N +\N +\N +\N +4 +\N +\N +\N +\N +\N +10 +\N +\N +\N +\N +\N +16 +\N +\N +\N +\N +\N +5 +\N +\N +\N +\N +\N +11 +\N +\N +\N +\N +\N +17 +\N +[] +[] +[] +[] +[] +[0,1,2,3,4,5] +[] +[] +[] +[] +[] +[0,1,2,3,4,5,6,7,8,9,10,11] +[] +[] +[] +[] +[] +[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17] +\N +\N +\N +\N +\N +6 +\N +\N +\N +\N +\N +12 +\N +\N +\N +\N +\N +18 +----------------------------------------------------------------------------------------------------------- +test3 select +\N +str_1 +2 +lc_str_3 +(4,5) +[0,1,2,3,4,5] +\N +str_7 +8 +lc_str_9 +(10,11) +[0,1,2,3,4,5,6,7,8,9,10,11] +\N +str_13 +14 +lc_str_15 +(16,17) +[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17] +\N +str_1 +\N +\N +\N +\N +\N +str_7 +\N +\N +\N +\N +\N +str_13 +\N +\N +\N +\N +\N +\N +2 +\N +\N +\N +\N +\N +8 +\N +\N +\N +\N +\N +14 +\N +\N +\N +\N +\N +\N +lc_str_3 +\N +\N +\N +\N +\N +lc_str_9 +\N +\N +\N +\N +\N +lc_str_15 +\N +\N +(0,0) +(0,0) +(0,0) +(0,0) +(4,5) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(10,11) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(16,17) +(0,0) +\N +\N +\N +\N +4 +\N +\N +\N +\N +\N +10 +\N +\N +\N +\N +\N +16 +\N +\N +\N +\N +\N +5 +\N +\N +\N +\N +\N +11 +\N +\N +\N +\N +\N +17 +\N +[] +[] +[] +[] +[] +[0,1,2,3,4,5] +[] +[] +[] +[] +[] +[0,1,2,3,4,5,6,7,8,9,10,11] +[] +[] +[] +[] +[] +[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17] +\N +\N +\N +\N +\N +6 +\N +\N +\N +\N +\N +12 +\N +\N +\N +\N +\N +18 +----------------------------------------------------------------------------------------------------------- +MergeTree wide +test1 insert +test1 select +\N +\N +\N +0 +1 +2 +str_0 +str_1 +str_2 +lc_str_0 +lc_str_1 +lc_str_2 +(0,1) +(1,2) +(2,3) +[0] +[0,1] +[0,1,2] +\N +\N +\N +\N +\N +\N +str_0 +str_1 +str_2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +1 +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +lc_str_0 +lc_str_1 +lc_str_2 +\N +\N +\N +\N +\N +\N +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,1) +(1,2) +(2,3) +(0,0) +(0,0) +(0,0) +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +1 +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +2 +3 +\N +\N +\N +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[0] +[0,1] +[0,1,2] +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +2 +3 +----------------------------------------------------------------------------------------------------------- +test1 select +\N +\N +\N +0 +1 +2 +str_0 +str_1 +str_2 +lc_str_0 +lc_str_1 +lc_str_2 +(0,1) +(1,2) +(2,3) +[0] +[0,1] +[0,1,2] +\N +\N +\N +\N +\N +\N +str_0 +str_1 +str_2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +1 +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +lc_str_0 +lc_str_1 +lc_str_2 +\N +\N +\N +\N +\N +\N +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,1) +(1,2) +(2,3) +(0,0) +(0,0) +(0,0) +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +1 +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +2 +3 +\N +\N +\N +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[0] +[0,1] +[0,1,2] +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +2 +3 +----------------------------------------------------------------------------------------------------------- +test2 insert +test2 select +\N +\N +\N +0 +\N +2 +str_0 +\N +str_2 +lc_str_0 +\N +lc_str_2 +(0,1) +\N +(2,3) +[0] +\N +[0,1,2] +\N +\N +\N +\N +\N +\N +str_0 +\N +str_2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +\N +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +lc_str_0 +\N +lc_str_2 +\N +\N +\N +\N +\N +\N +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,1) +(0,0) +(2,3) +(0,0) +(0,0) +(0,0) +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +\N +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +\N +3 +\N +\N +\N +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[0] +[] +[0,1,2] +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +\N +3 +----------------------------------------------------------------------------------------------------------- +test2 select +\N +\N +\N +0 +\N +2 +str_0 +\N +str_2 +lc_str_0 +\N +lc_str_2 +(0,1) +\N +(2,3) +[0] +\N +[0,1,2] +\N +\N +\N +\N +\N +\N +str_0 +\N +str_2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +\N +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +lc_str_0 +\N +lc_str_2 +\N +\N +\N +\N +\N +\N +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(0,1) +(0,0) +(2,3) +(0,0) +(0,0) +(0,0) +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +0 +\N +2 +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +\N +3 +\N +\N +\N +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[] +[0] +[] +[0,1,2] +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +\N +1 +\N +3 +----------------------------------------------------------------------------------------------------------- +test3 insert +test3 select +\N +str_1 +2 +lc_str_3 +(4,5) +[0,1,2,3,4,5] +\N +str_7 +8 +lc_str_9 +(10,11) +[0,1,2,3,4,5,6,7,8,9,10,11] +\N +str_13 +14 +lc_str_15 +(16,17) +[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17] +\N +str_1 +\N +\N +\N +\N +\N +str_7 +\N +\N +\N +\N +\N +str_13 +\N +\N +\N +\N +\N +\N +2 +\N +\N +\N +\N +\N +8 +\N +\N +\N +\N +\N +14 +\N +\N +\N +\N +\N +\N +lc_str_3 +\N +\N +\N +\N +\N +lc_str_9 +\N +\N +\N +\N +\N +lc_str_15 +\N +\N +(0,0) +(0,0) +(0,0) +(0,0) +(4,5) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(10,11) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(16,17) +(0,0) +\N +\N +\N +\N +4 +\N +\N +\N +\N +\N +10 +\N +\N +\N +\N +\N +16 +\N +\N +\N +\N +\N +5 +\N +\N +\N +\N +\N +11 +\N +\N +\N +\N +\N +17 +\N +[] +[] +[] +[] +[] +[0,1,2,3,4,5] +[] +[] +[] +[] +[] +[0,1,2,3,4,5,6,7,8,9,10,11] +[] +[] +[] +[] +[] +[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17] +\N +\N +\N +\N +\N +6 +\N +\N +\N +\N +\N +12 +\N +\N +\N +\N +\N +18 +----------------------------------------------------------------------------------------------------------- +test3 select +\N +str_1 +2 +lc_str_3 +(4,5) +[0,1,2,3,4,5] +\N +str_7 +8 +lc_str_9 +(10,11) +[0,1,2,3,4,5,6,7,8,9,10,11] +\N +str_13 +14 +lc_str_15 +(16,17) +[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17] +\N +str_1 +\N +\N +\N +\N +\N +str_7 +\N +\N +\N +\N +\N +str_13 +\N +\N +\N +\N +\N +\N +2 +\N +\N +\N +\N +\N +8 +\N +\N +\N +\N +\N +14 +\N +\N +\N +\N +\N +\N +lc_str_3 +\N +\N +\N +\N +\N +lc_str_9 +\N +\N +\N +\N +\N +lc_str_15 +\N +\N +(0,0) +(0,0) +(0,0) +(0,0) +(4,5) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(10,11) +(0,0) +(0,0) +(0,0) +(0,0) +(0,0) +(16,17) +(0,0) +\N +\N +\N +\N +4 +\N +\N +\N +\N +\N +10 +\N +\N +\N +\N +\N +16 +\N +\N +\N +\N +\N +5 +\N +\N +\N +\N +\N +11 +\N +\N +\N +\N +\N +17 +\N +[] +[] +[] +[] +[] +[0,1,2,3,4,5] +[] +[] +[] +[] +[] +[0,1,2,3,4,5,6,7,8,9,10,11] +[] +[] +[] +[] +[] +[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17] +\N +\N +\N +\N +\N +6 +\N +\N +\N +\N +\N +12 +\N +\N +\N +\N +\N +18 +----------------------------------------------------------------------------------------------------------- diff --git a/tests/queries/0_stateless/02941_variant_type_1.sh b/tests/queries/0_stateless/02941_variant_type_1.sh new file mode 100755 index 00000000000..774acb4bbef --- /dev/null +++ b/tests/queries/0_stateless/02941_variant_type_1.sh @@ -0,0 +1,124 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# reset --log_comment +CLICKHOUSE_LOG_COMMENT= +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_variant_type=1" + +function test1_insert() +{ + echo "test1 insert" + $CH_CLIENT -q "insert into test select number, NULL from numbers(3);" + $CH_CLIENT -q "insert into test select number + 3, number from numbers(3);" + $CH_CLIENT -q "insert into test select number + 6, 'str_' || toString(number) from numbers(3);" + $CH_CLIENT -q "insert into test select number + 9, ('lc_str_' || toString(number))::LowCardinality(String) from numbers(3);" + $CH_CLIENT -q "insert into test select number + 12, tuple(number, number + 1)::Tuple(a UInt32, b UInt32) from numbers(3);" + $CH_CLIENT -q "insert into test select number + 15, range(number + 1)::Array(UInt64) from numbers(3);" +} + +function test1_select() +{ + echo "test1 select" + $CH_CLIENT -q "select v from test order by id;" + $CH_CLIENT -q "select v.String from test order by id;" + $CH_CLIENT -q "select v.UInt64 from test order by id;" + $CH_CLIENT -q "select v.\`LowCardinality(String)\` from test order by id;" + $CH_CLIENT -q "select v.\`Tuple(a UInt32, b UInt32)\` from test order by id;" + $CH_CLIENT -q "select v.\`Tuple(a UInt32, b UInt32)\`.a from test order by id;" + $CH_CLIENT -q "select v.\`Tuple(a UInt32, b UInt32)\`.b from test order by id;" + $CH_CLIENT -q "select v.\`Array(UInt64)\` from test order by id;" + $CH_CLIENT -q "select v.\`Array(UInt64)\`.size0 from test order by id;" + echo "-----------------------------------------------------------------------------------------------------------" +} + +function test2_insert() +{ + echo "test2 insert" + $CH_CLIENT -q "insert into test select number, NULL from numbers(3);" + $CH_CLIENT -q "insert into test select number + 3, number % 2 ? NULL : number from numbers(3);" + $CH_CLIENT -q "insert into test select number + 6, number % 2 ? NULL : 'str_' || toString(number) from numbers(3);" + $CH_CLIENT -q "insert into test select number + 9, number % 2 ? CAST(NULL, 'Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))') : CAST(('lc_str_' || toString(number))::LowCardinality(String), 'Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))') from numbers(3);" + $CH_CLIENT -q "insert into test select number + 12, number % 2 ? CAST(NULL, 'Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))') : CAST(tuple(number, number + 1)::Tuple(a UInt32, b UInt32), 'Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))') from numbers(3);" + $CH_CLIENT -q "insert into test select number + 15, number % 2 ? CAST(NULL, 'Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))') : CAST(range(number + 1)::Array(UInt64), 'Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))') from numbers(3);" +} + +function test2_select() +{ + echo "test2 select" + $CH_CLIENT -q "select v from test order by id;" + $CH_CLIENT -q "select v.String from test order by id;" + $CH_CLIENT -q "select v.UInt64 from test order by id;" + $CH_CLIENT -q "select v.\`LowCardinality(String)\` from test order by id;" + $CH_CLIENT -q "select v.\`Tuple(a UInt32, b UInt32)\` from test order by id;" + $CH_CLIENT -q "select v.\`Tuple(a UInt32, b UInt32)\`.a from test order by id;" + $CH_CLIENT -q "select v.\`Tuple(a UInt32, b UInt32)\`.b from test order by id;" + $CH_CLIENT -q "select v.\`Array(UInt64)\` from test order by id;" + $CH_CLIENT -q "select v.\`Array(UInt64)\`.size0 from test order by id;" + echo "-----------------------------------------------------------------------------------------------------------" +} + +function test3_insert() +{ + echo "test3 insert" + $CH_CLIENT -q "insert into test with 'Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))' as type select number, multiIf(number % 6 == 0, CAST(NULL, type), number % 6 == 1, CAST('str_' || toString(number), type), number % 6 == 2, CAST(number, type), number % 6 == 3, CAST(('lc_str_' || toString(number))::LowCardinality(String), type), number % 6 == 4, CAST(tuple(number, number + 1)::Tuple(a UInt32, b UInt32), type), CAST(range(number + 1)::Array(UInt64), type)) as res from numbers(18);" +} + +function test3_select() +{ + echo "test3 select" + $CH_CLIENT -q "select v from test order by id;" + $CH_CLIENT -q "select v.String from test order by id;" + $CH_CLIENT -q "select v.UInt64 from test order by id;" + $CH_CLIENT -q "select v.\`LowCardinality(String)\` from test order by id;" + $CH_CLIENT -q "select v.\`Tuple(a UInt32, b UInt32)\` from test order by id;" + $CH_CLIENT -q "select v.\`Tuple(a UInt32, b UInt32)\`.a from test order by id;" + $CH_CLIENT -q "select v.\`Tuple(a UInt32, b UInt32)\`.b from test order by id;" + $CH_CLIENT -q "select v.\`Array(UInt64)\` from test order by id;" + $CH_CLIENT -q "select v.\`Array(UInt64)\`.size0 from test order by id;" + echo "-----------------------------------------------------------------------------------------------------------" +} + +function run() +{ + test1_insert + test1_select + if [ $1 == 1 ]; then + $CH_CLIENT -q "optimize table test final;" + test1_select + fi + $CH_CLIENT -q "truncate table test;" + test2_insert + test2_select + if [ $1 == 1 ]; then + $CH_CLIENT -q "optimize table test final;" + test2_select + fi + $CH_CLIENT -q "truncate table test;" + test3_insert + test3_select + if [ $1 == 1 ]; then + $CH_CLIENT -q "optimize table test final;" + test3_select + fi + $CH_CLIENT -q "truncate table test;" +} + +$CH_CLIENT -q "drop table if exists test;" + +echo "Memory" +$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=Memory;" +run 0 +$CH_CLIENT -q "drop table test;" + +echo "MergeTree compact" +$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=MergeTree order by id settings min_rows_for_wide_part=100000000, min_bytes_for_wide_part=1000000000;" +run 1 +$CH_CLIENT -q "drop table test;" + +echo "MergeTree wide" +$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1;" +run 1 +$CH_CLIENT -q "drop table test;" diff --git a/tests/queries/0_stateless/02941_variant_type_2.reference b/tests/queries/0_stateless/02941_variant_type_2.reference new file mode 100644 index 00000000000..4b6d53c52ac --- /dev/null +++ b/tests/queries/0_stateless/02941_variant_type_2.reference @@ -0,0 +1,51 @@ +Memory +test4 insert +test4 select +1000000 +200000 +200000 +200000 +200000 +200000 +200000 +200000 +MergeTree compact +test4 insert +test4 select +1000000 +200000 +200000 +200000 +200000 +200000 +200000 +200000 +test4 select +1000000 +200000 +200000 +200000 +200000 +200000 +200000 +200000 +MergeTree wide +test4 insert +test4 select +1000000 +200000 +200000 +200000 +200000 +200000 +200000 +200000 +test4 select +1000000 +200000 +200000 +200000 +200000 +200000 +200000 +200000 diff --git a/tests/queries/0_stateless/02941_variant_type_2.sh b/tests/queries/0_stateless/02941_variant_type_2.sh new file mode 100755 index 00000000000..aef5bc3fe02 --- /dev/null +++ b/tests/queries/0_stateless/02941_variant_type_2.sh @@ -0,0 +1,71 @@ +#!/usr/bin/env bash +# tags: long + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# reset --log_comment +CLICKHOUSE_LOG_COMMENT= +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_variant_type=1" + +function test4_insert() +{ + echo "test4 insert" + $CH_CLIENT -q "insert into test select number, NULL from numbers(200000);" + $CH_CLIENT -q "insert into test select number + 200000, number from numbers(200000);" + $CH_CLIENT -q "insert into test select number + 400000, 'str_' || toString(number) from numbers(200000);" + $CH_CLIENT -q "insert into test select number + 600000, ('lc_str_' || toString(number))::LowCardinality(String) from numbers(200000);" + $CH_CLIENT -q "insert into test select number + 800000, tuple(number, number + 1)::Tuple(a UInt32, b UInt32) from numbers(200000);" + $CH_CLIENT -q "insert into test select number + 1000000, range(number % 20 + 1)::Array(UInt64) from numbers(200000);" +} + +function test4_select +{ + echo "test4 select" + $CH_CLIENT -q "select v from test format Null;" + $CH_CLIENT -q "select count() from test where isNotNull(v);" + $CH_CLIENT -q "select v.String from test format Null;" + $CH_CLIENT -q "select count() from test where isNotNull(v.String);" + $CH_CLIENT -q "select v.UInt64 from test format Null;" + $CH_CLIENT -q "select count() from test where isNotNull(v.UInt64);" + $CH_CLIENT -q "select v.\`LowCardinality(String)\` from test format Null;" + $CH_CLIENT -q "select count() from test where isNotNull(v.\`LowCardinality(String)\`);" + $CH_CLIENT -q "select v.\`Tuple(a UInt32, b UInt32)\` from test format Null;" + $CH_CLIENT -q "select v.\`Tuple(a UInt32, b UInt32)\`.a from test format Null;" + $CH_CLIENT -q "select count() from test where isNotNull(v.\`Tuple(a UInt32, b UInt32)\`.a);" + $CH_CLIENT -q "select v.\`Tuple(a UInt32, b UInt32)\`.b from test format Null;" + $CH_CLIENT -q "select count() from test where isNotNull(v.\`Tuple(a UInt32, b UInt32)\`.b);" + $CH_CLIENT -q "select v.\`Array(UInt64)\` from test format Null;" + $CH_CLIENT -q "select count() from test where not empty(v.\`Array(UInt64)\`);" + $CH_CLIENT -q "select v.\`Array(UInt64)\`.size0 from test format Null;" + $CH_CLIENT -q "select count() from test where isNotNull(v.\`Array(UInt64)\`.size0);" +} + +function run() +{ + test4_insert + test4_select + if [ $1 == 1 ]; then + $CH_CLIENT -q "optimize table test final;" + test4_select + fi + $CH_CLIENT -q "truncate table test;" +} + +$CH_CLIENT -q "drop table if exists test;" + +echo "Memory" +$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=Memory;" +run 0 +$CH_CLIENT -q "drop table test;" + +echo "MergeTree compact" +$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=MergeTree order by id settings min_rows_for_wide_part=100000000, min_bytes_for_wide_part=1000000000;" +run 1 +$CH_CLIENT -q "drop table test;" + +echo "MergeTree wide" +$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1;" +run 1 +$CH_CLIENT -q "drop table test;" diff --git a/tests/queries/0_stateless/02941_variant_type_3.reference b/tests/queries/0_stateless/02941_variant_type_3.reference new file mode 100644 index 00000000000..1ccdb3acdff --- /dev/null +++ b/tests/queries/0_stateless/02941_variant_type_3.reference @@ -0,0 +1,51 @@ +Memory +test5 insert +test5 select +500000 +100000 +100000 +100000 +100000 +100000 +100000 +100000 +MergeTree compact +test5 insert +test5 select +500000 +100000 +100000 +100000 +100000 +100000 +100000 +100000 +test5 select +500000 +100000 +100000 +100000 +100000 +100000 +100000 +100000 +MergeTree wide +test5 insert +test5 select +500000 +100000 +100000 +100000 +100000 +100000 +100000 +100000 +test5 select +500000 +100000 +100000 +100000 +100000 +100000 +100000 +100000 diff --git a/tests/queries/0_stateless/02941_variant_type_3.sh b/tests/queries/0_stateless/02941_variant_type_3.sh new file mode 100755 index 00000000000..d3692270deb --- /dev/null +++ b/tests/queries/0_stateless/02941_variant_type_3.sh @@ -0,0 +1,71 @@ +#!/usr/bin/env bash +# tags: long + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# reset --log_comment +CLICKHOUSE_LOG_COMMENT= +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_variant_type=1" + +function test5_insert() +{ + echo "test5 insert" + $CH_CLIENT -q "insert into test select number, NULL from numbers(200000);" + $CH_CLIENT -q "insert into test select number + 200000, number % 2 ? NULL : number from numbers(200000);" + $CH_CLIENT -q "insert into test select number + 400000, number % 2 ? NULL : 'str_' || toString(number) from numbers(200000);" + $CH_CLIENT -q "insert into test select number + 600000, number % 2 ? CAST(NULL, 'Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))') : CAST(('lc_str_' || toString(number))::LowCardinality(String), 'Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))') from numbers(200000);" + $CH_CLIENT -q "insert into test select number + 800000, number % 2 ? CAST(NULL, 'Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))') : CAST(tuple(number, number + 1)::Tuple(a UInt32, b UInt32), 'Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))') from numbers(200000);" + $CH_CLIENT -q "insert into test select number + 1000000, number % 2 ? CAST(NULL, 'Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))') : CAST(range(number % 20 + 1)::Array(UInt64), 'Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))') from numbers(200000);" +} + +function test5_select() +{ + echo "test5 select" + $CH_CLIENT -q "select v from test format Null;" + $CH_CLIENT -q "select count() from test where isNotNull(v);" + $CH_CLIENT -q "select v.String from test format Null;" + $CH_CLIENT -q "select count() from test where isNotNull(v.String);" + $CH_CLIENT -q "select v.UInt64 from test format Null;" + $CH_CLIENT -q "select count() from test where isNotNull(v.UInt64);" + $CH_CLIENT -q "select v.\`LowCardinality(String)\` from test format Null;" + $CH_CLIENT -q "select count() from test where isNotNull(v.\`LowCardinality(String)\`);" + $CH_CLIENT -q "select v.\`Tuple(a UInt32, b UInt32)\` from test format Null;" + $CH_CLIENT -q "select v.\`Tuple(a UInt32, b UInt32)\`.a from test format Null;" + $CH_CLIENT -q "select count() from test where isNotNull(v.\`Tuple(a UInt32, b UInt32)\`.a);" + $CH_CLIENT -q "select v.\`Tuple(a UInt32, b UInt32)\`.b from test format Null;" + $CH_CLIENT -q "select count() from test where isNotNull(v.\`Tuple(a UInt32, b UInt32)\`.b);" + $CH_CLIENT -q "select v.\`Array(UInt64)\` from test format Null;" + $CH_CLIENT -q "select count() from test where not empty(v.\`Array(UInt64)\`);" + $CH_CLIENT -q "select v.\`Array(UInt64)\`.size0 from test format Null;" + $CH_CLIENT -q "select count() from test where isNotNull(v.\`Array(UInt64)\`.size0);" +} + +function run() +{ + test5_insert + test5_select + if [ $1 == 1 ]; then + $CH_CLIENT -q "optimize table test final;" + test5_select + fi + $CH_CLIENT -q "truncate table test;" +} + +$CH_CLIENT -q "drop table if exists test;" + +echo "Memory" +$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=Memory;" +run 0 +$CH_CLIENT -q "drop table test;" + +echo "MergeTree compact" +$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=MergeTree order by id settings min_rows_for_wide_part=100000000, min_bytes_for_wide_part=1000000000;" +run 1 +$CH_CLIENT -q "drop table test;" + +echo "MergeTree wide" +$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1;" +run 1 +$CH_CLIENT -q "drop table test;" diff --git a/tests/queries/0_stateless/02941_variant_type_4.reference b/tests/queries/0_stateless/02941_variant_type_4.reference new file mode 100644 index 00000000000..e13d5820343 --- /dev/null +++ b/tests/queries/0_stateless/02941_variant_type_4.reference @@ -0,0 +1,56 @@ +Memory +test6 insert +test6 select +1000000 +200000 +200000 +200000 +200000 +200000 +200000 +200000 +----------------------------------------------------------------------------------------------------------- +MergeTree compact +test6 insert +test6 select +1000000 +200000 +200000 +200000 +200000 +200000 +200000 +200000 +----------------------------------------------------------------------------------------------------------- +test6 select +1000000 +200000 +200000 +200000 +200000 +200000 +200000 +200000 +----------------------------------------------------------------------------------------------------------- +MergeTree wide +test6 insert +test6 select +1000000 +200000 +200000 +200000 +200000 +200000 +200000 +200000 +----------------------------------------------------------------------------------------------------------- +test6 select +1000000 +200000 +200000 +200000 +200000 +200000 +200000 +200000 +----------------------------------------------------------------------------------------------------------- diff --git a/tests/queries/0_stateless/02941_variant_type_4.sh b/tests/queries/0_stateless/02941_variant_type_4.sh new file mode 100755 index 00000000000..b3cc041bcd8 --- /dev/null +++ b/tests/queries/0_stateless/02941_variant_type_4.sh @@ -0,0 +1,66 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# reset --log_comment +CLICKHOUSE_LOG_COMMENT= +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_variant_type=1" + +function test6_insert() +{ + echo "test6 insert" + $CH_CLIENT -q "insert into test with 'Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))' as type select number, multiIf(number % 6 == 0, CAST(NULL, type), number % 6 == 1, CAST('str_' || toString(number), type), number % 6 == 2, CAST(number, type), number % 6 == 3, CAST(('lc_str_' || toString(number))::LowCardinality(String), type), number % 6 == 4, CAST(tuple(number, number + 1)::Tuple(a UInt32, b UInt32), type), CAST(range(number % 20 + 1)::Array(UInt64), type)) as res from numbers(1200000);" +} + +function test6_select() +{ + echo "test6 select" + $CH_CLIENT -q "select v from test format Null;" + $CH_CLIENT -q "select count() from test where isNotNull(v);" + $CH_CLIENT -q "select v.String from test format Null;" + $CH_CLIENT -q "select count() from test where isNotNull(v.String);" + $CH_CLIENT -q "select v.UInt64 from test format Null;" + $CH_CLIENT -q "select count() from test where isNotNull(v.UInt64);" + $CH_CLIENT -q "select v.\`LowCardinality(String)\` from test format Null;" + $CH_CLIENT -q "select count() from test where isNotNull(v.\`LowCardinality(String)\`);" + $CH_CLIENT -q "select v.\`Tuple(a UInt32, b UInt32)\` from test format Null;" + $CH_CLIENT -q "select v.\`Tuple(a UInt32, b UInt32)\`.a from test format Null;" + $CH_CLIENT -q "select count() from test where isNotNull(v.\`Tuple(a UInt32, b UInt32)\`.a);" + $CH_CLIENT -q "select v.\`Tuple(a UInt32, b UInt32)\`.b from test format Null;" + $CH_CLIENT -q "select count() from test where isNotNull(v.\`Tuple(a UInt32, b UInt32)\`.b);" + $CH_CLIENT -q "select v.\`Array(UInt64)\` from test format Null;" + $CH_CLIENT -q "select count() from test where not empty(v.\`Array(UInt64)\`);" + $CH_CLIENT -q "select v.\`Array(UInt64)\`.size0 from test format Null;" + $CH_CLIENT -q "select count() from test where isNotNull(v.\`Array(UInt64)\`.size0);" + echo "-----------------------------------------------------------------------------------------------------------" +} + +function run() +{ + test6_insert + test6_select + if [ $1 == 1 ]; then + $CH_CLIENT -q "optimize table test final;" + test6_select + fi + $CH_CLIENT -q "truncate table test;" +} + +$CH_CLIENT -q "drop table if exists test;" + +echo "Memory" +$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=Memory;" +run 0 +$CH_CLIENT -q "drop table test;" + +echo "MergeTree compact" +$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=MergeTree order by id settings min_rows_for_wide_part=100000000, min_bytes_for_wide_part=1000000000;" +run 1 +$CH_CLIENT -q "drop table test;" + +echo "MergeTree wide" +$CH_CLIENT -q "create table test (id UInt64, v Variant(String, UInt64, LowCardinality(String), Tuple(a UInt32, b UInt32), Array(UInt64))) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1;" +run 1 +$CH_CLIENT -q "drop table test;" diff --git a/tests/queries/0_stateless/02942_variant_cast.reference b/tests/queries/0_stateless/02942_variant_cast.reference new file mode 100644 index 00000000000..f3fd7a9ba33 --- /dev/null +++ b/tests/queries/0_stateless/02942_variant_cast.reference @@ -0,0 +1,25 @@ +\N +42 +0 +\N +2 +\N +Hello +Hello +NULL +Hello +Hello +\N +Hello +\N +0 +\N +42 +\N +Hello +2 +\N +Hello +5 +0 +1 diff --git a/tests/queries/0_stateless/02942_variant_cast.sql b/tests/queries/0_stateless/02942_variant_cast.sql new file mode 100644 index 00000000000..33587e3e438 --- /dev/null +++ b/tests/queries/0_stateless/02942_variant_cast.sql @@ -0,0 +1,23 @@ +set allow_experimental_variant_type=1; + +select NULL::Variant(String, UInt64); +select 42::UInt64::Variant(String, UInt64); +select 42::UInt32::Variant(String, UInt64); -- {serverError CANNOT_CONVERT_TYPE} +select now()::Variant(String, UInt64); -- {serverError CANNOT_CONVERT_TYPE} +select CAST(number % 2 ? NULL : number, 'Variant(String, UInt64)') from numbers(4); +select 'Hello'::LowCardinality(String)::Variant(LowCardinality(String), UInt64); +select 'Hello'::LowCardinality(Nullable(String))::Variant(LowCardinality(String), UInt64); +select 'NULL'::LowCardinality(Nullable(String))::Variant(LowCardinality(String), UInt64); +select 'Hello'::LowCardinality(Nullable(String))::Variant(LowCardinality(String), UInt64); +select CAST(CAST(number % 2 ? NULL : 'Hello', 'LowCardinality(Nullable(String))'), 'Variant(LowCardinality(String), UInt64)') from numbers(4); + +select NULL::Variant(String, UInt64)::UInt64; +select NULL::Variant(String, UInt64)::Nullable(UInt64); +select '42'::Variant(String, UInt64)::UInt64; +select 'str'::Variant(String, UInt64)::UInt64; -- {serverError CANNOT_PARSE_TEXT} +select CAST(multiIf(number % 3 == 0, NULL::Variant(String, UInt64), number % 3 == 1, 'Hello'::Variant(String, UInt64), number::Variant(String, UInt64)), 'Nullable(String)') from numbers(6); +select CAST(multiIf(number == 1, NULL::Variant(String, UInt64), number == 2, 'Hello'::Variant(String, UInt64), number::Variant(String, UInt64)), 'UInt64') from numbers(6); -- {serverError CANNOT_PARSE_TEXT} + + +select number::Variant(UInt64)::Variant(String, UInt64)::Variant(Array(String), String, UInt64) from numbers(2); +select 'str'::Variant(String, UInt64)::Variant(String, Array(UInt64)); -- {serverError CANNOT_CONVERT_TYPE} diff --git a/tests/queries/0_stateless/02943_variant_element.reference b/tests/queries/0_stateless/02943_variant_element.reference new file mode 100644 index 00000000000..ab8aaa8fdef --- /dev/null +++ b/tests/queries/0_stateless/02943_variant_element.reference @@ -0,0 +1,44 @@ +\N +\N +\N +\N +0 +1 +2 +3 +\N +\N +\N +\N +0 +\N +2 +\N +\N +\N +\N +\N +str_0 +\N +str_2 +\N +\N +\N +\N +\N +[] +[] +[] +[] +[] +[] +[] +[] +[0] +[] +[0,1,2] +[] +[[0]] +[[NULL]] +[[2]] +[[NULL]] diff --git a/tests/queries/0_stateless/02943_variant_element.sql b/tests/queries/0_stateless/02943_variant_element.sql new file mode 100644 index 00000000000..c8eff9775ad --- /dev/null +++ b/tests/queries/0_stateless/02943_variant_element.sql @@ -0,0 +1,16 @@ +set allow_experimental_variant_type=1; +set use_variant_when_no_common_type_in_if=1; + +select variantElement(NULL::Variant(String, UInt64), 'UInt64') from numbers(4); +select variantElement(number::Variant(String, UInt64), 'UInt64') from numbers(4); +select variantElement(number::Variant(String, UInt64), 'String') from numbers(4); +select variantElement((number % 2 ? NULL : number)::Variant(String, UInt64), 'UInt64') from numbers(4); +select variantElement((number % 2 ? NULL : number)::Variant(String, UInt64), 'String') from numbers(4); +select variantElement((number % 2 ? NULL : 'str_' || toString(number))::LowCardinality(Nullable(String))::Variant(LowCardinality(String), UInt64), 'LowCardinality(String)') from numbers(4); +select variantElement(NULL::LowCardinality(Nullable(String))::Variant(LowCardinality(String), UInt64), 'LowCardinality(String)') from numbers(4); +select variantElement((number % 2 ? NULL : number)::Variant(Array(UInt64), UInt64), 'Array(UInt64)') from numbers(4); +select variantElement(NULL::Variant(Array(UInt64), UInt64), 'Array(UInt64)') from numbers(4); +select variantElement(number % 2 ? NULL : range(number + 1), 'Array(UInt64)') from numbers(4); + +select variantElement([[(number % 2 ? NULL : number)::Variant(String, UInt64)]], 'UInt64') from numbers(4); + diff --git a/tests/queries/0_stateless/02944_variant_as_if_multi_if_result.reference b/tests/queries/0_stateless/02944_variant_as_if_multi_if_result.reference new file mode 100644 index 00000000000..3803f39253c --- /dev/null +++ b/tests/queries/0_stateless/02944_variant_as_if_multi_if_result.reference @@ -0,0 +1,96 @@ +Array(UInt8) [1,2,3] +Array(UInt8) [1,2,3] +String str_1 +Nullable(String) str_1 +String str_1 +Nullable(String) str_1 +Variant(Array(UInt8), String) str_1 +Variant(Array(UInt8), String) str_1 +Array(UInt8) [1,2,3] +Array(UInt8) [1,2,3] +String str_1 +Nullable(String) str_1 +String str_1 +Nullable(String) str_1 +Variant(Array(UInt8), String) str_1 +Variant(Array(UInt8), String) str_1 +Array(UInt8) [1,2,3] +Array(UInt8) [1,2,3] +String str_1 +Nullable(String) str_1 +String str_1 +Nullable(String) str_1 +Variant(Array(UInt8), String) str_1 +Variant(Array(UInt8), String) str_1 +String str_0 +String str_1 +String str_2 +String str_3 +Nullable(String) str_0 +Nullable(String) str_1 +Nullable(String) str_2 +Nullable(String) str_3 +Array(UInt64) [0] +Array(UInt64) [0,1] +Array(UInt64) [0,1,2] +Array(UInt64) [0,1,2,3] +Array(UInt64) [0] +Array(UInt64) [0,1] +Array(UInt64) [0,1,2] +Array(UInt64) [0,1,2,3] +String str_0 +String str_1 +String str_2 +String str_3 +Nullable(String) str_0 +Nullable(String) str_1 +Nullable(String) str_2 +Nullable(String) str_3 +Variant(Array(UInt64), String) str_0 +Variant(Array(UInt64), String) str_1 +Variant(Array(UInt64), String) str_2 +Variant(Array(UInt64), String) str_3 +Variant(Array(UInt64), String) str_0 +Variant(Array(UInt64), String) str_1 +Variant(Array(UInt64), String) str_2 +Variant(Array(UInt64), String) str_3 +Variant(Array(UInt64), String) str_0 +Variant(Array(UInt64), String) [0,1] +Variant(Array(UInt64), String) str_2 +Variant(Array(UInt64), String) [0,1,2,3] +Variant(Array(UInt64), String) str_0 +Variant(Array(UInt64), String) [0,1] +Variant(Array(UInt64), String) str_2 +Variant(Array(UInt64), String) [0,1,2,3] +Variant(Array(UInt64), String) str_0 +Variant(Array(UInt64), String) [0,1] +Variant(Array(UInt64), String) str_2 +Variant(Array(UInt64), String) [0,1,2,3] +Variant(Array(UInt64), String) str_0 +Variant(Array(UInt64), String) [0,1] +Variant(Array(UInt64), String) str_2 +Variant(Array(UInt64), String) [0,1,2,3] +Variant(Array(UInt64), String, UInt64) [0] +Variant(Array(UInt64), String, UInt64) 1 +Variant(Array(UInt64), String, UInt64) str_2 +Variant(Array(UInt64), String, UInt64) [0,1,2,3] +Variant(Array(UInt64), String, UInt64) 4 +Variant(Array(UInt64), String, UInt64) str_5 +Variant(Array(UInt64), String, UInt64) [0] +Variant(Array(UInt64), String, UInt64) 1 +Variant(Array(UInt64), String, UInt64) str_2 +Variant(Array(UInt64), String, UInt64) [0,1,2,3] +Variant(Array(UInt64), String, UInt64) 4 +Variant(Array(UInt64), String, UInt64) str_5 +Variant(Array(UInt64), String, UInt64) [0] +Variant(Array(UInt64), String, UInt64) 1 +Variant(Array(UInt64), String, UInt64) str_2 +Variant(Array(UInt64), String, UInt64) [0,1,2,3] +Variant(Array(UInt64), String, UInt64) 4 +Variant(Array(UInt64), String, UInt64) str_5 +Variant(Array(UInt64), String, UInt64) [0] +Variant(Array(UInt64), String, UInt64) 1 +Variant(Array(UInt64), String, UInt64) str_2 +Variant(Array(UInt64), String, UInt64) [0,1,2,3] +Variant(Array(UInt64), String, UInt64) 4 +Variant(Array(UInt64), String, UInt64) str_5 diff --git a/tests/queries/0_stateless/02944_variant_as_if_multi_if_result.sql b/tests/queries/0_stateless/02944_variant_as_if_multi_if_result.sql new file mode 100644 index 00000000000..da36863bfda --- /dev/null +++ b/tests/queries/0_stateless/02944_variant_as_if_multi_if_result.sql @@ -0,0 +1,64 @@ +set allow_experimental_variant_type=1; +set use_variant_when_no_common_type_in_if=1; + +select toTypeName(res), if(1, [1,2,3], 'str_1') as res; +select toTypeName(res), if(1, [1,2,3], 'str_1'::Nullable(String)) as res; + +select toTypeName(res), if(0, [1,2,3], 'str_1') as res; +select toTypeName(res), if(0, [1,2,3], 'str_1'::Nullable(String)) as res; + +select toTypeName(res), if(NULL, [1,2,3], 'str_1') as res; +select toTypeName(res), if(NULL, [1,2,3], 'str_1'::Nullable(String)) as res; + +select toTypeName(res), if(materialize(NULL::Nullable(UInt8)), [1,2,3], 'str_1') as res; +select toTypeName(res), if(materialize(NULL::Nullable(UInt8)), [1,2,3], 'str_1'::Nullable(String)) as res; + +select toTypeName(res), if(1, materialize([1,2,3]), 'str_1') as res; +select toTypeName(res), if(1, materialize([1,2,3]), 'str_1'::Nullable(String)) as res; + +select toTypeName(res), if(0, materialize([1,2,3]), 'str_1') as res; +select toTypeName(res), if(0, materialize([1,2,3]), 'str_1'::Nullable(String)) as res; + +select toTypeName(res), if(NULL, materialize([1,2,3]), 'str_1') as res; +select toTypeName(res), if(NULL, materialize([1,2,3]), 'str_1'::Nullable(String)) as res; + +select toTypeName(res), if(materialize(NULL::Nullable(UInt8)), materialize([1,2,3]), 'str_1') as res; +select toTypeName(res), if(materialize(NULL::Nullable(UInt8)), materialize([1,2,3]), 'str_1'::Nullable(String)) as res; + +select toTypeName(res), if(1, [1,2,3], materialize('str_1')) as res; +select toTypeName(res), if(1, [1,2,3], materialize('str_1')::Nullable(String)) as res; + +select toTypeName(res), if(0, [1,2,3], materialize('str_1')) as res; +select toTypeName(res), if(0, [1,2,3], materialize('str_1')::Nullable(String)) as res; + +select toTypeName(res), if(NULL, [1,2,3], materialize('str_1')) as res; +select toTypeName(res), if(NULL, [1,2,3], materialize('str_1')::Nullable(String)) as res; + +select toTypeName(res), if(materialize(NULL::Nullable(UInt8)), [1,2,3], materialize('str_1')) as res; +select toTypeName(res), if(materialize(NULL::Nullable(UInt8)), [1,2,3], materialize('str_1')::Nullable(String)) as res; + + +select toTypeName(res), if(0, range(number + 1), 'str_' || toString(number)) as res from numbers(4); +select toTypeName(res), if(0, range(number + 1), ('str_' || toString(number))::Nullable(String)) as res from numbers(4); + +select toTypeName(res), if(1, range(number + 1), 'str_' || toString(number)) as res from numbers(4); +select toTypeName(res), if(1, range(number + 1), ('str_' || toString(number))::Nullable(String)) as res from numbers(4); + +select toTypeName(res), if(NULL, range(number + 1), 'str_' || toString(number)) as res from numbers(4); +select toTypeName(res), if(NULL, range(number + 1), ('str_' || toString(number))::Nullable(String)) as res from numbers(4); + +select toTypeName(res), if(materialize(NULL::Nullable(UInt8)), range(number + 1), 'str_' || toString(number)) as res from numbers(4); +select toTypeName(res), if(materialize(NULL::Nullable(UInt8)), range(number + 1), ('str_' || toString(number))::Nullable(String)) as res from numbers(4); + +select toTypeName(res), if(number % 2, range(number + 1), 'str_' || toString(number)) as res from numbers(4); +select toTypeName(res), if(number % 2, range(number + 1), ('str_' || toString(number))::Nullable(String)) as res from numbers(4); + +select toTypeName(res), if(number % 2, range(number + 1), ('str_' || toString(number))::LowCardinality(String)) as res from numbers(4); +select toTypeName(res), if(number % 2, range(number + 1), ('str_' || toString(number))::LowCardinality(Nullable(String))) as res from numbers(4); + + +select toTypeName(res), multiIf(number % 3 == 0, range(number + 1), number % 3 == 1, number, 'str_' || toString(number)) as res from numbers(6); +select toTypeName(res), multiIf(number % 3 == 0, range(number + 1), number % 3 == 1, number, ('str_' || toString(number))::Nullable(String)) as res from numbers(6); +select toTypeName(res), multiIf(number % 3 == 0, range(number + 1), number % 3 == 1, number, ('str_' || toString(number))::LowCardinality(String)) as res from numbers(6); +select toTypeName(res), multiIf(number % 3 == 0, range(number + 1), number % 3 == 1, number, ('str_' || toString(number))::LowCardinality(Nullable(String))) as res from numbers(6); + From 0a7ca36e7fbd02b4b64a30371fa3118144179e51 Mon Sep 17 00:00:00 2001 From: avogar Date: Tue, 19 Dec 2023 17:08:35 +0000 Subject: [PATCH 027/245] Remove unneded changes in IColumn.h --- src/Columns/IColumn.h | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/src/Columns/IColumn.h b/src/Columns/IColumn.h index 0dcba5b310c..3f866e6213d 100644 --- a/src/Columns/IColumn.h +++ b/src/Columns/IColumn.h @@ -631,17 +631,6 @@ struct IsMutableColumns template <> struct IsMutableColumns<> { static const bool value = true; }; -template -struct IsMutableColumnsOrRvalueReferences; - -template -struct IsMutableColumnsOrRvalueReferences -{ - static const bool value = (std::is_assignable::value || std::is_rvalue_reference_v) && IsMutableColumnsOrRvalueReferences::value; -}; - -template <> -struct IsMutableColumnsOrRvalueReferences<> { static const bool value = true; }; template const Type * checkAndGetColumn(const IColumn & column) From bd84799aecb0f8103fd88e9fb1491720f9ec90c8 Mon Sep 17 00:00:00 2001 From: avogar Date: Tue, 19 Dec 2023 17:21:45 +0000 Subject: [PATCH 028/245] Fix style --- src/Columns/ColumnVariant.h | 4 ++-- utils/check-style/aspell-ignore/en/aspell-dict.txt | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/Columns/ColumnVariant.h b/src/Columns/ColumnVariant.h index 692fdd1709e..702107504f0 100644 --- a/src/Columns/ColumnVariant.h +++ b/src/Columns/ColumnVariant.h @@ -9,7 +9,7 @@ namespace DB namespace ErrorCodes { - extern const int LOGICAL_ERROR; + extern const int NOT_IMPLEMENTED; } /** @@ -263,7 +263,7 @@ public: bool hasOnlyNulls() const { /// If all variants are empty, we have only NULL values. - return std::all_of(variants.begin(), variants.end(), [](const auto & v){ return v->empty(); } ); + return std::all_of(variants.begin(), variants.end(), [](const WrappedPtr & v){ return v->empty(); }); } /// Check if local and global order is the same. diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index 637ab0ce6d4..bc03f4b39f8 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -2594,6 +2594,7 @@ uuid varPop varSamp variadic +variantElement varint varpop varsamp From e74ae96dd006f8ff5fc8150eba5ab0beb47ddba3 Mon Sep 17 00:00:00 2001 From: avogar Date: Tue, 19 Dec 2023 17:35:13 +0000 Subject: [PATCH 029/245] Fux typo --- src/DataTypes/Serializations/SerializationNullable.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/DataTypes/Serializations/SerializationNullable.cpp b/src/DataTypes/Serializations/SerializationNullable.cpp index e7f0e61f2a5..05c70827c35 100644 --- a/src/DataTypes/Serializations/SerializationNullable.cpp +++ b/src/DataTypes/Serializations/SerializationNullable.cpp @@ -206,7 +206,7 @@ ReturnType safeAppendToNullMap(ColumnNullable & column, bool is_null) } /// Deserialize value into non-nullable column. In case of NULL, insert default and set is_null to true. -/// If ReturnType is bool, return true if parsing was succesfull and false in case of any error. +/// If ReturnType is bool, return true if parsing was successful and false in case of any error. template static ReturnType deserializeImpl(IColumn & column, ReadBuffer & buf, CheckForNull && check_for_null, DeserializeNested && deserialize_nested, bool & is_null) { From 9edbfb3a31e67722a6af3b418a119e9b2bbb164e Mon Sep 17 00:00:00 2001 From: avogar Date: Tue, 19 Dec 2023 17:55:42 +0000 Subject: [PATCH 030/245] Fix build after merging with master --- src/DataTypes/Serializations/SerializationEnum.cpp | 10 +++++----- src/DataTypes/Serializations/SerializationEnum.h | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/DataTypes/Serializations/SerializationEnum.cpp b/src/DataTypes/Serializations/SerializationEnum.cpp index 6ad55913738..fb384547d64 100644 --- a/src/DataTypes/Serializations/SerializationEnum.cpp +++ b/src/DataTypes/Serializations/SerializationEnum.cpp @@ -47,7 +47,7 @@ bool SerializationEnum::tryDeserializeTextEscaped(IColumn & column, ReadBu { std::string field_name; readEscapedString(field_name, istr); - if (!this->tryGetValue(x, StringRef(field_name), true)) + if (!ref_enum_values.tryGetValue(x, StringRef(field_name), true)) return false; } @@ -75,7 +75,7 @@ bool SerializationEnum::tryDeserializeTextQuoted(IColumn & column, ReadBuf std::string field_name; readQuotedStringWithSQLStyle(field_name, istr); FieldType x; - if (!this->tryGetValue(x, StringRef(field_name))) + if (!ref_enum_values.tryGetValue(x, StringRef(field_name))) return false; assert_cast(column).getData().push_back(x); return true; @@ -111,7 +111,7 @@ bool SerializationEnum::tryDeserializeWholeText(IColumn & column, ReadBuff { std::string field_name; readStringUntilEOF(field_name, istr); - if (!this->tryGetValue(x, StringRef(field_name), true)) + if (!ref_enum_values.tryGetValue(x, StringRef(field_name), true)) return false; } @@ -157,7 +157,7 @@ bool SerializationEnum::tryDeserializeTextJSON(IColumn & column, ReadBuffe { std::string field_name; readJSONString(field_name, istr); - if (!this->tryGetValue(x, StringRef(field_name))) + if (!ref_enum_values.tryGetValue(x, StringRef(field_name))) return false; } @@ -198,7 +198,7 @@ bool SerializationEnum::tryDeserializeTextCSV(IColumn & column, ReadBuffer { std::string field_name; readCSVString(field_name, istr, settings.csv); - if (!this->tryGetValue(x, StringRef(field_name), true)) + if (!ref_enum_values.tryGetValue(x, StringRef(field_name), true)) return false; } diff --git a/src/DataTypes/Serializations/SerializationEnum.h b/src/DataTypes/Serializations/SerializationEnum.h index 708161dc5fd..5152a3fbc93 100644 --- a/src/DataTypes/Serializations/SerializationEnum.h +++ b/src/DataTypes/Serializations/SerializationEnum.h @@ -60,7 +60,7 @@ public: bool tryReadValue(ReadBuffer & istr, FieldType & x) const { - if (!tryReadText(x, istr) || !this->hasValue(x)) + if (!tryReadText(x, istr) || !ref_enum_values.hasValue(x)) return false; return true; From 3c9dd07f7b2c036f5d299869f16ae0a39621b25f Mon Sep 17 00:00:00 2001 From: avogar Date: Tue, 19 Dec 2023 21:17:39 +0000 Subject: [PATCH 031/245] Fix special builds, fix test --- src/Columns/tests/gtest_column_variant.cpp | 5 ++++- src/DataTypes/Serializations/SerializationIPv4andIPv6.cpp | 7 +++---- tests/queries/0_stateless/02941_variant_type_1.sh | 1 + tests/queries/0_stateless/02941_variant_type_2.sh | 2 +- tests/queries/0_stateless/02941_variant_type_3.sh | 2 +- tests/queries/0_stateless/02941_variant_type_4.sh | 1 + 6 files changed, 11 insertions(+), 7 deletions(-) diff --git a/src/Columns/tests/gtest_column_variant.cpp b/src/Columns/tests/gtest_column_variant.cpp index b701e2d3183..0a6512c46b7 100644 --- a/src/Columns/tests/gtest_column_variant.cpp +++ b/src/Columns/tests/gtest_column_variant.cpp @@ -582,7 +582,10 @@ TEST(ColumnVariant, PermuteAndIndexOneColumnNoNulls) ASSERT_EQ((*permuted_column)[2].get(), 2); auto index = ColumnUInt64::create(); - index->getData() = std::move(permutation); + index->getData().push_back(1); + index->getData().push_back(3); + index->getData().push_back(2); + index->getData().push_back(0); auto indexed_column = column->index(*index, 3); ASSERT_EQ(indexed_column->size(), 3); ASSERT_EQ((*indexed_column)[0].get(), 1); diff --git a/src/DataTypes/Serializations/SerializationIPv4andIPv6.cpp b/src/DataTypes/Serializations/SerializationIPv4andIPv6.cpp index 81c4af97401..dfcd24aff58 100644 --- a/src/DataTypes/Serializations/SerializationIPv4andIPv6.cpp +++ b/src/DataTypes/Serializations/SerializationIPv4andIPv6.cpp @@ -15,10 +15,10 @@ void SerializationIP::deserializeText(DB::IColumn & column, DB::ReadBuffer IPv x; readText(x, istr); + assert_cast &>(column).getData().push_back(x); + if (whole && !istr.eof()) throwUnexpectedDataAfterParsedValue(column, istr, settings, TypeName.data()); - - assert_cast &>(column).getData().push_back(x); } template @@ -77,11 +77,10 @@ void SerializationIP::deserializeTextJSON(DB::IColumn & column, DB::ReadBuf /// this code looks weird, but we want to throw specific exception to match original behavior... if (istr.eof()) assertChar('"', istr); + assert_cast &>(column).getData().push_back(x); if (*istr.position() != '"') throwUnexpectedDataAfterParsedValue(column, istr, settings, TypeName.data()); istr.ignore(); - - assert_cast &>(column).getData().push_back(x); } template diff --git a/tests/queries/0_stateless/02941_variant_type_1.sh b/tests/queries/0_stateless/02941_variant_type_1.sh index 774acb4bbef..4cf8ad25122 100755 --- a/tests/queries/0_stateless/02941_variant_type_1.sh +++ b/tests/queries/0_stateless/02941_variant_type_1.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash +# Tags: long CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # reset --log_comment diff --git a/tests/queries/0_stateless/02941_variant_type_2.sh b/tests/queries/0_stateless/02941_variant_type_2.sh index aef5bc3fe02..7064dfbf4ec 100755 --- a/tests/queries/0_stateless/02941_variant_type_2.sh +++ b/tests/queries/0_stateless/02941_variant_type_2.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# tags: long +# Tags: long CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # reset --log_comment diff --git a/tests/queries/0_stateless/02941_variant_type_3.sh b/tests/queries/0_stateless/02941_variant_type_3.sh index d3692270deb..303039edef7 100755 --- a/tests/queries/0_stateless/02941_variant_type_3.sh +++ b/tests/queries/0_stateless/02941_variant_type_3.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# tags: long +# Tags: long CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # reset --log_comment diff --git a/tests/queries/0_stateless/02941_variant_type_4.sh b/tests/queries/0_stateless/02941_variant_type_4.sh index b3cc041bcd8..169e43c6d69 100755 --- a/tests/queries/0_stateless/02941_variant_type_4.sh +++ b/tests/queries/0_stateless/02941_variant_type_4.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash +# Tags: long CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # reset --log_comment From 1efd65b8c73951e60e94f74ccc45141a5b39d85e Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 20 Dec 2023 17:43:04 +0000 Subject: [PATCH 032/245] Fix tests --- src/Columns/ColumnVariant.cpp | 10 ++++++++++ src/Columns/ColumnVariant.h | 1 + src/DataTypes/DataTypeVariant.cpp | 12 ++++++++++++ src/DataTypes/DataTypeVariant.h | 1 + src/DataTypes/IDataType.h | 2 +- .../Serializations/SerializationVariantElement.cpp | 2 +- 6 files changed, 26 insertions(+), 2 deletions(-) diff --git a/src/Columns/ColumnVariant.cpp b/src/Columns/ColumnVariant.cpp index 67754e77992..a3a0362b646 100644 --- a/src/Columns/ColumnVariant.cpp +++ b/src/Columns/ColumnVariant.cpp @@ -480,6 +480,16 @@ void ColumnVariant::insertFrom(const IColumn & src_, size_t n) } } +void ColumnVariant::insertIntoVariant(const DB::Field & x, Discriminator global_discr) +{ + if (global_discr > variants.size()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Invalid global discriminator: {}. The number of variants is {}", size_t(global_discr), variants.size()); + auto & variant = getVariantByGlobalDiscriminator(global_discr); + variant.insert(x); + getLocalDiscriminators().push_back(localDiscriminatorByGlobal(global_discr)); + getOffsets().push_back(variant.size() - 1); +} + void ColumnVariant::insertRangeFrom(const IColumn & src_, size_t start, size_t length) { const size_t num_variants = variants.size(); diff --git a/src/Columns/ColumnVariant.h b/src/Columns/ColumnVariant.h index 702107504f0..b388b118a69 100644 --- a/src/Columns/ColumnVariant.h +++ b/src/Columns/ColumnVariant.h @@ -174,6 +174,7 @@ public: StringRef getDataAt(size_t n) const override; void insertData(const char * pos, size_t length) override; void insert(const Field & x) override; + void insertIntoVariant(const Field & x, Discriminator global_discr); void insertFrom(const IColumn & src_, size_t n) override; void insertRangeFrom(const IColumn & src, size_t start, size_t length) override; void insertManyFrom(const IColumn & src, size_t position, size_t length) override; diff --git a/src/DataTypes/DataTypeVariant.cpp b/src/DataTypes/DataTypeVariant.cpp index 77e1c504cf8..334ed2c7b10 100644 --- a/src/DataTypes/DataTypeVariant.cpp +++ b/src/DataTypes/DataTypeVariant.cpp @@ -1,9 +1,11 @@ #include +#include #include #include #include #include #include +#include #include #include #include @@ -94,6 +96,16 @@ MutableColumnPtr DataTypeVariant::createColumn() const return ColumnVariant::create(std::move(nested_columns)); } +ColumnPtr DataTypeVariant::createColumnConst(size_t size, const DB::Field & field) const +{ + auto field_type = applyVisitor(FieldToDataType(), field); + auto discr = tryGetVariantDiscriminator(field_type); + if (!discr) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot insert field \"{}\" into column with type {}", toString(field), getName()); + auto column = createColumn(); + assert_cast(*column).insertIntoVariant(field, *discr); + return ColumnConst::create(std::move(column), size); +} Field DataTypeVariant::getDefault() const { diff --git a/src/DataTypes/DataTypeVariant.h b/src/DataTypes/DataTypeVariant.h index 60113a188b0..ca15dff1476 100644 --- a/src/DataTypes/DataTypeVariant.h +++ b/src/DataTypes/DataTypeVariant.h @@ -37,6 +37,7 @@ public: MutableColumnPtr createColumn() const override; + ColumnPtr createColumnConst(size_t size, const Field & field) const override; Field getDefault() const override; bool equals(const IDataType & rhs) const override; diff --git a/src/DataTypes/IDataType.h b/src/DataTypes/IDataType.h index ccdf54f57c3..4533c23a89f 100644 --- a/src/DataTypes/IDataType.h +++ b/src/DataTypes/IDataType.h @@ -150,7 +150,7 @@ public: /** Create ColumnConst for corresponding type, with specified size and value. */ - ColumnPtr createColumnConst(size_t size, const Field & field) const; + virtual ColumnPtr createColumnConst(size_t size, const Field & field) const; ColumnPtr createColumnConstWithDefaultValue(size_t size) const; /** Get default value of data type. diff --git a/src/DataTypes/Serializations/SerializationVariantElement.cpp b/src/DataTypes/Serializations/SerializationVariantElement.cpp index 4b24ee5754e..1c0808db2a0 100644 --- a/src/DataTypes/Serializations/SerializationVariantElement.cpp +++ b/src/DataTypes/Serializations/SerializationVariantElement.cpp @@ -204,7 +204,7 @@ ColumnPtr SerializationVariantElement::VariantSubcolumnCreator::create(const DB: /// If this variant is empty, fill result column with default values. if (prev->empty()) { - auto res = IColumn::mutate(makeNullableOrLowCardinalityNullableSafe(prev)); + auto res = makeNullableOrLowCardinalityNullableSafe(prev)->cloneEmpty(); res->insertManyDefaults(local_discriminators->size()); return res; } From 4f8789927db4dd0d9c79a80bebc805895d82297c Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 21 Dec 2023 15:53:21 +0000 Subject: [PATCH 033/245] Fix tests with analyzer, add more tests --- src/DataTypes/DataTypeVariant.cpp | 18 +- ...different_local_and_global_order.reference | 244 ++++++++++++++++++ ...e_with_different_local_and_global_order.sh | 82 ++++++ .../02944_variant_as_if_multi_if_result.sql | 1 + 4 files changed, 340 insertions(+), 5 deletions(-) create mode 100644 tests/queries/0_stateless/02943_variant_type_with_different_local_and_global_order.reference create mode 100755 tests/queries/0_stateless/02943_variant_type_with_different_local_and_global_order.sh diff --git a/src/DataTypes/DataTypeVariant.cpp b/src/DataTypes/DataTypeVariant.cpp index 334ed2c7b10..0575f220f22 100644 --- a/src/DataTypes/DataTypeVariant.cpp +++ b/src/DataTypes/DataTypeVariant.cpp @@ -98,12 +98,20 @@ MutableColumnPtr DataTypeVariant::createColumn() const ColumnPtr DataTypeVariant::createColumnConst(size_t size, const DB::Field & field) const { - auto field_type = applyVisitor(FieldToDataType(), field); - auto discr = tryGetVariantDiscriminator(field_type); - if (!discr) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot insert field \"{}\" into column with type {}", toString(field), getName()); auto column = createColumn(); - assert_cast(*column).insertIntoVariant(field, *discr); + if (field.isNull()) + { + column->insertDefault(); + } + else + { + auto field_type = applyVisitor(FieldToDataType(), field); + auto discr = tryGetVariantDiscriminator(field_type); + if (!discr) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot insert field \"{}\" into column with type {}", toString(field), getName()); + assert_cast(*column).insertIntoVariant(field, *discr); + } + return ColumnConst::create(std::move(column), size); } diff --git a/tests/queries/0_stateless/02943_variant_type_with_different_local_and_global_order.reference b/tests/queries/0_stateless/02943_variant_type_with_different_local_and_global_order.reference new file mode 100644 index 00000000000..f2e355824f9 --- /dev/null +++ b/tests/queries/0_stateless/02943_variant_type_with_different_local_and_global_order.reference @@ -0,0 +1,244 @@ +Memory +test1 insert +test1 select +0 \N 0 +1 \N 1 +2 \N 2 +3 \N 3 +4 \N 4 +5 \N 5 +6 \N 6 +7 \N 7 +8 \N 8 +9 \N 9 +10 \N 10 +\N \N \N +12 \N 12 +\N \N \N +14 \N 14 +\N \N \N +16 \N 16 +\N \N \N +18 \N 18 +\N \N \N +str_20 str_20 \N +\N \N \N +str_22 str_22 \N +\N \N \N +str_24 str_24 \N +\N \N \N +str_26 str_26 \N +\N \N \N +str_28 str_28 \N +\N \N \N +30 \N 30 +\N \N \N +32 \N 32 +\N \N \N +34 \N 34 +\N \N \N +str_36 str_36 \N +\N \N \N +str_38 str_38 \N +\N \N \N +----------------------------------------------------------------------------------------------------------- +test2 insert +test2 select +7000000 +1000000 +6000000 +----------------------------------------------------------------------------------------------------------- +MergeTree compact +test1 insert +test1 select +0 \N 0 +1 \N 1 +2 \N 2 +3 \N 3 +4 \N 4 +5 \N 5 +6 \N 6 +7 \N 7 +8 \N 8 +9 \N 9 +10 \N 10 +\N \N \N +12 \N 12 +\N \N \N +14 \N 14 +\N \N \N +16 \N 16 +\N \N \N +18 \N 18 +\N \N \N +str_20 str_20 \N +\N \N \N +str_22 str_22 \N +\N \N \N +str_24 str_24 \N +\N \N \N +str_26 str_26 \N +\N \N \N +str_28 str_28 \N +\N \N \N +30 \N 30 +\N \N \N +32 \N 32 +\N \N \N +34 \N 34 +\N \N \N +str_36 str_36 \N +\N \N \N +str_38 str_38 \N +\N \N \N +----------------------------------------------------------------------------------------------------------- +test1 select +0 \N 0 +1 \N 1 +2 \N 2 +3 \N 3 +4 \N 4 +5 \N 5 +6 \N 6 +7 \N 7 +8 \N 8 +9 \N 9 +10 \N 10 +\N \N \N +12 \N 12 +\N \N \N +14 \N 14 +\N \N \N +16 \N 16 +\N \N \N +18 \N 18 +\N \N \N +str_20 str_20 \N +\N \N \N +str_22 str_22 \N +\N \N \N +str_24 str_24 \N +\N \N \N +str_26 str_26 \N +\N \N \N +str_28 str_28 \N +\N \N \N +30 \N 30 +\N \N \N +32 \N 32 +\N \N \N +34 \N 34 +\N \N \N +str_36 str_36 \N +\N \N \N +str_38 str_38 \N +\N \N \N +----------------------------------------------------------------------------------------------------------- +test2 insert +test2 select +7000000 +1000000 +6000000 +----------------------------------------------------------------------------------------------------------- +test2 select +7000000 +1000000 +6000000 +----------------------------------------------------------------------------------------------------------- +MergeTree wide +test1 insert +test1 select +0 \N 0 +1 \N 1 +2 \N 2 +3 \N 3 +4 \N 4 +5 \N 5 +6 \N 6 +7 \N 7 +8 \N 8 +9 \N 9 +10 \N 10 +\N \N \N +12 \N 12 +\N \N \N +14 \N 14 +\N \N \N +16 \N 16 +\N \N \N +18 \N 18 +\N \N \N +str_20 str_20 \N +\N \N \N +str_22 str_22 \N +\N \N \N +str_24 str_24 \N +\N \N \N +str_26 str_26 \N +\N \N \N +str_28 str_28 \N +\N \N \N +30 \N 30 +\N \N \N +32 \N 32 +\N \N \N +34 \N 34 +\N \N \N +str_36 str_36 \N +\N \N \N +str_38 str_38 \N +\N \N \N +----------------------------------------------------------------------------------------------------------- +test1 select +0 \N 0 +1 \N 1 +2 \N 2 +3 \N 3 +4 \N 4 +5 \N 5 +6 \N 6 +7 \N 7 +8 \N 8 +9 \N 9 +10 \N 10 +\N \N \N +12 \N 12 +\N \N \N +14 \N 14 +\N \N \N +16 \N 16 +\N \N \N +18 \N 18 +\N \N \N +str_20 str_20 \N +\N \N \N +str_22 str_22 \N +\N \N \N +str_24 str_24 \N +\N \N \N +str_26 str_26 \N +\N \N \N +str_28 str_28 \N +\N \N \N +30 \N 30 +\N \N \N +32 \N 32 +\N \N \N +34 \N 34 +\N \N \N +str_36 str_36 \N +\N \N \N +str_38 str_38 \N +\N \N \N +----------------------------------------------------------------------------------------------------------- +test2 insert +test2 select +7000000 +1000000 +6000000 +----------------------------------------------------------------------------------------------------------- +test2 select +7000000 +1000000 +6000000 +----------------------------------------------------------------------------------------------------------- diff --git a/tests/queries/0_stateless/02943_variant_type_with_different_local_and_global_order.sh b/tests/queries/0_stateless/02943_variant_type_with_different_local_and_global_order.sh new file mode 100755 index 00000000000..88bd2d3bd42 --- /dev/null +++ b/tests/queries/0_stateless/02943_variant_type_with_different_local_and_global_order.sh @@ -0,0 +1,82 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# reset --log_comment +CLICKHOUSE_LOG_COMMENT= +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_variant_type=1 --use_variant_when_no_common_type_in_if=1 " + + +function test1_insert() +{ + echo "test1 insert" + $CH_CLIENT -q "insert into test select number, number::Variant(UInt64)::Variant(UInt64, Array(UInt64)) from numbers(10) settings max_block_size=3" + $CH_CLIENT -q "insert into test select number, if(number % 2, NULL, number)::Variant(UInt64)::Variant(UInt64, String, Array(UInt64)) as res from numbers(10, 10) settings max_block_size=3" + $CH_CLIENT -q "insert into test select number, if(number % 2, NULL, 'str_' || toString(number))::Variant(String)::Variant(UInt64, String, Array(UInt64)) as res from numbers(20, 10) settings max_block_size=3" + $CH_CLIENT -q "insert into test select number, if(number < 35, if(number % 2, NULL, number)::Variant(UInt64)::Variant(UInt64, String, Array(UInt64)), if(number % 2, NULL, 'str_' || toString(number))::Variant(String)::Variant(UInt64, String, Array(UInt64))) from numbers(30, 10) settings max_block_size=3" +} + +function test1_select() +{ + echo "test1 select" + $CH_CLIENT -q "select v, v.String, v.UInt64 from test order by id;" + echo "-----------------------------------------------------------------------------------------------------------" +} + +function test2_insert() +{ + echo "test2 insert" + $CH_CLIENT -q "insert into test select number, number::Variant(UInt64)::Variant(UInt64, Array(UInt64)) from numbers(1000000) settings max_insert_block_size = 100000, min_insert_block_size_rows=100000" + $CH_CLIENT -q "insert into test select number, if(number % 2, NULL, number)::Variant(UInt64)::Variant(UInt64, String, Array(UInt64)) as res from numbers(1000000, 10000000) settings max_insert_block_size = 100000, min_insert_block_size_rows=100000" + $CH_CLIENT -q "insert into test select number, if(number % 2, NULL, 'str_' || toString(number))::Variant(String)::Variant(UInt64, String, Array(UInt64)) as res from numbers(2000000, 1000000) settings max_insert_block_size = 100000, min_insert_block_size_rows=100000" + $CH_CLIENT -q "insert into test select number, if(number < 5, if(number % 2, NULL, number)::Variant(UInt64)::Variant(UInt64, String, Array(UInt64)), if(number % 2, NULL, 'str_' || toString(number))::Variant(String)::Variant(UInt64, String, Array(UInt64))) from numbers(3000000, 1000000) settings max_insert_block_size = 100000, min_insert_block_size_rows=100000" + } + +function test2_select() +{ + echo "test2 select" + $CH_CLIENT -q "select v from test format Null;" + $CH_CLIENT -q "select count() from test where isNotNull(v);" + $CH_CLIENT -q "select v.String from test format Null;" + $CH_CLIENT -q "select count() from test where isNotNull(v.String);" + $CH_CLIENT -q "select v.UInt64 from test format Null;" + $CH_CLIENT -q "select count() from test where isNotNull(v.UInt64);" + echo "-----------------------------------------------------------------------------------------------------------" +} + +function run() +{ + test1_insert + test1_select + if [ $1 == 1 ]; then + $CH_CLIENT -q "optimize table test final;" + test1_select + fi + $CH_CLIENT -q "truncate table test;" + test2_insert + test2_select + if [ $1 == 1 ]; then + $CH_CLIENT -q "optimize table test final;" + test2_select + fi + $CH_CLIENT -q "truncate table test;" +} + +$CH_CLIENT -q "drop table if exists test;" + +echo "Memory" +$CH_CLIENT -q "create table test (id UInt64, v Variant(UInt64, String, Array(UInt64))) engine=Memory;" +run 0 +$CH_CLIENT -q "drop table test;" + +echo "MergeTree compact" +$CH_CLIENT -q "create table test (id UInt64, v Variant(UInt64, String, Array(UInt64))) engine=MergeTree order by id settings min_rows_for_wide_part=100000000, min_bytes_for_wide_part=1000000000;" +run 1 +$CH_CLIENT -q "drop table test;" + +echo "MergeTree wide" +$CH_CLIENT -q "create table test (id UInt64, v Variant(UInt64, String, Array(UInt64))) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1;" +run 1 +$CH_CLIENT -q "drop table test;" diff --git a/tests/queries/0_stateless/02944_variant_as_if_multi_if_result.sql b/tests/queries/0_stateless/02944_variant_as_if_multi_if_result.sql index da36863bfda..1121b21e383 100644 --- a/tests/queries/0_stateless/02944_variant_as_if_multi_if_result.sql +++ b/tests/queries/0_stateless/02944_variant_as_if_multi_if_result.sql @@ -1,3 +1,4 @@ +set allow_experimental_analyzer=0; -- The result type for if function with constant is different with analyzer. set allow_experimental_variant_type=1; set use_variant_when_no_common_type_in_if=1; From 38ec9b5f719740b4e94758f9e5578acd562df939 Mon Sep 17 00:00:00 2001 From: avogar Date: Fri, 22 Dec 2023 00:11:39 +0000 Subject: [PATCH 034/245] Fix variant element deserialization --- .../Serializations/SerializationVariant.cpp | 27 ++-- .../SerializationVariantElement.cpp | 149 ++++++++++-------- ...different_local_and_global_order.reference | 30 ++-- ...e_with_different_local_and_global_order.sh | 8 +- 4 files changed, 117 insertions(+), 97 deletions(-) diff --git a/src/DataTypes/Serializations/SerializationVariant.cpp b/src/DataTypes/Serializations/SerializationVariant.cpp index ebd44fd6955..910ad1da303 100644 --- a/src/DataTypes/Serializations/SerializationVariant.cpp +++ b/src/DataTypes/Serializations/SerializationVariant.cpp @@ -277,13 +277,10 @@ void SerializationVariant::deserializeBinaryBulkWithMultipleStreams( throw Exception(ErrorCodes::LOGICAL_ERROR, "Trying to deserialize data into Variant column with not global variants order"); /// First, deserialize new discriminators. - /// We deserialize them into a separate column to be able to use substream cache, - /// so if we also need to deserialize some of sub columns, we will read discriminators only once. settings.path.push_back(Substream::VariantDiscriminators); - ColumnPtr discriminators; if (auto cached_discriminators = getFromSubstreamsCache(cache, settings.path)) { - discriminators = cached_discriminators; + col.getLocalDiscriminatorsPtr() = cached_discriminators; } else { @@ -291,29 +288,31 @@ void SerializationVariant::deserializeBinaryBulkWithMultipleStreams( if (!discriminators_stream) throw Exception(ErrorCodes::LOGICAL_ERROR, "Got empty stream for VariantDiscriminators in SerializationVariant::deserializeBinaryBulkWithMultipleStreams"); - discriminators = ColumnVariant::ColumnDiscriminators::create(); - SerializationNumber().deserializeBinaryBulk(*discriminators->assumeMutable(), *discriminators_stream, limit, 0); - addToSubstreamsCache(cache, settings.path, discriminators); + SerializationNumber().deserializeBinaryBulk(*col.getLocalDiscriminatorsPtr()->assumeMutable(), *discriminators_stream, limit, 0); + addToSubstreamsCache(cache, settings.path, col.getLocalDiscriminatorsPtr()); } settings.path.pop_back(); - /// Iterate through new discriminators, append them to column and calculate the limit for each variant. + /// Iterate through new discriminators and calculate the limit for each variant. /// While calculating limits we can also fill offsets column (we store offsets only in memory). - const auto & discriminators_data = assert_cast(*discriminators).getData(); - auto & local_discriminators = col.getLocalDiscriminators(); - local_discriminators.reserve(local_discriminators.size() + limit); + auto & discriminators_data = col.getLocalDiscriminators(); auto & offsets = col.getOffsets(); offsets.reserve(offsets.size() + limit); std::vector variant_limits(variants.size(), 0); - for (size_t i = 0; i != limit; ++i) + size_t discriminators_offset = discriminators_data.size() - limit; + for (size_t i = discriminators_offset; i != discriminators_data.size(); ++i) { ColumnVariant::Discriminator discr = discriminators_data[i]; - local_discriminators.push_back(discr); if (discr == ColumnVariant::NULL_DISCRIMINATOR) + { offsets.emplace_back(); + } else - offsets.push_back(col.getVariantByLocalDiscriminator(discr).size() + variant_limits[discr]++); + { + offsets.push_back(col.getVariantByLocalDiscriminator(discr).size() + variant_limits[discr]); + ++variant_limits[discr]; + } } /// Now we can deserialize variants according to their limits. diff --git a/src/DataTypes/Serializations/SerializationVariantElement.cpp b/src/DataTypes/Serializations/SerializationVariantElement.cpp index 1c0808db2a0..e06a20d2990 100644 --- a/src/DataTypes/Serializations/SerializationVariantElement.cpp +++ b/src/DataTypes/Serializations/SerializationVariantElement.cpp @@ -40,11 +40,31 @@ void SerializationVariantElement::serializeBinaryBulkStateSuffix(SerializeBinary ErrorCodes::NOT_IMPLEMENTED, "Method serializeBinaryBulkStateSuffix is not implemented for SerializationVariantElement"); } +struct DeserializeBinaryBulkStateVariantElement : public ISerialization::DeserializeBinaryBulkState +{ + /// During deserialization discriminators and variant streams can be shared. + /// For example we can read several variant elements together: "select v.UInt32, v.String from table", + /// or we can read the whole variant and some of variant elements: "select v, v.UInt32 from table". + /// To read the same column from the same stream more than once we use substream cache, + /// but this cache stores the whole column, not only the current range. + /// During deserialization of variant element discriminators and variant columns are not stored + /// in the result column, so we need to store them inside deserialization state, so we can use + /// substream cache correctly. + ColumnPtr discriminators; + ColumnPtr variant; + + ISerialization::DeserializeBinaryBulkStatePtr variant_element_state; +}; + void SerializationVariantElement::deserializeBinaryBulkStatePrefix(DeserializeBinaryBulkSettings & settings, DeserializeBinaryBulkStatePtr & state) const { + auto variant_element_state = std::make_shared(); + addVariantToPath(settings.path); - nested_serialization->deserializeBinaryBulkStatePrefix(settings, state); + nested_serialization->deserializeBinaryBulkStatePrefix(settings, variant_element_state->variant_element_state); removeVariantFromPath(settings.path); + + state = std::move(variant_element_state); } void SerializationVariantElement::serializeBinaryBulkWithMultipleStreams(const IColumn &, size_t, size_t, SerializeBinaryBulkSettings &, SerializeBinaryBulkStatePtr &) const @@ -53,22 +73,19 @@ void SerializationVariantElement::serializeBinaryBulkWithMultipleStreams(const I } void SerializationVariantElement::deserializeBinaryBulkWithMultipleStreams( - ColumnPtr & column, + ColumnPtr & result_column, size_t limit, DeserializeBinaryBulkSettings & settings, DeserializeBinaryBulkStatePtr & state, SubstreamsCache * cache) const { - auto mutable_column = column->assumeMutable(); - ColumnNullable * nullable_col = typeid_cast(mutable_column.get()); - NullMap * null_map = nullable_col ? &nullable_col->getNullMapData() : nullptr; + auto * variant_element_state = checkAndGetState(state); /// First, deserialize discriminators from Variant column. settings.path.push_back(Substream::VariantDiscriminators); - ColumnPtr discriminators; if (auto cached_discriminators = getFromSubstreamsCache(cache, settings.path)) { - discriminators = cached_discriminators; + variant_element_state->discriminators = cached_discriminators; } else { @@ -76,85 +93,87 @@ void SerializationVariantElement::deserializeBinaryBulkWithMultipleStreams( if (!discriminators_stream) throw Exception(ErrorCodes::LOGICAL_ERROR, "Got empty stream for VariantDiscriminators in SerializationVariantElement::deserializeBinaryBulkWithMultipleStreams"); - discriminators = ColumnVariant::ColumnDiscriminators::create(); - SerializationNumber().deserializeBinaryBulk(*discriminators->assumeMutable(), *discriminators_stream, limit, 0); - addToSubstreamsCache(cache, settings.path, discriminators); + /// If we started to read a new column, reinitialize discriminators column in deserialization state. + if (!variant_element_state->discriminators || result_column->empty()) + variant_element_state->discriminators = ColumnVariant::ColumnDiscriminators::create(); + + SerializationNumber().deserializeBinaryBulk(*variant_element_state->discriminators->assumeMutable(), *discriminators_stream, limit, 0); + addToSubstreamsCache(cache, settings.path, variant_element_state->discriminators); } settings.path.pop_back(); - /// Iterate through discriminators to calculate the size of the variant. - const auto & discriminators_data = assert_cast(*discriminators).getData(); - size_t variant_size = 0; - for (auto discr : discriminators_data) - variant_size += discr == variant_discriminator; + /// Iterate through new discriminators to calculate the limit for our variant. + const auto & discriminators_data = assert_cast(*variant_element_state->discriminators).getData(); + size_t discriminators_offset = variant_element_state->discriminators->size() - limit; + size_t variant_limit = 0; + for (size_t i = discriminators_offset; i != discriminators_data.size(); ++i) + variant_limit += (discriminators_data[i] == variant_discriminator); - /// Now we know the size of the variant and can deserialize it. + /// Now we know the limit for our variant and can deserialize it. - /// If the size of variant column is the same as the size of discriminators, - /// we can deserialize new values directly into our column. - if (variant_size == discriminators_data.size()) + /// If result column is Nullable, fill null map and extract nested column. + MutableColumnPtr mutable_column = result_column->assumeMutable(); + if (isColumnNullable(*mutable_column)) { - addVariantToPath(settings.path); - /// Special case when our result column is LowCardinality(Nullable(T)). - /// In this case the variant type is LowCardinality(T), and we cannot just - /// deserialize its values directly into LowCardinality(Nullable(T)) column. - /// We create a separate column with type LowCardinality(T), deserialize - /// values into it and then insert into result column using insertRangeFrom. - if (isColumnLowCardinalityNullable(*column)) + auto & nullable_column = assert_cast(*mutable_column); + NullMap & null_map = nullable_column.getNullMapData(); + /// If we have only our discriminator in range, fill null map with 0. + if (variant_limit == limit) { - ColumnPtr variant_col = mutable_column->cloneEmpty(); - /// LowCardinality(Nullable(T)) -> LowCardinality(T) - assert_cast(*variant_col->assumeMutable()).nestedRemoveNullable(); - nested_serialization->deserializeBinaryBulkWithMultipleStreams(variant_col, limit, settings, state, cache); - mutable_column->insertRangeFrom(*variant_col, 0, variant_col->size()); + null_map.resize_fill(null_map.size() + limit, 0); } + /// If no our discriminator in current range, fill null map with 1. + else if (variant_limit == 0) + { + null_map.resize_fill(null_map.size() + limit, 1); + } + /// Otherwise we should iterate through discriminators to fill null map. else { - nested_serialization->deserializeBinaryBulkWithMultipleStreams(nullable_col ? nullable_col->getNestedColumnPtr() : column, limit, settings, state, cache); + null_map.reserve(null_map.size() + limit); + for (size_t i = discriminators_offset; i != discriminators_data.size(); ++i) + null_map.push_back(discriminators_data[i] != variant_discriminator); } - if (nullable_col) - null_map->resize_fill(null_map->size() + limit, 0); - removeVariantFromPath(settings.path); - return; + + mutable_column = nullable_column.getNestedColumnPtr()->assumeMutable(); } - /// If variant size is 0, just fill column with default values. - if (variant_size == 0) + /// If we started to read a new column, reinitialize variant column in deserialization state. + if (!variant_element_state->variant || result_column->empty()) { - mutable_column->insertManyDefaults(limit); - return; + variant_element_state->variant = mutable_column->cloneEmpty(); + + /// When result column is LowCardinality(Nullable(T)) we should + /// remove Nullable from variant column before deserialization. + if (isColumnLowCardinalityNullable(*mutable_column)) + assert_cast(*variant_element_state->variant->assumeMutable()).nestedRemoveNullable(); } - /// In general case we should deserialize variant into a separate column, - /// iterate through discriminators and insert values from variant only when - /// row contains its discriminator and default value otherwise. - mutable_column->reserve(mutable_column->size() + limit); - mutable_column = nullable_col ? nullable_col->getNestedColumnPtr()->assumeMutable() : std::move(mutable_column); - ColumnPtr variant_col = mutable_column->cloneEmpty(); - - /// Special case when our result column is LowCardinality(Nullable(T)). - /// We should remove Nullable from variant column before deserialization. - if (isColumnLowCardinalityNullable(*column)) - assert_cast(*variant_col->assumeMutable()).nestedRemoveNullable(); - addVariantToPath(settings.path); - nested_serialization->deserializeBinaryBulkWithMultipleStreams(variant_col, variant_size, settings, state, cache); + nested_serialization->deserializeBinaryBulkWithMultipleStreams(variant_element_state->variant, variant_limit, settings, variant_element_state->variant_element_state, cache); removeVariantFromPath(settings.path); - size_t variant_index = 0; - for (auto discr : discriminators_data) + size_t variant_offset = variant_element_state->variant->size() - variant_limit; + + /// If don't have our discriminator in range, just insert defaults. + if (variant_limit == 0) { - if (discr == variant_discriminator) + mutable_column->insertManyDefaults(limit); + } + /// If we have only our discriminator in range, insert the whole range to result column. + else if (variant_limit == limit) + { + mutable_column->insertRangeFrom(*variant_element_state->variant, variant_offset, variant_limit); + } + /// Otherwise iterate through discriminators and insert value from variant or default value depending on the discriminator. + else + { + for (size_t i = discriminators_offset; i != discriminators_data.size(); ++i) { - if (null_map) - null_map->push_back(0); - mutable_column->insertFrom(*variant_col, variant_index++); - } - else - { - if (null_map) - null_map->push_back(1); - mutable_column->insertDefault(); + if (discriminators_data[i] == variant_discriminator) + mutable_column->insertFrom(*variant_element_state->variant, variant_offset++); + else + mutable_column->insertDefault(); } } } diff --git a/tests/queries/0_stateless/02943_variant_type_with_different_local_and_global_order.reference b/tests/queries/0_stateless/02943_variant_type_with_different_local_and_global_order.reference index f2e355824f9..1736a307c42 100644 --- a/tests/queries/0_stateless/02943_variant_type_with_different_local_and_global_order.reference +++ b/tests/queries/0_stateless/02943_variant_type_with_different_local_and_global_order.reference @@ -44,9 +44,9 @@ str_38 str_38 \N ----------------------------------------------------------------------------------------------------------- test2 insert test2 select -7000000 -1000000 -6000000 +2500000 +750000 +1750000 ----------------------------------------------------------------------------------------------------------- MergeTree compact test1 insert @@ -136,14 +136,14 @@ str_38 str_38 \N ----------------------------------------------------------------------------------------------------------- test2 insert test2 select -7000000 -1000000 -6000000 +2500000 +750000 +1750000 ----------------------------------------------------------------------------------------------------------- test2 select -7000000 -1000000 -6000000 +2500000 +750000 +1750000 ----------------------------------------------------------------------------------------------------------- MergeTree wide test1 insert @@ -233,12 +233,12 @@ str_38 str_38 \N ----------------------------------------------------------------------------------------------------------- test2 insert test2 select -7000000 -1000000 -6000000 +2500000 +750000 +1750000 ----------------------------------------------------------------------------------------------------------- test2 select -7000000 -1000000 -6000000 +2500000 +750000 +1750000 ----------------------------------------------------------------------------------------------------------- diff --git a/tests/queries/0_stateless/02943_variant_type_with_different_local_and_global_order.sh b/tests/queries/0_stateless/02943_variant_type_with_different_local_and_global_order.sh index 88bd2d3bd42..9f4df8d7466 100755 --- a/tests/queries/0_stateless/02943_variant_type_with_different_local_and_global_order.sh +++ b/tests/queries/0_stateless/02943_variant_type_with_different_local_and_global_order.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash +# Tags: long CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # reset --log_comment @@ -29,14 +30,15 @@ function test2_insert() { echo "test2 insert" $CH_CLIENT -q "insert into test select number, number::Variant(UInt64)::Variant(UInt64, Array(UInt64)) from numbers(1000000) settings max_insert_block_size = 100000, min_insert_block_size_rows=100000" - $CH_CLIENT -q "insert into test select number, if(number % 2, NULL, number)::Variant(UInt64)::Variant(UInt64, String, Array(UInt64)) as res from numbers(1000000, 10000000) settings max_insert_block_size = 100000, min_insert_block_size_rows=100000" + $CH_CLIENT -q "insert into test select number, if(number % 2, NULL, number)::Variant(UInt64)::Variant(UInt64, String, Array(UInt64)) as res from numbers(1000000, 1000000) settings max_insert_block_size = 100000, min_insert_block_size_rows=100000" $CH_CLIENT -q "insert into test select number, if(number % 2, NULL, 'str_' || toString(number))::Variant(String)::Variant(UInt64, String, Array(UInt64)) as res from numbers(2000000, 1000000) settings max_insert_block_size = 100000, min_insert_block_size_rows=100000" - $CH_CLIENT -q "insert into test select number, if(number < 5, if(number % 2, NULL, number)::Variant(UInt64)::Variant(UInt64, String, Array(UInt64)), if(number % 2, NULL, 'str_' || toString(number))::Variant(String)::Variant(UInt64, String, Array(UInt64))) from numbers(3000000, 1000000) settings max_insert_block_size = 100000, min_insert_block_size_rows=100000" - } + $CH_CLIENT -q "insert into test select number, if(number < 3500000, if(number % 2, NULL, number)::Variant(UInt64)::Variant(UInt64, String, Array(UInt64)), if(number % 2, NULL, 'str_' || toString(number))::Variant(String)::Variant(UInt64, String, Array(UInt64))) from numbers(3000000, 1000000) settings max_insert_block_size = 100000, min_insert_block_size_rows=100000" +} function test2_select() { echo "test2 select" + $CH_CLIENT -q "select v, v.String, v.UInt64 from test format Null;" $CH_CLIENT -q "select v from test format Null;" $CH_CLIENT -q "select count() from test where isNotNull(v);" $CH_CLIENT -q "select v.String from test format Null;" From 319c20091efe8eebee5bde9bb8bae67e58a589d9 Mon Sep 17 00:00:00 2001 From: avogar Date: Fri, 22 Dec 2023 00:15:44 +0000 Subject: [PATCH 035/245] Fix comments --- src/DataTypes/Serializations/SerializationVariant.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/DataTypes/Serializations/SerializationVariant.cpp b/src/DataTypes/Serializations/SerializationVariant.cpp index 910ad1da303..3b51c51872f 100644 --- a/src/DataTypes/Serializations/SerializationVariant.cpp +++ b/src/DataTypes/Serializations/SerializationVariant.cpp @@ -276,7 +276,7 @@ void SerializationVariant::deserializeBinaryBulkWithMultipleStreams( if (!col.hasGlobalVariantsOrder()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Trying to deserialize data into Variant column with not global variants order"); - /// First, deserialize new discriminators. + /// First, deserialize discriminators. settings.path.push_back(Substream::VariantDiscriminators); if (auto cached_discriminators = getFromSubstreamsCache(cache, settings.path)) { @@ -451,7 +451,7 @@ std::unordered_map getTypesTextDeserializePriorityMap() /// then for types with the same depth we sort by the types priority, and last we sort by the depth of LowCardinality/Nullable types, /// so if we have types with the same level of nesting and the same priority, we will first try to deserialize LowCardinality/Nullable types /// (for example if we have types Array(Array(String)) and Array(Array(Nullable(String))). -/// This is just a batch of heuristics, +/// This is just a batch of heuristics. std::tuple getTypeTextDeserializePriority(const DataTypePtr & type, size_t nested_depth, size_t simple_nested_depth, std::unordered_map & priority_map) { if (const auto * nullable_type = typeid_cast(type.get())) @@ -553,7 +553,7 @@ bool SerializationVariant::tryDeserializeImpl( for (size_t global_discr : deserialize_text_order) { ReadBufferFromString variant_buf(field); - /// Usually try_deserialize_variant should not throw an exception, but let's use try/catch just in case. + /// Usually try_deserialize_variant should not throw any exception, but let's use try/catch just in case. try { auto & variant_column = column_variant.getVariantByGlobalDiscriminator(global_discr); From a61efedba8854e8f06b549deb595315ee40eb303 Mon Sep 17 00:00:00 2001 From: avogar Date: Sat, 23 Dec 2023 00:10:58 +0000 Subject: [PATCH 036/245] Fix serialization again, add more tests --- src/Columns/ColumnVariant.h | 2 +- src/Columns/ColumnVector.cpp | 2 +- src/DataTypes/DataTypeVariant.cpp | 15 +++++- .../Serializations/ISerialization.cpp | 7 +++ src/DataTypes/Serializations/ISerialization.h | 1 + .../Serializations/SerializationVariant.cpp | 52 ++++++++++++++----- .../02943_variant_read_subcolumns_1.reference | 6 +++ .../02943_variant_read_subcolumns_1.sh | 38 ++++++++++++++ .../02943_variant_read_subcolumns_2.reference | 6 +++ .../02943_variant_read_subcolumns_2.sh | 38 ++++++++++++++ 10 files changed, 150 insertions(+), 17 deletions(-) create mode 100644 tests/queries/0_stateless/02943_variant_read_subcolumns_1.reference create mode 100755 tests/queries/0_stateless/02943_variant_read_subcolumns_1.sh create mode 100644 tests/queries/0_stateless/02943_variant_read_subcolumns_2.reference create mode 100755 tests/queries/0_stateless/02943_variant_read_subcolumns_2.sh diff --git a/src/Columns/ColumnVariant.h b/src/Columns/ColumnVariant.h index b388b118a69..ec58553f5f3 100644 --- a/src/Columns/ColumnVariant.h +++ b/src/Columns/ColumnVariant.h @@ -163,7 +163,7 @@ public: size_t size() const override { - return local_discriminators->size(); + return offsets->size(); } Field operator[](size_t n) const override; diff --git a/src/Columns/ColumnVector.cpp b/src/Columns/ColumnVector.cpp index 37e62c76596..b4e3fee5e42 100644 --- a/src/Columns/ColumnVector.cpp +++ b/src/Columns/ColumnVector.cpp @@ -469,7 +469,7 @@ void ColumnVector::insertRangeFrom(const IColumn & src, size_t start, size_t const ColumnVector & src_vec = assert_cast(src); if (start + length > src_vec.data.size()) - throw Exception(ErrorCodes::PARAMETER_OUT_OF_BOUND, + throw Exception(ErrorCodes::LOGICAL_ERROR, "Parameters start = {}, length = {} are out of bound " "in ColumnVector::insertRangeFrom method (data.size() = {}).", toString(start), toString(length), toString(src_vec.data.size())); diff --git a/src/DataTypes/DataTypeVariant.cpp b/src/DataTypes/DataTypeVariant.cpp index 0575f220f22..5dc42cc7443 100644 --- a/src/DataTypes/DataTypeVariant.cpp +++ b/src/DataTypes/DataTypeVariant.cpp @@ -108,7 +108,20 @@ ColumnPtr DataTypeVariant::createColumnConst(size_t size, const DB::Field & fiel auto field_type = applyVisitor(FieldToDataType(), field); auto discr = tryGetVariantDiscriminator(field_type); if (!discr) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot insert field \"{}\" into column with type {}", toString(field), getName()); + { + for (size_t i = 0; i != variants.size(); ++i) + { + if (field.getType() == variants[i]->getDefault().getType()) + { + discr = i; + break; + } + } + } + + if (!discr) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot insert field \"{}\" with type {} into column with type {}", toString(field), field.getTypeName(), getName()); + assert_cast(*column).insertIntoVariant(field, *discr); } diff --git a/src/DataTypes/Serializations/ISerialization.cpp b/src/DataTypes/Serializations/ISerialization.cpp index 86a37949dc8..46353fffb48 100644 --- a/src/DataTypes/Serializations/ISerialization.cpp +++ b/src/DataTypes/Serializations/ISerialization.cpp @@ -124,15 +124,20 @@ void ISerialization::deserializeBinaryBulkWithMultipleStreams( DeserializeBinaryBulkStatePtr & /* state */, SubstreamsCache * cache) const { + LOG_DEBUG(&Poco::Logger::get("ISerialization"), "Deserialize path {}. Initial column size: {}", settings.path.toString(), column->size()); + auto cached_column = getFromSubstreamsCache(cache, settings.path); if (cached_column) { column = cached_column; + LOG_DEBUG(&Poco::Logger::get("ISerialization"), "Use column from cache. Size: {}", cached_column->size()); } else if (ReadBuffer * stream = settings.getter(settings.path)) { auto mutable_column = column->assumeMutable(); + LOG_DEBUG(&Poco::Logger::get("ISerialization"), "Deserialize column. Initial size: {}", mutable_column->size()); deserializeBinaryBulk(*mutable_column, *stream, limit, settings.avg_value_size_hint); + LOG_DEBUG(&Poco::Logger::get("ISerialization"), "Deserialized column. Size: {}", mutable_column->size()); column = std::move(mutable_column); addToSubstreamsCache(cache, settings.path, column); } @@ -177,6 +182,8 @@ String getNameForSubstreamPath( } else if (it->type == Substream::VariantDiscriminators) stream_name += ".discr"; + else if (it->type == Substream::VariantOffsets) + stream_name += ".variant_offsets"; else if (it->type == Substream::VariantElement) stream_name += "." + it->variant_element_name; } diff --git a/src/DataTypes/Serializations/ISerialization.h b/src/DataTypes/Serializations/ISerialization.h index f0273f59d1f..5c6fe31ed9e 100644 --- a/src/DataTypes/Serializations/ISerialization.h +++ b/src/DataTypes/Serializations/ISerialization.h @@ -153,6 +153,7 @@ public: ObjectData, VariantDiscriminators, + VariantOffsets, VariantElements, VariantElement, diff --git a/src/DataTypes/Serializations/SerializationVariant.cpp b/src/DataTypes/Serializations/SerializationVariant.cpp index 3b51c51872f..d36151fe8e9 100644 --- a/src/DataTypes/Serializations/SerializationVariant.cpp +++ b/src/DataTypes/Serializations/SerializationVariant.cpp @@ -291,28 +291,17 @@ void SerializationVariant::deserializeBinaryBulkWithMultipleStreams( SerializationNumber().deserializeBinaryBulk(*col.getLocalDiscriminatorsPtr()->assumeMutable(), *discriminators_stream, limit, 0); addToSubstreamsCache(cache, settings.path, col.getLocalDiscriminatorsPtr()); } - settings.path.pop_back(); - /// Iterate through new discriminators and calculate the limit for each variant. - /// While calculating limits we can also fill offsets column (we store offsets only in memory). - auto & discriminators_data = col.getLocalDiscriminators(); - auto & offsets = col.getOffsets(); - offsets.reserve(offsets.size() + limit); + /// Second, calculate limits for each variant by iterating through new discriminators. std::vector variant_limits(variants.size(), 0); + auto & discriminators_data = col.getLocalDiscriminators(); size_t discriminators_offset = discriminators_data.size() - limit; for (size_t i = discriminators_offset; i != discriminators_data.size(); ++i) { ColumnVariant::Discriminator discr = discriminators_data[i]; - if (discr == ColumnVariant::NULL_DISCRIMINATOR) - { - offsets.emplace_back(); - } - else - { - offsets.push_back(col.getVariantByLocalDiscriminator(discr).size() + variant_limits[discr]); + if (discr != ColumnVariant::NULL_DISCRIMINATOR) ++variant_limits[discr]; - } } /// Now we can deserialize variants according to their limits. @@ -325,6 +314,41 @@ void SerializationVariant::deserializeBinaryBulkWithMultipleStreams( settings.path.pop_back(); } settings.path.pop_back(); + + /// Fill offsets column. + /// It's important to do it after deserialization of all variants, because to fill offsets we need + /// initial variants sizes without values in current range, but some variants can be shared with + /// other columns via substream cache and they can already contain values from this range even + /// before we call deserialize for them. So, before deserialize we cannot know for sure if + /// variant columns already contain values from current range or not. But after calling deserialize + /// we know for sure that they contain these values, so we can use valiant limits and their + /// new sizes to calculate correct offsets. + settings.path.push_back(Substream::VariantOffsets); + if (auto cached_offsets = getFromSubstreamsCache(cache, settings.path)) + { + col.getOffsetsPtr() = cached_offsets; + } + else + { + auto & offsets = col.getOffsets(); + offsets.reserve(offsets.size() + limit); + std::vector variant_offsets; + variant_offsets.reserve(variants.size()); + for (size_t i = 0; i != variants.size(); ++i) + variant_offsets.push_back(col.getVariantByLocalDiscriminator(i).size() - variant_limits[i]); + + for (size_t i = discriminators_offset; i != discriminators_data.size(); ++i) + { + ColumnVariant::Discriminator discr = discriminators_data[i]; + if (discr == ColumnVariant::NULL_DISCRIMINATOR) + offsets.emplace_back(); + else + offsets.push_back(variant_offsets[discr]++); + } + + addToSubstreamsCache(cache, settings.path, col.getOffsetsPtr()); + } + settings.path.pop_back(); } void SerializationVariant::addVariantElementToPath(DB::ISerialization::SubstreamPath & path, size_t i) const diff --git a/tests/queries/0_stateless/02943_variant_read_subcolumns_1.reference b/tests/queries/0_stateless/02943_variant_read_subcolumns_1.reference new file mode 100644 index 00000000000..4b93782cddf --- /dev/null +++ b/tests/queries/0_stateless/02943_variant_read_subcolumns_1.reference @@ -0,0 +1,6 @@ +Memory +test +MergeTree compact +test +MergeTree wide +test diff --git a/tests/queries/0_stateless/02943_variant_read_subcolumns_1.sh b/tests/queries/0_stateless/02943_variant_read_subcolumns_1.sh new file mode 100755 index 00000000000..9ccad55191f --- /dev/null +++ b/tests/queries/0_stateless/02943_variant_read_subcolumns_1.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash +# Tags: long + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# reset --log_comment +CLICKHOUSE_LOG_COMMENT= +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_variant_type=1 --use_variant_when_no_common_type_in_if=1 " + + +function test() +{ + echo "test" + $CH_CLIENT -q "insert into test select number, multiIf(number % 3 == 2, NULL, number % 3 == 1, number, arrayMap(x -> multiIf(number % 9 == 0, NULL, number % 9 == 3, 'str_' || toString(number), number), range(number % 10))) from numbers(1000000) settings min_insert_block_size_rows=100000" + $CH_CLIENT -q "select v, v.UInt64, v.\`Array(Variant(String, UInt64))\`, v.\`Array(Variant(String, UInt64))\`.size0, v.\`Array(Variant(String, UInt64))\`.UInt64 from test order by id format Null" + $CH_CLIENT -q "select v.UInt64, v.\`Array(Variant(String, UInt64))\`, v.\`Array(Variant(String, UInt64))\`.size0, v.\`Array(Variant(String, UInt64))\`.UInt64 from test order by id format Null" + $CH_CLIENT -q "select v.\`Array(Variant(String, UInt64))\`, v.\`Array(Variant(String, UInt64))\`.size0, v.\`Array(Variant(String, UInt64))\`.UInt64, v.\`Array(Variant(String, UInt64))\`.String from test order by id format Null" +} + +$CH_CLIENT -q "drop table if exists test;" + +echo "Memory" +$CH_CLIENT -q "create table test (id UInt64, v Variant(UInt64, Array(Variant(String, UInt64)))) engine=Memory" +test +$CH_CLIENT -q "drop table test;" + +echo "MergeTree compact" +$CH_CLIENT -q "create table test (id UInt64, v Variant(UInt64, Array(Variant(String, UInt64)))) engine=MergeTree order by id settings min_rows_for_wide_part=1000000000, min_bytes_for_wide_part=10000000000;" +test +$CH_CLIENT -q "drop table test;" + +echo "MergeTree wide" +$CH_CLIENT -q "create table test (id UInt64, v Variant(UInt64, Array(Variant(String, UInt64)))) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1;" +test +$CH_CLIENT -q "drop table test;" + diff --git a/tests/queries/0_stateless/02943_variant_read_subcolumns_2.reference b/tests/queries/0_stateless/02943_variant_read_subcolumns_2.reference new file mode 100644 index 00000000000..4b93782cddf --- /dev/null +++ b/tests/queries/0_stateless/02943_variant_read_subcolumns_2.reference @@ -0,0 +1,6 @@ +Memory +test +MergeTree compact +test +MergeTree wide +test diff --git a/tests/queries/0_stateless/02943_variant_read_subcolumns_2.sh b/tests/queries/0_stateless/02943_variant_read_subcolumns_2.sh new file mode 100755 index 00000000000..9ccad55191f --- /dev/null +++ b/tests/queries/0_stateless/02943_variant_read_subcolumns_2.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash +# Tags: long + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# reset --log_comment +CLICKHOUSE_LOG_COMMENT= +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_variant_type=1 --use_variant_when_no_common_type_in_if=1 " + + +function test() +{ + echo "test" + $CH_CLIENT -q "insert into test select number, multiIf(number % 3 == 2, NULL, number % 3 == 1, number, arrayMap(x -> multiIf(number % 9 == 0, NULL, number % 9 == 3, 'str_' || toString(number), number), range(number % 10))) from numbers(1000000) settings min_insert_block_size_rows=100000" + $CH_CLIENT -q "select v, v.UInt64, v.\`Array(Variant(String, UInt64))\`, v.\`Array(Variant(String, UInt64))\`.size0, v.\`Array(Variant(String, UInt64))\`.UInt64 from test order by id format Null" + $CH_CLIENT -q "select v.UInt64, v.\`Array(Variant(String, UInt64))\`, v.\`Array(Variant(String, UInt64))\`.size0, v.\`Array(Variant(String, UInt64))\`.UInt64 from test order by id format Null" + $CH_CLIENT -q "select v.\`Array(Variant(String, UInt64))\`, v.\`Array(Variant(String, UInt64))\`.size0, v.\`Array(Variant(String, UInt64))\`.UInt64, v.\`Array(Variant(String, UInt64))\`.String from test order by id format Null" +} + +$CH_CLIENT -q "drop table if exists test;" + +echo "Memory" +$CH_CLIENT -q "create table test (id UInt64, v Variant(UInt64, Array(Variant(String, UInt64)))) engine=Memory" +test +$CH_CLIENT -q "drop table test;" + +echo "MergeTree compact" +$CH_CLIENT -q "create table test (id UInt64, v Variant(UInt64, Array(Variant(String, UInt64)))) engine=MergeTree order by id settings min_rows_for_wide_part=1000000000, min_bytes_for_wide_part=10000000000;" +test +$CH_CLIENT -q "drop table test;" + +echo "MergeTree wide" +$CH_CLIENT -q "create table test (id UInt64, v Variant(UInt64, Array(Variant(String, UInt64)))) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1;" +test +$CH_CLIENT -q "drop table test;" + From 4931b363079aa5dd4fbc35ff6faea62efaf218de Mon Sep 17 00:00:00 2001 From: avogar Date: Sat, 23 Dec 2023 00:32:28 +0000 Subject: [PATCH 037/245] Fix style --- src/Columns/ColumnVector.cpp | 2 +- src/DataTypes/Serializations/SerializationArray.cpp | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/Columns/ColumnVector.cpp b/src/Columns/ColumnVector.cpp index b4e3fee5e42..37e62c76596 100644 --- a/src/Columns/ColumnVector.cpp +++ b/src/Columns/ColumnVector.cpp @@ -469,7 +469,7 @@ void ColumnVector::insertRangeFrom(const IColumn & src, size_t start, size_t const ColumnVector & src_vec = assert_cast(src); if (start + length > src_vec.data.size()) - throw Exception(ErrorCodes::LOGICAL_ERROR, + throw Exception(ErrorCodes::PARAMETER_OUT_OF_BOUND, "Parameters start = {}, length = {} are out of bound " "in ColumnVector::insertRangeFrom method (data.size() = {}).", toString(start), toString(length), toString(src_vec.data.size())); diff --git a/src/DataTypes/Serializations/SerializationArray.cpp b/src/DataTypes/Serializations/SerializationArray.cpp index be23278ef25..bb22af16c69 100644 --- a/src/DataTypes/Serializations/SerializationArray.cpp +++ b/src/DataTypes/Serializations/SerializationArray.cpp @@ -348,6 +348,7 @@ void SerializationArray::deserializeBinaryBulkWithMultipleStreams( { auto mutable_column = column->assumeMutable(); ColumnArray & column_array = typeid_cast(*mutable_column); + size_t prev_last_offset = column_array.getOffsets().back(); settings.path.push_back(Substream::ArraySizes); if (auto cached_column = getFromSubstreamsCache(cache, settings.path)) @@ -371,9 +372,9 @@ void SerializationArray::deserializeBinaryBulkWithMultipleStreams( /// Number of values corresponding with `offset_values` must be read. size_t last_offset = offset_values.back(); - if (last_offset < nested_column->size()) + if (last_offset < prev_last_offset) throw Exception(ErrorCodes::LOGICAL_ERROR, "Nested column is longer than last offset"); - size_t nested_limit = last_offset - nested_column->size(); + size_t nested_limit = last_offset - prev_last_offset; if (unlikely(nested_limit > MAX_ARRAYS_SIZE)) throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Array sizes are too large: {}", nested_limit); From 4e4aa90430d02f1fcc17b517946799f23c59b83e Mon Sep 17 00:00:00 2001 From: avogar Date: Sat, 23 Dec 2023 00:35:20 +0000 Subject: [PATCH 038/245] Remove debug logging --- src/DataTypes/Serializations/ISerialization.cpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/DataTypes/Serializations/ISerialization.cpp b/src/DataTypes/Serializations/ISerialization.cpp index 46353fffb48..08575f06f2a 100644 --- a/src/DataTypes/Serializations/ISerialization.cpp +++ b/src/DataTypes/Serializations/ISerialization.cpp @@ -124,20 +124,15 @@ void ISerialization::deserializeBinaryBulkWithMultipleStreams( DeserializeBinaryBulkStatePtr & /* state */, SubstreamsCache * cache) const { - LOG_DEBUG(&Poco::Logger::get("ISerialization"), "Deserialize path {}. Initial column size: {}", settings.path.toString(), column->size()); - auto cached_column = getFromSubstreamsCache(cache, settings.path); if (cached_column) { column = cached_column; - LOG_DEBUG(&Poco::Logger::get("ISerialization"), "Use column from cache. Size: {}", cached_column->size()); } else if (ReadBuffer * stream = settings.getter(settings.path)) { auto mutable_column = column->assumeMutable(); - LOG_DEBUG(&Poco::Logger::get("ISerialization"), "Deserialize column. Initial size: {}", mutable_column->size()); deserializeBinaryBulk(*mutable_column, *stream, limit, settings.avg_value_size_hint); - LOG_DEBUG(&Poco::Logger::get("ISerialization"), "Deserialized column. Size: {}", mutable_column->size()); column = std::move(mutable_column); addToSubstreamsCache(cache, settings.path, column); } From f594ab34f50c1bcd860bd3b950c8d74ffe09662d Mon Sep 17 00:00:00 2001 From: avogar Date: Sat, 23 Dec 2023 12:56:11 +0000 Subject: [PATCH 039/245] Fix special build --- src/Columns/ColumnVariant.cpp | 8 +++--- src/Columns/ColumnVariant.h | 2 +- .../Serializations/SerializationArray.cpp | 5 ++-- .../Serializations/SerializationVariant.cpp | 26 +++++++------------ src/Functions/if.cpp | 2 +- 5 files changed, 17 insertions(+), 26 deletions(-) diff --git a/src/Columns/ColumnVariant.cpp b/src/Columns/ColumnVariant.cpp index a3a0362b646..f90ebfc54bb 100644 --- a/src/Columns/ColumnVariant.cpp +++ b/src/Columns/ColumnVariant.cpp @@ -102,7 +102,7 @@ ColumnVariant::ColumnVariant(MutableColumnPtr local_discriminators_, MutableColu { } -ColumnVariant::ColumnVariant(MutableColumnPtr local_discriminators_, MutableColumns && variants_, const std::vector & global_discriminators) : ColumnVariant(std::move(local_discriminators_), nullptr, std::move(variants_), global_discriminators) +ColumnVariant::ColumnVariant(MutableColumnPtr local_discriminators_, MutableColumns && variants_, const std::vector & local_to_global_discriminators_) : ColumnVariant(std::move(local_discriminators_), nullptr, std::move(variants_), local_to_global_discriminators_) { } @@ -449,12 +449,12 @@ void ColumnVariant::insertData(const char *, size_t) throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method insertData is not supported for {}", getName()); } -void ColumnVariant::insert(const Field & field) +void ColumnVariant::insert(const Field & x) { - if (field.isNull()) + if (x.isNull()) insertDefault(); else - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Cannot insert field {} to column {}", toString(field), getName()); + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Cannot insert field {} to column {}", toString(x), getName()); } void ColumnVariant::insertFrom(const IColumn & src_, size_t n) diff --git a/src/Columns/ColumnVariant.h b/src/Columns/ColumnVariant.h index ec58553f5f3..eb96205924c 100644 --- a/src/Columns/ColumnVariant.h +++ b/src/Columns/ColumnVariant.h @@ -194,7 +194,7 @@ public: template ColumnPtr indexImpl(const PaddedPODArray & indexes, size_t limit) const; ColumnPtr replicate(const Offsets & replicate_offsets) const override; - MutableColumns scatter(ColumnIndex num_variants, const Selector & selector) const override; + MutableColumns scatter(ColumnIndex num_columns, const Selector & selector) const override; void gather(ColumnGathererStream & gatherer_stream) override; /// Variant type is not comparable. diff --git a/src/DataTypes/Serializations/SerializationArray.cpp b/src/DataTypes/Serializations/SerializationArray.cpp index bb22af16c69..be23278ef25 100644 --- a/src/DataTypes/Serializations/SerializationArray.cpp +++ b/src/DataTypes/Serializations/SerializationArray.cpp @@ -348,7 +348,6 @@ void SerializationArray::deserializeBinaryBulkWithMultipleStreams( { auto mutable_column = column->assumeMutable(); ColumnArray & column_array = typeid_cast(*mutable_column); - size_t prev_last_offset = column_array.getOffsets().back(); settings.path.push_back(Substream::ArraySizes); if (auto cached_column = getFromSubstreamsCache(cache, settings.path)) @@ -372,9 +371,9 @@ void SerializationArray::deserializeBinaryBulkWithMultipleStreams( /// Number of values corresponding with `offset_values` must be read. size_t last_offset = offset_values.back(); - if (last_offset < prev_last_offset) + if (last_offset < nested_column->size()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Nested column is longer than last offset"); - size_t nested_limit = last_offset - prev_last_offset; + size_t nested_limit = last_offset - nested_column->size(); if (unlikely(nested_limit > MAX_ARRAYS_SIZE)) throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Array sizes are too large: {}", nested_limit); diff --git a/src/DataTypes/Serializations/SerializationVariant.cpp b/src/DataTypes/Serializations/SerializationVariant.cpp index d36151fe8e9..c88dd8e9e0d 100644 --- a/src/DataTypes/Serializations/SerializationVariant.cpp +++ b/src/DataTypes/Serializations/SerializationVariant.cpp @@ -564,7 +564,7 @@ bool SerializationVariant::tryDeserializeImpl( IColumn & column, const String & field, std::function check_for_null, - std::function try_deserialize_variant) const + std::function try_deserialize_nested) const { auto & column_variant = assert_cast(column); ReadBufferFromString null_buf(field); @@ -577,25 +577,17 @@ bool SerializationVariant::tryDeserializeImpl( for (size_t global_discr : deserialize_text_order) { ReadBufferFromString variant_buf(field); - /// Usually try_deserialize_variant should not throw any exception, but let's use try/catch just in case. - try + auto & variant_column = column_variant.getVariantByGlobalDiscriminator(global_discr); + size_t prev_size = variant_column.size(); + if (try_deserialize_nested(variant_column, variants[global_discr], variant_buf) && variant_buf.eof()) { - auto & variant_column = column_variant.getVariantByGlobalDiscriminator(global_discr); - size_t prev_size = variant_column.size(); - if (try_deserialize_variant(variant_column, variants[global_discr], variant_buf) && variant_buf.eof()) - { - column_variant.getLocalDiscriminators().push_back(column_variant.localDiscriminatorByGlobal(global_discr)); - column_variant.getOffsets().push_back(prev_size); - return true; - } - else if (variant_column.size() > prev_size) - { - variant_column.popBack(1); - } + column_variant.getLocalDiscriminators().push_back(column_variant.localDiscriminatorByGlobal(global_discr)); + column_variant.getOffsets().push_back(prev_size); + return true; } - catch (...) + else if (variant_column.size() > prev_size) { - /// Try next variant. + variant_column.popBack(1); } } diff --git a/src/Functions/if.cpp b/src/Functions/if.cpp index b15bc5938be..9ca4b487119 100644 --- a/src/Functions/if.cpp +++ b/src/Functions/if.cpp @@ -224,7 +224,7 @@ public: return std::make_shared(context->getSettingsRef().allow_experimental_variant_type && context->getSettingsRef().use_variant_when_no_common_type_in_if); } - FunctionIf(bool use_variant_when_no_common_type_ = false) : FunctionIfBase(), use_variant_when_no_common_type(use_variant_when_no_common_type_) {} + explicit FunctionIf(bool use_variant_when_no_common_type_ = false) : FunctionIfBase(), use_variant_when_no_common_type(use_variant_when_no_common_type_) {} private: bool use_variant_when_no_common_type = false; From 5497fa79edfa6fdc2559d516486f80f88af40c68 Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 27 Dec 2023 14:11:53 +0000 Subject: [PATCH 040/245] Fix tests --- src/DataTypes/Serializations/SerializationEnum.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/DataTypes/Serializations/SerializationEnum.cpp b/src/DataTypes/Serializations/SerializationEnum.cpp index fb384547d64..14b1a33e2ce 100644 --- a/src/DataTypes/Serializations/SerializationEnum.cpp +++ b/src/DataTypes/Serializations/SerializationEnum.cpp @@ -73,7 +73,9 @@ template bool SerializationEnum::tryDeserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const { std::string field_name; - readQuotedStringWithSQLStyle(field_name, istr); + if (!tryReadQuotedStringWithSQLStyle(field_name, istr)) + return false; + FieldType x; if (!ref_enum_values.tryGetValue(x, StringRef(field_name))) return false; From 4b2a0b99fc094e6b70e516af0360f126f62a886d Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Wed, 27 Dec 2023 20:02:50 +0100 Subject: [PATCH 041/245] Update docs/en/sql-reference/functions/other-functions.md --- docs/en/sql-reference/functions/other-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/functions/other-functions.md b/docs/en/sql-reference/functions/other-functions.md index d69d692d055..ebc80e4d308 100644 --- a/docs/en/sql-reference/functions/other-functions.md +++ b/docs/en/sql-reference/functions/other-functions.md @@ -2839,7 +2839,7 @@ Extracts a column with specified type from a `Variant` column. **Syntax** ```sql -tupleElement(variant, type_name, [, default_value]) +variantElement(variant, type_name, [, default_value]) ``` **Arguments** From 275fbe3e986c8faee3bd396e3ed87e3707f0f25f Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 27 Dec 2023 20:04:14 +0000 Subject: [PATCH 042/245] Support function to subcolumns optimization for Variant, better text priority for reading Bool --- .../Passes/FunctionToSubcolumnsPass.cpp | 17 +++++++++++++++++ .../Serializations/SerializationVariant.cpp | 4 ++++ .../RewriteFunctionToSubcolumnVisitor.cpp | 15 +++++++++++++++ 3 files changed, 36 insertions(+) diff --git a/src/Analyzer/Passes/FunctionToSubcolumnsPass.cpp b/src/Analyzer/Passes/FunctionToSubcolumnsPass.cpp index cd635f87e0e..c74c1038173 100644 --- a/src/Analyzer/Passes/FunctionToSubcolumnsPass.cpp +++ b/src/Analyzer/Passes/FunctionToSubcolumnsPass.cpp @@ -176,6 +176,23 @@ public: node = std::make_shared(column, column_source); } + else if (function_name == "variantElement" && isVariant(column_type) && second_argument_constant_node) + { + /// Replace `variantElement(variant_argument, type_name)` with `variant_argument.type_name`. + const auto & variant_element_constant_value = second_argument_constant_node->getValue(); + String subcolumn_name; + + if (variant_element_constant_value.getType() != Field::Types::String) + return; + + subcolumn_name = variant_element_constant_value.get(); + + column.name += '.'; + column.name += subcolumn_name; + column.type = function_node->getResultType(); + + node = std::make_shared(column, column_source); + } else if (function_name == "mapContains" && column_type.isMap()) { const auto & data_type_map = assert_cast(*column.type); diff --git a/src/DataTypes/Serializations/SerializationVariant.cpp b/src/DataTypes/Serializations/SerializationVariant.cpp index c88dd8e9e0d..49ecb2fc546 100644 --- a/src/DataTypes/Serializations/SerializationVariant.cpp +++ b/src/DataTypes/Serializations/SerializationVariant.cpp @@ -534,6 +534,10 @@ std::tuple getTypeTextDeserializePriority(const DataType return {max_depth, max_priority, max_simple_nested_depth}; } + /// Bool type should have priority higher then all integers. + if (isBool(type)) + return {nested_depth, priority_map[TypeIndex::Int8] + 1 , simple_nested_depth}; + return {nested_depth, priority_map[type->getTypeId()], simple_nested_depth}; } diff --git a/src/Interpreters/RewriteFunctionToSubcolumnVisitor.cpp b/src/Interpreters/RewriteFunctionToSubcolumnVisitor.cpp index 506fa13b7ba..0717abd4782 100644 --- a/src/Interpreters/RewriteFunctionToSubcolumnVisitor.cpp +++ b/src/Interpreters/RewriteFunctionToSubcolumnVisitor.cpp @@ -122,6 +122,21 @@ void RewriteFunctionToSubcolumnData::visit(ASTFunction & function, ASTPtr & ast) ast = transformToSubcolumn(name_in_storage, subcolumn_name); ast->setAlias(alias); } + else if (function.name == "variantElement" && column_type_id == TypeIndex::Variant) + { + const auto * literal = arguments[1]->as(); + if (!literal) + return; + + String subcolumn_name; + auto value_type = literal->value.getType(); + if (value_type != Field::Types::String) + return; + + subcolumn_name = literal->value.get(); + ast = transformToSubcolumn(name_in_storage, subcolumn_name); + ast->setAlias(alias); + } else { auto it = binary_function_to_subcolumn.find(function.name); From 8b4157141c0501d4498278947b468d03638cdf8a Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 27 Dec 2023 20:36:10 +0000 Subject: [PATCH 043/245] Fix style --- src/DataTypes/Serializations/SerializationVariant.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/DataTypes/Serializations/SerializationVariant.cpp b/src/DataTypes/Serializations/SerializationVariant.cpp index 49ecb2fc546..9cfc4b9e26f 100644 --- a/src/DataTypes/Serializations/SerializationVariant.cpp +++ b/src/DataTypes/Serializations/SerializationVariant.cpp @@ -536,7 +536,7 @@ std::tuple getTypeTextDeserializePriority(const DataType /// Bool type should have priority higher then all integers. if (isBool(type)) - return {nested_depth, priority_map[TypeIndex::Int8] + 1 , simple_nested_depth}; + return {nested_depth, priority_map[TypeIndex::Int8] + 1, simple_nested_depth}; return {nested_depth, priority_map[type->getTypeId()], simple_nested_depth}; } From 4bb63f0a6f066bca972b5b3754a20f0a56354b8d Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 28 Dec 2023 13:05:15 +0100 Subject: [PATCH 044/245] Update test --- .../02916_broken_projection.reference | 124 ------------------ .../0_stateless/02916_broken_projection.sh | 16 +-- 2 files changed, 8 insertions(+), 132 deletions(-) diff --git a/tests/queries/0_stateless/02916_broken_projection.reference b/tests/queries/0_stateless/02916_broken_projection.reference index 358304de74a..d340326455a 100644 --- a/tests/queries/0_stateless/02916_broken_projection.reference +++ b/tests/queries/0_stateless/02916_broken_projection.reference @@ -2,11 +2,6 @@ insert new part insert new part insert new part insert new part -system.parts -all_0_0_0 1 ['proj','proj_2'] -all_1_1_0 1 ['proj','proj_2'] -all_2_2_0 1 ['proj','proj_2'] -all_3_3_0 1 ['proj','proj_2'] select from projection 'proj' 12 16 @@ -21,11 +16,6 @@ check table 1 0 broke metadata of part 'proj' (parent part: all_2_2_0) -system.parts -all_0_0_0 1 ['proj','proj_2'] -all_1_1_0 1 ['proj','proj_2'] -all_2_2_0 1 ['proj','proj_2'] -all_3_3_0 1 ['proj','proj_2'] select from projection 'proj' 12 16 @@ -46,11 +36,6 @@ all_2_2_0 broke data of part 'proj_2' (parent part: all_2_2_0) broken projections info all_2_2_0 proj FILE_DOESNT_EXIST -system.parts -all_0_0_0 1 ['proj','proj_2'] -all_1_1_0 1 ['proj','proj_2'] -all_2_2_0 1 ['proj','proj_2'] -all_3_3_0 1 ['proj','proj_2'] select from projection 'proj' 12 16 @@ -63,11 +48,6 @@ check table broken projections info all_2_2_0 proj FILE_DOESNT_EXIST all_2_2_0 proj_2 NO_FILE_IN_DATA_PART -system.parts -all_0_0_0 1 ['proj','proj_2'] -all_1_1_0 1 ['proj','proj_2'] -all_2_2_0 1 ['proj','proj_2'] -all_3_3_0 1 ['proj','proj_2'] select from projection 'proj' 12 16 @@ -94,14 +74,6 @@ broken projections info all_2_2_0 proj FILE_DOESNT_EXIST all_2_2_0 proj_2 NO_FILE_IN_DATA_PART all_3_3_0 proj_2 NO_FILE_IN_DATA_PART -system.parts -all_0_0_0 1 ['proj','proj_2'] -all_1_1_0 1 ['proj','proj_2'] -all_2_2_0 1 ['proj','proj_2'] -all_3_3_0 0 ['proj','proj_2'] -all_3_5_1 1 ['proj'] -all_4_4_0 0 ['proj','proj_2'] -all_5_5_0 0 ['proj','proj_2'] select from projection 'proj' 12 16 @@ -131,14 +103,6 @@ all_1_1_0 proj_2 FILE_DOESNT_EXIST all_2_2_0 proj NO_FILE_IN_DATA_PART all_2_2_0 proj_2 FILE_DOESNT_EXIST all_3_3_0 proj_2 FILE_DOESNT_EXIST -system.parts -all_0_0_0 1 ['proj','proj_2'] -all_1_1_0 1 ['proj','proj_2'] -all_2_2_0 1 ['proj','proj_2'] -all_3_3_0 0 ['proj','proj_2'] -all_3_5_1 1 ['proj'] -all_4_4_0 0 ['proj','proj_2'] -all_5_5_0 0 ['proj','proj_2'] select from projection 'proj' 12 16 @@ -155,18 +119,6 @@ check table full (test - all_1_1_0) all_1_1_0 materialize projection proj check table full (test - ) -system.parts -all_0_0_0 0 ['proj','proj_2'] -all_0_0_0_6 1 ['proj','proj_2'] -all_1_1_0 0 ['proj','proj_2'] -all_1_1_0_6 1 ['proj','proj_2'] -all_2_2_0 0 ['proj','proj_2'] -all_2_2_0_6 1 ['proj','proj_2'] -all_3_3_0 0 ['proj','proj_2'] -all_3_5_1 0 ['proj'] -all_3_5_1_6 1 ['proj'] -all_4_4_0 0 ['proj','proj_2'] -all_5_5_0 0 ['proj','proj_2'] select from projection 'proj' 12 16 @@ -189,25 +141,6 @@ OPTIMIZE TABLE test FINAL insert new part optimize OPTIMIZE TABLE test FINAL -system.parts -all_0_0_0 0 ['proj','proj_2'] -all_0_0_0_6 0 ['proj','proj_2'] -all_0_0_0_7 0 ['proj','proj_2'] -all_0_8_2_7 1 ['proj_2'] -all_1_1_0 0 ['proj','proj_2'] -all_1_1_0_6 0 ['proj','proj_2'] -all_1_1_0_7 0 ['proj','proj_2'] -all_2_2_0 0 ['proj','proj_2'] -all_2_2_0_6 0 ['proj','proj_2'] -all_2_2_0_7 0 ['proj','proj_2'] -all_3_3_0 0 ['proj','proj_2'] -all_3_5_1 0 ['proj'] -all_3_5_1_6 0 ['proj'] -all_3_5_1_7 0 ['proj','proj_2'] -all_4_4_0 0 ['proj','proj_2'] -all_5_5_0 0 ['proj','proj_2'] -all_8_8_0 0 ['proj','proj_2'] -all_9_9_0 1 ['proj','proj_2'] select from projection 'proj' 12 16 @@ -224,9 +157,6 @@ insert new part insert new part insert new part insert new part -system.parts -all_0_0_0 1 ['proj','proj_2'] -all_1_1_0 1 ['proj','proj_2'] select from projection 'proj' used projections SELECT c FROM test2 WHERE d == 12 OR d == 16 ORDER BY c; proj @@ -235,7 +165,6 @@ used projections SELECT d FROM test2 WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table 1 -system.parts select from projection 'proj' used projections SELECT c FROM test2_replica WHERE d == 12 OR d == 16 ORDER BY c; proj @@ -248,9 +177,6 @@ check table broke data of part 'proj' (parent part: all_0_0_0) check table full (test2 - all_0_0_0) all_0_0_0 -system.parts -all_0_0_0 1 ['proj','proj_2'] -all_1_1_0 1 ['proj','proj_2'] select from projection 'proj' used projections SELECT c FROM test2 WHERE d == 12 OR d == 16 ORDER BY c; proj @@ -262,9 +188,6 @@ check table broke data of part 'all_0_0_0' check table full (test2 - all_0_0_0) all_0_0_0 -system.parts -all_0_0_0 0 ['proj','proj_2'] -all_1_1_0 1 ['proj','proj_2'] select from projection 'proj' used projections SELECT c FROM test2 WHERE d == 12 OR d == 16 ORDER BY c; proj @@ -273,9 +196,6 @@ used projections SELECT d FROM test2 WHERE c == 12 OR c == 16 ORDER BY d; proj_2 check table 1 -system.parts -all_0_0_0 1 ['proj','proj_2'] -all_1_1_0 1 ['proj','proj_2'] select from projection 'proj' used projections SELECT c FROM test2 WHERE d == 12 OR d == 16 ORDER BY c; proj @@ -288,11 +208,6 @@ insert new part insert new part insert new part insert new part -system.parts -all_0_0_0 1 ['proj','proj_2'] -all_1_1_0 1 ['proj','proj_2'] -all_2_2_0 1 ['proj','proj_2'] -all_3_3_0 1 ['proj','proj_2'] select from projection 'proj' 12 16 @@ -307,11 +222,6 @@ check table 1 0 broke data of part 'proj' (parent part: all_2_2_0) -system.parts -all_0_0_0 1 ['proj','proj_2'] -all_1_1_0 1 ['proj','proj_2'] -all_2_2_0 1 ['proj','proj_2'] -all_3_3_0 1 ['proj','proj_2'] select from projection 'proj', expect error: proj FILE_DOESNT_EXIST select from projection 'proj_2' @@ -325,11 +235,6 @@ broken projections info all_2_2_0 proj NO_FILE_IN_DATA_PART BACKUP_CREATED RESTORED -system.parts -all_0_0_0 1 ['proj','proj_2'] -all_1_1_0 1 ['proj','proj_2'] -all_2_2_0 1 ['proj','proj_2'] -all_3_3_0 1 ['proj','proj_2'] select from projection 'proj' 12 16 @@ -345,11 +250,6 @@ check table broken projections info 0 broke all data of part 'proj' (parent part: all_2_2_0) -system.parts -all_0_0_0 1 ['proj','proj_2'] -all_1_1_0 1 ['proj','proj_2'] -all_2_2_0 1 ['proj','proj_2'] -all_3_3_0 1 ['proj','proj_2'] select from projection 'proj', expect error: proj STD_EXCEPTION select from projection 'proj_2' @@ -363,15 +263,6 @@ broken projections info all_2_2_0 proj FILE_DOESNT_EXIST FILE_DOESNT_EXIST materialize projection proj -system.parts -all_0_0_0 0 ['proj','proj_2'] -all_0_0_0_4 1 ['proj','proj_2'] -all_1_1_0 0 ['proj','proj_2'] -all_1_1_0_4 1 ['proj','proj_2'] -all_2_2_0 0 ['proj','proj_2'] -all_2_2_0_4 1 ['proj','proj_2'] -all_3_3_0 0 ['proj','proj_2'] -all_3_3_0_4 1 ['proj','proj_2'] select from projection 'proj' 12 16 @@ -388,11 +279,6 @@ broken projections info all_2_2_0 proj FILE_DOESNT_EXIST BACKUP_CREATED RESTORED -system.parts -all_0_0_0 1 ['proj','proj_2'] -all_1_1_0 1 ['proj','proj_2'] -all_2_2_0 1 ['proj','proj_2'] -all_3_3_0 1 ['proj','proj_2'] select from projection 'proj' 12 16 @@ -407,11 +293,6 @@ check table 1 0 broke all data of part 'proj' (parent part: all_1_1_0) -system.parts -all_0_0_0 1 ['proj','proj_2'] -all_1_1_0 1 ['proj','proj_2'] -all_2_2_0 1 ['proj','proj_2'] -all_3_3_0 1 ['proj','proj_2'] select from projection 'proj', expect error: proj select from projection 'proj_2' 12 @@ -424,11 +305,6 @@ broken projections info all_1_1_0 proj FILE_DOESNT_EXIST BACKUP_CREATED RESTORED -system.parts -all_0_0_0 1 ['proj','proj_2'] -all_1_1_0 1 ['proj','proj_2'] -all_2_2_0 1 ['proj','proj_2'] -all_3_3_0 1 ['proj','proj_2'] select from projection 'proj' 12 16 diff --git a/tests/queries/0_stateless/02916_broken_projection.sh b/tests/queries/0_stateless/02916_broken_projection.sh index 55e613b8f3a..a1df5dc858d 100755 --- a/tests/queries/0_stateless/02916_broken_projection.sh +++ b/tests/queries/0_stateless/02916_broken_projection.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Tags: long, no-random-merge-tree-settings, no-random-settings +# Tags: long, no-random-merge-tree-settings, no-random-settings, no-s3-storage # shellcheck disable=SC2046 CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) @@ -149,12 +149,12 @@ function check() expected_error=$3 fi - echo 'system.parts' - $CLICKHOUSE_CLIENT -q " - SELECT name, active, projections - FROM system.parts - WHERE table='$table' AND database=currentDatabase() - ORDER BY name;" + #echo 'system.parts' + #$CLICKHOUSE_CLIENT -q " + #SELECT name, active, projections + #FROM system.parts + #WHERE table='$table' AND database=currentDatabase() + #ORDER BY name;" query_id=$(random 8) @@ -447,7 +447,7 @@ function test3() break_projection test proj all_2_2_0 part - check test proj STD_EXCEPTION + check test broken_projections_info test From 3d2e95dbf5f81185d2a091d5e58490f66ed04bef Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 28 Dec 2023 13:49:49 +0100 Subject: [PATCH 045/245] Fix build --- src/Storages/MergeTree/checkDataPart.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/checkDataPart.cpp b/src/Storages/MergeTree/checkDataPart.cpp index ea46b6f0d56..5b60f0a7fc2 100644 --- a/src/Storages/MergeTree/checkDataPart.cpp +++ b/src/Storages/MergeTree/checkDataPart.cpp @@ -332,7 +332,7 @@ static IMergeTreeDataPart::Checksums checkDataPart( if (throw_on_broken_projection && !broken_projections_message.empty()) { - throw Exception(ErrorCodes::BROKEN_PROJECTION, broken_projections_message.data()); + throw Exception(ErrorCodes::BROKEN_PROJECTION, "{}", broken_projections_message); } if (require_checksums && !projections_on_disk.empty()) From 493f938c455e9bd507d521b7974b1e7a9e7c81b2 Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Thu, 28 Dec 2023 17:29:25 +0100 Subject: [PATCH 046/245] Update 02916_broken_projection.sh --- tests/queries/0_stateless/02916_broken_projection.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02916_broken_projection.sh b/tests/queries/0_stateless/02916_broken_projection.sh index a1df5dc858d..ca62d275189 100755 --- a/tests/queries/0_stateless/02916_broken_projection.sh +++ b/tests/queries/0_stateless/02916_broken_projection.sh @@ -447,7 +447,7 @@ function test3() break_projection test proj all_2_2_0 part - check test + check test proj ErrnoException broken_projections_info test From 91657185c8fc4349cb8825ac2e5d6126fddb8289 Mon Sep 17 00:00:00 2001 From: kssenii Date: Fri, 29 Dec 2023 13:05:15 +0100 Subject: [PATCH 047/245] Fxi --- tests/queries/0_stateless/02916_broken_projection.reference | 2 +- tests/queries/0_stateless/02916_broken_projection.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/02916_broken_projection.reference b/tests/queries/0_stateless/02916_broken_projection.reference index d340326455a..beaca49f99c 100644 --- a/tests/queries/0_stateless/02916_broken_projection.reference +++ b/tests/queries/0_stateless/02916_broken_projection.reference @@ -251,7 +251,7 @@ broken projections info 0 broke all data of part 'proj' (parent part: all_2_2_0) select from projection 'proj', expect error: proj -STD_EXCEPTION +Errno select from projection 'proj_2' 12 16 diff --git a/tests/queries/0_stateless/02916_broken_projection.sh b/tests/queries/0_stateless/02916_broken_projection.sh index ca62d275189..99e54b08b74 100755 --- a/tests/queries/0_stateless/02916_broken_projection.sh +++ b/tests/queries/0_stateless/02916_broken_projection.sh @@ -447,7 +447,7 @@ function test3() break_projection test proj all_2_2_0 part - check test proj ErrnoException + check test proj Errno broken_projections_info test From e0f0100332085f3075951a6d9bf5c8d69f6d9940 Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Fri, 29 Dec 2023 15:38:15 +0100 Subject: [PATCH 048/245] Update 02916_broken_projection.reference --- tests/queries/0_stateless/02916_broken_projection.reference | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/queries/0_stateless/02916_broken_projection.reference b/tests/queries/0_stateless/02916_broken_projection.reference index beaca49f99c..3967215e5de 100644 --- a/tests/queries/0_stateless/02916_broken_projection.reference +++ b/tests/queries/0_stateless/02916_broken_projection.reference @@ -252,6 +252,7 @@ broken projections info broke all data of part 'proj' (parent part: all_2_2_0) select from projection 'proj', expect error: proj Errno +Errno select from projection 'proj_2' 12 16 From 3de5b27c48483962285de0b16f152cc35eadd1a6 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Mon, 8 Jan 2024 16:50:17 +0100 Subject: [PATCH 049/245] Fix conflicts --- .../Serializations/SerializationString.cpp | 22 ++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/src/DataTypes/Serializations/SerializationString.cpp b/src/DataTypes/Serializations/SerializationString.cpp index a6bf29336b7..b2c254e63c5 100644 --- a/src/DataTypes/Serializations/SerializationString.cpp +++ b/src/DataTypes/Serializations/SerializationString.cpp @@ -381,7 +381,7 @@ void SerializationString::deserializeTextJSON(IColumn & column, ReadBuffer & ist str_value = "false"; } - read(column, [&](ColumnString::Chars & data) { data.insert(str_value.begin(), str_value.end()); }); + read(column, [&](ColumnString::Chars & data) { data.insert(str_value.begin(), str_value.end()); }); } else if (settings.json.read_numbers_as_strings && !istr.eof() && *istr.position() != '"') { @@ -406,6 +406,26 @@ bool SerializationString::tryDeserializeTextJSON(IColumn & column, ReadBuffer & if (settings.json.read_arrays_as_strings && !istr.eof() && *istr.position() == '[') return read(column, [&](ColumnString::Chars & data) { return readJSONArrayInto(data, istr); }); + if (settings.json.read_bools_as_strings && !istr.eof() && (*istr.position() == 't' || *istr.position() == 'f')) + { + String str_value; + if (*istr.position() == 't') + { + if (!checkString("true", istr)) + return false; + str_value = "true"; + } + else if (*istr.position() == 'f') + { + if (!checkString("false", istr)) + return false; + str_value = "false"; + } + + read(column, [&](ColumnString::Chars & data) { data.insert(str_value.begin(), str_value.end()); }); + return true; + } + if (settings.json.read_numbers_as_strings && !istr.eof() && *istr.position() != '"') { String field; From 21e4b453dfc7df905ed304c5513b50f57ef19228 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Mon, 8 Jan 2024 22:02:40 +0100 Subject: [PATCH 050/245] Fix pretty type name --- src/DataTypes/DataTypeVariant.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/DataTypes/DataTypeVariant.cpp b/src/DataTypes/DataTypeVariant.cpp index 5dc42cc7443..2bc4dfa5a7a 100644 --- a/src/DataTypes/DataTypeVariant.cpp +++ b/src/DataTypes/DataTypeVariant.cpp @@ -71,17 +71,17 @@ std::string DataTypeVariant::doGetPrettyName(size_t indent) const { size_t size = variants.size(); WriteBufferFromOwnString s; - s << "Variant(\n"; + s << "Variant("; for (size_t i = 0; i != size; ++i) { if (i != 0) - s << ",\n"; + s << ", "; - s << fourSpaceIndent(indent + 1) << variants[i]->getPrettyName(indent + 1); + s << variants[i]->getPrettyName(indent); } - s << '\n' << fourSpaceIndent(indent) << ')'; + s << ')'; return s.str(); } From 633b4a5dcfcf63bec8e2b5a1b5f38e648348639d Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Tue, 9 Jan 2024 19:23:34 +0100 Subject: [PATCH 051/245] Apply suggestions from code review Co-authored-by: Antonio Andelic --- src/Columns/ColumnNullable.cpp | 2 +- src/Columns/ColumnVariant.cpp | 8 +++++--- src/Columns/ColumnVariant.h | 2 +- src/DataTypes/EnumValues.cpp | 4 +--- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/Columns/ColumnNullable.cpp b/src/Columns/ColumnNullable.cpp index d2a579d6800..25b0e35e15e 100644 --- a/src/Columns/ColumnNullable.cpp +++ b/src/Columns/ColumnNullable.cpp @@ -928,7 +928,7 @@ ColumnPtr makeNullableOrLowCardinalityNullableSafe(const ColumnPtr & column) return assert_cast(*column).cloneNullable(); if (column->canBeInsideNullable()) - return makeNullableSafe(column); + return makeNullable(column); return column; } diff --git a/src/Columns/ColumnVariant.cpp b/src/Columns/ColumnVariant.cpp index f90ebfc54bb..10d79f59d37 100644 --- a/src/Columns/ColumnVariant.cpp +++ b/src/Columns/ColumnVariant.cpp @@ -631,9 +631,9 @@ void ColumnVariant::popBack(size_t n) size_t size = local_discriminators_data.size(); const size_t num_variants = variants.size(); std::vector nested_n(num_variants, 0); - for (size_t i = 0; i != n; ++i) + for (size_t i = size - n; i < size; ++i) { - Discriminator discr = local_discriminators_data[size - i - 1]; + Discriminator discr = local_discriminators_data[i]; if (discr != NULL_DISCRIMINATOR) ++nested_n[discr]; } @@ -966,7 +966,7 @@ ColumnPtr ColumnVariant::replicate(const Offsets & replicate_offsets) const { new_offsets_data.reserve(new_size); for (size_t i = old_size; i < new_size; ++i) - new_offsets_data.push_back(new_offsets_data[i - 1] + 1); + new_offsets_data.push_back(i); } else { @@ -1260,6 +1260,8 @@ std::optional ColumnVariant::getLocalDiscriminator { if (variants[i]->size() == local_discriminators->size()) return i; + if (!variants[i]->empty()) + return std::nullopt } return std::nullopt; diff --git a/src/Columns/ColumnVariant.h b/src/Columns/ColumnVariant.h index eb96205924c..8f0c5a6eef9 100644 --- a/src/Columns/ColumnVariant.h +++ b/src/Columns/ColumnVariant.h @@ -205,7 +205,7 @@ public: void compareColumn(const IColumn &, size_t, PaddedPODArray *, PaddedPODArray &, int, int) const override { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method compareColumn is not supported for ColumnAggregateFunction"); + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method compareColumn is not supported for ColumnVariant"); } bool hasEqualValues() const override; diff --git a/src/DataTypes/EnumValues.cpp b/src/DataTypes/EnumValues.cpp index 8a4b1304d5e..a15136b9335 100644 --- a/src/DataTypes/EnumValues.cpp +++ b/src/DataTypes/EnumValues.cpp @@ -85,9 +85,7 @@ bool EnumValues::tryGetValue(T & x, StringRef field_name, bool try_treat_as_i if (try_treat_as_id) { ReadBufferFromMemory tmp_buf(field_name.data, field_name.size); - if (!tryReadText(x, tmp_buf) || !tmp_buf.eof() || !value_to_name_map.contains(x)) - return false; - return true; + return tryReadText(x, tmp_buf) && tmp_buf.eof() && value_to_name_map.contains(x); } return false; } From fb758e48b04c5f799a5169af584f6a562866640d Mon Sep 17 00:00:00 2001 From: avogar Date: Tue, 9 Jan 2024 19:02:20 +0000 Subject: [PATCH 052/245] Apply suggestions --- src/Columns/ColumnVariant.cpp | 172 +++++++++++++++------------------- 1 file changed, 74 insertions(+), 98 deletions(-) diff --git a/src/Columns/ColumnVariant.cpp b/src/Columns/ColumnVariant.cpp index 10d79f59d37..a707ec8e153 100644 --- a/src/Columns/ColumnVariant.cpp +++ b/src/Columns/ColumnVariant.cpp @@ -204,10 +204,13 @@ ColumnVariant::ColumnVariant(DB::MutableColumnPtr local_discriminators_, DB::Mut } } -ColumnVariant::Ptr ColumnVariant::create(const Columns & variants, const std::vector & local_to_global_discriminators) +namespace +{ + +MutableColumns getVariantsAssumeMutable(const Columns & variants) { MutableColumns mutable_variants; - mutable_variants.reserve(variants.size()); + for (const auto & variant : variants) { if (isColumnConst(*variant)) @@ -215,35 +218,24 @@ ColumnVariant::Ptr ColumnVariant::create(const Columns & variants, const std::ve mutable_variants.emplace_back(variant->assumeMutable()); } - return ColumnVariant::create(std::move(mutable_variants), local_to_global_discriminators); + return mutable_variants; +} + +} + +ColumnVariant::Ptr ColumnVariant::create(const Columns & variants, const std::vector & local_to_global_discriminators) +{ + return ColumnVariant::create(getVariantsAssumeMutable(variants), local_to_global_discriminators); } ColumnVariant::Ptr ColumnVariant::create(const DB::ColumnPtr & local_discriminators, const DB::Columns & variants, const std::vector & local_to_global_discriminators) { - MutableColumns mutable_variants; - mutable_variants.reserve(variants.size()); - for (const auto & variant : variants) - { - if (isColumnConst(*variant)) - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "ColumnVariant cannot have ColumnConst as its element"); - mutable_variants.emplace_back(variant->assumeMutable()); - } - - return ColumnVariant::create(local_discriminators->assumeMutable(), std::move(mutable_variants), local_to_global_discriminators); + return ColumnVariant::create(local_discriminators->assumeMutable(), getVariantsAssumeMutable(variants), local_to_global_discriminators); } ColumnVariant::Ptr ColumnVariant::create(const DB::ColumnPtr & local_discriminators, const DB::ColumnPtr & offsets, const DB::Columns & variants, const std::vector & local_to_global_discriminators) { - MutableColumns mutable_variants; - mutable_variants.reserve(variants.size()); - for (const auto & variant : variants) - { - if (isColumnConst(*variant)) - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "ColumnVariant cannot have ColumnConst as its element"); - mutable_variants.emplace_back(variant->assumeMutable()); - } - - return ColumnVariant::create(local_discriminators->assumeMutable(), offsets->assumeMutable(), std::move(mutable_variants), local_to_global_discriminators); + return ColumnVariant::create(local_discriminators->assumeMutable(), offsets->assumeMutable(), getVariantsAssumeMutable(variants), local_to_global_discriminators); } MutableColumnPtr ColumnVariant::cloneEmpty() const @@ -309,104 +301,88 @@ MutableColumnPtr ColumnVariant::cloneResized(size_t new_size) const const auto & local_discriminators_data = getLocalDiscriminators(); const auto & offsets_data = getOffsets(); - /// We can find all variants sizes by scanning all new_size local_discriminators and calculating - /// sizes for all new variants. This code is below and commented. - -// std::vector new_nested_sizes(num_variants, 0); -// for (size_t i = 0; i != new_size; ++i) -// { -// Discriminator discr = local_discriminators_data[i]; -// if (discr != NULL_DISCRIMINATOR) -// ++new_nested_sizes[discr]; -// } -// -// MutableColumns new_variants; -// new_variants.reserve(num_variants); -// for (size_t i = 0; i != num_variants; ++i) -// { -// if (new_nested_sizes[i]) -// new_variants.emplace_back(variants[i]->cloneResized(new_nested_sizes[i])); -// else -// new_variants.emplace_back(variants[i]->cloneEmpty()); -// } -// -// return ColumnVariant::create(local_discriminators->cloneResized(new_size), offsets->cloneResized(new_size), std::move(new_variants), local_to_global_discriminators); - + /// We can find all variants sizes by scanning all new_size local_discriminators and calculating sizes for all new variants. /// But instead we are trying to optimize it using offsets column: /// For all non-empty variants we are trying to find last occurrence of its discriminator in local_discriminators[:new_size] or - /// first occurrence in local_discriminators[new_size:]. The same row in offsets column will contain the desired size (or size - 1) of variant. + /// first occurrence in local_discriminators[new_size:] depending on what range is smaller. The same row in offsets column will + /// contain the desired size (or size - 1) of variant. /// All empty variants will remain empty. - /// Not sure how good this optimization is, feel free to remove it and use simpler version above. + /// Not sure how good this optimization is, feel free to remove it and use simpler version without using offsets. MutableColumns new_variants(num_variants); - std::unordered_set seen_variants; + std::vector seen_variants(num_variants, 0); + size_t number_of_seen_variants = 0; /// First, check which variants are empty. They will remain empty. for (Discriminator i = 0; i != num_variants; ++i) { if (variants[i]->empty()) { - seen_variants.insert(i); + seen_variants[i] = 1; + ++number_of_seen_variants; new_variants[i] = variants[i]->cloneEmpty(); } } - /// Now, iterate through local discriminators using two pointers. - /// First will go from new_size - 1 to 0, second from new_size to size. - /// Finish when we find all variants or hit lower or upper bound. - ssize_t i = new_size - 1; - size_t j = new_size; - while (i != -1 && j != size) + /// Now, choose what range is smaller and use it. + /// [0, new_size) + if (2 * new_size <= size) { - Discriminator i_discr = local_discriminators_data[i]; - if (i_discr != NULL_DISCRIMINATOR) + for (ssize_t i = new_size - 1; i > -1; --i) { - auto [_, inserted] = seen_variants.insert(i_discr); - /// If this is the first occurrence of this discriminator, - /// we can get new size for this variant. - if (inserted) + Discriminator discr = local_discriminators_data[i]; + if (discr != NULL_DISCRIMINATOR) { - new_variants[i_discr] = variants[i_discr]->cloneResized(offsets_data[i] + 1); - if (seen_variants.size() == num_variants) - break; + /// If this is the first occurrence of this discriminator, + /// we can get new size for this variant. + if (!seen_variants[discr]) + { + seen_variants[discr] = 1; + ++number_of_seen_variants; + new_variants[discr] = variants[discr]->cloneResized(offsets_data[i] + 1); + /// Break if we found sizes for all variants. + if (number_of_seen_variants == num_variants) + break; + } } } - Discriminator j_discr = local_discriminators_data[j]; - if (j_discr != NULL_DISCRIMINATOR) + /// All variants that weren't found in range [0, new_size] will be empty in the result column. + if (number_of_seen_variants != num_variants) { - auto [_, inserted] = seen_variants.insert(j_discr); - /// If this is the first occurrence of this discriminator, - /// we can get new size for this variant. - if (inserted) - { - new_variants[j_discr] = variants[j_discr]->cloneResized(offsets_data[j]); - if (seen_variants.size() == num_variants) - break; - } - } - - --i; - ++j; - } - - /// We can finish in 3 cases: - /// 1) seen_variants.size() == num_variants - we found local_discriminators of all variants, nothing to do. - /// 2) i == -1 - we scanned all values in local_discriminators[:new_size]. Not found variants doesn't have - /// values in local_discriminators[:new_size], so they should be empty in the resized version. - /// 3) j == size - we scanned all values in local_discriminators[new_size:]. Not found variants doesn't have - /// values in local_discriminators[new_size:], so, we should use the full variant in the resized version. - if (seen_variants.size() != num_variants) - { - for (size_t discr = 0; discr != num_variants; ++discr) - { - if (!seen_variants.contains(discr)) - { - if (i == -1) + for (size_t discr = 0; discr != num_variants; ++discr) + if (!seen_variants[discr]) new_variants[discr] = variants[discr]->cloneEmpty(); - else - new_variants[discr] = IColumn::mutate(variants[discr]); + } + } + /// [new_size, size) + else + { + for (size_t i = new_size; i < size; ++i) + { + Discriminator discr = local_discriminators_data[i]; + if (discr != NULL_DISCRIMINATOR) + { + /// If this is the first occurrence of this discriminator, + /// we can get new size for this variant. + if (!seen_variants[discr]) + { + seen_variants[discr] = 1; + ++number_of_seen_variants; + new_variants[discr] = variants[discr]->cloneResized(offsets_data[i]); + /// Break if we found sizes for all variants. + if (number_of_seen_variants == num_variants) + break; + } } } + + if (number_of_seen_variants != num_variants) + { + /// All variants that weren't found in range [new_size, size) will not change their sizes. + for (size_t discr = 0; discr != num_variants; ++discr) + if (!seen_variants[discr]) + new_variants[discr] = IColumn::mutate(variants[discr]); + } } return ColumnVariant::create(local_discriminators->cloneResized(new_size), offsets->cloneResized(new_size), std::move(new_variants), local_to_global_discriminators); @@ -1261,7 +1237,7 @@ std::optional ColumnVariant::getLocalDiscriminator if (variants[i]->size() == local_discriminators->size()) return i; if (!variants[i]->empty()) - return std::nullopt + return std::nullopt; } return std::nullopt; From 10af0d406fb536917a84d23f4bacba073ea9443e Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Wed, 10 Jan 2024 16:55:58 +0100 Subject: [PATCH 053/245] Update 02916_broken_projection.sh --- tests/queries/0_stateless/02916_broken_projection.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02916_broken_projection.sh b/tests/queries/0_stateless/02916_broken_projection.sh index 99e54b08b74..fbd26e59f6f 100755 --- a/tests/queries/0_stateless/02916_broken_projection.sh +++ b/tests/queries/0_stateless/02916_broken_projection.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Tags: long, no-random-merge-tree-settings, no-random-settings, no-s3-storage +# Tags: long, no-random-merge-tree-settings, no-random-settings, no-s3-storage, no-parallel # shellcheck disable=SC2046 CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) From 1deaaf5466a2633d58fba87521435491546df0a2 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Thu, 11 Jan 2024 15:20:06 +0100 Subject: [PATCH 054/245] Apply suggestions from code review Co-authored-by: Antonio Andelic --- .../Serializations/SerializationDateTime64.cpp | 6 +++--- src/DataTypes/Serializations/SerializationEnum.h | 5 +---- .../Serializations/SerializationNamed.cpp | 1 - .../Serializations/SerializationTuple.cpp | 15 +++------------ 4 files changed, 7 insertions(+), 20 deletions(-) diff --git a/src/DataTypes/Serializations/SerializationDateTime64.cpp b/src/DataTypes/Serializations/SerializationDateTime64.cpp index a19619bf8d3..442e29edd52 100644 --- a/src/DataTypes/Serializations/SerializationDateTime64.cpp +++ b/src/DataTypes/Serializations/SerializationDateTime64.cpp @@ -50,7 +50,7 @@ void SerializationDateTime64::deserializeText(IColumn & column, ReadBuffer & ist bool SerializationDateTime64::tryDeserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &, bool whole) const { DateTime64 result = 0; - if (tryReadDateTime64Text(result, scale, istr, time_zone) || (whole && istr.eof())) + if (!tryReadDateTime64Text(result, scale, istr, time_zone) || (whole && !istr.eof())) return false; assert_cast(column).getData().push_back(result); @@ -151,7 +151,7 @@ bool SerializationDateTime64::tryDeserializeTextQuoted(IColumn & column, ReadBuf DateTime64 x = 0; if (checkChar('\'', istr)) /// Cases: '2017-08-31 18:36:48' or '1504193808' { - if (tryReadText(x, scale, istr, settings, time_zone, utc_time_zone) || !checkChar('\'', istr)) + if (!tryReadText(x, scale, istr, settings, time_zone, utc_time_zone) || !checkChar('\'', istr)) return false; } else /// Just 1504193808 or 01504193808 @@ -265,7 +265,7 @@ bool SerializationDateTime64::tryDeserializeTextCSV(IColumn & column, ReadBuffer { if (settings.csv.delimiter != ',' || settings.date_time_input_format == FormatSettings::DateTimeInputFormat::Basic) { - if (tryReadText(x, scale, istr, settings, time_zone, utc_time_zone)) + if (!tryReadText(x, scale, istr, settings, time_zone, utc_time_zone)) return false; } else diff --git a/src/DataTypes/Serializations/SerializationEnum.h b/src/DataTypes/Serializations/SerializationEnum.h index 5152a3fbc93..bb720ee9b1f 100644 --- a/src/DataTypes/Serializations/SerializationEnum.h +++ b/src/DataTypes/Serializations/SerializationEnum.h @@ -60,10 +60,7 @@ public: bool tryReadValue(ReadBuffer & istr, FieldType & x) const { - if (!tryReadText(x, istr) || !ref_enum_values.hasValue(x)) - return false; - - return true; + return tryReadText(x, istr) && ref_enum_values.hasValue(x); } std::optional> own_enum_values; diff --git a/src/DataTypes/Serializations/SerializationNamed.cpp b/src/DataTypes/Serializations/SerializationNamed.cpp index 1a9cbe9a37d..ca60948ce68 100644 --- a/src/DataTypes/Serializations/SerializationNamed.cpp +++ b/src/DataTypes/Serializations/SerializationNamed.cpp @@ -1,5 +1,4 @@ #include -#include namespace DB { diff --git a/src/DataTypes/Serializations/SerializationTuple.cpp b/src/DataTypes/Serializations/SerializationTuple.cpp index c0b0658e6b4..79b7fa84242 100644 --- a/src/DataTypes/Serializations/SerializationTuple.cpp +++ b/src/DataTypes/Serializations/SerializationTuple.cpp @@ -212,10 +212,7 @@ ReturnType SerializationTuple::deserializeTextImpl(IColumn & column, ReadBuffer return ReturnType(true); }; - if constexpr (throw_exception) - addElementSafe(elems.size(), column, impl); - else - return addElementSafe(elems.size(), column, impl); + return addElementSafe(elems.size(), column, impl); } void SerializationTuple::deserializeText(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings, bool whole) const @@ -457,10 +454,7 @@ ReturnType SerializationTuple::deserializeTextJSONImpl(IColumn & column, ReadBuf return ReturnType(true); }; - if constexpr (throw_exception) - addElementSafe(elems.size(), column, impl); - else - return addElementSafe(elems.size(), column, impl); + return addElementSafe(elems.size(), column, impl); } else { @@ -502,10 +496,7 @@ ReturnType SerializationTuple::deserializeTextJSONImpl(IColumn & column, ReadBuf return ReturnType(true); }; - if constexpr (throw_exception) - addElementSafe(elems.size(), column, impl); - else - return addElementSafe(elems.size(), column, impl); + return addElementSafe(elems.size(), column, impl); } } From f05d89bc2b26206b1b6854ad48dd35840b82a123 Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 11 Jan 2024 14:48:57 +0000 Subject: [PATCH 055/245] Apply review suggestions --- .../Serializations/ISerialization.cpp | 47 +++--- .../Serializations/SerializationTuple.cpp | 3 + .../Serializations/SerializationVariant.cpp | 135 +++++++++--------- 3 files changed, 101 insertions(+), 84 deletions(-) diff --git a/src/DataTypes/Serializations/ISerialization.cpp b/src/DataTypes/Serializations/ISerialization.cpp index 08575f06f2a..c699b3b0748 100644 --- a/src/DataTypes/Serializations/ISerialization.cpp +++ b/src/DataTypes/Serializations/ISerialization.cpp @@ -176,7 +176,7 @@ String getNameForSubstreamPath( stream_name += "." + it->tuple_element_name; } else if (it->type == Substream::VariantDiscriminators) - stream_name += ".discr"; + stream_name += ".variant_discr"; else if (it->type == Substream::VariantOffsets) stream_name += ".variant_offsets"; else if (it->type == Substream::VariantElement) @@ -261,43 +261,51 @@ bool ISerialization::isSpecialCompressionAllowed(const SubstreamPath & path) return true; } -#define TRY_DESERIALIZE_TEXT(deserialize) \ - size_t prev_size = column.size(); \ - try \ - { \ - deserialize(column, istr, settings); \ - return true; \ - } \ - catch (...) \ - { \ - if (column.size() > prev_size) \ - column.popBack(column.size() - prev_size); \ - return false; \ - } \ +namespace +{ + +template +bool tryDeserializeText(const F deserialize, DB::IColumn & column) +{ + size_t prev_size = column.size(); + try + { + deserialize(column); + return true; + } + catch (...) + { + if (column.size() > prev_size) + column.popBack(column.size() - prev_size); + return false; + } +} + +} bool ISerialization::tryDeserializeTextCSV(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const { - TRY_DESERIALIZE_TEXT(deserializeTextCSV) + return tryDeserializeText([&](DB::IColumn & my_column) { deserializeTextCSV(my_column, istr, settings); }, column); } bool ISerialization::tryDeserializeTextEscaped(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const { - TRY_DESERIALIZE_TEXT(deserializeTextEscaped) + return tryDeserializeText([&](DB::IColumn & my_column) { deserializeTextEscaped(my_column, istr, settings); }, column); } bool ISerialization::tryDeserializeTextJSON(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const { - TRY_DESERIALIZE_TEXT(deserializeTextJSON) + return tryDeserializeText([&](DB::IColumn & my_column) { deserializeTextJSON(my_column, istr, settings); }, column); } bool ISerialization::tryDeserializeTextQuoted(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const { - TRY_DESERIALIZE_TEXT(deserializeTextQuoted) + return tryDeserializeText([&](DB::IColumn & my_column) { deserializeTextQuoted(my_column, istr, settings); }, column); } bool ISerialization::tryDeserializeWholeText(DB::IColumn & column, DB::ReadBuffer & istr, const DB::FormatSettings & settings) const { - TRY_DESERIALIZE_TEXT(deserializeWholeText) + return tryDeserializeText([&](DB::IColumn & my_column) { deserializeWholeText(my_column, istr, settings); }, column); } void ISerialization::deserializeTextRaw(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const @@ -346,7 +354,6 @@ bool ISerialization::hasSubcolumnForPath(const SubstreamPath & path, size_t pref return path[last_elem].type == Substream::NullMap || path[last_elem].type == Substream::TupleElement || path[last_elem].type == Substream::ArraySizes - || path[last_elem].type == Substream::VariantDiscriminators || path[last_elem].type == Substream::VariantElement; } diff --git a/src/DataTypes/Serializations/SerializationTuple.cpp b/src/DataTypes/Serializations/SerializationTuple.cpp index 79b7fa84242..c249ee69e46 100644 --- a/src/DataTypes/Serializations/SerializationTuple.cpp +++ b/src/DataTypes/Serializations/SerializationTuple.cpp @@ -76,7 +76,10 @@ static ReturnType addElementSafe(size_t num_elems, IColumn & column, F && impl) { auto & element_column = extractElementColumn(column, i); if (element_column.size() > old_size) + { + chassert(old_size - element_column.size() == 1); element_column.popBack(1); + } } }; diff --git a/src/DataTypes/Serializations/SerializationVariant.cpp b/src/DataTypes/Serializations/SerializationVariant.cpp index 9cfc4b9e26f..64fcb63d604 100644 --- a/src/DataTypes/Serializations/SerializationVariant.cpp +++ b/src/DataTypes/Serializations/SerializationVariant.cpp @@ -397,70 +397,76 @@ void SerializationVariant::deserializeBinary(IColumn & column, ReadBuffer & istr namespace { -std::unordered_map getTypesTextDeserializePriorityMap() +const std::unordered_map & getTypesTextDeserializePriorityMap() { - static const std::vector priorities = { - /// Complex types have highest priority. - TypeIndex::Array, - TypeIndex::Tuple, - TypeIndex::Map, - TypeIndex::AggregateFunction, + static std::unordered_map priority_map = [] + { + static constexpr std::array priorities = { + /// Complex types have highest priority. + TypeIndex::Array, + TypeIndex::Tuple, + TypeIndex::Map, + TypeIndex::AggregateFunction, - /// Enums can be parsed both from strings and numbers. - /// So they have high enough priority. - TypeIndex::Enum8, - TypeIndex::Enum16, + /// Enums can be parsed both from strings and numbers. + /// So they have high enough priority. + TypeIndex::Enum8, + TypeIndex::Enum16, - /// Types that can be parsed from strings. - TypeIndex::UUID, - TypeIndex::IPv4, - TypeIndex::IPv6, + /// Types that can be parsed from strings. + TypeIndex::UUID, + TypeIndex::IPv4, + TypeIndex::IPv6, - /// Types that can be parsed from numbers. - /// The order: - /// 1) Integers - /// 2) Big Integers - /// 3) Decimals - /// 4) Floats - /// In each group small types have higher priority. - TypeIndex::Int8, - TypeIndex::UInt8, - TypeIndex::Int16, - TypeIndex::UInt16, - TypeIndex::Int32, - TypeIndex::UInt32, - TypeIndex::Int64, - TypeIndex::UInt64, - TypeIndex::Int128, - TypeIndex::UInt128, - TypeIndex::Int256, - TypeIndex::UInt256, - TypeIndex::Decimal32, - TypeIndex::Decimal64, - TypeIndex::Decimal128, - TypeIndex::Decimal256, - TypeIndex::Float32, - TypeIndex::Float64, + /// Types that can be parsed from numbers. + /// The order: + /// 1) Integers + /// 2) Big Integers + /// 3) Decimals + /// 4) Floats + /// In each group small types have higher priority. + TypeIndex::Int8, + TypeIndex::UInt8, + TypeIndex::Int16, + TypeIndex::UInt16, + TypeIndex::Int32, + TypeIndex::UInt32, + TypeIndex::Int64, + TypeIndex::UInt64, + TypeIndex::Int128, + TypeIndex::UInt128, + TypeIndex::Int256, + TypeIndex::UInt256, + TypeIndex::Decimal32, + TypeIndex::Decimal64, + TypeIndex::Decimal128, + TypeIndex::Decimal256, + TypeIndex::Float32, + TypeIndex::Float64, - /// Dates and DateTimes. More simple Date types have higher priority. - /// They have lower priority as numbers as some DateTimes sometimes can - /// be also parsed from numbers, but we don't want it usually. - TypeIndex::Date, - TypeIndex::Date32, - TypeIndex::DateTime, - TypeIndex::DateTime64, + /// Dates and DateTimes. More simple Date types have higher priority. + /// They have lower priority as numbers as some DateTimes sometimes can + /// be also parsed from numbers, but we don't want it usually. + TypeIndex::Date, + TypeIndex::Date32, + TypeIndex::DateTime, + TypeIndex::DateTime64, - /// String types have almost the lowest priority, - /// as in text formats almost all data can - /// be deserialized into String type. - TypeIndex::FixedString, - TypeIndex::String, - }; + /// String types have almost the lowest priority, + /// as in text formats almost all data can + /// be deserialized into String type. + TypeIndex::FixedString, + TypeIndex::String, + }; + + std::unordered_map pm; + + pm.reserve(priorities.size()); + for (size_t i = 0; i != priorities.size(); ++i) + pm[priorities[i]] = priorities.size() - i; + return pm; + }(); - std::unordered_map priority_map; - priority_map.reserve(priorities.size()); - for (size_t i = 0; i != priorities.size(); ++i) - priority_map[priorities[i]] = priorities.size() - i; return priority_map; } @@ -476,7 +482,7 @@ std::unordered_map getTypesTextDeserializePriorityMap() /// so if we have types with the same level of nesting and the same priority, we will first try to deserialize LowCardinality/Nullable types /// (for example if we have types Array(Array(String)) and Array(Array(Nullable(String))). /// This is just a batch of heuristics. -std::tuple getTypeTextDeserializePriority(const DataTypePtr & type, size_t nested_depth, size_t simple_nested_depth, std::unordered_map & priority_map) +std::tuple getTypeTextDeserializePriority(const DataTypePtr & type, size_t nested_depth, size_t simple_nested_depth, const std::unordered_map & priority_map) { if (const auto * nullable_type = typeid_cast(type.get())) return getTypeTextDeserializePriority(nullable_type->getNestedType(), nested_depth, simple_nested_depth + 1, priority_map); @@ -487,7 +493,7 @@ std::tuple getTypeTextDeserializePriority(const DataType if (const auto * array_type = typeid_cast(type.get())) { auto [elements_nested_depth, elements_priority, elements_simple_nested_depth] = getTypeTextDeserializePriority(array_type->getNestedType(), nested_depth + 1, simple_nested_depth, priority_map); - return {elements_nested_depth, elements_priority + priority_map[TypeIndex::Array], elements_simple_nested_depth}; + return {elements_nested_depth, elements_priority + priority_map.at(TypeIndex::Array), elements_simple_nested_depth}; } if (const auto * tuple_type = typeid_cast(type.get())) @@ -505,14 +511,14 @@ std::tuple getTypeTextDeserializePriority(const DataType max_simple_nested_depth = elem_simple_nested_depth; } - return {max_nested_depth, sum_priority + priority_map[TypeIndex::Tuple], max_simple_nested_depth}; + return {max_nested_depth, sum_priority + priority_map.at(TypeIndex::Tuple), max_simple_nested_depth}; } if (const auto * map_type = typeid_cast(type.get())) { auto [key_max_depth, key_priority, key_simple_nested_depth] = getTypeTextDeserializePriority(map_type->getKeyType(), nested_depth + 1, simple_nested_depth, priority_map); auto [value_max_depth, value_priority, value_simple_nested_depth] = getTypeTextDeserializePriority(map_type->getValueType(), nested_depth + 1, simple_nested_depth, priority_map); - return {std::max(key_max_depth, value_max_depth), key_priority + value_priority + priority_map[TypeIndex::Map], std::max(key_simple_nested_depth, value_simple_nested_depth)}; + return {std::max(key_max_depth, value_max_depth), key_priority + value_priority + priority_map.at(TypeIndex::Map), std::max(key_simple_nested_depth, value_simple_nested_depth)}; } if (const auto * variant_type = typeid_cast(type.get())) @@ -536,9 +542,10 @@ std::tuple getTypeTextDeserializePriority(const DataType /// Bool type should have priority higher then all integers. if (isBool(type)) - return {nested_depth, priority_map[TypeIndex::Int8] + 1, simple_nested_depth}; + return {nested_depth, priority_map.at(TypeIndex::Int8) + 1, simple_nested_depth}; - return {nested_depth, priority_map[type->getTypeId()], simple_nested_depth}; + auto it = priority_map.find(type->getTypeId()); + return {nested_depth, it == priority_map.end() ? 0 : it->second, simple_nested_depth}; } } @@ -549,7 +556,7 @@ std::vector SerializationVariant::getVariantsDeserializeTextOrder(const priorities.reserve(variant_types.size()); std::vector order; order.reserve(variant_types.size()); - auto priority_map = getTypesTextDeserializePriorityMap(); + const auto & priority_map = getTypesTextDeserializePriorityMap(); for (size_t i = 0; i != variant_types.size(); ++i) { priorities.push_back(getTypeTextDeserializePriority(variant_types[i], 0, 0, priority_map)); From 9e639df12e69c7373e400115977c432b8fdf31f2 Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 11 Jan 2024 18:44:05 +0000 Subject: [PATCH 056/245] Add fixes, add new mode to getLeastSupertype and use it in if/multiIf --- .../Serializations/SerializationTuple.cpp | 2 +- src/DataTypes/getLeastSupertype.cpp | 73 ++++++++++++++++--- src/DataTypes/getLeastSupertype.h | 12 +++ src/Functions/if.cpp | 14 +--- src/Functions/multiIf.cpp | 8 +- ...940_variant_text_deserialization.reference | 2 +- 6 files changed, 79 insertions(+), 32 deletions(-) diff --git a/src/DataTypes/Serializations/SerializationTuple.cpp b/src/DataTypes/Serializations/SerializationTuple.cpp index c249ee69e46..5d8c84b70bf 100644 --- a/src/DataTypes/Serializations/SerializationTuple.cpp +++ b/src/DataTypes/Serializations/SerializationTuple.cpp @@ -77,7 +77,7 @@ static ReturnType addElementSafe(size_t num_elems, IColumn & column, F && impl) auto & element_column = extractElementColumn(column, i); if (element_column.size() > old_size) { - chassert(old_size - element_column.size() == 1); + chassert(element_column.size() - old_size == 1); element_column.popBack(1); } } diff --git a/src/DataTypes/getLeastSupertype.cpp b/src/DataTypes/getLeastSupertype.cpp index e5bdb4b267f..5d67f888c4b 100644 --- a/src/DataTypes/getLeastSupertype.cpp +++ b/src/DataTypes/getLeastSupertype.cpp @@ -18,6 +18,7 @@ #include #include #include +#include namespace DB @@ -58,6 +59,25 @@ DataTypePtr throwOrReturn(const DataTypes & types, std::string_view message_suff if constexpr (on_error == LeastSupertypeOnError::String) return std::make_shared(); + if constexpr (on_error == LeastSupertypeOnError::Variant && std::is_same_v>) + { + DataTypes variants; + for (const auto & type : types) + { + if (isVariant(type)) + { + const DataTypes & nested_variants = assert_cast(*type).getVariants(); + variants.insert(variants.end(), nested_variants.begin(), nested_variants.end()); + } + else + { + variants.push_back(removeNullableOrLowCardinalityNullable(type)); + } + } + + return std::make_shared(variants); + } + if constexpr (on_error == LeastSupertypeOnError::Null) return nullptr; @@ -67,8 +87,8 @@ DataTypePtr throwOrReturn(const DataTypes & types, std::string_view message_suff throw Exception(error_code, "There is no supertype for types {} {}", getExceptionMessagePrefix(types), message_suffix); } -template -DataTypePtr getNumericType(const TypeIndexSet & types) +template +DataTypePtr getNumericType(const TypeIndexSet & types, ThrowOrReturnFunc throwOrReturnFunc) { bool all_numbers = true; @@ -119,7 +139,7 @@ DataTypePtr getNumericType(const TypeIndexSet & types) if (max_bits_of_signed_integer || max_bits_of_unsigned_integer || max_mantissa_bits_of_floating) { if (!all_numbers) - return throwOrReturn(types, "because some of them are numbers and some of them are not", ErrorCodes::NO_COMMON_TYPE); + return throwOrReturnFunc(types, "because some of them are numbers and some of them are not", ErrorCodes::NO_COMMON_TYPE); /// If there are signed and unsigned types of same bit-width, the result must be signed number with at least one more bit. /// Example, common of Int32, UInt32 = Int64. @@ -134,7 +154,7 @@ DataTypePtr getNumericType(const TypeIndexSet & types) if (min_bit_width_of_integer != 64) ++min_bit_width_of_integer; else - return throwOrReturn(types, + return throwOrReturnFunc(types, "because some of them are signed integers and some are unsigned integers," " but there is no signed integer type, that can exactly represent all required unsigned integer values", ErrorCodes::NO_COMMON_TYPE); @@ -149,7 +169,7 @@ DataTypePtr getNumericType(const TypeIndexSet & types) else if (min_mantissa_bits <= 53) return std::make_shared(); else - return throwOrReturn(types, + return throwOrReturnFunc(types, " because some of them are integers and some are floating point," " but there is no floating point type, that can exactly represent all required integers", ErrorCodes::NO_COMMON_TYPE); } @@ -170,7 +190,7 @@ DataTypePtr getNumericType(const TypeIndexSet & types) else if (min_bit_width_of_integer <= 256) return std::make_shared(); else - return throwOrReturn(types, + return throwOrReturnFunc(types, " because some of them are signed integers and some are unsigned integers," " but there is no signed integer type, that can exactly represent all required unsigned integer values", ErrorCodes::NO_COMMON_TYPE); } @@ -190,7 +210,7 @@ DataTypePtr getNumericType(const TypeIndexSet & types) else if (min_bit_width_of_integer <= 256) return std::make_shared(); else - return throwOrReturn(types, + return throwOrReturnFunc(types, " but as all data types are unsigned integers, we must have found maximum unsigned integer type", ErrorCodes::NO_COMMON_TYPE); } } @@ -382,7 +402,18 @@ DataTypePtr getLeastSupertype(const DataTypes & types) if (!all_maps) return throwOrReturn(types, "because some of them are Maps and some of them are not", ErrorCodes::NO_COMMON_TYPE); - auto keys_common_type = getLeastSupertype(key_types); + DataTypePtr keys_common_type; + if constexpr (on_error == LeastSupertypeOnError::Variant) + { + keys_common_type = getLeastSupertype(key_types); + if (!keys_common_type) + return throwOrReturn(types, "", ErrorCodes::NO_COMMON_TYPE); + } + else + { + keys_common_type = getLeastSupertype(key_types); + } + auto values_common_type = getLeastSupertype(value_types); /// When on_error == LeastSupertypeOnError::Null and we cannot get least supertype for keys or values, /// keys_common_type or values_common_type will be nullptr, we should return nullptr in this case. @@ -423,7 +454,18 @@ DataTypePtr getLeastSupertype(const DataTypes & types) return getLeastSupertype(nested_types); else { - auto nested_type = getLeastSupertype(nested_types); + DataTypePtr nested_type; + if constexpr (on_error == LeastSupertypeOnError::Variant) + { + nested_type = getLeastSupertype(nested_types); + if (!nested_type) + return throwOrReturn(types, "", ErrorCodes::NO_COMMON_TYPE); + } + else + { + nested_type = getLeastSupertype(nested_types); + } + /// When on_error == LeastSupertypeOnError::Null and we cannot get least supertype, /// nested_type will be nullptr, we should return nullptr in this case. if (!nested_type) @@ -456,6 +498,8 @@ DataTypePtr getLeastSupertype(const DataTypes & types) if (have_nullable) { auto nested_type = getLeastSupertype(nested_types); + if (isVariant(nested_type)) + return nested_type; /// When on_error == LeastSupertypeOnError::Null and we cannot get least supertype, /// nested_type will be nullptr, we should return nullptr in this case. if (!nested_type) @@ -623,7 +667,8 @@ DataTypePtr getLeastSupertype(const DataTypes & types) { /// First, if we have signed integers, try to convert all UInt64 to Int64 if possible. convertUInt64toInt64IfPossible(types, type_ids); - auto numeric_type = getNumericType(type_ids); + auto throw_or_return = [&](const TypeIndexSet &, std::string_view message_suffix, int error_code){ return throwOrReturn(types, message_suffix, error_code); }; + auto numeric_type = getNumericType(type_ids, throw_or_return); if (numeric_type) return numeric_type; } @@ -637,6 +682,11 @@ DataTypePtr getLeastSupertypeOrString(const DataTypes & types) return getLeastSupertype(types); } +DataTypePtr getLeastSupertypeOrVariant(const DataTypes & types) +{ + return getLeastSupertype(types); +} + DataTypePtr tryGetLeastSupertype(const DataTypes & types) { return getLeastSupertype(types); @@ -676,7 +726,8 @@ DataTypePtr getLeastSupertype(const TypeIndexSet & types) return std::make_shared(); } - auto numeric_type = getNumericType(types); + auto throw_or_return = [](const TypeIndexSet & type_ids, std::string_view message_suffix, int error_code){ return throwOrReturn(type_ids, message_suffix, error_code); }; + auto numeric_type = getNumericType(types, throw_or_return); if (numeric_type) return numeric_type; diff --git a/src/DataTypes/getLeastSupertype.h b/src/DataTypes/getLeastSupertype.h index 2ef4a0e6850..d949fad69c5 100644 --- a/src/DataTypes/getLeastSupertype.h +++ b/src/DataTypes/getLeastSupertype.h @@ -8,6 +8,7 @@ enum class LeastSupertypeOnError { Throw, String, + Variant, Null, }; @@ -24,6 +25,17 @@ DataTypePtr getLeastSupertype(const DataTypes & types); /// All types can be casted to String, because they can be serialized to String. DataTypePtr getLeastSupertypeOrString(const DataTypes & types); +/// Same as getLeastSupertype but in case when there is no supertype for some types +/// it uses Variant of these types as a supertype. Any type can be casted to a Variant +/// that contains this type. +/// As nested Variants are not allowed, if one of the types is Variant, it's variants +/// are used in the resulting Variant. +/// Examples: +/// (UInt64, String) -> Variant(UInt64, String) +/// (Array(UInt64), Array(String)) -> Array(Variant(UInt64, String)) +/// (Variant(UInt64, String), Array(UInt32)) -> Variant(UInt64, String, Array(UInt32)) +DataTypePtr getLeastSupertypeOrVariant(const DataTypes & types); + /// Same as above but return nullptr instead of throwing exception. DataTypePtr tryGetLeastSupertype(const DataTypes & types); diff --git a/src/Functions/if.cpp b/src/Functions/if.cpp index 1dc7443f124..c247938f885 100644 --- a/src/Functions/if.cpp +++ b/src/Functions/if.cpp @@ -688,15 +688,9 @@ private: DataTypePtr common_type; if (use_variant_when_no_common_type) - { - common_type = tryGetLeastSupertype(DataTypes{arg1.type, arg2.type}); - if (!common_type) - common_type = std::make_shared(DataTypes{removeNullableOrLowCardinalityNullable(arg1.type), removeNullableOrLowCardinalityNullable(arg2.type)}); - } + common_type = getLeastSupertypeOrVariant(DataTypes{arg1.type, arg2.type}); else - { common_type = getLeastSupertype(DataTypes{arg1.type, arg2.type}); - } ColumnPtr col_then = castColumn(arg1, common_type); ColumnPtr col_else = castColumn(arg2, common_type); @@ -1118,11 +1112,7 @@ public: "Must be UInt8.", arguments[0]->getName()); if (use_variant_when_no_common_type) - { - if (auto res = tryGetLeastSupertype(DataTypes{arguments[1], arguments[2]})) - return res; - return std::make_shared(DataTypes{removeNullableOrLowCardinalityNullable(arguments[1]), removeNullableOrLowCardinalityNullable(arguments[2])}); - } + return getLeastSupertypeOrVariant(DataTypes{arguments[1], arguments[2]}); return getLeastSupertype(DataTypes{arguments[1], arguments[2]}); } diff --git a/src/Functions/multiIf.cpp b/src/Functions/multiIf.cpp index 7a2e9444b2c..cefbea9f352 100644 --- a/src/Functions/multiIf.cpp +++ b/src/Functions/multiIf.cpp @@ -119,13 +119,7 @@ public: }); if (context->getSettingsRef().allow_experimental_variant_type && context->getSettingsRef().use_variant_when_no_common_type_in_if) - { - if (auto res = tryGetLeastSupertype(types_of_branches)) - return res; - for (auto & type : types_of_branches) - type = removeNullableOrLowCardinalityNullable(type); - return std::make_shared(types_of_branches); - } + return getLeastSupertypeOrVariant(types_of_branches); return getLeastSupertype(types_of_branches); } diff --git a/tests/queries/0_stateless/02940_variant_text_deserialization.reference b/tests/queries/0_stateless/02940_variant_text_deserialization.reference index 98725917567..8836e6c4e57 100644 --- a/tests/queries/0_stateless/02940_variant_text_deserialization.reference +++ b/tests/queries/0_stateless/02940_variant_text_deserialization.reference @@ -505,7 +505,7 @@ String (NULL,NULL),('string',NULL),(-1,-1),(0,0),(10000000000,NULL)(NULL,NULL),('string',NULL),(-1,NULL),(0,0),(10000000000,NULL)(NULL,NULL),('string',NULL),(-1,-1),(0,0),(10000000000,NULL)(NULL,NULL),('string',NULL),(-1,NULL),(0,0),(10000000000,NULL)(NULL,NULL),('string',NULL),(-1,-1),(0,0),(10000000000,NULL)(NULL,NULL),('string',NULL),(-1,NULL),(0,0),(10000000000,NULL)(NULL,NULL),('string',NULL),(-1,-1),(0,0),(10000000000000000000000,NULL)(NULL,NULL),('string',NULL),(-1,NULL),(0,0),(10000000000000000000000,NULL)(NULL,NULL),('string',NULL),(-1,-1),(0,0)(NULL,NULL),('string',NULL),(-1,NULL),(0,0)Floats (NULL,NULL),('string',NULL),(42.42,42.42)(NULL,NULL),('string',NULL),(42.42,42.42)Decimals (NULL,NULL),('string',NULL),(42.42,42.42)(NULL,NULL),('string',NULL),(42.42,42.42)(NULL,NULL),('string',NULL),(42.42,42.42)(NULL,NULL),('string',NULL),(42.42,42.42)Dates and DateTimes -(NULL,NULL),('string',NULL),('1970-01-01 00:00:00.000',NULL),('2020-01-01','2020-01-01'),('2020-01-01 00:00:00.999',NULL)(NULL,NULL),('string',NULL),('1970-01-01 00:00:00.000',NULL),('1900-01-01','1900-01-01'),('2020-01-01 00:00:00.999',NULL)(NULL,NULL),('string',NULL),('1970-01-01 00:00:00.000',NULL),('2020-01-01 00:00:00','2020-01-01 00:00:00'),('2020-01-01 00:00:00.999',NULL)(NULL,NULL),('string',NULL),('1970-01-01 00:00:00.000','1970-01-01 00:00:00.000'),('2020-01-01 00:00:00.999',NULL),('2020-01-01 00:00:00.999999999 ABC',NULL)UUID +(NULL,NULL),('string',NULL),('2020-01-d1',NULL),('2020-01-01','2020-01-01'),('2020-01-01 00:00:00.999',NULL)(NULL,NULL),('string',NULL),('2020-01-d1',NULL),('1900-01-01','1900-01-01'),('2020-01-01 00:00:00.999',NULL)(NULL,NULL),('string',NULL),('2020-01-d1',NULL),('2020-01-01 00:00:00','2020-01-01 00:00:00'),('2020-01-01 00:00:00.999',NULL)(NULL,NULL),('string',NULL),('2020-01-d1',NULL),('2020-01-01 00:00:00.999','2020-01-01 00:00:00.999'),('2020-01-01 00:00:00.999999999 ABC',NULL)UUID (NULL,NULL),('string',NULL),('c8619cca-0caa-445e-ae76-1d4f6e0b3927','c8619cca-0caa-445e-ae76-1d4f6e0b3927'),('c8619cca-0caa-445e-ae76-1d4f6e0b3927AAA',NULL)IPv4 (NULL,NULL),('string',NULL),('127.0.0.1','127.0.0.1'),('127.0.0.1AAA',NULL)IPv6 (NULL,NULL),('string',NULL),('2001:db8:85a3::8a2e:370:7334','2001:db8:85a3::8a2e:370:7334'),('2001:0db8:85a3:0000:0000:8a2e:0370:7334AAA',NULL)Enum From 3eba7678057df92e8a7f91912863843d377eecd4 Mon Sep 17 00:00:00 2001 From: yariks5s Date: Mon, 15 Jan 2024 19:17:13 +0000 Subject: [PATCH 057/245] init --- src/Interpreters/InterpreterCreateQuery.cpp | 10 ++++++++-- .../02973_dictionary_table_exception_fix.reference | 0 .../02973_dictionary_table_exception_fix.sql | 6 ++++++ 3 files changed, 14 insertions(+), 2 deletions(-) create mode 100644 tests/queries/0_stateless/02973_dictionary_table_exception_fix.reference create mode 100644 tests/queries/0_stateless/02973_dictionary_table_exception_fix.sql diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 36e864ace26..c00f58de59a 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -1405,8 +1405,14 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, interpreter.execute(); } else - throw Exception(storage_already_exists_error_code, - "{} {}.{} already exists", storage_name, backQuoteIfNeed(create.getDatabase()), backQuoteIfNeed(create.getTable())); + { + if (database->getTable(create.getTable(), getContext())->isDictionary()) + throw Exception(ErrorCodes::DICTIONARY_ALREADY_EXISTS, + "Dictionary {}.{} already exists", backQuoteIfNeed(create.getDatabase()), backQuoteIfNeed(create.getTable())); + else + throw Exception(ErrorCodes::TABLE_ALREADY_EXISTS, + "Table {}.{} already exists", backQuoteIfNeed(create.getDatabase()), backQuoteIfNeed(create.getTable())); + } } else if (!create.attach) { diff --git a/tests/queries/0_stateless/02973_dictionary_table_exception_fix.reference b/tests/queries/0_stateless/02973_dictionary_table_exception_fix.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02973_dictionary_table_exception_fix.sql b/tests/queries/0_stateless/02973_dictionary_table_exception_fix.sql new file mode 100644 index 00000000000..f8061b42670 --- /dev/null +++ b/tests/queries/0_stateless/02973_dictionary_table_exception_fix.sql @@ -0,0 +1,6 @@ +CREATE TABLE test_table (i Int64) engine=MergeTree order by i; +CREATE DICTIONARY test_dict (y String, value UInt64 DEFAULT 0) PRIMARY KEY y SOURCE(CLICKHOUSE(TABLE 'test_table')) LAYOUT(DIRECT()); +CREATE TABLE test_dict (y Int64) engine=MergeTree order by y; -- { serverError DICTIONARY_ALREADY_EXISTS } +CREATE DICTIONARY test_table (y String, value UInt64 DEFAULT 0) PRIMARY KEY y SOURCE(CLICKHOUSE(TABLE 'test_table')) LAYOUT(DIRECT()); -- { serverError TABLE_ALREADY_EXISTS } +CREATE DICTIONARY test_dict (y String, value UInt64 DEFAULT 0) PRIMARY KEY y SOURCE(CLICKHOUSE(TABLE 'test_table')) LAYOUT(DIRECT()); -- { serverError DICTIONARY_ALREADY_EXISTS } +CREATE TABLE test_table (y Int64) engine=MergeTree order by y; -- { serverError TABLE_ALREADY_EXISTS } From 7bc6a858c7778911a51e4c2430125f9c3741a535 Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 16 Jan 2024 19:44:55 +0100 Subject: [PATCH 058/245] Rewrite bash test to integration test --- .../test_broken_projections/__init__.py | 0 .../config.d/backups.xml | 13 + .../test_broken_projections/test.py | 492 +++++++++++++++++ .../02916_broken_projection.reference | 322 ----------- .../0_stateless/02916_broken_projection.sh | 515 ------------------ 5 files changed, 505 insertions(+), 837 deletions(-) create mode 100644 tests/integration/test_broken_projections/__init__.py create mode 100644 tests/integration/test_broken_projections/config.d/backups.xml create mode 100644 tests/integration/test_broken_projections/test.py delete mode 100644 tests/queries/0_stateless/02916_broken_projection.reference delete mode 100755 tests/queries/0_stateless/02916_broken_projection.sh diff --git a/tests/integration/test_broken_projections/__init__.py b/tests/integration/test_broken_projections/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_broken_projections/config.d/backups.xml b/tests/integration/test_broken_projections/config.d/backups.xml new file mode 100644 index 00000000000..4da8edffd67 --- /dev/null +++ b/tests/integration/test_broken_projections/config.d/backups.xml @@ -0,0 +1,13 @@ + + + + + local + /var/lib/clickhouse/disks/backups/ + + + + + backups + + diff --git a/tests/integration/test_broken_projections/test.py b/tests/integration/test_broken_projections/test.py new file mode 100644 index 00000000000..ca1a29817a5 --- /dev/null +++ b/tests/integration/test_broken_projections/test.py @@ -0,0 +1,492 @@ +import time +import pytest +import logging +import string +import random +from helpers.cluster import ClickHouseCluster + +cluster = ClickHouseCluster(__file__) + + +@pytest.fixture(scope="module") +def cluster(): + try: + cluster = ClickHouseCluster(__file__) + cluster.add_instance( + "node", + main_configs=["config.d/backups.xml"], + stay_alive=True, + with_zookeeper=True, + ) + + logging.info("Starting cluster...") + cluster.start() + logging.info("Cluster started") + + yield cluster + finally: + cluster.shutdown() + + +def create_table(node, table, replica, data_prefix = ""): + if data_prefix == "": + data_prefix = table + + node.query( + f""" + DROP TABLE IF EXISTS {table} SYNC; + CREATE TABLE {table} + ( + a String, + b String, + c Int64, + d Int64, + e Int64, + + PROJECTION proj + ( + SELECT c ORDER BY d + ), + PROJECTION proj_2 + ( + SELECT d ORDER BY c + ) + ) + ENGINE = ReplicatedMergeTree('/test_broken_projection_{data_prefix}/data/', '{replica}') ORDER BY a + SETTINGS min_bytes_for_wide_part = 0, + max_parts_to_merge_at_once=3, + enable_vertical_merge_algorithm=1, + vertical_merge_algorithm_min_rows_to_activate = 1, + vertical_merge_algorithm_min_columns_to_activate = 1, + vertical_merge_algorithm_min_columns_to_activate = 1, + compress_primary_key=0; + """ + ) + + +def insert(node, table, offset, size): + node.query( + f""" + INSERT INTO {table} + SELECT number, number, number, number, number%2 FROM numbers({offset}, {size}) + SETTINGS insert_keeper_fault_injection_probability=0.0; + """ + ) + + +def get_parts(node, table): + return ( + node.query( + f""" + SELECT name + FROM system.parts + WHERE table='{table}' AND database=currentDatabase() AND active = 1 + ORDER BY name;" + """ + ) + .strip() + .split("\n") + ) + + +def bash(node, command): + node.exec_in_container(["bash", "-c", command], privileged=True, user="root") + + +def break_projection(node, table, part, parent_part, break_type): + part_path = node.query( + f""" + SELECT path + FROM system.projection_parts + WHERE table='{table}' + AND database=currentDatabase() + AND active=1 + AND part_name='{part}' + AND parent_name='{parent_part}' + ORDER BY modification_time DESC + LIMIT 1; + """ + ).strip() + + node.query( + f"select throwIf(substring('{part_path}', 1, 1) != '/', 'Path is relative: {part_path}')" + ) + + if break_type == "data": + bash(node, f"rm '{part_path}/d.bin'") + bash(node, f"rm '{part_path}/c.bin'") + elif break_type == "metadata": + bash(node, f"rm '{part_path}/columns.txt'") + elif break_type == "part": + bash(node, f"rm -r '{part_path}'") + + +def break_part(node, table, part): + part_path = node.query( + f""" + SELECT path + FROM system.parts + WHERE table='{table}' + AND database=currentDatabase() + AND active=1 + AND part_name='{part}' + ORDER BY modification_time DESC + LIMIT 1; + """ + ).strip() + + node.query( + f"select throwIf(substring('{part_path}', 1, 1) != '/', 'Path is relative: {part_path}')" + ) + bash(node, f"rm '{part_path}/columns.txt'") + + +def get_broken_projections_info(node, table): + return node.query( + f""" + SELECT parent_name, name, errors.name FROM + ( + SELECT parent_name, name, exception_code + FROM system.projection_parts + WHERE table='{table}' + AND database=currentDatabase() + AND is_broken = 1 + ) AS parts_info + INNER JOIN system.errors AS errors + ON parts_info.exception_code = errors.code + ORDER BY parent_name, name + """ + ).strip() + + +def optimize(node, table, final, no_wait): + query = f"OPTIMIZE TABLE {table}" + if final: + query += " FINAL" + if no_wait: + query += " SETTINGS alter_sync=0" + node.query(query) + + +def reattach(node, table): + node.query( + f""" + DETACH TABLE {table}; + ATTACH TABLE {table}; + """ + ) + + +def materialize_projection(node, table, proj): + node.query( + f"ALTER TABLE {table} MATERIALIZE PROJECTION {proj} SETTINGS mutations_sync=2" + ) + + +def check_table_full(node, table): + return node.query( + f"CHECK TABLE {table} SETTINGS check_query_single_value_result = 0;" + ).strip() + + +def random_str(length=6): + alphabet = string.ascii_lowercase + string.digits + return "".join(random.SystemRandom().choice(alphabet) for _ in range(length)) + + +def check(node, table, check_result, expect_broken_part="", expected_error=""): + query_id = random_str() + + if expect_broken_part == "proj": + assert expected_error in node.query_and_get_error( + f"SELECT c FROM '{table}' WHERE d == 12 ORDER BY c" + ) + else: + node.query( + f"SELECT c FROM '{table}' WHERE d == 12 OR d == 16 ORDER BY c", + query_id=query_id, + ) + assert "proj" in node.query( + f""" + SYSTEM FLUSH LOGS; + SELECT query, splitByChar('.', arrayJoin(projections))[-1] + FROM system.query_log + WHERE current_database=currentDatabase() AND query_id='{query_id}' AND type='QueryFinish' + """ + ) + + query_id = random_str() + + if expect_broken_part == "proj_2": + assert expected_error in node.query_and_get_error( + f"SELECT d FROM '{table}' WHERE c == 12 ORDER BY d" + ) + else: + node.query( + f"SELECT d FROM '{table}' WHERE c == 12 OR c == 16 ORDER BY d", + query_id=query_id, + ) + assert "proj" in node.query( + f""" + SYSTEM FLUSH LOGS; + SELECT query, splitByChar('.', arrayJoin(projections))[-1] + FROM system.query_log + WHERE current_database=currentDatabase() AND query_id='{query_id}' AND type='QueryFinish' + """ + ) + + assert check_result == int(node.query(f"CHECK TABLE {table}")) + + +def test_broken_ignored(cluster): + node = cluster.instances["node"] + + table_name = "test1" + create_table(node, table_name, 1) + + insert(node, table_name, 0, 5) + insert(node, table_name, 5, 5) + insert(node, table_name, 10, 5) + insert(node, table_name, 15, 5) + + assert ["all_0_0_0", "all_1_1_0", "all_2_2_0", "all_3_3_0"] == get_parts( + node, table_name + ) + + # Break metadata (columns.txt) file of projection 'proj' + break_projection(node, table_name, "proj", "all_2_2_0", "metadata") + + # Do select and after "check table" query. + # Select works because it does not read columns.txt. + # But expect check table result as 0. + check(node, table_name, 0) + + # Projection 'proj' from part all_2_2_0 will now appear in broken parts info + # because it was marked broken during "check table" query. + assert "all_2_2_0\tproj\tFILE_DOESNT_EXIST" in get_broken_projections_info( + node, table_name + ) + + # Check table query will also show a list of parts which have broken projections. + assert "all_2_2_0" in check_table_full(node, table_name) + + # Break data file of projection 'proj_2' for part all_2_2_0 + break_projection(node, table_name, "proj_2", "all_2_2_0", "data") + + # It will not yet appear in broken projections info. + assert "proj_2" not in get_broken_projections_info(node, table_name) + + # Select now fails with error "File doesn't exist" + check(node, table_name, 0, "proj_2", "FILE_DOESNT_EXIST") + + # Projection 'proj_2' from part all_2_2_0 will now appear in broken parts info. + assert "all_2_2_0\tproj_2\tNO_FILE_IN_DATA_PART" in get_broken_projections_info( + node, table_name + ) + + # Second select works, because projection is now marked as broken. + check(node, table_name, 0) + + # Break data file of projection 'proj_2' for part all_3_3_0 + break_projection(node, table_name, "proj_2", "all_3_3_0", "data") + + # It will not yet appear in broken projections info. + assert "all_3_3_0" not in get_broken_projections_info(node, table_name) + + insert(node, table_name, 20, 5) + insert(node, table_name, 25, 5) + + # Part all_3_3_0 has 'proj' and 'proj_2' projections, but 'proj_2' is broken and server does NOT know it yet. + # Parts all_4_4_0 and all_5_5_0 have both non-broken projections. + # So a merge will be create for future part all_3_5_1. + # During merge it will fail to read from 'proj_2' of part all_3_3_0 and proj_2 will be marked broken. + # Merge will be retried and on second attempt it will succeed. + # The result part all_3_5_1 will have only 1 projection - 'proj', because + # it will skip 'proj_2' as it will see that one part does not have it anymore in the set of valid projections. + optimize(node, table_name, 0, 1) + time.sleep(5) + + # table_uuid=node.query(f"SELECT uuid FROM system.tables WHERE table='{table_name}' and database=currentDatabase()").strip() + # assert 0 < int( + # node.query( + # f""" + # SYSTEM FLUSH LOGS; + # SELECT count() FROM system.text_log + # WHERE level='Error' + # AND logger_name='MergeTreeBackgroundExecutor' + # AND message like 'Exception while executing background task %{table_uuid}:all_3_5_1%%Cannot open file%proj_2.proj/c.bin%' + # """) + # ) + + assert "all_3_3_0" in get_broken_projections_info(node, table_name) + check(node, table_name, 0) + + +def test_materialize_broken_projection(cluster): + node = cluster.instances["node"] + + table_name = "test2" + create_table(node, table_name, 1) + + insert(node, table_name, 0, 5) + insert(node, table_name, 5, 5) + insert(node, table_name, 10, 5) + insert(node, table_name, 15, 5) + + assert ["all_0_0_0", "all_1_1_0", "all_2_2_0", "all_3_3_0"] == get_parts( + node, table_name + ) + + break_projection(node, table_name, "proj", "all_1_1_0", "metadata") + reattach(node, table_name) + + assert "all_1_1_0\tproj\tNO_FILE_IN_DATA_PART" in get_broken_projections_info( + node, table_name + ) + assert "Part all_1_1_0 has a broken projection proj" in check_table_full( + node, table_name + ) + + break_projection(node, table_name, "proj_2", "all_1_1_0", "data") + reattach(node, table_name) + + assert "all_1_1_0\tproj_2\tFILE_DOESNT_EXIST" in get_broken_projections_info( + node, table_name + ) + assert "Part all_1_1_0 has a broken projection proj_2" in check_table_full( + node, table_name + ) + + materialize_projection(node, table_name, "proj") + + assert "has a broken projection" not in check_table_full(node, table_name) + + +def test_broken_ignored_replicated(cluster): + node = cluster.instances["node"] + + table_name = "test3" + table_name2 = "test3_replica" + create_table(node, table_name, 1) + + insert(node, table_name, 0, 5) + insert(node, table_name, 5, 5) + insert(node, table_name, 10, 5) + insert(node, table_name, 15, 5) + + check(node, table_name, 1) + + create_table(node, table_name2, 2, table_name) + check(node, table_name2, 1) + + break_projection(node, table_name, "proj", "all_0_0_0", "data") + assert "Part all_0_0_0 has a broken projection proj" in check_table_full( + node, table_name + ) + + break_part(node, table_name, "all_0_0_0") + node.query(f"SYSTEM SYNC REPLICA {table_name}") + assert "has a broken projection" not in check_table_full(node, table_name) + + +def test_broken_projections_in_backups(cluster): + node = cluster.instances["node"] + + table_name = "test4" + create_table(node, table_name, 1) + + insert(node, table_name, 0, 5) + insert(node, table_name, 5, 5) + insert(node, table_name, 10, 5) + insert(node, table_name, 15, 5) + + check(node, table_name, 1) + + break_projection(node, table_name, "proj", "all_2_2_0", "data") + check(node, table_name, 0, "proj", "FILE_DOESNT_EXIST") + + assert "all_2_2_0\tproj\tNO_FILE_IN_DATA_PART" in get_broken_projections_info( + node, table_name + ) + + assert "BACKUP_CREATED" in node.query( + f""" + set backup_restore_keeper_fault_injection_probability=0.0; + backup table {table_name} to Disk('backups', 'b1') settings check_projection_parts=false; + """ + ) + + assert "RESTORED" in node.query( + f""" + drop table {table_name} sync; + set backup_restore_keeper_fault_injection_probability=0.0; + restore table {table_name} from Disk('backups', 'b1'); + """ + ) + + check(node, table_name, 1) + + assert "" == get_broken_projections_info(node, table_name) + # TODO: add a check for what projections are loaded + + break_projection(node, table_name, "proj", "all_2_2_0", "part") + + check(node, table_name, 0, "proj", "ErrnoException") + + assert "all_2_2_0\tproj\tFILE_DOESNT_EXIST" == get_broken_projections_info( + node, table_name + ) + + assert "FILE_DOESNT_EXIST" in node.query_and_get_error( + f""" + set backup_restore_keeper_fault_injection_probability=0.0; + backup table {table_name} to Disk('backups', 'b2') + """ + ) + + materialize_projection(node, table_name, "proj") + check(node, table_name, 1) + # TODO: + # assert "all_2_2_0\tproj\tFILE_DOESNT_EXIST" == get_broken_projections_info(node, table_name) + + assert "BACKUP_CREATED" in node.query( + f""" + set backup_restore_keeper_fault_injection_probability=0.0; + backup table {table_name} to Disk('backups', 'b3') settings check_projection_parts=false; + """ + ) + + assert "RESTORED" in node.query( + f""" + drop table {table_name} sync; + set backup_restore_keeper_fault_injection_probability=0.0; + restore table {table_name} from Disk('backups', 'b3'); + """ + ) + check(node, table_name, 1) + + break_projection(node, table_name, "proj", "all_1_1_0", "part") + # TODO: check(node, table_name, 0, "proj", "FILE_DOESNT_EXIST") + assert "Part all_1_1_0 has a broken projection proj" in check_table_full( + node, table_name + ) + assert "all_1_1_0\tproj\tFILE_DOESNT_EXIST" == get_broken_projections_info( + node, table_name + ) + + assert "BACKUP_CREATED" in node.query( + f""" + set backup_restore_keeper_fault_injection_probability=0.0; + backup table {table_name} to Disk('backups', 'b4') settings check_projection_parts=false; + """ + ) + + assert "RESTORED" in node.query( + f""" + drop table {table_name} sync; + set backup_restore_keeper_fault_injection_probability=0.0; + restore table {table_name} from Disk('backups', 'b4'); + """ + ) + check(node, table_name, 1) + assert "" == get_broken_projections_info(node, table_name) diff --git a/tests/queries/0_stateless/02916_broken_projection.reference b/tests/queries/0_stateless/02916_broken_projection.reference deleted file mode 100644 index 3967215e5de..00000000000 --- a/tests/queries/0_stateless/02916_broken_projection.reference +++ /dev/null @@ -1,322 +0,0 @@ -insert new part -insert new part -insert new part -insert new part -select from projection 'proj' -12 -16 -used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2' -12 -16 -used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 -check table -1 -0 -broke metadata of part 'proj' (parent part: all_2_2_0) -select from projection 'proj' -12 -16 -used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2' -12 -16 -used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 -check table -0 -broken projections info -all_2_2_0 proj FILE_DOESNT_EXIST -check table full (test - all_2_2_0) -all_2_2_0 -0 -broke data of part 'proj_2' (parent part: all_2_2_0) -broken projections info -all_2_2_0 proj FILE_DOESNT_EXIST -select from projection 'proj' -12 -16 -used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2', expect error: proj_2 -FILE_DOESNT_EXIST -check table -0 -broken projections info -all_2_2_0 proj FILE_DOESNT_EXIST -all_2_2_0 proj_2 NO_FILE_IN_DATA_PART -select from projection 'proj' -12 -16 -used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2' -12 -16 -used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 -check table -0 -0 -broke data of part 'proj_2' (parent part: all_3_3_0) -broken projections info -all_2_2_0 proj FILE_DOESNT_EXIST -all_2_2_0 proj_2 NO_FILE_IN_DATA_PART -insert new part -insert new part -optimize -OPTIMIZE TABLE test SETTINGS alter_sync=0 -0 -broken projections info -all_2_2_0 proj FILE_DOESNT_EXIST -all_2_2_0 proj_2 NO_FILE_IN_DATA_PART -all_3_3_0 proj_2 NO_FILE_IN_DATA_PART -select from projection 'proj' -12 -16 -used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2' -12 -16 -used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 -check table -0 -0 -broke metadata of part 'proj' (parent part: all_1_1_0) -Detach - Attach -broken projections info -all_1_1_0 proj NO_FILE_IN_DATA_PART -all_2_2_0 proj NO_FILE_IN_DATA_PART -all_2_2_0 proj_2 FILE_DOESNT_EXIST -all_3_3_0 proj_2 FILE_DOESNT_EXIST -0 -broke data of part 'proj_2' (parent part: all_1_1_0) -Detach - Attach -broken projections info -all_1_1_0 proj NO_FILE_IN_DATA_PART -all_1_1_0 proj_2 FILE_DOESNT_EXIST -all_2_2_0 proj NO_FILE_IN_DATA_PART -all_2_2_0 proj_2 FILE_DOESNT_EXIST -all_3_3_0 proj_2 FILE_DOESNT_EXIST -select from projection 'proj' -12 -16 -used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2' -12 -16 -used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 -check table -0 -check table full (test - all_1_1_0) -all_1_1_0 -materialize projection proj -check table full (test - ) -select from projection 'proj' -12 -16 -used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2' -12 -16 -used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 -check table -1 -materialize projection proj_2 -check table full (test - ) -0 -broke data of part 'proj' (parent part: all_3_5_1_7) -insert new part -optimize -OPTIMIZE TABLE test FINAL -insert new part -optimize -OPTIMIZE TABLE test FINAL -select from projection 'proj' -12 -16 -used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj_2 -select from projection 'proj_2' -12 -16 -used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 -check table -1 -insert new part -insert new part -insert new part -insert new part -select from projection 'proj' -used projections -SELECT c FROM test2 WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2' -used projections -SELECT d FROM test2 WHERE c == 12 OR c == 16 ORDER BY d; proj_2 -check table -1 -select from projection 'proj' -used projections -SELECT c FROM test2_replica WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2' -used projections -SELECT d FROM test2_replica WHERE c == 12 OR c == 16 ORDER BY d; proj_2 -check table -1 -0 -broke data of part 'proj' (parent part: all_0_0_0) -check table full (test2 - all_0_0_0) -all_0_0_0 -select from projection 'proj' -used projections -SELECT c FROM test2 WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2' -used projections -SELECT d FROM test2 WHERE c == 12 OR c == 16 ORDER BY d; proj_2 -check table -0 -broke data of part 'all_0_0_0' -check table full (test2 - all_0_0_0) -all_0_0_0 -select from projection 'proj' -used projections -SELECT c FROM test2 WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2' -used projections -SELECT d FROM test2 WHERE c == 12 OR c == 16 ORDER BY d; proj_2 -check table -1 -select from projection 'proj' -used projections -SELECT c FROM test2 WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2' -used projections -SELECT d FROM test2 WHERE c == 12 OR c == 16 ORDER BY d; proj_2 -check table -1 -insert new part -insert new part -insert new part -insert new part -select from projection 'proj' -12 -16 -used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2' -12 -16 -used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 -check table -1 -0 -broke data of part 'proj' (parent part: all_2_2_0) -select from projection 'proj', expect error: proj -FILE_DOESNT_EXIST -select from projection 'proj_2' -12 -16 -used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 -check table -0 -broken projections info -all_2_2_0 proj NO_FILE_IN_DATA_PART -BACKUP_CREATED -RESTORED -select from projection 'proj' -12 -16 -used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2' -12 -16 -used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 -check table -1 -broken projections info -0 -broke all data of part 'proj' (parent part: all_2_2_0) -select from projection 'proj', expect error: proj -Errno -Errno -select from projection 'proj_2' -12 -16 -used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 -check table -0 -broken projections info -all_2_2_0 proj FILE_DOESNT_EXIST -FILE_DOESNT_EXIST -materialize projection proj -select from projection 'proj' -12 -16 -used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2' -12 -16 -used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 -check table -1 -broken projections info -all_2_2_0 proj FILE_DOESNT_EXIST -BACKUP_CREATED -RESTORED -select from projection 'proj' -12 -16 -used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2' -12 -16 -used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 -check table -1 -0 -broke all data of part 'proj' (parent part: all_1_1_0) -select from projection 'proj', expect error: proj -select from projection 'proj_2' -12 -16 -used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 -check table -0 -broken projections info -all_1_1_0 proj FILE_DOESNT_EXIST -BACKUP_CREATED -RESTORED -select from projection 'proj' -12 -16 -used projections -SELECT c FROM test WHERE d == 12 OR d == 16 ORDER BY c; proj -select from projection 'proj_2' -12 -16 -used projections -SELECT d FROM test WHERE c == 12 OR c == 16 ORDER BY d; proj_2 -check table -0 -broken projections info -all_1_1_0 proj NO_FILE_IN_DATA_PART diff --git a/tests/queries/0_stateless/02916_broken_projection.sh b/tests/queries/0_stateless/02916_broken_projection.sh deleted file mode 100755 index fbd26e59f6f..00000000000 --- a/tests/queries/0_stateless/02916_broken_projection.sh +++ /dev/null @@ -1,515 +0,0 @@ -#!/usr/bin/env bash -# Tags: long, no-random-merge-tree-settings, no-random-settings, no-s3-storage, no-parallel -# shellcheck disable=SC2046 - -CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) -# shellcheck source=../shell_config.sh -. "$CURDIR"/../shell_config.sh - -function create_table() -{ - test_id=$1 - name=$2 - replica=$3 - $CLICKHOUSE_CLIENT -nm -q " - DROP TABLE IF EXISTS $name SYNC; - CREATE TABLE $name - ( - a String, - b String, - c Int64, - d Int64, - e Int64, - - PROJECTION proj - ( - SELECT c ORDER BY d - ), - PROJECTION proj_2 - ( - SELECT d ORDER BY c - ) - ) - ENGINE = ReplicatedMergeTree('/test_broken_projection_32_$test_id/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/', '$replica') ORDER BY a - SETTINGS min_bytes_for_wide_part = 0, - max_parts_to_merge_at_once=3, - enable_vertical_merge_algorithm=1, - vertical_merge_algorithm_min_rows_to_activate = 1, - vertical_merge_algorithm_min_columns_to_activate = 1, - vertical_merge_algorithm_min_columns_to_activate = 1, - compress_primary_key=0; - " -} - -function random() -{ - cat /dev/urandom | LC_ALL=C tr -dc 'a-zA-Z' | fold -w ${1:-8} | head -n 1 -} - -function insert() -{ - table=$1 - offset=$2 - size=$3 - echo 'insert new part' - $CLICKHOUSE_CLIENT -q "INSERT INTO $table SELECT number, number, number, number, number%2 FROM numbers($offset, $size) SETTINGS insert_keeper_fault_injection_probability=0.0;" -} - -function break_projection() -{ - table=$1 - part_name=$2 - parent_name=$3 - break_type=$4 - - read -r part_path <<< $($CLICKHOUSE_CLIENT -nm -q " - SELECT path - FROM system.projection_parts - WHERE table='$table' - AND database=currentDatabase() - AND active=1 - AND part_name='$part_name' - AND parent_name='$parent_name' - ORDER BY modification_time DESC - LIMIT 1; - ") - - $CLICKHOUSE_CLIENT -q "select throwIf(substring('$part_path', 1, 1) != '/', 'Path is relative: $part_path')" || exit - - if [ "$break_type" = "data" ] - then - rm "$part_path/d.bin" - rm "$part_path/c.bin" - echo "broke data of part '$part_name' (parent part: $parent_name)" - fi - if [ "$break_type" = "metadata" ] - then - rm "$part_path/columns.txt" - echo "broke metadata of part '$part_name' (parent part: $parent_name)" - fi - if [ "$break_type" = "part" ] - then - rm -r "$part_path" - echo "broke all data of part '$part_name' (parent part: $parent_name)" - fi -} - -function break_part() -{ - table=$1 - part_name=$2 - - read -r part_path <<< $($CLICKHOUSE_CLIENT -nm -q " - SELECT path - FROM system.parts - WHERE table='$table' - AND database=currentDatabase() - AND active=1 - AND part_name='$part_name' - ORDER BY modification_time DESC - LIMIT 1; - ") - - if [ "$part_path" = "" ] - then - echo "Part path is empty" - exit - fi - - rm $part_path/columns.txt - echo "broke data of part '$part_name'" -} - -function broken_projections_info() -{ - table=$1 - echo 'broken projections info' - $CLICKHOUSE_CLIENT -q " - SELECT parent_name, name, errors.name FROM - ( - SELECT parent_name, name, exception_code - FROM system.projection_parts - WHERE table='$table' - AND database=currentDatabase() - AND is_broken = 1 - ) AS parts_info - INNER JOIN system.errors AS errors - ON parts_info.exception_code = errors.code - ORDER BY parent_name, name -" -} - -function check() -{ - table=$1 - expect_broken_part="" - expected_error="" - if [ $# -gt 1 ]; then - expect_broken_part=$2 - expected_error=$3 - fi - - #echo 'system.parts' - #$CLICKHOUSE_CLIENT -q " - #SELECT name, active, projections - #FROM system.parts - #WHERE table='$table' AND database=currentDatabase() - #ORDER BY name;" - - query_id=$(random 8) - - if [ "$expect_broken_part" = "proj" ] - then - echo "select from projection 'proj', expect error: $expect_broken_part" - $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -nm -q " - SET send_logs_level='fatal'; - SELECT c FROM $table WHERE d == 12 ORDER BY c; - " 2>&1 | grep -oF "$expected_error" - else - echo "select from projection 'proj'" - $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT c FROM $table WHERE d == 12 OR d == 16 ORDER BY c;" - echo 'used projections' - $CLICKHOUSE_CLIENT -nm -q " - SYSTEM FLUSH LOGS; - SELECT query, splitByChar('.', arrayJoin(projections))[-1] FROM system.query_log WHERE current_database=currentDatabase() AND query_id='$query_id' AND type='QueryFinish' - " - fi - - query_id=$(random 8) - - if [ "$expect_broken_part" = "proj_2" ] - then - echo "select from projection 'proj_2', expect error: $expect_broken_part" - $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -nm -q " - SET send_logs_level='fatal'; - SELECT d FROM $table WHERE c == 12 ORDER BY d; - " 2>&1 | grep -oF "$expected_error" - else - echo "select from projection 'proj_2'" - $CLICKHOUSE_CLIENT --optimize_use_projections 1 --query_id $query_id -q "SELECT d FROM $table WHERE c == 12 OR c == 16 ORDER BY d;" - echo 'used projections' - $CLICKHOUSE_CLIENT -nm -q " - SYSTEM FLUSH LOGS; - SELECT query, splitByChar('.', arrayJoin(projections))[-1] FROM system.query_log WHERE current_database=currentDatabase() AND query_id='$query_id' AND type='QueryFinish' - " - fi - - echo 'check table' - $CLICKHOUSE_CLIENT -nm -q " - SET send_logs_level='fatal'; - CHECK TABLE $table;" -} - -function optimize() -{ - final=$1 - no_wait=$2 - - echo 'optimize' - query="OPTIMIZE TABLE test" - - if [ $final -eq 1 ]; then - query="$query FINAL" - fi - if [ $no_wait -eq 1 ]; then - query="$query SETTINGS alter_sync=0" - fi - - echo $query - - $CLICKHOUSE_CLIENT -q "$query" -} - -function reattach() -{ - echo 'Detach - Attach' - $CLICKHOUSE_CLIENT -nm -q " - SET send_logs_level='fatal'; - DETACH TABLE test; - ATTACH TABLE test; - " -} - -function materialize_projection -{ - table=$1 - projection=$2 - echo "materialize projection $projection" - $CLICKHOUSE_CLIENT -q "ALTER TABLE $table MATERIALIZE PROJECTION $projection SETTINGS mutations_sync=2" -} - -function check_table_full() -{ - table=$1 - expect_broken_part=$2 - echo "check table full ($1 - $2)" - if [ "$expect_broken_part" = "" ] - then - $CLICKHOUSE_CLIENT -nm -q " - SET send_logs_level='fatal'; - CHECK TABLE $table SETTINGS check_query_single_value_result = 0; - " | grep "broken" - else - $CLICKHOUSE_CLIENT -nm -q " - SET send_logs_level='fatal'; - CHECK TABLE $table SETTINGS check_query_single_value_result = 0; - " | grep "broken" | grep -o $expect_broken_part | head -n 1 - fi -} - -function test1() -{ - create_table test1 test 1 - - table_uuid=$($CLICKHOUSE_CLIENT -q "SELECT uuid FROM system.tables WHERE table='test' and database=currentDatabase()") - - insert test 0 5 - - insert test 5 5 - - insert test 10 5 - - insert test 15 5 - - check test - - # Break metadata file of projection 'proj' - break_projection test proj all_2_2_0 metadata - - # Do select and after "check table" query. - # Select works because it does not read columns.txt. - check test - - # Projection 'proj' from part all_2_2_0 will now appear in broken parts info - # because it was marked broken during "check table" query. - # TODO: try to mark it during select as well - broken_projections_info test - - # Check table query will also show a list of parts which have broken projections. - check_table_full test "all_2_2_0" - - # Break data file of projection 'proj_2' for part all_2_2_0 - break_projection test proj_2 all_2_2_0 data - - # It will not yet appear in broken projections info. - broken_projections_info test - - # Select now fails with error "File doesn't exist" - check test "proj_2" FILE_DOESNT_EXIST - - # Projection 'proj_2' from part all_2_2_0 will now appear in broken parts info. - broken_projections_info test - - # Second select works, because projection is now marked as broken. - check test - - # Break data file of projection 'proj_2' for part all_3_3_0 - break_projection test proj_2 all_3_3_0 data - - # It will not yet appear in broken projections info. - broken_projections_info test - - insert test 20 5 - - insert test 25 5 - - # Part all_3_3_0 has 'proj' and 'proj_2' projections, but 'proj_2' is broken and server does NOT know it yet. - # Parts all_4_4_0 and all_5_5_0 have both non-broken projections. - # So a merge will be create for future part all_3_5_1. - # During merge it will fail to read from 'proj_2' of part all_3_3_0 and proj_2 will be marked broken. - # Merge will be retried and on second attempt it will succeed. - # The result part all_3_5_1 will have only 1 projection - 'proj', because - # it will skip 'proj_2' as it will see that one part does not have it anymore in the set of valid projections. - optimize 0 1 - sleep 2 - - $CLICKHOUSE_CLIENT -nm -q " - SYSTEM FLUSH LOGS; - SELECT count() FROM system.text_log - WHERE level='Error' - AND logger_name='MergeTreeBackgroundExecutor' - AND message like 'Exception while executing background task {$table_uuid:all_3_5_1}%Cannot open file%proj_2.proj/c.bin%' - " - - # Projection 'proj_2' from part all_2_2_0 will now appear in broken parts info. - broken_projections_info test - - check test - - break_projection test proj all_1_1_0 metadata - - reattach - - broken_projections_info test - - break_projection test proj_2 all_1_1_0 data - - reattach - - broken_projections_info test - - check test - - check_table_full test all_1_1_0 - - materialize_projection test proj - - check_table_full test - - check test - - materialize_projection test proj_2 - - check_table_full test - - break_projection test proj all_3_5_1_7 data - - insert test 30 5 - - optimize 1 0 - - insert test 35 5 - - optimize 1 0 - - check test -} - -function test2() -{ - create_table test2 test2 1 - - insert test2 0 5 - - insert test2 5 5 - - insert test 10 5 - - insert test 15 5 - - check test2 - - create_table test2 test2_replica 2 - - check test2_replica - - break_projection test2 proj all_0_0_0 data - - check_table_full test2 all_0_0_0 - - check test2 - - break_part test2 all_0_0_0 - - check_table_full test2 all_0_0_0 - - check test2 - - $CLICKHOUSE_CLIENT -q "SYSTEM SYNC REPLICA test2;" - - check test2 -} - -function test3() -{ - create_table test3 test 1 - - insert test 0 5 - - insert test 5 5 - - insert test 10 5 - - insert test 15 5 - - check test - - break_projection test proj all_2_2_0 data - - check test proj FILE_DOESNT_EXIST - - broken_projections_info test - - ${CLICKHOUSE_CLIENT} -nm --query " - set backup_restore_keeper_fault_injection_probability=0.0; - backup table ${CLICKHOUSE_DATABASE}.test to Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}') settings check_projection_parts=false; - " | grep -o "BACKUP_CREATED" - - ${CLICKHOUSE_CLIENT} -nm --stacktrace --query " - drop table test sync; - set backup_restore_keeper_fault_injection_probability=0.0; - restore table ${CLICKHOUSE_DATABASE}.test from Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}'); - " | grep -o "RESTORED" - - check test - - broken_projections_info test - - break_projection test proj all_2_2_0 part - - check test proj Errno - - broken_projections_info test - - ${CLICKHOUSE_CLIENT} -nm --query " - set send_logs_level='fatal'; - set backup_restore_keeper_fault_injection_probability=0.0; - backup table ${CLICKHOUSE_DATABASE}.test to Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}_2') - " 2>&1 | grep -o "FILE_DOESNT_EXIST" - - materialize_projection test proj - - check test - - broken_projections_info test - - ${CLICKHOUSE_CLIENT} -nm --query " - set send_logs_level='fatal'; - set backup_restore_keeper_fault_injection_probability=0.0; - backup table ${CLICKHOUSE_DATABASE}.test to Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}_3') - " | grep -o "BACKUP_CREATED" - - ${CLICKHOUSE_CLIENT} -nm --stacktrace --query " - drop table test sync; - set send_logs_level='fatal'; - set backup_restore_keeper_fault_injection_probability=0.0; - restore table ${CLICKHOUSE_DATABASE}.test from Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}_3'); - " | grep -o "RESTORED" - - check test - - break_projection test proj all_1_1_0 part - - check test proj FILE_DOESNT_EXIST - - broken_projections_info test - - ${CLICKHOUSE_CLIENT} -nm --query " - set send_logs_level='fatal'; - set backup_restore_keeper_fault_injection_probability=0.0; - backup table ${CLICKHOUSE_DATABASE}.test to Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}_4') - settings check_projection_parts=false, allow_backup_broken_projections=true; - " | grep -o "BACKUP_CREATED" - - ${CLICKHOUSE_CLIENT} -nm --stacktrace --query " - drop table test sync; - set send_logs_level='fatal'; - set backup_restore_keeper_fault_injection_probability=0.0; - restore table ${CLICKHOUSE_DATABASE}.test from Disk('backups', '${CLICKHOUSE_TEST_UNIQUE_NAME}_4'); - " | grep -o "RESTORED" - - check test - - broken_projections_info test -} - -test1 -test2 -test3 - - -$CLICKHOUSE_CLIENT -nm -q " -DROP TABLE IF EXISTS test SYNC; -DROP TABLE IF EXISTS test2 SYNC; -DROP TABLE IF EXISTS test2_replica SYNC; -" From 216769f43ea536a38b9e7d5650cdd02fae972caf Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Tue, 16 Jan 2024 18:55:23 +0000 Subject: [PATCH 059/245] Automatic style fix --- tests/integration/test_broken_projections/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_broken_projections/test.py b/tests/integration/test_broken_projections/test.py index ca1a29817a5..cc3e55402b3 100644 --- a/tests/integration/test_broken_projections/test.py +++ b/tests/integration/test_broken_projections/test.py @@ -28,7 +28,7 @@ def cluster(): cluster.shutdown() -def create_table(node, table, replica, data_prefix = ""): +def create_table(node, table, replica, data_prefix=""): if data_prefix == "": data_prefix = table From dea8b10ae972b4fc2b20dbf90d90e362bf7e4207 Mon Sep 17 00:00:00 2001 From: kssenii Date: Wed, 17 Jan 2024 15:39:57 +0100 Subject: [PATCH 060/245] Fix test --- tests/integration/test_broken_projections/test.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/tests/integration/test_broken_projections/test.py b/tests/integration/test_broken_projections/test.py index ca1a29817a5..90d82f9f010 100644 --- a/tests/integration/test_broken_projections/test.py +++ b/tests/integration/test_broken_projections/test.py @@ -425,9 +425,7 @@ def test_broken_projections_in_backups(cluster): ) check(node, table_name, 1) - assert "" == get_broken_projections_info(node, table_name) - # TODO: add a check for what projections are loaded break_projection(node, table_name, "proj", "all_2_2_0", "part") @@ -446,8 +444,6 @@ def test_broken_projections_in_backups(cluster): materialize_projection(node, table_name, "proj") check(node, table_name, 1) - # TODO: - # assert "all_2_2_0\tproj\tFILE_DOESNT_EXIST" == get_broken_projections_info(node, table_name) assert "BACKUP_CREATED" in node.query( f""" @@ -466,7 +462,6 @@ def test_broken_projections_in_backups(cluster): check(node, table_name, 1) break_projection(node, table_name, "proj", "all_1_1_0", "part") - # TODO: check(node, table_name, 0, "proj", "FILE_DOESNT_EXIST") assert "Part all_1_1_0 has a broken projection proj" in check_table_full( node, table_name ) @@ -477,7 +472,7 @@ def test_broken_projections_in_backups(cluster): assert "BACKUP_CREATED" in node.query( f""" set backup_restore_keeper_fault_injection_probability=0.0; - backup table {table_name} to Disk('backups', 'b4') settings check_projection_parts=false; + backup table {table_name} to Disk('backups', 'b4') settings check_projection_parts=false, allow_backup_broken_projections=true; """ ) @@ -488,5 +483,5 @@ def test_broken_projections_in_backups(cluster): restore table {table_name} from Disk('backups', 'b4'); """ ) - check(node, table_name, 1) - assert "" == get_broken_projections_info(node, table_name) + check(node, table_name, 0) + assert "all_1_1_0\tproj\tNO_FILE_IN_DATA_PART" == get_broken_projections_info(node, table_name) From 1e9de73bf57de1eb66007cba0fecb9f0459c973e Mon Sep 17 00:00:00 2001 From: kssenii Date: Wed, 17 Jan 2024 15:40:46 +0100 Subject: [PATCH 061/245] Fix style check --- tests/integration/test_broken_projections/test.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_broken_projections/test.py b/tests/integration/test_broken_projections/test.py index eb8c452fff0..1b192e0df24 100644 --- a/tests/integration/test_broken_projections/test.py +++ b/tests/integration/test_broken_projections/test.py @@ -484,4 +484,6 @@ def test_broken_projections_in_backups(cluster): """ ) check(node, table_name, 0) - assert "all_1_1_0\tproj\tNO_FILE_IN_DATA_PART" == get_broken_projections_info(node, table_name) + assert "all_1_1_0\tproj\tNO_FILE_IN_DATA_PART" == get_broken_projections_info( + node, table_name + ) From 8ac04c6dd8a945e0f189aae572c54ee4458f75dd Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 18 Jan 2024 15:15:57 +0000 Subject: [PATCH 062/245] Address comments --- src/DataTypes/DataTypeVariant.cpp | 9 ++++ .../Serializations/SerializationVariant.cpp | 48 +++++++------------ .../Serializations/SerializationVariant.h | 23 +++++++++ .../SerializationVariantElement.cpp | 14 +++--- 4 files changed, 57 insertions(+), 37 deletions(-) diff --git a/src/DataTypes/DataTypeVariant.cpp b/src/DataTypes/DataTypeVariant.cpp index 2bc4dfa5a7a..e0510373960 100644 --- a/src/DataTypes/DataTypeVariant.cpp +++ b/src/DataTypes/DataTypeVariant.cpp @@ -105,6 +105,15 @@ ColumnPtr DataTypeVariant::createColumnConst(size_t size, const DB::Field & fiel } else { + /// We don't have exact mapping Field type -> Data type, so we cannot + /// always know in which variant we need to insert the field by it's type. + /// Examples: + /// Field(42) and Variant(UInt16, String). Type of the Field - UInt64, but we can insert it in UInt16 + /// Field(42) and Variant(Date, String). Type of the Field - UInt64, but we can insert it in Date + + /// Let's first apply FieldToDataType visitor to find best Data type for this field. + /// If we have variant with such type, we will insert this field into it. + /// Otherwise we will try to find the first variant that has default Field value with the same type. auto field_type = applyVisitor(FieldToDataType(), field); auto discr = tryGetVariantDiscriminator(field_type); if (!discr) diff --git a/src/DataTypes/Serializations/SerializationVariant.cpp b/src/DataTypes/Serializations/SerializationVariant.cpp index 64fcb63d604..78ec0a5e2da 100644 --- a/src/DataTypes/Serializations/SerializationVariant.cpp +++ b/src/DataTypes/Serializations/SerializationVariant.cpp @@ -200,19 +200,12 @@ void SerializationVariant::serializeBinaryBulkWithMultipleStreams( for (size_t i = 0; i != limit; ++i) writeBinaryLittleEndian(non_empty_global_discr, *discriminators_stream); - /// Second, serialize variants in global order. + /// Second, serialize non-empty variant (other variants are empty and we can skip their serialization). settings.path.push_back(Substream::VariantElements); - for (size_t i = 0; i != variants.size(); ++i) - { - addVariantElementToPath(settings.path, i); - /// For non empty variant use the same offset/limit as for whole Variant column - if (i == non_empty_global_discr) - variants[i]->serializeBinaryBulkWithMultipleStreams(col.getVariantByGlobalDiscriminator(i), offset, limit, settings, variant_state->states[i]); - /// For empty variants, use just 0/0, they won't serialize anything. - else - variants[i]->serializeBinaryBulkWithMultipleStreams(col.getVariantByGlobalDiscriminator(i), 0, 0, settings, variant_state->states[i]); - settings.path.pop_back(); - } + addVariantElementToPath(settings.path, non_empty_global_discr); + /// We can use the same offset/limit as for whole Variant column + variants[non_empty_global_discr]->serializeBinaryBulkWithMultipleStreams(col.getVariantByGlobalDiscriminator(non_empty_global_discr), offset, limit, settings, variant_state->states[non_empty_global_discr]); + settings.path.pop_back(); settings.path.pop_back(); return; } @@ -237,26 +230,22 @@ void SerializationVariant::serializeBinaryBulkWithMultipleStreams( } } - /// If limit for some variant is 0, it means that we don't have its discriminator in the range. - /// Set offset to the size of column for such variants, so we won't serialize values from them. - for (size_t i = 0; i != variant_offsets_and_limits.size(); ++i) - { - if (!variant_offsets_and_limits[i].second) - variant_offsets_and_limits[i].first = col.getVariantByGlobalDiscriminator(i).size(); - } - /// Serialize variants in global order. settings.path.push_back(Substream::VariantElements); for (size_t i = 0; i != variants.size(); ++i) { - addVariantElementToPath(settings.path, i); - variants[i]->serializeBinaryBulkWithMultipleStreams( - col.getVariantByGlobalDiscriminator(i), - variant_offsets_and_limits[i].first, - variant_offsets_and_limits[i].second, - settings, - variant_state->states[i]); - settings.path.pop_back(); + /// Serialize variant only if we have its discriminator in the range. + if (variant_offsets_and_limits[i].second) + { + addVariantElementToPath(settings.path, i); + variants[i]->serializeBinaryBulkWithMultipleStreams( + col.getVariantByGlobalDiscriminator(i), + variant_offsets_and_limits[i].first, + variant_offsets_and_limits[i].second, + settings, + variant_state->states[i]); + settings.path.pop_back(); + } } settings.path.pop_back(); } @@ -564,9 +553,6 @@ std::vector SerializationVariant::getVariantsDeserializeTextOrder(const } std::sort(order.begin(), order.end(), [&](size_t left, size_t right) { return priorities[left] > priorities[right]; }); - String types_order; - for (auto i : order) - types_order += " " + variant_types[i]->getName(); return order; } diff --git a/src/DataTypes/Serializations/SerializationVariant.h b/src/DataTypes/Serializations/SerializationVariant.h index b6bee94c65f..3f53dcf1339 100644 --- a/src/DataTypes/Serializations/SerializationVariant.h +++ b/src/DataTypes/Serializations/SerializationVariant.h @@ -6,6 +6,29 @@ namespace DB { +/// Class for serializing/deserializing column with Variant type. +/// It supports both text and binary bulk serializations/deserializations. +/// +/// During text serialization it checks discriminator of the current row and +/// uses corresponding text serialization of this variant. +/// +/// During text deserialization it tries all variants deserializations +/// (using tryDeserializeText* methods of ISerialization) in predefined order +/// and inserts data in the first variant with succeeded deserialization. +/// +/// During binary bulk serialization it transforms local discriminators +/// to global and serializes them into a separate stream VariantDiscriminators. +/// Each variant is serialized into a separate stream with path VariantElements/VariantElement +/// (VariantElements stream is needed for correct sub-columns creation). We store and serialize +/// variants in a sparse form (the size of a variant column equals to the number of its discriminator +/// in the discriminators column), so during deserialization the limit for each variant is +/// calculated according to discriminators column. +/// Offsets column is not serialized and stored only in memory. +/// +/// During binary bulk deserialization we first deserialize discriminators from corresponding stream +/// and use them to calculate the limit for each variant. Each variant is deserialized from +/// corresponding stream using calculated limit. Offsets column is not deserialized and constructed +/// according to discriminators. class SerializationVariant : public ISerialization { public: diff --git a/src/DataTypes/Serializations/SerializationVariantElement.cpp b/src/DataTypes/Serializations/SerializationVariantElement.cpp index e06a20d2990..be91e0ba2ee 100644 --- a/src/DataTypes/Serializations/SerializationVariantElement.cpp +++ b/src/DataTypes/Serializations/SerializationVariantElement.cpp @@ -149,19 +149,21 @@ void SerializationVariantElement::deserializeBinaryBulkWithMultipleStreams( assert_cast(*variant_element_state->variant->assumeMutable()).nestedRemoveNullable(); } + /// If nothing to deserialize, just insert defaults. + if (variant_limit == 0) + { + mutable_column->insertManyDefaults(limit); + return; + } + addVariantToPath(settings.path); nested_serialization->deserializeBinaryBulkWithMultipleStreams(variant_element_state->variant, variant_limit, settings, variant_element_state->variant_element_state, cache); removeVariantFromPath(settings.path); size_t variant_offset = variant_element_state->variant->size() - variant_limit; - /// If don't have our discriminator in range, just insert defaults. - if (variant_limit == 0) - { - mutable_column->insertManyDefaults(limit); - } /// If we have only our discriminator in range, insert the whole range to result column. - else if (variant_limit == limit) + if (variant_limit == limit) { mutable_column->insertRangeFrom(*variant_element_state->variant, variant_offset, variant_limit); } From f202d713711857c083de5aaba1198198d2eaa3a4 Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 18 Jan 2024 17:29:36 +0000 Subject: [PATCH 063/245] Make alter with variant espansion as no-op, add tests for alters --- src/DataTypes/DataTypeVariant.cpp | 22 ++ src/DataTypes/DataTypeVariant.h | 4 + .../Serializations/SerializationVariant.cpp | 2 +- .../SerializationVariantElement.cpp | 16 +- src/Storages/MergeTree/MutateTask.cpp | 20 +- .../02941_variant_type_alters.reference | 330 ++++++++++++++++++ .../0_stateless/02941_variant_type_alters.sh | 61 ++++ 7 files changed, 452 insertions(+), 3 deletions(-) create mode 100644 tests/queries/0_stateless/02941_variant_type_alters.reference create mode 100755 tests/queries/0_stateless/02941_variant_type_alters.sh diff --git a/src/DataTypes/DataTypeVariant.cpp b/src/DataTypes/DataTypeVariant.cpp index e0510373960..3a39fdf9ea8 100644 --- a/src/DataTypes/DataTypeVariant.cpp +++ b/src/DataTypes/DataTypeVariant.cpp @@ -230,6 +230,28 @@ static DataTypePtr create(const ASTPtr & arguments) return std::make_shared(nested_types); } +bool isVariantExtension(const DataTypePtr & from_type, const DataTypePtr & to_type) +{ + const auto * from_variant = typeid_cast(from_type.get()); + const auto * to_variant = typeid_cast(to_type.get()); + if (!from_variant || !to_variant) + return false; + + const auto & to_variants = to_variant->getVariants(); + std::unordered_set to_variant_types; + to_variant_types.reserve(to_variants.size()); + for (const auto & variant : to_variants) + to_variant_types.insert(variant->getName()); + + for (const auto & variant : from_variant->getVariants()) + { + if (!to_variant_types.contains(variant->getName())) + return false; + } + + return true; +} + void registerDataTypeVariant(DataTypeFactory & factory) { diff --git a/src/DataTypes/DataTypeVariant.h b/src/DataTypes/DataTypeVariant.h index ca15dff1476..1a1cb6c12f2 100644 --- a/src/DataTypes/DataTypeVariant.h +++ b/src/DataTypes/DataTypeVariant.h @@ -61,5 +61,9 @@ private: SerializationPtr doGetDefaultSerialization() const override; }; +/// Check if conversion from from_type to to_type is Variant extension +/// (both types are Variants and to_type contains all variants from from_type). +bool isVariantExtension(const DataTypePtr & from_type, const DataTypePtr & to_type); + } diff --git a/src/DataTypes/Serializations/SerializationVariant.cpp b/src/DataTypes/Serializations/SerializationVariant.cpp index 78ec0a5e2da..48a78dd54a9 100644 --- a/src/DataTypes/Serializations/SerializationVariant.cpp +++ b/src/DataTypes/Serializations/SerializationVariant.cpp @@ -275,7 +275,7 @@ void SerializationVariant::deserializeBinaryBulkWithMultipleStreams( { auto * discriminators_stream = settings.getter(settings.path); if (!discriminators_stream) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Got empty stream for VariantDiscriminators in SerializationVariant::deserializeBinaryBulkWithMultipleStreams"); + return; SerializationNumber().deserializeBinaryBulk(*col.getLocalDiscriminatorsPtr()->assumeMutable(), *discriminators_stream, limit, 0); addToSubstreamsCache(cache, settings.path, col.getLocalDiscriminatorsPtr()); diff --git a/src/DataTypes/Serializations/SerializationVariantElement.cpp b/src/DataTypes/Serializations/SerializationVariantElement.cpp index be91e0ba2ee..80524cbd814 100644 --- a/src/DataTypes/Serializations/SerializationVariantElement.cpp +++ b/src/DataTypes/Serializations/SerializationVariantElement.cpp @@ -91,7 +91,7 @@ void SerializationVariantElement::deserializeBinaryBulkWithMultipleStreams( { auto * discriminators_stream = settings.getter(settings.path); if (!discriminators_stream) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Got empty stream for VariantDiscriminators in SerializationVariantElement::deserializeBinaryBulkWithMultipleStreams"); + return; /// If we started to read a new column, reinitialize discriminators column in deserialization state. if (!variant_element_state->discriminators || result_column->empty()) @@ -156,10 +156,24 @@ void SerializationVariantElement::deserializeBinaryBulkWithMultipleStreams( return; } + size_t prev_variant_size = variant_element_state->variant->size(); addVariantToPath(settings.path); nested_serialization->deserializeBinaryBulkWithMultipleStreams(variant_element_state->variant, variant_limit, settings, variant_element_state->variant_element_state, cache); removeVariantFromPath(settings.path); + /// If nothing was deserialized when variant_limit > 0 + /// it means that we don't have a stream for such sub-column. + /// It may happen during ALTER MODIFY column with Variant extension. + /// In this case we should just insert default values. + if (variant_element_state->variant->empty()) + { + mutable_column->insertManyDefaults(limit); + return; + } + + if (variant_element_state->variant->size() != prev_variant_size + variant_limit) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected variant column size after deserialization. Expected {}, got {}", prev_variant_size + variant_limit, variant_element_state->variant->size()); + size_t variant_offset = variant_element_state->variant->size() - variant_limit; /// If we have only our discriminator in range, insert the whole range to result column. diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index e4070aa8262..44734ec98c0 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -1921,7 +1922,7 @@ static bool canSkipConversionToNullable(const MergeTreeDataPartPtr & part, const if (!part_column) return false; - /// For ALTER MODIFY COLUMN from 'Type' to 'Nullable(Type)' we can skip mutatation and + /// For ALTER MODIFY COLUMN from 'Type' to 'Nullable(Type)' we can skip mutation and /// apply only metadata conversion. But it doesn't work for custom serialization. const auto * to_nullable = typeid_cast(command.data_type.get()); if (!to_nullable) @@ -1937,6 +1938,20 @@ static bool canSkipConversionToNullable(const MergeTreeDataPartPtr & part, const return true; } +static bool canSkipConversionToVariant(const MergeTreeDataPartPtr & part, const MutationCommand & command) +{ + if (command.type != MutationCommand::READ_COLUMN) + return false; + + auto part_column = part->tryGetColumn(command.column_name); + if (!part_column) + return false; + + /// For ALTER MODIFY COLUMN with Variant extension (like 'Variant(T1, T2)' to 'Variant(T1, T2, T3, ...)') + /// we can skip mutation and apply only metadata conversion. + return isVariantExtension(part_column->type, command.data_type); +} + static bool canSkipMutationCommandForPart(const MergeTreeDataPartPtr & part, const MutationCommand & command, const ContextPtr & context) { if (command.partition) @@ -1952,6 +1967,9 @@ static bool canSkipMutationCommandForPart(const MergeTreeDataPartPtr & part, con if (canSkipConversionToNullable(part, command)) return true; + if (canSkipConversionToVariant(part, command)) + return true; + return false; } diff --git a/tests/queries/0_stateless/02941_variant_type_alters.reference b/tests/queries/0_stateless/02941_variant_type_alters.reference new file mode 100644 index 00000000000..52c834e455b --- /dev/null +++ b/tests/queries/0_stateless/02941_variant_type_alters.reference @@ -0,0 +1,330 @@ +Memory +initial insert +alter add column 1 +0 0 \N \N \N +1 1 \N \N \N +2 2 \N \N \N +insert after alter add column 1 +0 0 \N \N \N +1 1 \N \N \N +2 2 \N \N \N +3 3 3 \N 3 +4 4 4 \N 4 +5 5 5 \N 5 +6 6 str_6 str_6 \N +7 7 str_7 str_7 \N +8 8 str_8 str_8 \N +9 9 \N \N \N +10 10 \N \N \N +11 11 \N \N \N +12 12 12 \N 12 +13 13 str_13 str_13 \N +14 14 \N \N \N +alter modify column 1 +0 0 \N \N \N \N +1 1 \N \N \N \N +2 2 \N \N \N \N +3 3 3 \N 3 \N +4 4 4 \N 4 \N +5 5 5 \N 5 \N +6 6 str_6 str_6 \N \N +7 7 str_7 str_7 \N \N +8 8 str_8 str_8 \N \N +9 9 \N \N \N \N +10 10 \N \N \N \N +11 11 \N \N \N \N +12 12 12 \N 12 \N +13 13 str_13 str_13 \N \N +14 14 \N \N \N \N +insert after alter modify column 1 +0 0 \N \N \N \N +1 1 \N \N \N \N +2 2 \N \N \N \N +3 3 3 \N 3 \N +4 4 4 \N 4 \N +5 5 5 \N 5 \N +6 6 str_6 str_6 \N \N +7 7 str_7 str_7 \N \N +8 8 str_8 str_8 \N \N +9 9 \N \N \N \N +10 10 \N \N \N \N +11 11 \N \N \N \N +12 12 12 \N 12 \N +13 13 str_13 str_13 \N \N +14 14 \N \N \N \N +15 15 1970-01-16 \N \N 1970-01-16 +16 16 1970-01-17 \N \N 1970-01-17 +17 17 1970-01-18 \N \N 1970-01-18 +18 18 1970-01-19 \N \N 1970-01-19 +19 19 \N \N \N \N +20 20 20 \N 20 \N +21 21 str_21 str_21 \N \N +alter modify column 2 +0 0 \N \N \N \N \N \N +1 1 \N \N \N \N \N \N +2 2 \N \N \N \N \N \N +3 3 \N \N 3 \N 3 \N +4 4 \N \N 4 \N 4 \N +5 5 \N \N 5 \N 5 \N +6 6 \N \N str_6 str_6 \N \N +7 7 \N \N str_7 str_7 \N \N +8 8 \N \N str_8 str_8 \N \N +9 9 \N \N \N \N \N \N +10 10 \N \N \N \N \N \N +11 11 \N \N \N \N \N \N +12 12 \N \N 12 \N 12 \N +13 13 \N \N str_13 str_13 \N \N +14 14 \N \N \N \N \N \N +15 15 \N \N 1970-01-16 \N \N 1970-01-16 +16 16 \N \N 1970-01-17 \N \N 1970-01-17 +17 17 \N \N 1970-01-18 \N \N 1970-01-18 +18 18 \N \N 1970-01-19 \N \N 1970-01-19 +19 19 \N \N \N \N \N \N +20 20 \N \N 20 \N 20 \N +21 21 \N \N str_21 str_21 \N \N +insert after alter modify column 2 +0 0 \N \N \N \N \N \N +1 1 \N \N \N \N \N \N +2 2 \N \N \N \N \N \N +3 3 \N \N 3 \N 3 \N +4 4 \N \N 4 \N 4 \N +5 5 \N \N 5 \N 5 \N +6 6 \N \N str_6 str_6 \N \N +7 7 \N \N str_7 str_7 \N \N +8 8 \N \N str_8 str_8 \N \N +9 9 \N \N \N \N \N \N +10 10 \N \N \N \N \N \N +11 11 \N \N \N \N \N \N +12 12 \N \N 12 \N 12 \N +13 13 \N \N str_13 str_13 \N \N +14 14 \N \N \N \N \N \N +15 15 \N \N 1970-01-16 \N \N 1970-01-16 +16 16 \N \N 1970-01-17 \N \N 1970-01-17 +17 17 \N \N 1970-01-18 \N \N 1970-01-18 +18 18 \N \N 1970-01-19 \N \N 1970-01-19 +19 19 \N \N \N \N \N \N +20 20 \N \N 20 \N 20 \N +21 21 \N \N str_21 str_21 \N \N +22 str_22 \N str_22 \N \N \N \N +23 \N \N \N \N \N \N \N +24 24 24 \N \N \N \N \N +MergeTree compact +initial insert +alter add column 1 +0 0 \N \N \N +1 1 \N \N \N +2 2 \N \N \N +insert after alter add column 1 +0 0 \N \N \N +1 1 \N \N \N +2 2 \N \N \N +3 3 3 \N 3 +4 4 4 \N 4 +5 5 5 \N 5 +6 6 str_6 str_6 \N +7 7 str_7 str_7 \N +8 8 str_8 str_8 \N +9 9 \N \N \N +10 10 \N \N \N +11 11 \N \N \N +12 12 12 \N 12 +13 13 str_13 str_13 \N +14 14 \N \N \N +alter modify column 1 +0 0 \N \N \N \N +1 1 \N \N \N \N +2 2 \N \N \N \N +3 3 3 \N 3 \N +4 4 4 \N 4 \N +5 5 5 \N 5 \N +6 6 str_6 str_6 \N \N +7 7 str_7 str_7 \N \N +8 8 str_8 str_8 \N \N +9 9 \N \N \N \N +10 10 \N \N \N \N +11 11 \N \N \N \N +12 12 12 \N 12 \N +13 13 str_13 str_13 \N \N +14 14 \N \N \N \N +insert after alter modify column 1 +0 0 \N \N \N \N +1 1 \N \N \N \N +2 2 \N \N \N \N +3 3 3 \N 3 \N +4 4 4 \N 4 \N +5 5 5 \N 5 \N +6 6 str_6 str_6 \N \N +7 7 str_7 str_7 \N \N +8 8 str_8 str_8 \N \N +9 9 \N \N \N \N +10 10 \N \N \N \N +11 11 \N \N \N \N +12 12 12 \N 12 \N +13 13 str_13 str_13 \N \N +14 14 \N \N \N \N +15 15 1970-01-16 \N \N 1970-01-16 +16 16 1970-01-17 \N \N 1970-01-17 +17 17 1970-01-18 \N \N 1970-01-18 +18 18 1970-01-19 \N \N 1970-01-19 +19 19 \N \N \N \N +20 20 20 \N 20 \N +21 21 str_21 str_21 \N \N +alter modify column 2 +0 0 0 \N \N \N \N \N +1 1 1 \N \N \N \N \N +2 2 2 \N \N \N \N \N +3 3 3 \N 3 \N 3 \N +4 4 4 \N 4 \N 4 \N +5 5 5 \N 5 \N 5 \N +6 6 6 \N str_6 str_6 \N \N +7 7 7 \N str_7 str_7 \N \N +8 8 8 \N str_8 str_8 \N \N +9 9 9 \N \N \N \N \N +10 10 10 \N \N \N \N \N +11 11 11 \N \N \N \N \N +12 12 12 \N 12 \N 12 \N +13 13 13 \N str_13 str_13 \N \N +14 14 14 \N \N \N \N \N +15 15 15 \N 1970-01-16 \N \N 1970-01-16 +16 16 16 \N 1970-01-17 \N \N 1970-01-17 +17 17 17 \N 1970-01-18 \N \N 1970-01-18 +18 18 18 \N 1970-01-19 \N \N 1970-01-19 +19 19 19 \N \N \N \N \N +20 20 20 \N 20 \N 20 \N +21 21 21 \N str_21 str_21 \N \N +insert after alter modify column 2 +0 0 0 \N \N \N \N \N +1 1 1 \N \N \N \N \N +2 2 2 \N \N \N \N \N +3 3 3 \N 3 \N 3 \N +4 4 4 \N 4 \N 4 \N +5 5 5 \N 5 \N 5 \N +6 6 6 \N str_6 str_6 \N \N +7 7 7 \N str_7 str_7 \N \N +8 8 8 \N str_8 str_8 \N \N +9 9 9 \N \N \N \N \N +10 10 10 \N \N \N \N \N +11 11 11 \N \N \N \N \N +12 12 12 \N 12 \N 12 \N +13 13 13 \N str_13 str_13 \N \N +14 14 14 \N \N \N \N \N +15 15 15 \N 1970-01-16 \N \N 1970-01-16 +16 16 16 \N 1970-01-17 \N \N 1970-01-17 +17 17 17 \N 1970-01-18 \N \N 1970-01-18 +18 18 18 \N 1970-01-19 \N \N 1970-01-19 +19 19 19 \N \N \N \N \N +20 20 20 \N 20 \N 20 \N +21 21 21 \N str_21 str_21 \N \N +22 str_22 \N str_22 \N \N \N \N +23 \N \N \N \N \N \N \N +24 24 24 \N \N \N \N \N +MergeTree wide +initial insert +alter add column 1 +0 0 \N \N \N +1 1 \N \N \N +2 2 \N \N \N +insert after alter add column 1 +0 0 \N \N \N +1 1 \N \N \N +2 2 \N \N \N +3 3 3 \N 3 +4 4 4 \N 4 +5 5 5 \N 5 +6 6 str_6 str_6 \N +7 7 str_7 str_7 \N +8 8 str_8 str_8 \N +9 9 \N \N \N +10 10 \N \N \N +11 11 \N \N \N +12 12 12 \N 12 +13 13 str_13 str_13 \N +14 14 \N \N \N +alter modify column 1 +0 0 \N \N \N \N +1 1 \N \N \N \N +2 2 \N \N \N \N +3 3 3 \N 3 \N +4 4 4 \N 4 \N +5 5 5 \N 5 \N +6 6 str_6 str_6 \N \N +7 7 str_7 str_7 \N \N +8 8 str_8 str_8 \N \N +9 9 \N \N \N \N +10 10 \N \N \N \N +11 11 \N \N \N \N +12 12 12 \N 12 \N +13 13 str_13 str_13 \N \N +14 14 \N \N \N \N +insert after alter modify column 1 +0 0 \N \N \N \N +1 1 \N \N \N \N +2 2 \N \N \N \N +3 3 3 \N 3 \N +4 4 4 \N 4 \N +5 5 5 \N 5 \N +6 6 str_6 str_6 \N \N +7 7 str_7 str_7 \N \N +8 8 str_8 str_8 \N \N +9 9 \N \N \N \N +10 10 \N \N \N \N +11 11 \N \N \N \N +12 12 12 \N 12 \N +13 13 str_13 str_13 \N \N +14 14 \N \N \N \N +15 15 1970-01-16 \N \N 1970-01-16 +16 16 1970-01-17 \N \N 1970-01-17 +17 17 1970-01-18 \N \N 1970-01-18 +18 18 1970-01-19 \N \N 1970-01-19 +19 19 \N \N \N \N +20 20 20 \N 20 \N +21 21 str_21 str_21 \N \N +alter modify column 2 +0 0 0 \N \N \N \N \N +1 1 1 \N \N \N \N \N +2 2 2 \N \N \N \N \N +3 3 3 \N 3 \N 3 \N +4 4 4 \N 4 \N 4 \N +5 5 5 \N 5 \N 5 \N +6 6 6 \N str_6 str_6 \N \N +7 7 7 \N str_7 str_7 \N \N +8 8 8 \N str_8 str_8 \N \N +9 9 9 \N \N \N \N \N +10 10 10 \N \N \N \N \N +11 11 11 \N \N \N \N \N +12 12 12 \N 12 \N 12 \N +13 13 13 \N str_13 str_13 \N \N +14 14 14 \N \N \N \N \N +15 15 15 \N 1970-01-16 \N \N 1970-01-16 +16 16 16 \N 1970-01-17 \N \N 1970-01-17 +17 17 17 \N 1970-01-18 \N \N 1970-01-18 +18 18 18 \N 1970-01-19 \N \N 1970-01-19 +19 19 19 \N \N \N \N \N +20 20 20 \N 20 \N 20 \N +21 21 21 \N str_21 str_21 \N \N +insert after alter modify column 2 +0 0 0 \N \N \N \N \N +1 1 1 \N \N \N \N \N +2 2 2 \N \N \N \N \N +3 3 3 \N 3 \N 3 \N +4 4 4 \N 4 \N 4 \N +5 5 5 \N 5 \N 5 \N +6 6 6 \N str_6 str_6 \N \N +7 7 7 \N str_7 str_7 \N \N +8 8 8 \N str_8 str_8 \N \N +9 9 9 \N \N \N \N \N +10 10 10 \N \N \N \N \N +11 11 11 \N \N \N \N \N +12 12 12 \N 12 \N 12 \N +13 13 13 \N str_13 str_13 \N \N +14 14 14 \N \N \N \N \N +15 15 15 \N 1970-01-16 \N \N 1970-01-16 +16 16 16 \N 1970-01-17 \N \N 1970-01-17 +17 17 17 \N 1970-01-18 \N \N 1970-01-18 +18 18 18 \N 1970-01-19 \N \N 1970-01-19 +19 19 19 \N \N \N \N \N +20 20 20 \N 20 \N 20 \N +21 21 21 \N str_21 str_21 \N \N +22 str_22 \N str_22 \N \N \N \N +23 \N \N \N \N \N \N \N +24 24 24 \N \N \N \N \N diff --git a/tests/queries/0_stateless/02941_variant_type_alters.sh b/tests/queries/0_stateless/02941_variant_type_alters.sh new file mode 100755 index 00000000000..9b0d4febd65 --- /dev/null +++ b/tests/queries/0_stateless/02941_variant_type_alters.sh @@ -0,0 +1,61 @@ +#!/usr/bin/env bash +# Tags: long + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# reset --log_comment +CLICKHOUSE_LOG_COMMENT= +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_variant_type=1 --use_variant_when_no_common_type_in_if=1 " + +function run() +{ + echo "initial insert" + $CH_CLIENT -q "insert into test select number, number from numbers(3)" + + echo "alter add column 1" + $CH_CLIENT -q "alter table test add column v Variant(UInt64, String) settings mutations_sync=1" + $CH_CLIENT -q "select x, y, v, v.String, v.UInt64 from test order by x" + + echo "insert after alter add column 1" + $CH_CLIENT -q "insert into test select number, number, number from numbers(3, 3)" + $CH_CLIENT -q "insert into test select number, number, 'str_' || toString(number) from numbers(6, 3)" + $CH_CLIENT -q "insert into test select number, number, NULL from numbers(9, 3)" + $CH_CLIENT -q "insert into test select number, number, multiIf(number % 3 == 0, number, number % 3 == 1, 'str_' || toString(number), NULL) from numbers(12, 3)" + $CH_CLIENT -q "select x, y, v, v.String, v.UInt64 from test order by x" + + echo "alter modify column 1" + $CH_CLIENT -q "alter table test modify column v Variant(UInt64, String, Date) settings mutations_sync=1" + $CH_CLIENT -q "select x, y, v, v.String, v.UInt64, v.Date from test order by x" + + echo "insert after alter modify column 1" + $CH_CLIENT -q "insert into test select number, number, toDate(number) from numbers(15, 3)" + $CH_CLIENT -q "insert into test select number, number, multiIf(number % 4 == 0, number, number % 4 == 1, 'str_' || toString(number), number % 4 == 2, toDate(number), NULL) from numbers(18, 4)" + $CH_CLIENT -q "select x, y, v, v.String, v.UInt64, v.Date from test order by x" + + echo "alter modify column 2" + $CH_CLIENT -q "alter table test modify column y Variant(UInt64, String) settings mutations_sync=1" + $CH_CLIENT -q "select x, y, y.UInt64, y.String, v, v.String, v.UInt64, v.Date from test order by x" + + echo "insert after alter modify column 2" + $CH_CLIENT -q "insert into test select number, multiIf(number % 3 == 0, number, number % 3 == 1, 'str_' || toString(number), NULL), NULL from numbers(22, 3)" + $CH_CLIENT -q "select x, y, y.UInt64, y.String, v, v.String, v.UInt64, v.Date from test order by x" +} + +$CH_CLIENT -q "drop table if exists test;" + +echo "Memory" +$CH_CLIENT -q "create table test (x UInt64, y UInt64) engine=Memory" +run +$CH_CLIENT -q "drop table test;" + +echo "MergeTree compact" +$CH_CLIENT -q "create table test (x UInt64, y UInt64) engine=MergeTree order by x settings min_rows_for_wide_part=100000000, min_bytes_for_wide_part=1000000000;" +run +$CH_CLIENT -q "drop table test;" + +echo "MergeTree wide" +$CH_CLIENT -q "create table test (x UInt64, y UInt64 ) engine=MergeTree order by x settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1;" +run +$CH_CLIENT -q "drop table test;" From 4109b6608186b1b9d9dce60f1821313294b7e7c4 Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 18 Jan 2024 17:30:32 +0000 Subject: [PATCH 064/245] Remove unneded tag from test --- tests/queries/0_stateless/02941_variant_type_alters.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/queries/0_stateless/02941_variant_type_alters.sh b/tests/queries/0_stateless/02941_variant_type_alters.sh index 9b0d4febd65..7e2ecbd67aa 100755 --- a/tests/queries/0_stateless/02941_variant_type_alters.sh +++ b/tests/queries/0_stateless/02941_variant_type_alters.sh @@ -1,5 +1,4 @@ #!/usr/bin/env bash -# Tags: long CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # reset --log_comment From 3dcc2056a59f9b374b4de3b72c30107dd7825d47 Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 18 Jan 2024 20:39:36 +0000 Subject: [PATCH 065/245] Fix conflicts --- src/DataTypes/Serializations/ISerialization.cpp | 1 + src/DataTypes/Serializations/ISerialization.h | 1 + src/DataTypes/Serializations/SerializationVariant.cpp | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/DataTypes/Serializations/ISerialization.cpp b/src/DataTypes/Serializations/ISerialization.cpp index 2f1eb1887af..7d57d72090b 100644 --- a/src/DataTypes/Serializations/ISerialization.cpp +++ b/src/DataTypes/Serializations/ISerialization.cpp @@ -54,6 +54,7 @@ const std::set ISerialization::Substream::named_types TupleElement, NamedOffsets, NamedNullMap, + NamedVariantDiscriminators, }; String ISerialization::Substream::toString() const diff --git a/src/DataTypes/Serializations/ISerialization.h b/src/DataTypes/Serializations/ISerialization.h index 64a7a889640..7fba9db4acf 100644 --- a/src/DataTypes/Serializations/ISerialization.h +++ b/src/DataTypes/Serializations/ISerialization.h @@ -155,6 +155,7 @@ public: ObjectData, VariantDiscriminators, + NamedVariantDiscriminators, VariantOffsets, VariantElements, VariantElement, diff --git a/src/DataTypes/Serializations/SerializationVariant.cpp b/src/DataTypes/Serializations/SerializationVariant.cpp index 48a78dd54a9..5af94364167 100644 --- a/src/DataTypes/Serializations/SerializationVariant.cpp +++ b/src/DataTypes/Serializations/SerializationVariant.cpp @@ -36,7 +36,7 @@ void SerializationVariant::enumerateStreams( const auto * type_variant = data.type ? &assert_cast(*data.type) : nullptr; const auto * column_variant = data.column ? &assert_cast(*data.column) : nullptr; - auto discriminators_serialization = std::make_shared(std::make_shared>(), "discr", false); + auto discriminators_serialization = std::make_shared(std::make_shared>(), "discr", SubstreamType::NamedVariantDiscriminators); auto local_discriminators = column_variant ? column_variant->getLocalDiscriminatorsPtr() : nullptr; settings.path.push_back(Substream::VariantDiscriminators); From cfc8c60aa70917e48281e3583adc922967326d50 Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 18 Jan 2024 21:26:55 +0000 Subject: [PATCH 066/245] Fix build --- src/DataTypes/Serializations/SerializationVariantElement.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/DataTypes/Serializations/SerializationVariantElement.cpp b/src/DataTypes/Serializations/SerializationVariantElement.cpp index 80524cbd814..8d0acee1c2b 100644 --- a/src/DataTypes/Serializations/SerializationVariantElement.cpp +++ b/src/DataTypes/Serializations/SerializationVariantElement.cpp @@ -208,8 +208,8 @@ void SerializationVariantElement::removeVariantFromPath(DB::ISerialization::Subs } SerializationVariantElement::VariantSubcolumnCreator::VariantSubcolumnCreator( - const DB::ColumnPtr & local_discriminators_, - const DB::String & variant_element_name_, + const ColumnPtr & local_discriminators_, + const String & variant_element_name_, const ColumnVariant::Discriminator global_variant_discriminator_, const ColumnVariant::Discriminator local_variant_discriminator_) : local_discriminators(local_discriminators_) From 7ae631de1ed1ff4bcb8bac5e06c2026db3ff972c Mon Sep 17 00:00:00 2001 From: avogar Date: Fri, 19 Jan 2024 10:23:40 +0000 Subject: [PATCH 067/245] Remove wron check, remove duplicate tests --- .../SerializationVariantElement.cpp | 4 -- ...> 02943_variant_read_subcolumns.reference} | 0 ..._1.sh => 02943_variant_read_subcolumns.sh} | 0 .../02943_variant_read_subcolumns_2.reference | 6 --- .../02943_variant_read_subcolumns_2.sh | 38 ------------------- 5 files changed, 48 deletions(-) rename tests/queries/0_stateless/{02943_variant_read_subcolumns_1.reference => 02943_variant_read_subcolumns.reference} (100%) rename tests/queries/0_stateless/{02943_variant_read_subcolumns_1.sh => 02943_variant_read_subcolumns.sh} (100%) delete mode 100644 tests/queries/0_stateless/02943_variant_read_subcolumns_2.reference delete mode 100755 tests/queries/0_stateless/02943_variant_read_subcolumns_2.sh diff --git a/src/DataTypes/Serializations/SerializationVariantElement.cpp b/src/DataTypes/Serializations/SerializationVariantElement.cpp index 8d0acee1c2b..56f0e5d77be 100644 --- a/src/DataTypes/Serializations/SerializationVariantElement.cpp +++ b/src/DataTypes/Serializations/SerializationVariantElement.cpp @@ -156,7 +156,6 @@ void SerializationVariantElement::deserializeBinaryBulkWithMultipleStreams( return; } - size_t prev_variant_size = variant_element_state->variant->size(); addVariantToPath(settings.path); nested_serialization->deserializeBinaryBulkWithMultipleStreams(variant_element_state->variant, variant_limit, settings, variant_element_state->variant_element_state, cache); removeVariantFromPath(settings.path); @@ -171,9 +170,6 @@ void SerializationVariantElement::deserializeBinaryBulkWithMultipleStreams( return; } - if (variant_element_state->variant->size() != prev_variant_size + variant_limit) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected variant column size after deserialization. Expected {}, got {}", prev_variant_size + variant_limit, variant_element_state->variant->size()); - size_t variant_offset = variant_element_state->variant->size() - variant_limit; /// If we have only our discriminator in range, insert the whole range to result column. diff --git a/tests/queries/0_stateless/02943_variant_read_subcolumns_1.reference b/tests/queries/0_stateless/02943_variant_read_subcolumns.reference similarity index 100% rename from tests/queries/0_stateless/02943_variant_read_subcolumns_1.reference rename to tests/queries/0_stateless/02943_variant_read_subcolumns.reference diff --git a/tests/queries/0_stateless/02943_variant_read_subcolumns_1.sh b/tests/queries/0_stateless/02943_variant_read_subcolumns.sh similarity index 100% rename from tests/queries/0_stateless/02943_variant_read_subcolumns_1.sh rename to tests/queries/0_stateless/02943_variant_read_subcolumns.sh diff --git a/tests/queries/0_stateless/02943_variant_read_subcolumns_2.reference b/tests/queries/0_stateless/02943_variant_read_subcolumns_2.reference deleted file mode 100644 index 4b93782cddf..00000000000 --- a/tests/queries/0_stateless/02943_variant_read_subcolumns_2.reference +++ /dev/null @@ -1,6 +0,0 @@ -Memory -test -MergeTree compact -test -MergeTree wide -test diff --git a/tests/queries/0_stateless/02943_variant_read_subcolumns_2.sh b/tests/queries/0_stateless/02943_variant_read_subcolumns_2.sh deleted file mode 100755 index 9ccad55191f..00000000000 --- a/tests/queries/0_stateless/02943_variant_read_subcolumns_2.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env bash -# Tags: long - -CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) -# reset --log_comment -CLICKHOUSE_LOG_COMMENT= -# shellcheck source=../shell_config.sh -. "$CUR_DIR"/../shell_config.sh - -CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_variant_type=1 --use_variant_when_no_common_type_in_if=1 " - - -function test() -{ - echo "test" - $CH_CLIENT -q "insert into test select number, multiIf(number % 3 == 2, NULL, number % 3 == 1, number, arrayMap(x -> multiIf(number % 9 == 0, NULL, number % 9 == 3, 'str_' || toString(number), number), range(number % 10))) from numbers(1000000) settings min_insert_block_size_rows=100000" - $CH_CLIENT -q "select v, v.UInt64, v.\`Array(Variant(String, UInt64))\`, v.\`Array(Variant(String, UInt64))\`.size0, v.\`Array(Variant(String, UInt64))\`.UInt64 from test order by id format Null" - $CH_CLIENT -q "select v.UInt64, v.\`Array(Variant(String, UInt64))\`, v.\`Array(Variant(String, UInt64))\`.size0, v.\`Array(Variant(String, UInt64))\`.UInt64 from test order by id format Null" - $CH_CLIENT -q "select v.\`Array(Variant(String, UInt64))\`, v.\`Array(Variant(String, UInt64))\`.size0, v.\`Array(Variant(String, UInt64))\`.UInt64, v.\`Array(Variant(String, UInt64))\`.String from test order by id format Null" -} - -$CH_CLIENT -q "drop table if exists test;" - -echo "Memory" -$CH_CLIENT -q "create table test (id UInt64, v Variant(UInt64, Array(Variant(String, UInt64)))) engine=Memory" -test -$CH_CLIENT -q "drop table test;" - -echo "MergeTree compact" -$CH_CLIENT -q "create table test (id UInt64, v Variant(UInt64, Array(Variant(String, UInt64)))) engine=MergeTree order by id settings min_rows_for_wide_part=1000000000, min_bytes_for_wide_part=10000000000;" -test -$CH_CLIENT -q "drop table test;" - -echo "MergeTree wide" -$CH_CLIENT -q "create table test (id UInt64, v Variant(UInt64, Array(Variant(String, UInt64)))) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1;" -test -$CH_CLIENT -q "drop table test;" - From 0c85339ddb26e00ac64d6c763a0f5019b7ee2619 Mon Sep 17 00:00:00 2001 From: avogar Date: Fri, 19 Jan 2024 10:50:13 +0000 Subject: [PATCH 068/245] Fix style --- src/DataTypes/Serializations/SerializationVariantElement.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/DataTypes/Serializations/SerializationVariantElement.cpp b/src/DataTypes/Serializations/SerializationVariantElement.cpp index 56f0e5d77be..053f8d22d5a 100644 --- a/src/DataTypes/Serializations/SerializationVariantElement.cpp +++ b/src/DataTypes/Serializations/SerializationVariantElement.cpp @@ -9,7 +9,6 @@ namespace DB namespace ErrorCodes { extern const int NOT_IMPLEMENTED; - extern const int LOGICAL_ERROR; } void SerializationVariantElement::enumerateStreams( From a196d04a1c2d5f36ec43c2b0947916be7321037c Mon Sep 17 00:00:00 2001 From: kssenii Date: Fri, 19 Jan 2024 12:04:16 +0100 Subject: [PATCH 069/245] Update test --- .../test_broken_projections/test.py | 94 +++++++++++++++++-- 1 file changed, 84 insertions(+), 10 deletions(-) diff --git a/tests/integration/test_broken_projections/test.py b/tests/integration/test_broken_projections/test.py index 1b192e0df24..48ed10d0f87 100644 --- a/tests/integration/test_broken_projections/test.py +++ b/tests/integration/test_broken_projections/test.py @@ -159,6 +159,19 @@ def get_broken_projections_info(node, table): ).strip() +def get_projections_info(node, table): + return node.query( + f""" + SELECT parent_name, name, is_broken + FROM system.projection_parts + WHERE table='{table}' + AND active = 1 + AND database=currentDatabase() + ORDER BY parent_name, name + """ + ).strip() + + def optimize(node, table, final, no_wait): query = f"OPTIMIZE TABLE {table}" if final: @@ -389,6 +402,11 @@ def test_broken_ignored_replicated(cluster): assert "has a broken projection" not in check_table_full(node, table_name) +def get_random_string(string_length=8): + alphabet = string.ascii_letters + string.digits + return "".join((random.choice(alphabet) for _ in range(string_length))) + + def test_broken_projections_in_backups(cluster): node = cluster.instances["node"] @@ -400,6 +418,10 @@ def test_broken_projections_in_backups(cluster): insert(node, table_name, 10, 5) insert(node, table_name, 15, 5) + assert ["all_0_0_0", "all_1_1_0", "all_2_2_0", "all_3_3_0"] == get_parts( + node, table_name + ) + check(node, table_name, 1) break_projection(node, table_name, "proj", "all_2_2_0", "data") @@ -409,10 +431,23 @@ def test_broken_projections_in_backups(cluster): node, table_name ) + assert ( + "all_0_0_0\tproj\t0\n" + "all_0_0_0\tproj_2\t0\n" + "all_1_1_0\tproj\t0\n" + "all_1_1_0\tproj_2\t0\n" + "all_2_2_0\tproj\t1\n" + "all_2_2_0\tproj_2\t0\n" + "all_3_3_0\tproj\t0\n" + "all_3_3_0\tproj_2\t0" + == get_projections_info(node, table_name) + ) + + backup_name = f"b1-{get_random_string()}" assert "BACKUP_CREATED" in node.query( f""" set backup_restore_keeper_fault_injection_probability=0.0; - backup table {table_name} to Disk('backups', 'b1') settings check_projection_parts=false; + backup table {table_name} to Disk('backups', '{backup_name}') settings check_projection_parts=false; """ ) @@ -420,18 +455,30 @@ def test_broken_projections_in_backups(cluster): f""" drop table {table_name} sync; set backup_restore_keeper_fault_injection_probability=0.0; - restore table {table_name} from Disk('backups', 'b1'); + restore table {table_name} from Disk('backups', '{backup_name}'); """ ) + assert ( + "all_0_0_0\tproj\t0\n" + "all_0_0_0\tproj_2\t0\n" + "all_1_1_0\tproj\t0\n" + "all_1_1_0\tproj_2\t0\n" + "all_2_2_0\tproj\t0\n" + "all_2_2_0\tproj_2\t0\n" + "all_3_3_0\tproj\t0\n" + "all_3_3_0\tproj_2\t0" + == get_projections_info(node, table_name) + ) + check(node, table_name, 1) assert "" == get_broken_projections_info(node, table_name) - break_projection(node, table_name, "proj", "all_2_2_0", "part") + break_projection(node, table_name, "proj_2", "all_2_2_0", "part") - check(node, table_name, 0, "proj", "ErrnoException") + check(node, table_name, 0, "proj_2", "ErrnoException") - assert "all_2_2_0\tproj\tFILE_DOESNT_EXIST" == get_broken_projections_info( + assert "all_2_2_0\tproj_2\tFILE_DOESNT_EXIST" == get_broken_projections_info( node, table_name ) @@ -442,13 +489,14 @@ def test_broken_projections_in_backups(cluster): """ ) - materialize_projection(node, table_name, "proj") + materialize_projection(node, table_name, "proj_2") check(node, table_name, 1) + backup_name = f"b3-{get_random_string()}" assert "BACKUP_CREATED" in node.query( f""" set backup_restore_keeper_fault_injection_probability=0.0; - backup table {table_name} to Disk('backups', 'b3') settings check_projection_parts=false; + backup table {table_name} to Disk('backups', '{backup_name}') settings check_projection_parts=false; """ ) @@ -456,11 +504,23 @@ def test_broken_projections_in_backups(cluster): f""" drop table {table_name} sync; set backup_restore_keeper_fault_injection_probability=0.0; - restore table {table_name} from Disk('backups', 'b3'); + restore table {table_name} from Disk('backups', '{backup_name}'); """ ) check(node, table_name, 1) + assert ( + "all_0_0_0\tproj\t0\n" + "all_0_0_0\tproj_2\t0\n" + "all_1_1_0\tproj\t0\n" + "all_1_1_0\tproj_2\t0\n" + "all_2_2_0\tproj\t0\n" + "all_2_2_0\tproj_2\t0\n" + "all_3_3_0\tproj\t0\n" + "all_3_3_0\tproj_2\t0" + == get_projections_info(node, table_name) + ) + break_projection(node, table_name, "proj", "all_1_1_0", "part") assert "Part all_1_1_0 has a broken projection proj" in check_table_full( node, table_name @@ -469,10 +529,11 @@ def test_broken_projections_in_backups(cluster): node, table_name ) + backup_name = f"b4-{get_random_string()}" assert "BACKUP_CREATED" in node.query( f""" set backup_restore_keeper_fault_injection_probability=0.0; - backup table {table_name} to Disk('backups', 'b4') settings check_projection_parts=false, allow_backup_broken_projections=true; + backup table {table_name} to Disk('backups', '{backup_name}') settings check_projection_parts=false, allow_backup_broken_projections=true; """ ) @@ -480,9 +541,22 @@ def test_broken_projections_in_backups(cluster): f""" drop table {table_name} sync; set backup_restore_keeper_fault_injection_probability=0.0; - restore table {table_name} from Disk('backups', 'b4'); + restore table {table_name} from Disk('backups', '{backup_name}'); """ ) + + assert ( + "all_0_0_0\tproj\t0\n" + "all_0_0_0\tproj_2\t0\n" + "all_1_1_0\tproj\t1\n" + "all_1_1_0\tproj_2\t0\n" + "all_2_2_0\tproj\t0\n" + "all_2_2_0\tproj_2\t0\n" + "all_3_3_0\tproj\t0\n" + "all_3_3_0\tproj_2\t0" + == get_projections_info(node, table_name) + ) + check(node, table_name, 0) assert "all_1_1_0\tproj\tNO_FILE_IN_DATA_PART" == get_broken_projections_info( node, table_name From 580501c2b42231eacc4e843968aeb876ff784297 Mon Sep 17 00:00:00 2001 From: avogar Date: Fri, 19 Jan 2024 11:08:56 +0000 Subject: [PATCH 070/245] Add new settings to settings changes history --- src/Core/SettingsChangesHistory.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index 62ffd837a33..af213983b66 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -83,7 +83,9 @@ static std::map sett { {"24.1", {{"print_pretty_type_names", false, true, "Better user experience."}, {"input_format_json_read_bools_as_strings", false, true, "Allow to read bools as strings in JSON formats by default"}, - {"output_format_arrow_use_signed_indexes_for_dictionary", false, true, "Use signed indexes type for Arrow dictionaries by default as it's recommended"}}}, + {"output_format_arrow_use_signed_indexes_for_dictionary", false, true, "Use signed indexes type for Arrow dictionaries by default as it's recommended"}, + {"allow_experimental_variant_type", false, false, "Add new experimental Variant type"}, + {"use_variant_when_no_common_type_in_if", false, false, "Allow to use Variant in if/multiIf if there is no common type"}}}, {"23.12", {{"allow_suspicious_ttl_expressions", true, false, "It is a new setting, and in previous versions the behavior was equivalent to allowing."}, {"input_format_parquet_allow_missing_columns", false, true, "Allow missing columns in Parquet files by default"}, {"input_format_orc_allow_missing_columns", false, true, "Allow missing columns in ORC files by default"}, From 7c66141e08ec203dbff908d69d929ea3bfc0995f Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Fri, 19 Jan 2024 11:11:13 +0000 Subject: [PATCH 071/245] Automatic style fix --- tests/integration/test_broken_projections/test.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/tests/integration/test_broken_projections/test.py b/tests/integration/test_broken_projections/test.py index 48ed10d0f87..8e3978a078e 100644 --- a/tests/integration/test_broken_projections/test.py +++ b/tests/integration/test_broken_projections/test.py @@ -439,8 +439,7 @@ def test_broken_projections_in_backups(cluster): "all_2_2_0\tproj\t1\n" "all_2_2_0\tproj_2\t0\n" "all_3_3_0\tproj\t0\n" - "all_3_3_0\tproj_2\t0" - == get_projections_info(node, table_name) + "all_3_3_0\tproj_2\t0" == get_projections_info(node, table_name) ) backup_name = f"b1-{get_random_string()}" @@ -467,8 +466,7 @@ def test_broken_projections_in_backups(cluster): "all_2_2_0\tproj\t0\n" "all_2_2_0\tproj_2\t0\n" "all_3_3_0\tproj\t0\n" - "all_3_3_0\tproj_2\t0" - == get_projections_info(node, table_name) + "all_3_3_0\tproj_2\t0" == get_projections_info(node, table_name) ) check(node, table_name, 1) @@ -517,8 +515,7 @@ def test_broken_projections_in_backups(cluster): "all_2_2_0\tproj\t0\n" "all_2_2_0\tproj_2\t0\n" "all_3_3_0\tproj\t0\n" - "all_3_3_0\tproj_2\t0" - == get_projections_info(node, table_name) + "all_3_3_0\tproj_2\t0" == get_projections_info(node, table_name) ) break_projection(node, table_name, "proj", "all_1_1_0", "part") @@ -553,8 +550,7 @@ def test_broken_projections_in_backups(cluster): "all_2_2_0\tproj\t0\n" "all_2_2_0\tproj_2\t0\n" "all_3_3_0\tproj\t0\n" - "all_3_3_0\tproj_2\t0" - == get_projections_info(node, table_name) + "all_3_3_0\tproj_2\t0" == get_projections_info(node, table_name) ) check(node, table_name, 0) From c51d1f04f6d135c63f5123d4aaef47cef5474525 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Fri, 19 Jan 2024 15:57:20 +0100 Subject: [PATCH 072/245] Add settings max_unexpected_write_error_retries for Azure Blob Storage --- src/Core/Settings.h | 1 + src/Disks/IO/WriteBufferFromAzureBlobStorage.cpp | 10 +++++----- src/Disks/IO/WriteBufferFromAzureBlobStorage.h | 2 ++ .../AzureBlobStorage/AzureBlobStorageAuth.cpp | 3 ++- .../AzureBlobStorage/AzureObjectStorage.cpp | 1 + .../AzureBlobStorage/AzureObjectStorage.h | 5 ++++- 6 files changed, 15 insertions(+), 7 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 292e945a29c..59f32c60f63 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -86,6 +86,7 @@ class IColumn; M(UInt64, azure_max_single_part_upload_size, 100*1024*1024, "The maximum size of object to upload using singlepart upload to Azure blob storage.", 0) \ M(UInt64, s3_max_single_read_retries, 4, "The maximum number of retries during single S3 read.", 0) \ M(UInt64, azure_max_single_read_retries, 4, "The maximum number of retries during single Azure blob storage read.", 0) \ + M(UInt64, azure_max_unexpected_write_error_retries, 4, "The maximum number of retries in case of unexpected errors during Azure blob storage write", 0) \ M(UInt64, s3_max_unexpected_write_error_retries, 4, "The maximum number of retries in case of unexpected errors during S3 write.", 0) \ M(UInt64, s3_max_redirects, 10, "Max number of S3 redirects hops allowed.", 0) \ M(UInt64, s3_max_connections, 1024, "The maximum number of connections per server.", 0) \ diff --git a/src/Disks/IO/WriteBufferFromAzureBlobStorage.cpp b/src/Disks/IO/WriteBufferFromAzureBlobStorage.cpp index 60bc04f5f95..b4665eb7346 100644 --- a/src/Disks/IO/WriteBufferFromAzureBlobStorage.cpp +++ b/src/Disks/IO/WriteBufferFromAzureBlobStorage.cpp @@ -18,17 +18,17 @@ namespace ProfileEvents namespace DB { -static constexpr auto DEFAULT_RETRY_NUM = 3; - WriteBufferFromAzureBlobStorage::WriteBufferFromAzureBlobStorage( std::shared_ptr blob_container_client_, const String & blob_path_, size_t max_single_part_upload_size_, + size_t max_unexpected_write_error_retries_, size_t buf_size_, const WriteSettings & write_settings_) : WriteBufferFromFileBase(buf_size_, nullptr, 0) , log(&Poco::Logger::get("WriteBufferFromAzureBlobStorage")) , max_single_part_upload_size(max_single_part_upload_size_) + , max_unexpected_write_error_retries(max_unexpected_write_error_retries_) , blob_path(blob_path_) , write_settings(write_settings_) , blob_container_client(blob_container_client_) @@ -77,13 +77,13 @@ void WriteBufferFromAzureBlobStorage::execWithRetry(std::function func, void WriteBufferFromAzureBlobStorage::finalizeImpl() { - execWithRetry([this](){ next(); }, DEFAULT_RETRY_NUM); + execWithRetry([this](){ next(); }, max_unexpected_write_error_retries); if (tmp_buffer_write_offset > 0) uploadBlock(tmp_buffer->data(), tmp_buffer_write_offset); auto block_blob_client = blob_container_client->GetBlockBlobClient(blob_path); - execWithRetry([&](){ block_blob_client.CommitBlockList(block_ids); }, DEFAULT_RETRY_NUM); + execWithRetry([&](){ block_blob_client.CommitBlockList(block_ids); }, max_unexpected_write_error_retries); LOG_TRACE(log, "Committed {} blocks for blob `{}`", block_ids.size(), blob_path); } @@ -94,7 +94,7 @@ void WriteBufferFromAzureBlobStorage::uploadBlock(const char * data, size_t size const std::string & block_id = block_ids.emplace_back(getRandomASCIIString(64)); Azure::Core::IO::MemoryBodyStream memory_stream(reinterpret_cast(data), size); - execWithRetry([&](){ block_blob_client.StageBlock(block_id, memory_stream); }, DEFAULT_RETRY_NUM, size); + execWithRetry([&](){ block_blob_client.StageBlock(block_id, memory_stream); }, max_unexpected_write_error_retries, size); tmp_buffer_write_offset = 0; LOG_TRACE(log, "Staged block (id: {}) of size {} (blob path: {}).", block_id, size, blob_path); diff --git a/src/Disks/IO/WriteBufferFromAzureBlobStorage.h b/src/Disks/IO/WriteBufferFromAzureBlobStorage.h index f1be81922e1..7494130134b 100644 --- a/src/Disks/IO/WriteBufferFromAzureBlobStorage.h +++ b/src/Disks/IO/WriteBufferFromAzureBlobStorage.h @@ -30,6 +30,7 @@ public: AzureClientPtr blob_container_client_, const String & blob_path_, size_t max_single_part_upload_size_, + size_t max_unexpected_write_error_retries_, size_t buf_size_, const WriteSettings & write_settings_); @@ -48,6 +49,7 @@ private: Poco::Logger * log; const size_t max_single_part_upload_size; + const size_t max_unexpected_write_error_retries; const std::string blob_path; const WriteSettings write_settings; diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp index a5c8afe0264..a209049ceee 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp @@ -164,7 +164,8 @@ std::unique_ptr getAzureBlobStorageSettings(const Po config.getUInt64(config_prefix + ".min_bytes_for_seek", 1024 * 1024), config.getInt(config_prefix + ".max_single_read_retries", 3), config.getInt(config_prefix + ".max_single_download_retries", 3), - config.getInt(config_prefix + ".list_object_keys_size", 1000) + config.getInt(config_prefix + ".list_object_keys_size", 1000), + config.getUInt64(config_prefix + ".max_unexpected_write_error_retries", 4) ); } diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp index 068e2aebab1..683bfeb74a7 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp @@ -268,6 +268,7 @@ std::unique_ptr AzureObjectStorage::writeObject( /// NO client.get(), object.remote_path, settings.get()->max_single_part_upload_size, + settings.get()->max_unexpected_write_error_retries, buf_size, patchSettings(write_settings)); } diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h index 4718dce9bf9..2d505c6a022 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h @@ -23,12 +23,14 @@ struct AzureObjectStorageSettings uint64_t min_bytes_for_seek_, int max_single_read_retries_, int max_single_download_retries_, - int list_object_keys_size_) + int list_object_keys_size_, + size_t max_unexpected_write_error_retries_) : max_single_part_upload_size(max_single_part_upload_size_) , min_bytes_for_seek(min_bytes_for_seek_) , max_single_read_retries(max_single_read_retries_) , max_single_download_retries(max_single_download_retries_) , list_object_keys_size(list_object_keys_size_) + , max_unexpected_write_error_retries (max_unexpected_write_error_retries_) { } @@ -39,6 +41,7 @@ struct AzureObjectStorageSettings size_t max_single_read_retries = 3; size_t max_single_download_retries = 3; int list_object_keys_size = 1000; + size_t max_unexpected_write_error_retries = 4; }; using AzureClient = Azure::Storage::Blobs::BlobContainerClient; From d22fc3a224ac29857f3dc3eb60ff872221829006 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Fri, 19 Jan 2024 16:13:58 +0100 Subject: [PATCH 073/245] Updated to fetch default from settings --- .../ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp index a209049ceee..e0199fde98b 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp @@ -7,6 +7,7 @@ #include #include #include +#include using namespace Azure::Storage::Blobs; @@ -157,7 +158,7 @@ std::unique_ptr getAzureBlobContainerClient( } } -std::unique_ptr getAzureBlobStorageSettings(const Poco::Util::AbstractConfiguration & config, const String & config_prefix, ContextPtr /*context*/) +std::unique_ptr getAzureBlobStorageSettings(const Poco::Util::AbstractConfiguration & config, const String & config_prefix, ContextPtr context) { return std::make_unique( config.getUInt64(config_prefix + ".max_single_part_upload_size", 100 * 1024 * 1024), @@ -165,7 +166,7 @@ std::unique_ptr getAzureBlobStorageSettings(const Po config.getInt(config_prefix + ".max_single_read_retries", 3), config.getInt(config_prefix + ".max_single_download_retries", 3), config.getInt(config_prefix + ".list_object_keys_size", 1000), - config.getUInt64(config_prefix + ".max_unexpected_write_error_retries", 4) + config.getUInt64(config_prefix + ".max_unexpected_write_error_retries", context->getSettings().azure_max_unexpected_write_error_retries) ); } From caf9d8df6d789203a4e408341c9494952eb14ad2 Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Sat, 20 Jan 2024 14:59:13 +0100 Subject: [PATCH 074/245] Update test.py --- tests/integration/test_broken_projections/test.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/integration/test_broken_projections/test.py b/tests/integration/test_broken_projections/test.py index 8e3978a078e..d750bb5827d 100644 --- a/tests/integration/test_broken_projections/test.py +++ b/tests/integration/test_broken_projections/test.py @@ -413,6 +413,8 @@ def test_broken_projections_in_backups(cluster): table_name = "test4" create_table(node, table_name, 1) + node.qeury("SYSTEM STOP MERGES") + insert(node, table_name, 0, 5) insert(node, table_name, 5, 5) insert(node, table_name, 10, 5) @@ -557,3 +559,4 @@ def test_broken_projections_in_backups(cluster): assert "all_1_1_0\tproj\tNO_FILE_IN_DATA_PART" == get_broken_projections_info( node, table_name ) + node.qeury("SYSTEM START MERGES") From e5c3b67f379efdd6d403be08f8bce164348663a1 Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Sat, 20 Jan 2024 16:10:09 +0100 Subject: [PATCH 075/245] Update test.py --- tests/integration/test_broken_projections/test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_broken_projections/test.py b/tests/integration/test_broken_projections/test.py index d750bb5827d..acf0160cf1b 100644 --- a/tests/integration/test_broken_projections/test.py +++ b/tests/integration/test_broken_projections/test.py @@ -413,7 +413,7 @@ def test_broken_projections_in_backups(cluster): table_name = "test4" create_table(node, table_name, 1) - node.qeury("SYSTEM STOP MERGES") + node.query("SYSTEM STOP MERGES") insert(node, table_name, 0, 5) insert(node, table_name, 5, 5) @@ -559,4 +559,4 @@ def test_broken_projections_in_backups(cluster): assert "all_1_1_0\tproj\tNO_FILE_IN_DATA_PART" == get_broken_projections_info( node, table_name ) - node.qeury("SYSTEM START MERGES") + node.query("SYSTEM START MERGES") From 02b178cc9c9b38a30344e2ddd896c1eaf429c3de Mon Sep 17 00:00:00 2001 From: MochiXu Date: Mon, 22 Jan 2024 11:08:03 +0800 Subject: [PATCH 076/245] fix drop inverted index --- src/Storages/MergeTree/MutateTask.cpp | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index e4070aa8262..8ed8b8bba4c 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -674,15 +674,21 @@ static NameToNameVector collectFilesForRenames( { if (command.type == MutationCommand::Type::DROP_INDEX) { - if (source_part->checksums.has(INDEX_FILE_PREFIX + command.column_name + ".idx2")) + const std::vector suffixes = {".idx2", ".idx", ".gin_dict", ".gin_post", ".gin_seg", ".gin_sid"}; + + for (const auto& suffix : suffixes) { - add_rename(INDEX_FILE_PREFIX + command.column_name + ".idx2", ""); - add_rename(INDEX_FILE_PREFIX + command.column_name + mrk_extension, ""); - } - else if (source_part->checksums.has(INDEX_FILE_PREFIX + command.column_name + ".idx")) - { - add_rename(INDEX_FILE_PREFIX + command.column_name + ".idx", ""); - add_rename(INDEX_FILE_PREFIX + command.column_name + mrk_extension, ""); + String filename = INDEX_FILE_PREFIX + command.column_name + suffix; + + if ((suffix == ".idx2" || suffix == ".idx") && source_part->checksums.has(filename)) + { + add_rename(filename, ""); + add_rename(INDEX_FILE_PREFIX + command.column_name + mrk_extension, ""); + } + else if (source_part->checksums.has(filename)) + { + add_rename(filename, ""); + } } } else if (command.type == MutationCommand::Type::DROP_PROJECTION) From b4dcd6755a8a2384e5937991e0656058aed4f95a Mon Sep 17 00:00:00 2001 From: avogar Date: Mon, 22 Jan 2024 13:41:58 +0000 Subject: [PATCH 077/245] Use ColumnConst instead of Field in IColumn::createWithOffsets --- src/Columns/ColumnConst.cpp | 22 +++++++++++++++++++ src/Columns/ColumnConst.h | 5 +++++ src/Columns/ColumnNullable.cpp | 16 ++++++-------- src/Columns/ColumnNullable.h | 2 +- src/Columns/ColumnObject.cpp | 3 ++- src/Columns/ColumnSparse.cpp | 3 ++- src/Columns/ColumnVector.cpp | 5 +++-- src/Columns/ColumnVector.h | 2 +- src/Columns/IColumn.cpp | 6 ++--- src/Columns/IColumn.h | 5 +++-- src/Functions/IFunction.cpp | 4 ++-- .../0_stateless/02941_variant_type_alters.sh | 1 + 12 files changed, 52 insertions(+), 22 deletions(-) diff --git a/src/Columns/ColumnConst.cpp b/src/Columns/ColumnConst.cpp index 9aa0f5cfa49..6e5a3c45c4e 100644 --- a/src/Columns/ColumnConst.cpp +++ b/src/Columns/ColumnConst.cpp @@ -159,4 +159,26 @@ void ColumnConst::compareColumn( std::fill(compare_results.begin(), compare_results.end(), res); } +ColumnConst::Ptr createColumnConst(const ColumnPtr & column, Field value) +{ + auto data = column->cloneEmpty(); + data->insert(value); + return ColumnConst::create(std::move(data), 1); +} + +ColumnConst::Ptr createColumnConst(const ColumnPtr & column, size_t const_value_index) +{ + auto data = column->cloneEmpty(); + data->insertFrom(*column, const_value_index); + return ColumnConst::create(std::move(data), 1); +} + +ColumnConst::Ptr createColumnConstWithDefaultValue(const ColumnPtr & column) +{ + auto data = column->cloneEmpty(); + data->insertDefault(); + return ColumnConst::create(std::move(data), 1); +} + + } diff --git a/src/Columns/ColumnConst.h b/src/Columns/ColumnConst.h index dc84e0c2402..3c646a62795 100644 --- a/src/Columns/ColumnConst.h +++ b/src/Columns/ColumnConst.h @@ -292,4 +292,9 @@ public: bool isCollationSupported() const override { return data->isCollationSupported(); } }; +ColumnConst::Ptr createColumnConst(const ColumnPtr & column, Field value); +ColumnConst::Ptr createColumnConst(const ColumnPtr & column, size_t const_value_index); +ColumnConst::Ptr createColumnConstWithDefaultValue(const ColumnPtr &column); + + } diff --git a/src/Columns/ColumnNullable.cpp b/src/Columns/ColumnNullable.cpp index 25b0e35e15e..c0b13204b8e 100644 --- a/src/Columns/ColumnNullable.cpp +++ b/src/Columns/ColumnNullable.cpp @@ -827,24 +827,22 @@ void ColumnNullable::checkConsistency() const "Logical error: Sizes of nested column and null map of Nullable column are not equal"); } -ColumnPtr ColumnNullable::createWithOffsets(const IColumn::Offsets & offsets, const Field & default_field, size_t total_rows, size_t shift) const +ColumnPtr ColumnNullable::createWithOffsets(const IColumn::Offsets & offsets, const ColumnConst & column_with_default_value, size_t total_rows, size_t shift) const { ColumnPtr new_values; ColumnPtr new_null_map; - if (default_field.getType() == Field::Types::Null) + const ColumnNullable & nullable_column_with_default_value = assert_cast(column_with_default_value.getDataColumn()); + if (nullable_column_with_default_value.isNullAt(0)) { - auto default_column = nested_column->cloneEmpty(); - default_column->insertDefault(); - /// Value in main column, when null map is 1 is implementation defined. So, take any value. - new_values = nested_column->createWithOffsets(offsets, (*default_column)[0], total_rows, shift); - new_null_map = null_map->createWithOffsets(offsets, Field(1u), total_rows, shift); + new_values = nested_column->createWithOffsets(offsets, *createColumnConstWithDefaultValue(nested_column), total_rows, shift); + new_null_map = null_map->createWithOffsets(offsets, *createColumnConst(null_map, Field(1u)), total_rows, shift); } else { - new_values = nested_column->createWithOffsets(offsets, default_field, total_rows, shift); - new_null_map = null_map->createWithOffsets(offsets, Field(0u), total_rows, shift); + new_values = nested_column->createWithOffsets(offsets, *ColumnConst::create(nullable_column_with_default_value.getNestedColumnPtr(), 1), total_rows, shift); + new_null_map = null_map->createWithOffsets(offsets, *createColumnConst(null_map, Field(0u)), total_rows, shift); } return ColumnNullable::create(new_values, new_null_map); diff --git a/src/Columns/ColumnNullable.h b/src/Columns/ColumnNullable.h index 60c7750f8fc..3e04ba8a180 100644 --- a/src/Columns/ColumnNullable.h +++ b/src/Columns/ColumnNullable.h @@ -167,7 +167,7 @@ public: getIndicesOfNonDefaultRowsImpl(indices, from, limit); } - ColumnPtr createWithOffsets(const Offsets & offsets, const Field & default_field, size_t total_rows, size_t shift) const override; + ColumnPtr createWithOffsets(const Offsets & offsets, const ColumnConst & column_with_default_value, size_t total_rows, size_t shift) const override; bool isNullable() const override { return true; } bool isFixedAndContiguous() const override { return false; } diff --git a/src/Columns/ColumnObject.cpp b/src/Columns/ColumnObject.cpp index f7176568a1b..0ec9c616bab 100644 --- a/src/Columns/ColumnObject.cpp +++ b/src/Columns/ColumnObject.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -475,7 +476,7 @@ void ColumnObject::Subcolumn::finalize() { auto values = part->index(*offsets, offsets->size()); values = castColumn({values, from_type, ""}, to_type); - part = values->createWithOffsets(offsets_data, to_type->getDefault(), part_size, /*shift=*/ 0); + part = values->createWithOffsets(offsets_data, *createColumnConstWithDefaultValue(result_column->getPtr()), part_size, /*shift=*/ 0); } } diff --git a/src/Columns/ColumnSparse.cpp b/src/Columns/ColumnSparse.cpp index 02e6e9e56b4..eeeec912ce8 100644 --- a/src/Columns/ColumnSparse.cpp +++ b/src/Columns/ColumnSparse.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -130,7 +131,7 @@ StringRef ColumnSparse::getDataAt(size_t n) const ColumnPtr ColumnSparse::convertToFullColumnIfSparse() const { - return values->createWithOffsets(getOffsetsData(), (*values)[0], _size, /*shift=*/ 1); + return values->createWithOffsets(getOffsetsData(), *createColumnConst(values, 0), _size, /*shift=*/ 1); } void ColumnSparse::insertSingleValue(const Inserter & inserter) diff --git a/src/Columns/ColumnVector.cpp b/src/Columns/ColumnVector.cpp index b1cf449dfde..3aadc530878 100644 --- a/src/Columns/ColumnVector.cpp +++ b/src/Columns/ColumnVector.cpp @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -940,7 +941,7 @@ ColumnPtr ColumnVector::compress() const } template -ColumnPtr ColumnVector::createWithOffsets(const IColumn::Offsets & offsets, const Field & default_field, size_t total_rows, size_t shift) const +ColumnPtr ColumnVector::createWithOffsets(const IColumn::Offsets & offsets, const ColumnConst & column_with_default_value, size_t total_rows, size_t shift) const { if (offsets.size() + shift != size()) throw Exception(ErrorCodes::LOGICAL_ERROR, @@ -949,7 +950,7 @@ ColumnPtr ColumnVector::createWithOffsets(const IColumn::Offsets & offsets, c auto res = this->create(); auto & res_data = res->getData(); - T default_value = static_cast(default_field.safeGet()); + T default_value = assert_cast &>(column_with_default_value.getDataColumn()).getElement(0); res_data.resize_fill(total_rows, default_value); for (size_t i = 0; i < offsets.size(); ++i) res_data[offsets[i]] = data[i + shift]; diff --git a/src/Columns/ColumnVector.h b/src/Columns/ColumnVector.h index fab2d5f06aa..652cc1f5ff9 100644 --- a/src/Columns/ColumnVector.h +++ b/src/Columns/ColumnVector.h @@ -379,7 +379,7 @@ public: return this->template getIndicesOfNonDefaultRowsImpl(indices, from, limit); } - ColumnPtr createWithOffsets(const IColumn::Offsets & offsets, const Field & default_field, size_t total_rows, size_t shift) const override; + ColumnPtr createWithOffsets(const IColumn::Offsets & offsets, const ColumnConst & column_with_default_value, size_t total_rows, size_t shift) const override; ColumnPtr compress() const override; diff --git a/src/Columns/IColumn.cpp b/src/Columns/IColumn.cpp index 82dc82e0bd9..d7f83b822d2 100644 --- a/src/Columns/IColumn.cpp +++ b/src/Columns/IColumn.cpp @@ -35,7 +35,7 @@ void IColumn::insertFrom(const IColumn & src, size_t n) insert(src[n]); } -ColumnPtr IColumn::createWithOffsets(const Offsets & offsets, const Field & default_field, size_t total_rows, size_t shift) const +ColumnPtr IColumn::createWithOffsets(const Offsets & offsets, const ColumnConst & column_with_default_value, size_t total_rows, size_t shift) const { if (offsets.size() + shift != size()) throw Exception(ErrorCodes::LOGICAL_ERROR, @@ -51,14 +51,14 @@ ColumnPtr IColumn::createWithOffsets(const Offsets & offsets, const Field & defa current_offset = offsets[i]; if (offsets_diff > 1) - res->insertMany(default_field, offsets_diff - 1); + res->insertManyFrom(column_with_default_value.getDataColumn(), 0, offsets_diff - 1); res->insertFrom(*this, i + shift); } ssize_t offsets_diff = static_cast(total_rows) - current_offset; if (offsets_diff > 1) - res->insertMany(default_field, offsets_diff - 1); + res->insertManyFrom(column_with_default_value.getDataColumn(), 0, offsets_diff - 1); return res; } diff --git a/src/Columns/IColumn.h b/src/Columns/IColumn.h index 3f866e6213d..1dcd3acdd19 100644 --- a/src/Columns/IColumn.h +++ b/src/Columns/IColumn.h @@ -34,6 +34,7 @@ class Arena; class ColumnGathererStream; class Field; class WeakHash32; +class ColumnConst; /* * Represents a set of equal ranges in previous column to perform sorting in current column. @@ -462,10 +463,10 @@ public: /// Returns column with @total_size elements. /// In result column values from current column are at positions from @offsets. - /// Other values are filled by @default_value. + /// Other values are filled by value from @column_with_default_value. /// @shift means how much rows to skip from the beginning of current column. /// Used to create full column from sparse. - [[nodiscard]] virtual Ptr createWithOffsets(const Offsets & offsets, const Field & default_field, size_t total_rows, size_t shift) const; + [[nodiscard]] virtual Ptr createWithOffsets(const Offsets & offsets, const ColumnConst & column_with_default_value, size_t total_rows, size_t shift) const; /// Compress column in memory to some representation that allows to decompress it back. /// Return itself if compression is not applicable for this column type. diff --git a/src/Functions/IFunction.cpp b/src/Functions/IFunction.cpp index a46f4d2a11d..d4c6b8f4ba6 100644 --- a/src/Functions/IFunction.cpp +++ b/src/Functions/IFunction.cpp @@ -313,7 +313,7 @@ ColumnPtr IExecutableFunction::execute(const ColumnsWithTypeAndName & arguments, { bool use_default_implementation_for_sparse_columns = useDefaultImplementationForSparseColumns(); /// DataTypeFunction does not support obtaining default (isDefaultAt()) - /// ColumnFunction does not support getting specific values + /// ColumnFunction does not support getting specific values. if (result_type->getTypeId() != TypeIndex::Function && use_default_implementation_for_sparse_columns) { size_t num_sparse_columns = 0; @@ -368,7 +368,7 @@ ColumnPtr IExecutableFunction::execute(const ColumnsWithTypeAndName & arguments, if (!result_type->canBeInsideSparseColumns() || !res->isDefaultAt(0) || res->getNumberOfDefaultRows() != 1) { const auto & offsets_data = assert_cast &>(*sparse_offsets).getData(); - return res->createWithOffsets(offsets_data, (*res)[0], input_rows_count, /*shift=*/ 1); + return res->createWithOffsets(offsets_data, *createColumnConst(res, 0), input_rows_count, /*shift=*/ 1); } return ColumnSparse::create(res, sparse_offsets, input_rows_count); diff --git a/tests/queries/0_stateless/02941_variant_type_alters.sh b/tests/queries/0_stateless/02941_variant_type_alters.sh index 7e2ecbd67aa..9b0d4febd65 100755 --- a/tests/queries/0_stateless/02941_variant_type_alters.sh +++ b/tests/queries/0_stateless/02941_variant_type_alters.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash +# Tags: long CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # reset --log_comment From 15e3a5b3961ac304a30ef211594f57bda3a2f584 Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 22 Jan 2024 14:57:10 +0100 Subject: [PATCH 078/245] Try fix flaky test --- .../integration/test_broken_projections/test.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/tests/integration/test_broken_projections/test.py b/tests/integration/test_broken_projections/test.py index acf0160cf1b..87d910b9c77 100644 --- a/tests/integration/test_broken_projections/test.py +++ b/tests/integration/test_broken_projections/test.py @@ -28,10 +28,17 @@ def cluster(): cluster.shutdown() -def create_table(node, table, replica, data_prefix=""): +def create_table(node, table, replica, data_prefix="", aggressive_merge=True): if data_prefix == "": data_prefix = table + if aggressive_merge: + vertical_merge_algorithm_min_rows_to_activate = 1 + vertical_merge_algorithm_min_columns_to_activate = 1 + else: + vertical_merge_algorithm_min_rows_to_activate = 100000 + vertical_merge_algorithm_min_columns_to_activate = 100 + node.query( f""" DROP TABLE IF EXISTS {table} SYNC; @@ -56,9 +63,8 @@ def create_table(node, table, replica, data_prefix=""): SETTINGS min_bytes_for_wide_part = 0, max_parts_to_merge_at_once=3, enable_vertical_merge_algorithm=1, - vertical_merge_algorithm_min_rows_to_activate = 1, - vertical_merge_algorithm_min_columns_to_activate = 1, - vertical_merge_algorithm_min_columns_to_activate = 1, + vertical_merge_algorithm_min_rows_to_activate = {vertical_merge_algorithm_min_rows_to_activate}, + vertical_merge_algorithm_min_columns_to_activate = {vertical_merge_algorithm_min_columns_to_activate}, compress_primary_key=0; """ ) @@ -411,7 +417,7 @@ def test_broken_projections_in_backups(cluster): node = cluster.instances["node"] table_name = "test4" - create_table(node, table_name, 1) + create_table(node, table_name, 1, aggressive_merge=False) node.query("SYSTEM STOP MERGES") From 0606a772674fdecf08a9a904ef46293e8bba9acc Mon Sep 17 00:00:00 2001 From: mochi Date: Mon, 22 Jan 2024 22:02:50 +0800 Subject: [PATCH 079/245] Update src/Storages/MergeTree/MutateTask.cpp Co-authored-by: Dmitry Novik --- src/Storages/MergeTree/MutateTask.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index 8ed8b8bba4c..fccee6bd887 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -676,7 +676,7 @@ static NameToNameVector collectFilesForRenames( { const std::vector suffixes = {".idx2", ".idx", ".gin_dict", ".gin_post", ".gin_seg", ".gin_sid"}; - for (const auto& suffix : suffixes) + for (const auto & suffix : suffixes) { String filename = INDEX_FILE_PREFIX + command.column_name + suffix; From 4e5249275ed67c52d958007978c66619db22a1a5 Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Mon, 22 Jan 2024 16:45:25 +0100 Subject: [PATCH 080/245] Do not resolve remote table id on initiator --- src/Storages/StorageDistributed.cpp | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/Storages/StorageDistributed.cpp b/src/Storages/StorageDistributed.cpp index 987ea4a4957..9972517bbac 100644 --- a/src/Storages/StorageDistributed.cpp +++ b/src/Storages/StorageDistributed.cpp @@ -779,18 +779,11 @@ QueryTreeNodePtr buildQueryTreeDistributed(SelectQueryInfo & query_info, } else { - auto resolved_remote_storage_id = remote_storage_id; - // In case of cross-replication we don't know what database is used for the table. - // `storage_id.hasDatabase()` can return false only on the initiator node. - // Each shard will use the default database (in the case of cross-replication shards may have different defaults). - if (remote_storage_id.hasDatabase()) - resolved_remote_storage_id = query_context->resolveStorageID(remote_storage_id); - auto get_column_options = GetColumnsOptions(GetColumnsOptions::All).withExtendedObjects().withVirtuals(); auto column_names_and_types = distributed_storage_snapshot->getColumns(get_column_options); - auto storage = std::make_shared(resolved_remote_storage_id, ColumnsDescription{column_names_and_types}); + auto storage = std::make_shared(remote_storage_id, ColumnsDescription{column_names_and_types}); auto table_node = std::make_shared(std::move(storage), query_context); if (table_expression_modifiers) From c59f3e164ffedf1a7561b0fd0a65c2555685ca91 Mon Sep 17 00:00:00 2001 From: Bhavna Jindal Date: Thu, 18 Jan 2024 12:08:29 -0800 Subject: [PATCH 081/245] Return baseline component as fourth array --- .../sql-reference/functions/time-series-functions.md | 8 ++++++-- src/Functions/seriesDecomposeSTL.cpp | 10 +++++++++- .../0_stateless/02813_seriesDecomposeSTL.reference | 8 ++++---- 3 files changed, 19 insertions(+), 7 deletions(-) diff --git a/docs/en/sql-reference/functions/time-series-functions.md b/docs/en/sql-reference/functions/time-series-functions.md index 144d832b36a..2e42aa884b4 100644 --- a/docs/en/sql-reference/functions/time-series-functions.md +++ b/docs/en/sql-reference/functions/time-series-functions.md @@ -77,8 +77,8 @@ The number of data points in `series` should be at least twice the value of `per **Returned value** -- An array of three arrays where the first array include seasonal components, the second array - trend, -and the third array - residue component. +- An array of four arrays where the first array include seasonal components, the second array - trend, +the third array - residue component, and the fourth array - baseline component. Type: [Array](../../sql-reference/data-types/array.md). @@ -107,6 +107,10 @@ Result: [ 0, 0.0000019073486, -0.0000019073486, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0.0000019073486, 0, 0 + ], + [ + 10.1, 20.449999, 40.340004, 10.100001, 20.45, 40.34, 10.100001, 20.45, 40.34, 10.1, 20.45, 40.34, + 10.1, 20.45, 40.34, 10.1, 20.45, 40.34, 10.1, 20.45, 40.34, 10.100002, 20.45, 40.34 ]] │ └────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ ``` diff --git a/src/Functions/seriesDecomposeSTL.cpp b/src/Functions/seriesDecomposeSTL.cpp index 21e36761213..0c44afa32a6 100644 --- a/src/Functions/seriesDecomposeSTL.cpp +++ b/src/Functions/seriesDecomposeSTL.cpp @@ -128,6 +128,10 @@ public: res_data.insert(residue.begin(), residue.end()); res_col_offsets_data.push_back(res_data.size()); + // Create Baseline = seasonal + trend + std::transform(seasonal.begin(), seasonal.end(), trend.begin(), std::back_inserter(res_data), std::plus()); + res_col_offsets_data.push_back(res_data.size()); + root_offsets_data.push_back(res_col_offsets->size()); prev_src_offset = curr_offset; @@ -201,7 +205,7 @@ The number of data points in `series` should be at least twice the value of `per **Returned value** -- An array of three arrays where the first array include seasonal components, the second array - trend, and the third array - residue component. +- An array of four arrays where the first array include seasonal components, the second array - trend, the third array - residue component, and the fourth array - baseline component. Type: [Array](../../sql-reference/data-types/array.md). @@ -230,6 +234,10 @@ Result: [ 0, 0.0000019073486, -0.0000019073486, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0.0000019073486, 0, 0 + ], + [ + 10.1, 20.449999, 40.340004, 10.100001, 20.45, 40.34, 10.100001, 20.45, 40.34, 10.1, 20.45, 40.34, + 10.1, 20.45, 40.34, 10.1, 20.45, 40.34, 10.1, 20.45, 40.34, 10.100002, 20.45, 40.34 ]] │ └────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ ```)", diff --git a/tests/queries/0_stateless/02813_seriesDecomposeSTL.reference b/tests/queries/0_stateless/02813_seriesDecomposeSTL.reference index dc30e7f8371..28dae705335 100644 --- a/tests/queries/0_stateless/02813_seriesDecomposeSTL.reference +++ b/tests/queries/0_stateless/02813_seriesDecomposeSTL.reference @@ -1,4 +1,4 @@ -[[-13.529999,-3.1799996,16.71,-13.53,-3.1799996,16.71,-13.53,-3.1799996,16.71,-13.530001,-3.18,16.710001,-13.530001,-3.1800003,16.710001,-13.530001,-3.1800003,16.710001,-13.530001,-3.1799994,16.71,-13.529999,-3.1799994,16.709997],[23.63,23.63,23.630003,23.630001,23.630001,23.630001,23.630001,23.630001,23.630001,23.630001,23.630001,23.63,23.630001,23.630001,23.63,23.630001,23.630001,23.63,23.630001,23.630001,23.630001,23.630001,23.630001,23.630003],[0,0.0000019073486,-0.0000019073486,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.0000019073486,0,0]] -[[4.04452e-8,-1.7846537e-8,-5.9488454e-9,0,0,0,0,0,0,-1.9868216e-8,-9.5297715e-8,2.2540547e-9,3.4229203e-8,8.573613e-8],[1.9999999,2,2,2,2,2,2,2,2,2,2,2,1.9999996,1.9999996],[1.1920929e-7,0,0,0,0,0,0,0,0,0,0,0,3.5762787e-7,2.3841858e-7]] -[[-13.529999,-3.1799996,16.71,-13.53,-3.1799996,16.71,-13.53,-3.1799996,16.71,-13.530001,-3.18,16.710001,-13.530001,-3.1800003,16.710001,-13.530001,-3.1800003,16.710001,-13.530001,-3.1799994,16.71,-13.529999,-3.1799994,16.709997],[23.63,23.63,23.630003,23.630001,23.630001,23.630001,23.630001,23.630001,23.630001,23.630001,23.630001,23.63,23.630001,23.630001,23.63,23.630001,23.630001,23.63,23.630001,23.630001,23.630001,23.630001,23.630001,23.630003],[0,0.0000019073486,-0.0000019073486,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.0000019073486,0,0]] -[[53.946846,-4.8119445,43.525013,-23.71359,-42.472305,-51.636955,-50.458298,-51.982674,37.62072,-15.9006605,56.65076,-5.809669,57.143845,-2.0370207,54.050922,-4.897961,43.954018,-23.808758,-42.651337,-51.86827,-50.709732,-52.18156,37.734905,-15.853402,56.91643,-5.8815174,57.253094,-2.012879,54.157806,-4.9817176,44.384747,-23.902956,-42.830154,-52.10025,-50.96271,-52.3829,37.84573,-15.81032,57.177113,-5.958963,57.356136,-1.9952412,54.27533,-5.066312,44.878296,-23.956438,-42.993656,-52.337124,-51.208073,-52.615646,37.91102,-15.8062525,57.49891,-6.056076,57.45604,-1.9797823,54.39525,-5.1483474,45.374573],[88.028534,88.95315,89.87776,90.802376,91.64913,92.49588,93.342636,94.19737,95.0521,95.90684,96.712975,97.51912,98.32526,98.36342,98.40158,98.43974,98.36777,98.29579,98.223816,98.536446,98.849075,99.161705,99.7552,100.348694,100.94219,101.53184,102.12149,102.711136,103.79921,104.88729,105.975365,107.50462,109.033875,110.56313,111.79767,113.032196,114.26673,115.02128,115.775826,116.53037,117.15541,117.78044,118.40548,118.86489,119.3243,119.783714,120.04031,120.29691,120.55351,120.78621,121.01891,121.25161,121.533585,121.81555,122.09753,122.41821,122.7389,123.059586,123.39267],[-2.97538,2.8587952,-23.402771,0.91121674,4.8231735,9.141075,8.115662,10.785301,0.32717896,5.99382,-12.363731,5.29055,0.53089905,-2.3264008,-3.4524994,1.4582214,-2.321785,2.51297,5.4275208,3.3318253,5.8606567,0.019859314,-4.4901123,-12.495293,-5.8586197,-1.650322,-11.374588,4.3017426,4.042984,1.094429,9.639885,3.3983307,-3.20372,-5.462883,-5.834961,-6.649292,-1.1124649,3.7890396,16.047066,-2.5714111,8.488449,-2.785202,2.319191,-0.79857635,13.797401,-5.827278,-6.0466614,-5.9597855,-7.3454437,-3.1705627,6.0700684,3.5546417,1.9675064,-0.7594757,2.446434,0.5615692,0.86585236,-3.9112396,1.2327576]] +[[-13.529999,-3.1799996,16.71,-13.53,-3.1799996,16.71,-13.53,-3.1799996,16.71,-13.530001,-3.18,16.710001,-13.530001,-3.1800003,16.710001,-13.530001,-3.1800003,16.710001,-13.530001,-3.1799994,16.71,-13.529999,-3.1799994,16.709997],[23.63,23.63,23.630003,23.630001,23.630001,23.630001,23.630001,23.630001,23.630001,23.630001,23.630001,23.63,23.630001,23.630001,23.63,23.630001,23.630001,23.63,23.630001,23.630001,23.630001,23.630001,23.630001,23.630003],[0,0.0000019073486,-0.0000019073486,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.0000019073486,0,0],[10.1,20.449999,40.340004,10.100001,20.45,40.34,10.100001,20.45,40.34,10.1,20.45,40.34,10.1,20.45,40.34,10.1,20.45,40.34,10.1,20.45,40.34,10.100002,20.45,40.34]] +[[4.04452e-8,-1.7846537e-8,-5.9488454e-9,0,0,0,0,0,0,-1.9868216e-8,-9.5297715e-8,2.2540547e-9,3.4229203e-8,8.573613e-8],[1.9999999,2,2,2,2,2,2,2,2,2,2,2,1.9999996,1.9999996],[1.1920929e-7,0,0,0,0,0,0,0,0,0,0,0,3.5762787e-7,2.3841858e-7],[1.9999999,2,2,2,2,2,2,2,2,2,1.9999999,2,1.9999996,1.9999998]] +[[-13.529999,-3.1799996,16.71,-13.53,-3.1799996,16.71,-13.53,-3.1799996,16.71,-13.530001,-3.18,16.710001,-13.530001,-3.1800003,16.710001,-13.530001,-3.1800003,16.710001,-13.530001,-3.1799994,16.71,-13.529999,-3.1799994,16.709997],[23.63,23.63,23.630003,23.630001,23.630001,23.630001,23.630001,23.630001,23.630001,23.630001,23.630001,23.63,23.630001,23.630001,23.63,23.630001,23.630001,23.63,23.630001,23.630001,23.630001,23.630001,23.630001,23.630003],[0,0.0000019073486,-0.0000019073486,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.0000019073486,0,0],[10.1,20.449999,40.340004,10.100001,20.45,40.34,10.100001,20.45,40.34,10.1,20.45,40.34,10.1,20.45,40.34,10.1,20.45,40.34,10.1,20.45,40.34,10.100002,20.45,40.34]] +[[53.946846,-4.8119445,43.525013,-23.71359,-42.472305,-51.636955,-50.458298,-51.982674,37.62072,-15.9006605,56.65076,-5.809669,57.143845,-2.0370207,54.050922,-4.897961,43.954018,-23.808758,-42.651337,-51.86827,-50.709732,-52.18156,37.734905,-15.853402,56.91643,-5.8815174,57.253094,-2.012879,54.157806,-4.9817176,44.384747,-23.902956,-42.830154,-52.10025,-50.96271,-52.3829,37.84573,-15.81032,57.177113,-5.958963,57.356136,-1.9952412,54.27533,-5.066312,44.878296,-23.956438,-42.993656,-52.337124,-51.208073,-52.615646,37.91102,-15.8062525,57.49891,-6.056076,57.45604,-1.9797823,54.39525,-5.1483474,45.374573],[88.028534,88.95315,89.87776,90.802376,91.64913,92.49588,93.342636,94.19737,95.0521,95.90684,96.712975,97.51912,98.32526,98.36342,98.40158,98.43974,98.36777,98.29579,98.223816,98.536446,98.849075,99.161705,99.7552,100.348694,100.94219,101.53184,102.12149,102.711136,103.79921,104.88729,105.975365,107.50462,109.033875,110.56313,111.79767,113.032196,114.26673,115.02128,115.775826,116.53037,117.15541,117.78044,118.40548,118.86489,119.3243,119.783714,120.04031,120.29691,120.55351,120.78621,121.01891,121.25161,121.533585,121.81555,122.09753,122.41821,122.7389,123.059586,123.39267],[-2.97538,2.8587952,-23.402771,0.91121674,4.8231735,9.141075,8.115662,10.785301,0.32717896,5.99382,-12.363731,5.29055,0.53089905,-2.3264008,-3.4524994,1.4582214,-2.321785,2.51297,5.4275208,3.3318253,5.8606567,0.019859314,-4.4901123,-12.495293,-5.8586197,-1.650322,-11.374588,4.3017426,4.042984,1.094429,9.639885,3.3983307,-3.20372,-5.462883,-5.834961,-6.649292,-1.1124649,3.7890396,16.047066,-2.5714111,8.488449,-2.785202,2.319191,-0.79857635,13.797401,-5.827278,-6.0466614,-5.9597855,-7.3454437,-3.1705627,6.0700684,3.5546417,1.9675064,-0.7594757,2.446434,0.5615692,0.86585236,-3.9112396,1.2327576],[141.97537,84.141205,133.40277,67.08878,49.176826,40.858925,42.88434,42.2147,132.67282,80.00618,153.36374,91.70945,155.4691,96.3264,152.4525,93.54178,142.32178,74.48703,55.57248,46.668175,48.139343,46.980145,137.49011,84.49529,157.85863,95.65032,159.37459,100.69826,157.95702,99.90557,150.3601,83.60167,66.20372,58.462883,60.834957,60.649296,152.11246,99.21096,172.95294,110.57141,174.51155,115.7852,172.68082,113.79858,164.2026,95.82728,77.04666,67.95979,69.34544,68.17056,158.92993,105.44536,179.0325,115.759476,179.55356,120.43843,177.13416,117.91124,168.76724]] From f935493f284e1acd94eacdf0c50f91de688d817e Mon Sep 17 00:00:00 2001 From: Bhavna Jindal Date: Fri, 19 Jan 2024 09:05:04 -0800 Subject: [PATCH 082/245] fix trailing whitespaces --- docs/en/sql-reference/functions/time-series-functions.md | 2 +- src/Functions/seriesDecomposeSTL.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/en/sql-reference/functions/time-series-functions.md b/docs/en/sql-reference/functions/time-series-functions.md index 2e42aa884b4..21e66302ad2 100644 --- a/docs/en/sql-reference/functions/time-series-functions.md +++ b/docs/en/sql-reference/functions/time-series-functions.md @@ -78,7 +78,7 @@ The number of data points in `series` should be at least twice the value of `per **Returned value** - An array of four arrays where the first array include seasonal components, the second array - trend, -the third array - residue component, and the fourth array - baseline component. +the third array - residue component, and the fourth array - baseline(seasoanl + trend) component. Type: [Array](../../sql-reference/data-types/array.md). diff --git a/src/Functions/seriesDecomposeSTL.cpp b/src/Functions/seriesDecomposeSTL.cpp index 0c44afa32a6..9a6a229e282 100644 --- a/src/Functions/seriesDecomposeSTL.cpp +++ b/src/Functions/seriesDecomposeSTL.cpp @@ -128,7 +128,7 @@ public: res_data.insert(residue.begin(), residue.end()); res_col_offsets_data.push_back(res_data.size()); - // Create Baseline = seasonal + trend + // Create Baseline = seasonal + trend std::transform(seasonal.begin(), seasonal.end(), trend.begin(), std::back_inserter(res_data), std::plus()); res_col_offsets_data.push_back(res_data.size()); @@ -205,7 +205,7 @@ The number of data points in `series` should be at least twice the value of `per **Returned value** -- An array of four arrays where the first array include seasonal components, the second array - trend, the third array - residue component, and the fourth array - baseline component. +- An array of four arrays where the first array include seasonal components, the second array - trend, the third array - residue component, and the fourth array - baseline(seasoanl + trend) component. Type: [Array](../../sql-reference/data-types/array.md). From d9edd5a7f36491a8d86705e6c7221c1a74cd6ef5 Mon Sep 17 00:00:00 2001 From: Bhavna Jindal Date: Fri, 19 Jan 2024 10:20:01 -0800 Subject: [PATCH 083/245] fix spell check --- docs/en/sql-reference/functions/time-series-functions.md | 2 +- src/Functions/seriesDecomposeSTL.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/en/sql-reference/functions/time-series-functions.md b/docs/en/sql-reference/functions/time-series-functions.md index 21e66302ad2..016c3410944 100644 --- a/docs/en/sql-reference/functions/time-series-functions.md +++ b/docs/en/sql-reference/functions/time-series-functions.md @@ -78,7 +78,7 @@ The number of data points in `series` should be at least twice the value of `per **Returned value** - An array of four arrays where the first array include seasonal components, the second array - trend, -the third array - residue component, and the fourth array - baseline(seasoanl + trend) component. +the third array - residue component, and the fourth array - baseline(seasonal + trend) component. Type: [Array](../../sql-reference/data-types/array.md). diff --git a/src/Functions/seriesDecomposeSTL.cpp b/src/Functions/seriesDecomposeSTL.cpp index 9a6a229e282..4376691868b 100644 --- a/src/Functions/seriesDecomposeSTL.cpp +++ b/src/Functions/seriesDecomposeSTL.cpp @@ -205,7 +205,7 @@ The number of data points in `series` should be at least twice the value of `per **Returned value** -- An array of four arrays where the first array include seasonal components, the second array - trend, the third array - residue component, and the fourth array - baseline(seasoanl + trend) component. +- An array of four arrays where the first array include seasonal components, the second array - trend, the third array - residue component, and the fourth array - baseline(seasonal + trend) component. Type: [Array](../../sql-reference/data-types/array.md). From 084ee74b6898214024feafcfa292ff8419bc0050 Mon Sep 17 00:00:00 2001 From: Bhavna Jindal Date: Mon, 22 Jan 2024 07:31:33 -0800 Subject: [PATCH 084/245] minor fix --- src/Functions/seriesDecomposeSTL.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Functions/seriesDecomposeSTL.cpp b/src/Functions/seriesDecomposeSTL.cpp index 4376691868b..e9276c4aefb 100644 --- a/src/Functions/seriesDecomposeSTL.cpp +++ b/src/Functions/seriesDecomposeSTL.cpp @@ -129,7 +129,7 @@ public: res_col_offsets_data.push_back(res_data.size()); // Create Baseline = seasonal + trend - std::transform(seasonal.begin(), seasonal.end(), trend.begin(), std::back_inserter(res_data), std::plus()); + std::transform(seasonal.begin(), seasonal.end(), trend.begin(), std::back_inserter(res_data), std::plus<>()); res_col_offsets_data.push_back(res_data.size()); root_offsets_data.push_back(res_col_offsets->size()); From 78df07199bc57c8dac9a56fb8092eb1256ad8b56 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Tue, 23 Jan 2024 07:16:38 +0000 Subject: [PATCH 085/245] More consistent tests for inverted index --- ...> 02346_inverted_index_bug47393.reference} | 0 .../02346_inverted_index_bug47393.sql | 25 +++++++++++++++++++ ...> 02346_inverted_index_bug52019.reference} | 0 ....sql => 02346_inverted_index_bug52019.sql} | 17 ++++++++++--- ...46_inverted_index_detach_attach.reference} | 0 ...=> 02346_inverted_index_detach_attach.sql} | 6 ++--- ...nverted_index_experimental_flag.reference} | 0 ...2346_inverted_index_experimental_flag.sql} | 3 +++ ..._inverted_index_match_predicate.reference} | 0 ... 02346_inverted_index_match_predicate.sql} | 2 ++ .../02346_inverted_index_mutation.sql | 25 ------------------- ... => 02346_inverted_index_search.reference} | 0 ...ch.sql => 02346_inverted_index_search.sql} | 0 13 files changed, 47 insertions(+), 31 deletions(-) rename tests/queries/0_stateless/{02346_inverted_index_mutation.reference => 02346_inverted_index_bug47393.reference} (100%) create mode 100644 tests/queries/0_stateless/02346_inverted_index_bug47393.sql rename tests/queries/0_stateless/{02696_inverted_idx_checksums.reference => 02346_inverted_index_bug52019.reference} (100%) rename tests/queries/0_stateless/{02862_index_inverted_incorrect_args.sql => 02346_inverted_index_bug52019.sql} (62%) rename tests/queries/0_stateless/{02862_index_inverted_incorrect_args.reference => 02346_inverted_index_detach_attach.reference} (100%) rename tests/queries/0_stateless/{02696_inverted_idx_checksums.sql => 02346_inverted_index_detach_attach.sql} (75%) rename tests/queries/0_stateless/{02895_forbid_create_inverted_index.reference => 02346_inverted_index_experimental_flag.reference} (100%) rename tests/queries/0_stateless/{02895_forbid_create_inverted_index.sql => 02346_inverted_index_experimental_flag.sql} (72%) rename tests/queries/0_stateless/{02951_inverted_index_support_match.reference => 02346_inverted_index_match_predicate.reference} (100%) rename tests/queries/0_stateless/{02951_inverted_index_support_match.sql => 02346_inverted_index_match_predicate.sql} (97%) delete mode 100644 tests/queries/0_stateless/02346_inverted_index_mutation.sql rename tests/queries/0_stateless/{02346_full_text_search.reference => 02346_inverted_index_search.reference} (100%) rename tests/queries/0_stateless/{02346_full_text_search.sql => 02346_inverted_index_search.sql} (100%) diff --git a/tests/queries/0_stateless/02346_inverted_index_mutation.reference b/tests/queries/0_stateless/02346_inverted_index_bug47393.reference similarity index 100% rename from tests/queries/0_stateless/02346_inverted_index_mutation.reference rename to tests/queries/0_stateless/02346_inverted_index_bug47393.reference diff --git a/tests/queries/0_stateless/02346_inverted_index_bug47393.sql b/tests/queries/0_stateless/02346_inverted_index_bug47393.sql new file mode 100644 index 00000000000..166e051b120 --- /dev/null +++ b/tests/queries/0_stateless/02346_inverted_index_bug47393.sql @@ -0,0 +1,25 @@ +SET allow_experimental_inverted_index = 1; + +DROP TABLE IF EXISTS tab; +CREATE TABLE tab +( + id UInt64, + str String, + INDEX idx str TYPE inverted(3) GRANULARITY 1 +) +ENGINE = MergeTree +ORDER BY tuple() +SETTINGS min_rows_for_wide_part = 1, min_bytes_for_wide_part = 1; + +INSERT INTO tab (str) VALUES ('I am inverted'); + +SELECT data_version FROM system.parts WHERE database = currentDatabase() AND table = 'tab' AND active = 1; + +-- update column synchronously +ALTER TABLE tab UPDATE str = 'I am not inverted' WHERE 1 SETTINGS mutations_sync=1; + +SELECT data_version FROM system.parts WHERE database = currentDatabase() AND table = 'tab' AND active = 1; + +SELECT str FROM tab WHERE str LIKE '%inverted%' SETTINGS force_data_skipping_indices = 'idx'; + +DROP TABLE tab; diff --git a/tests/queries/0_stateless/02696_inverted_idx_checksums.reference b/tests/queries/0_stateless/02346_inverted_index_bug52019.reference similarity index 100% rename from tests/queries/0_stateless/02696_inverted_idx_checksums.reference rename to tests/queries/0_stateless/02346_inverted_index_bug52019.reference diff --git a/tests/queries/0_stateless/02862_index_inverted_incorrect_args.sql b/tests/queries/0_stateless/02346_inverted_index_bug52019.sql similarity index 62% rename from tests/queries/0_stateless/02862_index_inverted_incorrect_args.sql rename to tests/queries/0_stateless/02346_inverted_index_bug52019.sql index 7ba122a7155..c61e17d9cea 100644 --- a/tests/queries/0_stateless/02862_index_inverted_incorrect_args.sql +++ b/tests/queries/0_stateless/02346_inverted_index_bug52019.sql @@ -1,9 +1,20 @@ --- https://github.com/ClickHouse/ClickHouse/issues/52019 -DROP TABLE IF EXISTS tab; +-- Test for Bug 52019: Undefined behavior + SET allow_experimental_inverted_index=1; -CREATE TABLE tab (`k` UInt64, `s` Map(String, String), INDEX af mapKeys(s) TYPE inverted(2) GRANULARITY 1) ENGINE = MergeTree ORDER BY k SETTINGS index_granularity = 2, index_granularity_bytes = '10Mi'; + +DROP TABLE IF EXISTS tab; + +CREATE TABLE tab ( + k UInt64, + s Map(String, String), + INDEX idx mapKeys(s) TYPE inverted(2) GRANULARITY 1) +ENGINE = MergeTree +ORDER BY k +SETTINGS index_granularity = 2, index_granularity_bytes = '10Mi'; + INSERT INTO tab (k) VALUES (0); SELECT * FROM tab PREWHERE (s[NULL]) = 'Click a03' SETTINGS allow_experimental_analyzer=1; SELECT * FROM tab PREWHERE (s[1]) = 'Click a03' SETTINGS allow_experimental_analyzer=1; -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } SELECT * FROM tab PREWHERE (s['foo']) = 'Click a03' SETTINGS allow_experimental_analyzer=1; + DROP TABLE tab; diff --git a/tests/queries/0_stateless/02862_index_inverted_incorrect_args.reference b/tests/queries/0_stateless/02346_inverted_index_detach_attach.reference similarity index 100% rename from tests/queries/0_stateless/02862_index_inverted_incorrect_args.reference rename to tests/queries/0_stateless/02346_inverted_index_detach_attach.reference diff --git a/tests/queries/0_stateless/02696_inverted_idx_checksums.sql b/tests/queries/0_stateless/02346_inverted_index_detach_attach.sql similarity index 75% rename from tests/queries/0_stateless/02696_inverted_idx_checksums.sql rename to tests/queries/0_stateless/02346_inverted_index_detach_attach.sql index 92ffa7a6196..762d78922fe 100644 --- a/tests/queries/0_stateless/02696_inverted_idx_checksums.sql +++ b/tests/queries/0_stateless/02346_inverted_index_detach_attach.sql @@ -2,8 +2,8 @@ SET allow_experimental_inverted_index = 1; CREATE TABLE t ( - `key` UInt64, - `str` String, + key UInt64, + str String, INDEX inv_idx str TYPE inverted(0) GRANULARITY 1 ) ENGINE = MergeTree @@ -13,4 +13,4 @@ INSERT INTO t VALUES (1, 'Hello World'); ALTER TABLE t DETACH PART 'all_1_1_0'; -ALTER TABLE t ATTACH PART 'all_1_1_0'; \ No newline at end of file +ALTER TABLE t ATTACH PART 'all_1_1_0'; diff --git a/tests/queries/0_stateless/02895_forbid_create_inverted_index.reference b/tests/queries/0_stateless/02346_inverted_index_experimental_flag.reference similarity index 100% rename from tests/queries/0_stateless/02895_forbid_create_inverted_index.reference rename to tests/queries/0_stateless/02346_inverted_index_experimental_flag.reference diff --git a/tests/queries/0_stateless/02895_forbid_create_inverted_index.sql b/tests/queries/0_stateless/02346_inverted_index_experimental_flag.sql similarity index 72% rename from tests/queries/0_stateless/02895_forbid_create_inverted_index.sql rename to tests/queries/0_stateless/02346_inverted_index_experimental_flag.sql index dc92d9198fb..bf89265372e 100644 --- a/tests/queries/0_stateless/02895_forbid_create_inverted_index.sql +++ b/tests/queries/0_stateless/02346_inverted_index_experimental_flag.sql @@ -1,4 +1,7 @@ +-- Tests that the inverted index can only be supported when allow_experimental_inverted_index = 1. + SET allow_experimental_inverted_index = 0; + DROP TABLE IF EXISTS tab; CREATE TABLE tab ( diff --git a/tests/queries/0_stateless/02951_inverted_index_support_match.reference b/tests/queries/0_stateless/02346_inverted_index_match_predicate.reference similarity index 100% rename from tests/queries/0_stateless/02951_inverted_index_support_match.reference rename to tests/queries/0_stateless/02346_inverted_index_match_predicate.reference diff --git a/tests/queries/0_stateless/02951_inverted_index_support_match.sql b/tests/queries/0_stateless/02346_inverted_index_match_predicate.sql similarity index 97% rename from tests/queries/0_stateless/02951_inverted_index_support_match.sql rename to tests/queries/0_stateless/02346_inverted_index_match_predicate.sql index 9ebf10412d9..99405c0acf2 100644 --- a/tests/queries/0_stateless/02951_inverted_index_support_match.sql +++ b/tests/queries/0_stateless/02346_inverted_index_match_predicate.sql @@ -1,3 +1,5 @@ +-- Tests that match() utilizes the inverted index + SET allow_experimental_inverted_index = true; DROP TABLE IF EXISTS tab; diff --git a/tests/queries/0_stateless/02346_inverted_index_mutation.sql b/tests/queries/0_stateless/02346_inverted_index_mutation.sql deleted file mode 100644 index 83b73807cd7..00000000000 --- a/tests/queries/0_stateless/02346_inverted_index_mutation.sql +++ /dev/null @@ -1,25 +0,0 @@ -SET allow_experimental_inverted_index=1; - -DROP TABLE IF EXISTS t; -CREATE TABLE t -( - `timestamp` UInt64, - `s` String, - INDEX idx s TYPE inverted(3) GRANULARITY 1 -) -ENGINE = MergeTree -ORDER BY tuple() -SETTINGS min_rows_for_wide_part = 1, min_bytes_for_wide_part = 1; - -INSERT INTO t (s) VALUES ('I am inverted'); - -SELECT data_version FROM system.parts WHERE database=currentDatabase() AND table='t' AND active=1; - --- do update column synchronously -ALTER TABLE t UPDATE s='I am not inverted' WHERE 1 SETTINGS mutations_sync=1; - -SELECT data_version FROM system.parts WHERE database=currentDatabase() AND table='t' AND active=1; - -SELECT s FROM t WHERE s LIKE '%inverted%' SETTINGS force_data_skipping_indices='idx'; - -DROP TABLE t; diff --git a/tests/queries/0_stateless/02346_full_text_search.reference b/tests/queries/0_stateless/02346_inverted_index_search.reference similarity index 100% rename from tests/queries/0_stateless/02346_full_text_search.reference rename to tests/queries/0_stateless/02346_inverted_index_search.reference diff --git a/tests/queries/0_stateless/02346_full_text_search.sql b/tests/queries/0_stateless/02346_inverted_index_search.sql similarity index 100% rename from tests/queries/0_stateless/02346_full_text_search.sql rename to tests/queries/0_stateless/02346_inverted_index_search.sql From bfec324b2818a3764c09347508125051273dac25 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Tue, 23 Jan 2024 10:06:25 +0000 Subject: [PATCH 086/245] Some fixups + test --- src/Storages/MergeTree/MutateTask.cpp | 18 ++++++++++------- .../02346_inverted_index_bug59039.reference | 0 .../02346_inverted_index_bug59039.sql | 20 +++++++++++++++++++ 3 files changed, 31 insertions(+), 7 deletions(-) create mode 100644 tests/queries/0_stateless/02346_inverted_index_bug59039.reference create mode 100644 tests/queries/0_stateless/02346_inverted_index_bug59039.sql diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index fccee6bd887..48aad368dd4 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -674,21 +674,25 @@ static NameToNameVector collectFilesForRenames( { if (command.type == MutationCommand::Type::DROP_INDEX) { - const std::vector suffixes = {".idx2", ".idx", ".gin_dict", ".gin_post", ".gin_seg", ".gin_sid"}; + static const std::array suffixes = {".idx2", ".idx"}; + static const std::array gin_suffixes = {".gin_dict", ".gin_post", ".gin_seg", ".gin_sid"}; /// .gin_* is inverted index for (const auto & suffix : suffixes) { - String filename = INDEX_FILE_PREFIX + command.column_name + suffix; + const String filename = INDEX_FILE_PREFIX + command.column_name + suffix; + const String filename_mrk = INDEX_FILE_PREFIX + command.column_name + mrk_extension; - if ((suffix == ".idx2" || suffix == ".idx") && source_part->checksums.has(filename)) + if (source_part->checksums.has(filename)) { add_rename(filename, ""); - add_rename(INDEX_FILE_PREFIX + command.column_name + mrk_extension, ""); + add_rename(filename_mrk, ""); } - else if (source_part->checksums.has(filename)) - { + } + for (const auto & gin_suffix : gin_suffixes) + { + const String filename = INDEX_FILE_PREFIX + command.column_name + gin_suffix; + if (source_part->checksums.has(filename)) add_rename(filename, ""); - } } } else if (command.type == MutationCommand::Type::DROP_PROJECTION) diff --git a/tests/queries/0_stateless/02346_inverted_index_bug59039.reference b/tests/queries/0_stateless/02346_inverted_index_bug59039.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02346_inverted_index_bug59039.sql b/tests/queries/0_stateless/02346_inverted_index_bug59039.sql new file mode 100644 index 00000000000..0ef0cb0c733 --- /dev/null +++ b/tests/queries/0_stateless/02346_inverted_index_bug59039.sql @@ -0,0 +1,20 @@ +-- This is supposed to test that DROP INDEX removes all index related files. Can't test this directly but at least run the statement and +-- check that no bad things happen. + +SET allow_experimental_inverted_index = 1; + +DROP TABLE IF EXISTS tab; + +CREATE TABLE tab +( + id UInt64, + doc String, + INDEX text_idx doc TYPE inverted +) +ENGINE = MergeTree +ORDER BY id +SETTINGS index_granularity = 2, index_granularity_bytes = '10Mi', min_bytes_for_wide_part = 0, min_rows_for_wide_part = 0; + +ALTER TABLE tab DROP INDEX text_idx; + +DROP TABLE tab; From bef0fcb482c4b8782a1d2e485be9f6d8ffc2dfe9 Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Tue, 23 Jan 2024 14:56:24 +0100 Subject: [PATCH 087/245] Fix is_order_by_all flag in QueryNode --- src/Analyzer/QueryNode.cpp | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/src/Analyzer/QueryNode.cpp b/src/Analyzer/QueryNode.cpp index 738b1ac62e8..a82fb4489b5 100644 --- a/src/Analyzer/QueryNode.cpp +++ b/src/Analyzer/QueryNode.cpp @@ -119,6 +119,9 @@ void QueryNode::dumpTreeImpl(WriteBuffer & buffer, FormatState & format_state, s if (is_group_by_all) buffer << ", is_group_by_all: " << is_group_by_all; + if (is_order_by_all) + buffer << ", is_order_by_all: " << is_order_by_all; + std::string group_by_type; if (is_group_by_with_rollup) group_by_type = "rollup"; @@ -257,6 +260,7 @@ bool QueryNode::isEqualImpl(const IQueryTreeNode & rhs) const is_group_by_with_cube == rhs_typed.is_group_by_with_cube && is_group_by_with_grouping_sets == rhs_typed.is_group_by_with_grouping_sets && is_group_by_all == rhs_typed.is_group_by_all && + is_order_by_all == rhs_typed.is_order_by_all && cte_name == rhs_typed.cte_name && projection_columns == rhs_typed.projection_columns && settings_changes == rhs_typed.settings_changes; @@ -288,6 +292,7 @@ void QueryNode::updateTreeHashImpl(HashState & state) const state.update(is_group_by_with_cube); state.update(is_group_by_with_grouping_sets); state.update(is_group_by_all); + state.update(is_order_by_all); state.update(settings_changes.size()); @@ -306,18 +311,19 @@ QueryTreeNodePtr QueryNode::cloneImpl() const { auto result_query_node = std::make_shared(context); - result_query_node->is_subquery = is_subquery; - result_query_node->is_cte = is_cte; - result_query_node->is_distinct = is_distinct; - result_query_node->is_limit_with_ties = is_limit_with_ties; - result_query_node->is_group_by_with_totals = is_group_by_with_totals; - result_query_node->is_group_by_with_rollup = is_group_by_with_rollup; - result_query_node->is_group_by_with_cube = is_group_by_with_cube; + result_query_node->is_subquery = is_subquery; + result_query_node->is_cte = is_cte; + result_query_node->is_distinct = is_distinct; + result_query_node->is_limit_with_ties = is_limit_with_ties; + result_query_node->is_group_by_with_totals = is_group_by_with_totals; + result_query_node->is_group_by_with_rollup = is_group_by_with_rollup; + result_query_node->is_group_by_with_cube = is_group_by_with_cube; result_query_node->is_group_by_with_grouping_sets = is_group_by_with_grouping_sets; - result_query_node->is_group_by_all = is_group_by_all; - result_query_node->cte_name = cte_name; - result_query_node->projection_columns = projection_columns; - result_query_node->settings_changes = settings_changes; + result_query_node->is_group_by_all = is_group_by_all; + result_query_node->is_order_by_all = is_order_by_all; + result_query_node->cte_name = cte_name; + result_query_node->projection_columns = projection_columns; + result_query_node->settings_changes = settings_changes; return result_query_node; } From 799a94081ba7587ec47d85554bdbb458ffb1436d Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Tue, 23 Jan 2024 11:40:42 -0300 Subject: [PATCH 088/245] Revert "Revert "Allow to attach partition from table with different partition expression when destination partition expression doesn't re-partition"" This reverts commit a1c83e2f51117a69d484c7ae7884c3bc5dd98129. --- .../statements/alter/partition.md | 2 +- src/Interpreters/MonotonicityCheckVisitor.h | 102 +++- src/Interpreters/applyFunction.cpp | 43 ++ src/Interpreters/applyFunction.h | 16 + src/Parsers/queryToString.cpp | 5 + src/Parsers/queryToString.h | 1 + src/Storages/MergeTree/IMergeTreeDataPart.cpp | 37 +- src/Storages/MergeTree/IMergeTreeDataPart.h | 1 + src/Storages/MergeTree/KeyCondition.cpp | 83 +-- src/Storages/MergeTree/MergeTreeData.cpp | 350 +++++-------- src/Storages/MergeTree/MergeTreeData.h | 18 + .../MergeTree/MergeTreeDataPartCloner.cpp | 320 ++++++++++++ .../MergeTree/MergeTreeDataPartCloner.h | 43 ++ src/Storages/MergeTree/MergeTreePartition.cpp | 39 ++ src/Storages/MergeTree/MergeTreePartition.h | 10 +- ...ergeTreePartitionCompatibilityVerifier.cpp | 91 ++++ .../MergeTreePartitionCompatibilityVerifier.h | 30 ++ ...TreePartitionGlobalMinMaxIdxCalculator.cpp | 25 + ...geTreePartitionGlobalMinMaxIdxCalculator.h | 24 + src/Storages/StorageMergeTree.cpp | 93 +++- src/Storages/StorageReplicatedMergeTree.cpp | 135 ++++- .../__init__.py | 0 .../configs/remote_servers.xml | 17 + .../test.py | 214 ++++++++ ...artition_different_partition_exp.reference | 467 +++++++++++++++++ ...tach_partition_different_partition_exp.sql | 485 ++++++++++++++++++ 26 files changed, 2310 insertions(+), 341 deletions(-) create mode 100644 src/Interpreters/applyFunction.cpp create mode 100644 src/Interpreters/applyFunction.h create mode 100644 src/Storages/MergeTree/MergeTreeDataPartCloner.cpp create mode 100644 src/Storages/MergeTree/MergeTreeDataPartCloner.h create mode 100644 src/Storages/MergeTree/MergeTreePartitionCompatibilityVerifier.cpp create mode 100644 src/Storages/MergeTree/MergeTreePartitionCompatibilityVerifier.h create mode 100644 src/Storages/MergeTree/MergeTreePartitionGlobalMinMaxIdxCalculator.cpp create mode 100644 src/Storages/MergeTree/MergeTreePartitionGlobalMinMaxIdxCalculator.h create mode 100644 tests/integration/test_attach_partition_distinct_expression_replicated/__init__.py create mode 100644 tests/integration/test_attach_partition_distinct_expression_replicated/configs/remote_servers.xml create mode 100644 tests/integration/test_attach_partition_distinct_expression_replicated/test.py create mode 100644 tests/queries/0_stateless/02456_test_attach_partition_different_partition_exp.reference create mode 100644 tests/queries/0_stateless/02456_test_attach_partition_different_partition_exp.sql diff --git a/docs/en/sql-reference/statements/alter/partition.md b/docs/en/sql-reference/statements/alter/partition.md index 114b8d5ffe3..5659a0565c5 100644 --- a/docs/en/sql-reference/statements/alter/partition.md +++ b/docs/en/sql-reference/statements/alter/partition.md @@ -112,7 +112,7 @@ Note that: For the query to run successfully, the following conditions must be met: - Both tables must have the same structure. -- Both tables must have the same partition key, the same order by key and the same primary key. +- Both tables must have the same order by key and the same primary key. - Both tables must have the same indices and projections. - Both tables must have the same storage policy. diff --git a/src/Interpreters/MonotonicityCheckVisitor.h b/src/Interpreters/MonotonicityCheckVisitor.h index cc386825024..4e71bd56851 100644 --- a/src/Interpreters/MonotonicityCheckVisitor.h +++ b/src/Interpreters/MonotonicityCheckVisitor.h @@ -1,13 +1,17 @@ #pragma once #include +#include #include +#include #include #include -#include #include +#include +#include #include #include +#include #include #include #include @@ -33,6 +37,8 @@ public: ASTIdentifier * identifier = nullptr; DataTypePtr arg_data_type = {}; + Range range = Range::createWholeUniverse(); + void reject() { monotonicity.is_monotonic = false; } bool isRejected() const { return !monotonicity.is_monotonic; } @@ -97,13 +103,30 @@ public: if (data.isRejected()) return; - /// TODO: monotonicity for functions of several arguments - if (!ast_function.arguments || ast_function.arguments->children.size() != 1) + /// Monotonicity check only works for functions that contain at most two arguments and one of them must be a constant. + if (!ast_function.arguments) { data.reject(); return; } + auto arguments_size = ast_function.arguments->children.size(); + + if (arguments_size == 0 || arguments_size > 2) + { + data.reject(); + return; + } + else if (arguments_size == 2) + { + /// If the function has two arguments, then one of them must be a constant. + if (!ast_function.arguments->children[0]->as() && !ast_function.arguments->children[1]->as()) + { + data.reject(); + return; + } + } + if (!data.canOptimize(ast_function)) { data.reject(); @@ -124,14 +147,33 @@ public: return; } - ColumnsWithTypeAndName args; - args.emplace_back(data.arg_data_type, "tmp"); - auto function_base = function->build(args); + auto function_arguments = getFunctionArguments(ast_function, data); + + auto function_base = function->build(function_arguments); if (function_base && function_base->hasInformationAboutMonotonicity()) { bool is_positive = data.monotonicity.is_positive; - data.monotonicity = function_base->getMonotonicityForRange(*data.arg_data_type, Field(), Field()); + data.monotonicity = function_base->getMonotonicityForRange(*data.arg_data_type, data.range.left, data.range.right); + + auto & key_range = data.range; + + /// If we apply function to open interval, we can get empty intervals in result. + /// E.g. for ('2020-01-03', '2020-01-20') after applying 'toYYYYMM' we will get ('202001', '202001'). + /// To avoid this we make range left and right included. + /// Any function that treats NULL specially is not monotonic. + /// Thus we can safely use isNull() as an -Inf/+Inf indicator here. + if (!key_range.left.isNull()) + { + key_range.left = applyFunction(function_base, data.arg_data_type, key_range.left); + key_range.left_included = true; + } + + if (!key_range.right.isNull()) + { + key_range.right = applyFunction(function_base, data.arg_data_type, key_range.right); + key_range.right_included = true; + } if (!is_positive) data.monotonicity.is_positive = !data.monotonicity.is_positive; @@ -143,13 +185,53 @@ public: static bool needChildVisit(const ASTPtr & parent, const ASTPtr &) { - /// Currently we check monotonicity only for single-argument functions. - /// Although, multi-argument functions with all but one constant arguments can also be monotonic. + /// Multi-argument functions with all but one constant arguments can be monotonic. if (const auto * func = typeid_cast(parent.get())) - return func->arguments->children.size() < 2; + return func->arguments->children.size() <= 2; return true; } + + static ColumnWithTypeAndName extractLiteralColumnAndTypeFromAstLiteral(const ASTLiteral * literal) + { + ColumnWithTypeAndName result; + + result.type = applyVisitor(FieldToDataType(), literal->value); + result.column = result.type->createColumnConst(0, literal->value); + + return result; + } + + static ColumnsWithTypeAndName getFunctionArguments(const ASTFunction & ast_function, const Data & data) + { + ColumnsWithTypeAndName args; + + auto arguments_size = ast_function.arguments->children.size(); + + chassert(arguments_size == 1 || arguments_size == 2); + + if (arguments_size == 2) + { + if (ast_function.arguments->children[0]->as()) + { + const auto * literal = ast_function.arguments->children[0]->as(); + args.push_back(extractLiteralColumnAndTypeFromAstLiteral(literal)); + args.emplace_back(data.arg_data_type, "tmp"); + } + else + { + const auto * literal = ast_function.arguments->children[1]->as(); + args.emplace_back(data.arg_data_type, "tmp"); + args.push_back(extractLiteralColumnAndTypeFromAstLiteral(literal)); + } + } + else + { + args.emplace_back(data.arg_data_type, "tmp"); + } + + return args; + } }; using MonotonicityCheckVisitor = ConstInDepthNodeVisitor; diff --git a/src/Interpreters/applyFunction.cpp b/src/Interpreters/applyFunction.cpp new file mode 100644 index 00000000000..a53f14f0381 --- /dev/null +++ b/src/Interpreters/applyFunction.cpp @@ -0,0 +1,43 @@ +#include + +#include +#include + +namespace DB +{ + +static Field applyFunctionForField(const FunctionBasePtr & func, const DataTypePtr & arg_type, const Field & arg_value) +{ + ColumnsWithTypeAndName columns{ + {arg_type->createColumnConst(1, arg_value), arg_type, "x"}, + }; + + auto col = func->execute(columns, func->getResultType(), 1); + return (*col)[0]; +} + +FieldRef applyFunction(const FunctionBasePtr & func, const DataTypePtr & current_type, const FieldRef & field) +{ + /// Fallback for fields without block reference. + if (field.isExplicit()) + return applyFunctionForField(func, current_type, field); + + String result_name = "_" + func->getName() + "_" + toString(field.column_idx); + const auto & columns = field.columns; + size_t result_idx = columns->size(); + + for (size_t i = 0; i < result_idx; ++i) + if ((*columns)[i].name == result_name) + result_idx = i; + + if (result_idx == columns->size()) + { + ColumnsWithTypeAndName args{(*columns)[field.column_idx]}; + field.columns->emplace_back(ColumnWithTypeAndName{nullptr, func->getResultType(), result_name}); + (*columns)[result_idx].column = func->execute(args, (*columns)[result_idx].type, columns->front().column->size()); + } + + return {field.columns, field.row_idx, result_idx}; +} + +} diff --git a/src/Interpreters/applyFunction.h b/src/Interpreters/applyFunction.h new file mode 100644 index 00000000000..9b8ae43a53c --- /dev/null +++ b/src/Interpreters/applyFunction.h @@ -0,0 +1,16 @@ +#pragma once + +#include + +namespace DB +{ +struct FieldRef; + +class IFunctionBase; +class IDataType; + +using DataTypePtr = std::shared_ptr; +using FunctionBasePtr = std::shared_ptr; + +FieldRef applyFunction(const FunctionBasePtr & func, const DataTypePtr & current_type, const FieldRef & field); +} diff --git a/src/Parsers/queryToString.cpp b/src/Parsers/queryToString.cpp index 9721aa1f128..4a1903393f6 100644 --- a/src/Parsers/queryToString.cpp +++ b/src/Parsers/queryToString.cpp @@ -3,6 +3,11 @@ namespace DB { + String queryToStringNullable(const ASTPtr & query) + { + return query ? queryToString(query) : ""; + } + String queryToString(const ASTPtr & query) { return queryToString(*query); diff --git a/src/Parsers/queryToString.h b/src/Parsers/queryToString.h index 873de218293..3acd560b1e2 100644 --- a/src/Parsers/queryToString.h +++ b/src/Parsers/queryToString.h @@ -6,4 +6,5 @@ namespace DB { String queryToString(const ASTPtr & query); String queryToString(const IAST & query); + String queryToStringNullable(const ASTPtr & query); } diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 87f23b0da2a..f3057a8254f 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -81,6 +81,7 @@ void IMergeTreeDataPart::MinMaxIndex::load(const MergeTreeData & data, const Par auto minmax_column_types = data.getMinMaxColumnsTypes(partition_key); size_t minmax_idx_size = minmax_column_types.size(); + hyperrectangle.clear(); hyperrectangle.reserve(minmax_idx_size); for (size_t i = 0; i < minmax_idx_size; ++i) { @@ -104,6 +105,39 @@ void IMergeTreeDataPart::MinMaxIndex::load(const MergeTreeData & data, const Par initialized = true; } +Block IMergeTreeDataPart::MinMaxIndex::getBlock(const MergeTreeData & data) const +{ + if (!initialized) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to get block from uninitialized MinMax index."); + + Block block; + + const auto metadata_snapshot = data.getInMemoryMetadataPtr(); + const auto & partition_key = metadata_snapshot->getPartitionKey(); + + const auto minmax_column_names = data.getMinMaxColumnsNames(partition_key); + const auto minmax_column_types = data.getMinMaxColumnsTypes(partition_key); + const auto minmax_idx_size = minmax_column_types.size(); + + for (size_t i = 0; i < minmax_idx_size; ++i) + { + const auto & data_type = minmax_column_types[i]; + const auto & column_name = minmax_column_names[i]; + + const auto column = data_type->createColumn(); + + const auto min_val = hyperrectangle.at(i).left; + const auto max_val = hyperrectangle.at(i).right; + + column->insert(min_val); + column->insert(max_val); + + block.insert(ColumnWithTypeAndName(column->getPtr(), data_type, column_name)); + } + + return block; +} + IMergeTreeDataPart::MinMaxIndex::WrittenFiles IMergeTreeDataPart::MinMaxIndex::store( const MergeTreeData & data, IDataPartStorage & part_storage, Checksums & out_checksums) const { @@ -185,8 +219,7 @@ void IMergeTreeDataPart::MinMaxIndex::merge(const MinMaxIndex & other) if (!initialized) { - hyperrectangle = other.hyperrectangle; - initialized = true; + *this = other; } else { diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index 640a1f1d0a3..29f0f54d419 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -336,6 +336,7 @@ public: } void load(const MergeTreeData & data, const PartMetadataManagerPtr & manager); + Block getBlock(const MergeTreeData & data) const; using WrittenFiles = std::vector>; diff --git a/src/Storages/MergeTree/KeyCondition.cpp b/src/Storages/MergeTree/KeyCondition.cpp index d5922ae1bc2..e5bcb11091f 100644 --- a/src/Storages/MergeTree/KeyCondition.cpp +++ b/src/Storages/MergeTree/KeyCondition.cpp @@ -1,36 +1,37 @@ -#include -#include -#include +#include +#include #include #include #include #include +#include #include -#include #include -#include -#include -#include -#include -#include -#include -#include +#include #include +#include #include -#include -#include -#include -#include -#include -#include +#include +#include +#include +#include +#include #include -#include +#include +#include +#include +#include +#include #include #include #include -#include -#include +#include +#include +#include #include +#include +#include +#include #include #include @@ -836,21 +837,6 @@ bool KeyCondition::getConstant(const ASTPtr & expr, Block & block_with_constants return node.tryGetConstant(out_value, out_type); } - -static Field applyFunctionForField( - const FunctionBasePtr & func, - const DataTypePtr & arg_type, - const Field & arg_value) -{ - ColumnsWithTypeAndName columns - { - { arg_type->createColumnConst(1, arg_value), arg_type, "x" }, - }; - - auto col = func->execute(columns, func->getResultType(), 1); - return (*col)[0]; -} - /// The case when arguments may have types different than in the primary key. static std::pair applyFunctionForFieldOfUnknownType( const FunctionBasePtr & func, @@ -890,33 +876,6 @@ static std::pair applyBinaryFunctionForFieldOfUnknownType( return {std::move(result), std::move(return_type)}; } - -static FieldRef applyFunction(const FunctionBasePtr & func, const DataTypePtr & current_type, const FieldRef & field) -{ - /// Fallback for fields without block reference. - if (field.isExplicit()) - return applyFunctionForField(func, current_type, field); - - String result_name = "_" + func->getName() + "_" + toString(field.column_idx); - const auto & columns = field.columns; - size_t result_idx = columns->size(); - - for (size_t i = 0; i < result_idx; ++i) - { - if ((*columns)[i].name == result_name) - result_idx = i; - } - - if (result_idx == columns->size()) - { - ColumnsWithTypeAndName args{(*columns)[field.column_idx]}; - field.columns->emplace_back(ColumnWithTypeAndName {nullptr, func->getResultType(), result_name}); - (*columns)[result_idx].column = func->execute(args, (*columns)[result_idx].type, columns->front().column->size()); - } - - return {field.columns, field.row_idx, result_idx}; -} - /** When table's key has expression with these functions from a column, * and when a column in a query is compared with a constant, such as: * CREATE TABLE (x String) ORDER BY toDate(x) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 61332a4ff38..c3e348a549a 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -8,21 +8,6 @@ #include #include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include #include #include @@ -43,19 +28,20 @@ #include #include #include -#include -#include #include #include #include #include #include #include +#include +#include #include +#include #include -#include -#include #include +#include +#include #include #include #include @@ -64,26 +50,41 @@ #include #include #include -#include #include #include #include #include #include #include +#include #include #include +#include #include #include #include #include #include +#include #include #include -#include #include #include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include #include @@ -197,6 +198,50 @@ namespace ErrorCodes extern const int LIMIT_EXCEEDED; } +static size_t getPartitionAstFieldsCount(const ASTPartition & partition_ast, ASTPtr partition_value_ast) +{ + if (partition_ast.fields_count.has_value()) + return *partition_ast.fields_count; + + if (partition_value_ast->as()) + return 1; + + const auto * tuple_ast = partition_value_ast->as(); + + if (!tuple_ast) + { + throw Exception( + ErrorCodes::INVALID_PARTITION_VALUE, "Expected literal or tuple for partition key, got {}", partition_value_ast->getID()); + } + + if (tuple_ast->name != "tuple") + { + if (!isFunctionCast(tuple_ast)) + throw Exception(ErrorCodes::INVALID_PARTITION_VALUE, "Expected tuple for complex partition key, got {}", tuple_ast->name); + + if (tuple_ast->arguments->as()->children.empty()) + throw Exception(ErrorCodes::INVALID_PARTITION_VALUE, "Expected tuple for complex partition key, got {}", tuple_ast->name); + + auto first_arg = tuple_ast->arguments->as()->children.at(0); + if (const auto * inner_tuple = first_arg->as(); inner_tuple && inner_tuple->name == "tuple") + { + const auto * arguments_ast = tuple_ast->arguments->as(); + return arguments_ast ? arguments_ast->children.size() : 0; + } + else if (const auto * inner_literal_tuple = first_arg->as(); inner_literal_tuple) + { + return inner_literal_tuple->value.getType() == Field::Types::Tuple ? inner_literal_tuple->value.safeGet().size() : 1; + } + + throw Exception(ErrorCodes::INVALID_PARTITION_VALUE, "Expected tuple for complex partition key, got {}", tuple_ast->name); + } + else + { + const auto * arguments_ast = tuple_ast->arguments->as(); + return arguments_ast ? arguments_ast->children.size() : 0; + } +} + static void checkSuspiciousIndices(const ASTFunction * index_function) { std::unordered_set unique_index_expression_hashes; @@ -4854,7 +4899,7 @@ void MergeTreeData::removePartContributionToColumnAndSecondaryIndexSizes(const D } void MergeTreeData::checkAlterPartitionIsPossible( - const PartitionCommands & commands, const StorageMetadataPtr & /*metadata_snapshot*/, const Settings & settings, ContextPtr local_context) const + const PartitionCommands & commands, const StorageMetadataPtr & /*metadata_snapshot*/, const Settings & settings, ContextPtr) const { for (const auto & command : commands) { @@ -4882,7 +4927,15 @@ void MergeTreeData::checkAlterPartitionIsPossible( throw DB::Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Only support DROP/DETACH PARTITION ALL currently"); } else - getPartitionIDFromQuery(command.partition, local_context); + { + // The below `getPartitionIDFromQuery` call will not work for attach / replace because it assumes the partition expressions + // are the same and deliberately uses this storage. Later on, `MergeTreeData::replaceFrom` is called, and it makes the right + // call to `getPartitionIDFromQuery` using source storage. + // Note: `PartitionCommand::REPLACE_PARTITION` is used both for `REPLACE PARTITION` and `ATTACH PARTITION FROM` queries. + // But not for `ATTACH PARTITION` queries. + if (command.type != PartitionCommand::REPLACE_PARTITION) + getPartitionIDFromQuery(command.partition, getContext()); + } } } } @@ -5616,69 +5669,8 @@ String MergeTreeData::getPartitionIDFromQuery(const ASTPtr & ast, ContextPtr loc MergeTreePartInfo::validatePartitionID(partition_ast.id->clone(), format_version); return partition_ast.id->as()->value.safeGet(); } - size_t partition_ast_fields_count = 0; ASTPtr partition_value_ast = partition_ast.value->clone(); - if (!partition_ast.fields_count.has_value()) - { - if (partition_value_ast->as()) - { - partition_ast_fields_count = 1; - } - else if (const auto * tuple_ast = partition_value_ast->as()) - { - if (tuple_ast->name != "tuple") - { - if (isFunctionCast(tuple_ast)) - { - if (tuple_ast->arguments->as()->children.empty()) - { - throw Exception( - ErrorCodes::INVALID_PARTITION_VALUE, "Expected tuple for complex partition key, got {}", tuple_ast->name); - } - auto first_arg = tuple_ast->arguments->as()->children.at(0); - if (const auto * inner_tuple = first_arg->as(); inner_tuple && inner_tuple->name == "tuple") - { - const auto * arguments_ast = tuple_ast->arguments->as(); - if (arguments_ast) - partition_ast_fields_count = arguments_ast->children.size(); - else - partition_ast_fields_count = 0; - } - else if (const auto * inner_literal_tuple = first_arg->as(); inner_literal_tuple) - { - if (inner_literal_tuple->value.getType() == Field::Types::Tuple) - partition_ast_fields_count = inner_literal_tuple->value.safeGet().size(); - else - partition_ast_fields_count = 1; - } - else - { - throw Exception( - ErrorCodes::INVALID_PARTITION_VALUE, "Expected tuple for complex partition key, got {}", tuple_ast->name); - } - } - else - throw Exception(ErrorCodes::INVALID_PARTITION_VALUE, "Expected tuple for complex partition key, got {}", tuple_ast->name); - } - else - { - const auto * arguments_ast = tuple_ast->arguments->as(); - if (arguments_ast) - partition_ast_fields_count = arguments_ast->children.size(); - else - partition_ast_fields_count = 0; - } - } - else - { - throw Exception( - ErrorCodes::INVALID_PARTITION_VALUE, "Expected literal or tuple for partition key, got {}", partition_value_ast->getID()); - } - } - else - { - partition_ast_fields_count = *partition_ast.fields_count; - } + auto partition_ast_fields_count = getPartitionAstFieldsCount(partition_ast, partition_value_ast); if (format_version < MERGE_TREE_DATA_MIN_FORMAT_VERSION_WITH_CUSTOM_PARTITIONING) { @@ -7014,23 +7006,35 @@ MergeTreeData & MergeTreeData::checkStructureAndGetMergeTreeData(IStorage & sour if (my_snapshot->getColumns().getAllPhysical().sizeOfDifference(src_snapshot->getColumns().getAllPhysical())) throw Exception(ErrorCodes::INCOMPATIBLE_COLUMNS, "Tables have different structure"); - auto query_to_string = [] (const ASTPtr & ast) - { - return ast ? queryToString(ast) : ""; - }; - - if (query_to_string(my_snapshot->getSortingKeyAST()) != query_to_string(src_snapshot->getSortingKeyAST())) + if (queryToStringNullable(my_snapshot->getSortingKeyAST()) != queryToStringNullable(src_snapshot->getSortingKeyAST())) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Tables have different ordering"); - if (query_to_string(my_snapshot->getPartitionKeyAST()) != query_to_string(src_snapshot->getPartitionKeyAST())) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Tables have different partition key"); - if (format_version != src_data->format_version) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Tables have different format_version"); - if (query_to_string(my_snapshot->getPrimaryKeyAST()) != query_to_string(src_snapshot->getPrimaryKeyAST())) + if (queryToStringNullable(my_snapshot->getPrimaryKeyAST()) != queryToStringNullable(src_snapshot->getPrimaryKeyAST())) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Tables have different primary key"); + const auto is_a_subset_of = [](const auto & lhs, const auto & rhs) + { + if (lhs.size() > rhs.size()) + return false; + + const auto rhs_set = NameSet(rhs.begin(), rhs.end()); + for (const auto & lhs_element : lhs) + if (!rhs_set.contains(lhs_element)) + return false; + + return true; + }; + + if (!is_a_subset_of(my_snapshot->getColumnsRequiredForPartitionKey(), src_snapshot->getColumnsRequiredForPartitionKey())) + { + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "Destination table partition expression columns must be a subset of source table partition expression columns"); + } + const auto check_definitions = [](const auto & my_descriptions, const auto & src_descriptions) { if (my_descriptions.size() != src_descriptions.size()) @@ -7071,128 +7075,56 @@ std::pair MergeTreeData::cloneAn const ReadSettings & read_settings, const WriteSettings & write_settings) { - /// Check that the storage policy contains the disk where the src_part is located. - bool does_storage_policy_allow_same_disk = false; - for (const DiskPtr & disk : getStoragePolicy()->getDisks()) - { - if (disk->getName() == src_part->getDataPartStorage().getDiskName()) - { - does_storage_policy_allow_same_disk = true; - break; - } - } - if (!does_storage_policy_allow_same_disk) - throw Exception( - ErrorCodes::BAD_ARGUMENTS, - "Could not clone and load part {} because disk does not belong to storage policy", - quoteString(src_part->getDataPartStorage().getFullPath())); + return MergeTreeDataPartCloner::clone( + this, src_part, metadata_snapshot, dst_part_info, tmp_part_prefix, require_part_metadata, params, read_settings, write_settings); +} - String dst_part_name = src_part->getNewName(dst_part_info); - String tmp_dst_part_name = tmp_part_prefix + dst_part_name; - auto temporary_directory_lock = getTemporaryPartDirectoryHolder(tmp_dst_part_name); +std::pair MergeTreeData::cloneAndLoadPartOnSameDiskWithDifferentPartitionKey( + const MergeTreeData::DataPartPtr & src_part, + const MergeTreePartition & new_partition, + const String & partition_id, + const IMergeTreeDataPart::MinMaxIndex & min_max_index, + const String & tmp_part_prefix, + const StorageMetadataPtr & my_metadata_snapshot, + const IDataPartStorage::ClonePartParams & clone_params, + ContextPtr local_context, + Int64 min_block, + Int64 max_block +) +{ + MergeTreePartInfo dst_part_info(partition_id, min_block, max_block, src_part->info.level); - /// Why it is needed if we only hardlink files? - auto reservation = src_part->getDataPartStorage().reserve(src_part->getBytesOnDisk()); - auto src_part_storage = src_part->getDataPartStoragePtr(); + return MergeTreeDataPartCloner::cloneWithDistinctPartitionExpression( + this, + src_part, + my_metadata_snapshot, + dst_part_info, + tmp_part_prefix, + local_context->getReadSettings(), + local_context->getWriteSettings(), + new_partition, + min_max_index, + false, + clone_params); +} - scope_guard src_flushed_tmp_dir_lock; - MergeTreeData::MutableDataPartPtr src_flushed_tmp_part; +std::pair MergeTreeData::createPartitionAndMinMaxIndexFromSourcePart( + const MergeTreeData::DataPartPtr & src_part, + const StorageMetadataPtr & metadata_snapshot, + ContextPtr local_context) +{ + const auto & src_data = src_part->storage; - /// If source part is in memory, flush it to disk and clone it already in on-disk format - /// Protect tmp dir from removing by cleanup thread with src_flushed_tmp_dir_lock - /// Construct src_flushed_tmp_part in order to delete part with its directory at destructor - if (auto src_part_in_memory = asInMemoryPart(src_part)) - { - auto flushed_part_path = *src_part_in_memory->getRelativePathForPrefix(tmp_part_prefix); + auto metadata_manager = std::make_shared(src_part.get()); + IMergeTreeDataPart::MinMaxIndex min_max_index; - auto tmp_src_part_file_name = fs::path(tmp_dst_part_name).filename(); - src_flushed_tmp_dir_lock = src_part->storage.getTemporaryPartDirectoryHolder(tmp_src_part_file_name); + min_max_index.load(src_data, metadata_manager); - auto flushed_part_storage = src_part_in_memory->flushToDisk(flushed_part_path, metadata_snapshot); + MergeTreePartition new_partition; - src_flushed_tmp_part = MergeTreeDataPartBuilder(*this, src_part->name, flushed_part_storage) - .withPartInfo(src_part->info) - .withPartFormatFromDisk() - .build(); + new_partition.create(metadata_snapshot, min_max_index.getBlock(src_data), 0u, local_context); - src_flushed_tmp_part->is_temp = true; - src_part_storage = flushed_part_storage; - } - - String with_copy; - if (params.copy_instead_of_hardlink) - with_copy = " (copying data)"; - - auto dst_part_storage = src_part_storage->freeze( - relative_data_path, - tmp_dst_part_name, - read_settings, - write_settings, - /* save_metadata_callback= */ {}, - params); - - if (params.metadata_version_to_write.has_value()) - { - chassert(!params.keep_metadata_version); - auto out_metadata = dst_part_storage->writeFile(IMergeTreeDataPart::METADATA_VERSION_FILE_NAME, 4096, getContext()->getWriteSettings()); - writeText(metadata_snapshot->getMetadataVersion(), *out_metadata); - out_metadata->finalize(); - if (getSettings()->fsync_after_insert) - out_metadata->sync(); - } - - LOG_DEBUG(log, "Clone{} part {} to {}{}", - src_flushed_tmp_part ? " flushed" : "", - src_part_storage->getFullPath(), - std::string(fs::path(dst_part_storage->getFullRootPath()) / tmp_dst_part_name), - with_copy); - - auto dst_data_part = MergeTreeDataPartBuilder(*this, dst_part_name, dst_part_storage) - .withPartFormatFromDisk() - .build(); - - if (!params.copy_instead_of_hardlink && params.hardlinked_files) - { - params.hardlinked_files->source_part_name = src_part->name; - params.hardlinked_files->source_table_shared_id = src_part->storage.getTableSharedID(); - - for (auto it = src_part->getDataPartStorage().iterate(); it->isValid(); it->next()) - { - if (!params.files_to_copy_instead_of_hardlinks.contains(it->name()) - && it->name() != IMergeTreeDataPart::DELETE_ON_DESTROY_MARKER_FILE_NAME_DEPRECATED - && it->name() != IMergeTreeDataPart::TXN_VERSION_METADATA_FILE_NAME) - { - params.hardlinked_files->hardlinks_from_source_part.insert(it->name()); - } - } - - auto projections = src_part->getProjectionParts(); - for (const auto & [name, projection_part] : projections) - { - const auto & projection_storage = projection_part->getDataPartStorage(); - for (auto it = projection_storage.iterate(); it->isValid(); it->next()) - { - auto file_name_with_projection_prefix = fs::path(projection_storage.getPartDirectory()) / it->name(); - if (!params.files_to_copy_instead_of_hardlinks.contains(file_name_with_projection_prefix) - && it->name() != IMergeTreeDataPart::DELETE_ON_DESTROY_MARKER_FILE_NAME_DEPRECATED - && it->name() != IMergeTreeDataPart::TXN_VERSION_METADATA_FILE_NAME) - { - params.hardlinked_files->hardlinks_from_source_part.insert(file_name_with_projection_prefix); - } - } - } - } - - /// We should write version metadata on part creation to distinguish it from parts that were created without transaction. - TransactionID tid = params.txn ? params.txn->tid : Tx::PrehistoricTID; - dst_data_part->version.setCreationTID(tid, nullptr); - dst_data_part->storeVersionMetadata(); - - dst_data_part->is_temp = true; - - dst_data_part->loadColumnsChecksumsIndexes(require_part_metadata, true); - dst_data_part->modification_time = dst_part_storage->getLastModified().epochTime(); - return std::make_pair(dst_data_part, std::move(temporary_directory_lock)); + return {new_partition, min_max_index}; } String MergeTreeData::getFullPathOnDisk(const DiskPtr & disk) const diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index f0dbaf0e307..9c433e11b84 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -231,6 +231,7 @@ public: } }; + using DataParts = std::set; using MutableDataParts = std::set; using DataPartsVector = std::vector; @@ -848,6 +849,23 @@ public: const ReadSettings & read_settings, const WriteSettings & write_settings); + std::pair cloneAndLoadPartOnSameDiskWithDifferentPartitionKey( + const MergeTreeData::DataPartPtr & src_part, + const MergeTreePartition & new_partition, + const String & partition_id, + const IMergeTreeDataPart::MinMaxIndex & min_max_index, + const String & tmp_part_prefix, + const StorageMetadataPtr & my_metadata_snapshot, + const IDataPartStorage::ClonePartParams & clone_params, + ContextPtr local_context, + Int64 min_block, + Int64 max_block); + + static std::pair createPartitionAndMinMaxIndexFromSourcePart( + const MergeTreeData::DataPartPtr & src_part, + const StorageMetadataPtr & metadata_snapshot, + ContextPtr local_context); + virtual std::vector getMutationsStatus() const = 0; /// Returns true if table can create new parts with adaptive granularity diff --git a/src/Storages/MergeTree/MergeTreeDataPartCloner.cpp b/src/Storages/MergeTree/MergeTreeDataPartCloner.cpp new file mode 100644 index 00000000000..78cb9aa0624 --- /dev/null +++ b/src/Storages/MergeTree/MergeTreeDataPartCloner.cpp @@ -0,0 +1,320 @@ +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ +extern const int BAD_ARGUMENTS; +} + +static Poco::Logger * log = &Poco::Logger::get("MergeTreeDataPartCloner"); + +namespace DistinctPartitionExpression +{ +std::unique_ptr updatePartitionFile( + const MergeTreeData & merge_tree_data, + const MergeTreePartition & partition, + const MergeTreeData::MutableDataPartPtr & dst_part, + IDataPartStorage & storage) +{ + storage.removeFile("partition.dat"); + // Leverage already implemented MergeTreePartition::store to create & store partition.dat. + // Checksum is re-calculated later. + return partition.store(merge_tree_data, storage, dst_part->checksums); +} + +IMergeTreeDataPart::MinMaxIndex::WrittenFiles updateMinMaxFiles( + const MergeTreeData & merge_tree_data, + const MergeTreeData::MutableDataPartPtr & dst_part, + IDataPartStorage & storage, + const StorageMetadataPtr & metadata_snapshot) +{ + for (const auto & column_name : MergeTreeData::getMinMaxColumnsNames(metadata_snapshot->partition_key)) + { + auto file = "minmax_" + escapeForFileName(column_name) + ".idx"; + storage.removeFile(file); + } + + return dst_part->minmax_idx->store(merge_tree_data, storage, dst_part->checksums); +} + +void finalizeNewFiles(const std::vector> & files, bool sync_new_files) +{ + for (const auto & file : files) + { + file->finalize(); + if (sync_new_files) + file->sync(); + } +} + +void updateNewPartFiles( + const MergeTreeData & merge_tree_data, + const MergeTreeData::MutableDataPartPtr & dst_part, + const MergeTreePartition & new_partition, + const IMergeTreeDataPart::MinMaxIndex & new_min_max_index, + const StorageMetadataPtr & src_metadata_snapshot, + bool sync_new_files) +{ + auto & storage = dst_part->getDataPartStorage(); + + *dst_part->minmax_idx = new_min_max_index; + + auto partition_file = updatePartitionFile(merge_tree_data, new_partition, dst_part, storage); + + auto min_max_files = updateMinMaxFiles(merge_tree_data, dst_part, storage, src_metadata_snapshot); + + IMergeTreeDataPart::MinMaxIndex::WrittenFiles written_files; + + if (partition_file) + written_files.emplace_back(std::move(partition_file)); + + written_files.insert(written_files.end(), std::make_move_iterator(min_max_files.begin()), std::make_move_iterator(min_max_files.end())); + + finalizeNewFiles(written_files, sync_new_files); + + // MergeTreeDataPartCloner::finalize_part calls IMergeTreeDataPart::loadColumnsChecksumsIndexes, which will re-create + // the checksum file if it doesn't exist. Relying on that is cumbersome, but this refactoring is simply a code extraction + // with small improvements. It can be further improved in the future. + storage.removeFile("checksums.txt"); +} +} + +namespace +{ +bool doesStoragePolicyAllowSameDisk(MergeTreeData * merge_tree_data, const MergeTreeData::DataPartPtr & src_part) +{ + for (const DiskPtr & disk : merge_tree_data->getStoragePolicy()->getDisks()) + if (disk->getName() == src_part->getDataPartStorage().getDiskName()) + return true; + return false; +} + +DataPartStoragePtr flushPartStorageToDiskIfInMemory( + MergeTreeData * merge_tree_data, + const MergeTreeData::DataPartPtr & src_part, + const StorageMetadataPtr & metadata_snapshot, + const String & tmp_part_prefix, + const String & tmp_dst_part_name, + scope_guard & src_flushed_tmp_dir_lock, + MergeTreeData::MutableDataPartPtr src_flushed_tmp_part) +{ + if (auto src_part_in_memory = asInMemoryPart(src_part)) + { + auto flushed_part_path = src_part_in_memory->getRelativePathForPrefix(tmp_part_prefix); + auto tmp_src_part_file_name = fs::path(tmp_dst_part_name).filename(); + + src_flushed_tmp_dir_lock = src_part->storage.getTemporaryPartDirectoryHolder(tmp_src_part_file_name); + + auto flushed_part_storage = src_part_in_memory->flushToDisk(*flushed_part_path, metadata_snapshot); + + src_flushed_tmp_part = MergeTreeDataPartBuilder(*merge_tree_data, src_part->name, flushed_part_storage) + .withPartInfo(src_part->info) + .withPartFormatFromDisk() + .build(); + + src_flushed_tmp_part->is_temp = true; + + return flushed_part_storage; + } + + return src_part->getDataPartStoragePtr(); +} + +std::shared_ptr hardlinkAllFiles( + MergeTreeData * merge_tree_data, + const DB::ReadSettings & read_settings, + const DB::WriteSettings & write_settings, + const DataPartStoragePtr & storage, + const String & path, + const DB::IDataPartStorage::ClonePartParams & params) +{ + return storage->freeze( + merge_tree_data->getRelativeDataPath(), + path, + read_settings, + write_settings, + /*save_metadata_callback=*/{}, + params); +} + +std::pair cloneSourcePart( + MergeTreeData * merge_tree_data, + const MergeTreeData::DataPartPtr & src_part, + const StorageMetadataPtr & metadata_snapshot, + const MergeTreePartInfo & dst_part_info, + const String & tmp_part_prefix, + const ReadSettings & read_settings, + const WriteSettings & write_settings, + const DB::IDataPartStorage::ClonePartParams & params) +{ + const auto dst_part_name = src_part->getNewName(dst_part_info); + + const auto tmp_dst_part_name = tmp_part_prefix + dst_part_name; + + auto temporary_directory_lock = merge_tree_data->getTemporaryPartDirectoryHolder(tmp_dst_part_name); + + src_part->getDataPartStorage().reserve(src_part->getBytesOnDisk()); + + scope_guard src_flushed_tmp_dir_lock; + MergeTreeData::MutableDataPartPtr src_flushed_tmp_part; + + auto src_part_storage = flushPartStorageToDiskIfInMemory( + merge_tree_data, src_part, metadata_snapshot, tmp_part_prefix, tmp_dst_part_name, src_flushed_tmp_dir_lock, src_flushed_tmp_part); + + auto dst_part_storage = hardlinkAllFiles(merge_tree_data, read_settings, write_settings, src_part_storage, tmp_dst_part_name, params); + + if (params.metadata_version_to_write.has_value()) + { + chassert(!params.keep_metadata_version); + auto out_metadata = dst_part_storage->writeFile( + IMergeTreeDataPart::METADATA_VERSION_FILE_NAME, 4096, merge_tree_data->getContext()->getWriteSettings()); + writeText(metadata_snapshot->getMetadataVersion(), *out_metadata); + out_metadata->finalize(); + if (merge_tree_data->getSettings()->fsync_after_insert) + out_metadata->sync(); + } + + LOG_DEBUG( + log, + "Clone {} part {} to {}{}", + src_flushed_tmp_part ? "flushed" : "", + src_part_storage->getFullPath(), + std::string(fs::path(dst_part_storage->getFullRootPath()) / tmp_dst_part_name), + false); + + + auto part = MergeTreeDataPartBuilder(*merge_tree_data, dst_part_name, dst_part_storage).withPartFormatFromDisk().build(); + + return std::make_pair(part, std::move(temporary_directory_lock)); +} + +void handleHardLinkedParameterFiles(const MergeTreeData::DataPartPtr & src_part, const DB::IDataPartStorage::ClonePartParams & params) +{ + const auto & hardlinked_files = params.hardlinked_files; + + hardlinked_files->source_part_name = src_part->name; + hardlinked_files->source_table_shared_id = src_part->storage.getTableSharedID(); + + for (auto it = src_part->getDataPartStorage().iterate(); it->isValid(); it->next()) + { + if (!params.files_to_copy_instead_of_hardlinks.contains(it->name()) + && it->name() != IMergeTreeDataPart::DELETE_ON_DESTROY_MARKER_FILE_NAME_DEPRECATED + && it->name() != IMergeTreeDataPart::TXN_VERSION_METADATA_FILE_NAME) + { + hardlinked_files->hardlinks_from_source_part.insert(it->name()); + } + } +} + +void handleProjections(const MergeTreeData::DataPartPtr & src_part, const DB::IDataPartStorage::ClonePartParams & params) +{ + auto projections = src_part->getProjectionParts(); + for (const auto & [name, projection_part] : projections) + { + const auto & projection_storage = projection_part->getDataPartStorage(); + for (auto it = projection_storage.iterate(); it->isValid(); it->next()) + { + auto file_name_with_projection_prefix = fs::path(projection_storage.getPartDirectory()) / it->name(); + if (!params.files_to_copy_instead_of_hardlinks.contains(file_name_with_projection_prefix) + && it->name() != IMergeTreeDataPart::DELETE_ON_DESTROY_MARKER_FILE_NAME_DEPRECATED + && it->name() != IMergeTreeDataPart::TXN_VERSION_METADATA_FILE_NAME) + { + params.hardlinked_files->hardlinks_from_source_part.insert(file_name_with_projection_prefix); + } + } + } +} + +MergeTreeData::MutableDataPartPtr finalizePart( + const MergeTreeData::MutableDataPartPtr & dst_part, const DB::IDataPartStorage::ClonePartParams & params, bool require_part_metadata) +{ + /// We should write version metadata on part creation to distinguish it from parts that were created without transaction. + TransactionID tid = params.txn ? params.txn->tid : Tx::PrehistoricTID; + dst_part->version.setCreationTID(tid, nullptr); + dst_part->storeVersionMetadata(); + + dst_part->is_temp = true; + + dst_part->loadColumnsChecksumsIndexes(require_part_metadata, true); + + dst_part->modification_time = dst_part->getDataPartStorage().getLastModified().epochTime(); + + return dst_part; +} + +std::pair cloneAndHandleHardlinksAndProjections( + MergeTreeData * merge_tree_data, + const DataPartPtr & src_part, + const StorageMetadataPtr & metadata_snapshot, + const MergeTreePartInfo & dst_part_info, + const String & tmp_part_prefix, + const ReadSettings & read_settings, + const WriteSettings & write_settings, + const IDataPartStorage::ClonePartParams & params) +{ + if (!doesStoragePolicyAllowSameDisk(merge_tree_data, src_part)) + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "Could not clone and load part {} because disk does not belong to storage policy", + quoteString(src_part->getDataPartStorage().getFullPath())); + + auto [destination_part, temporary_directory_lock] = cloneSourcePart( + merge_tree_data, src_part, metadata_snapshot, dst_part_info, tmp_part_prefix, read_settings, write_settings, params); + + if (!params.copy_instead_of_hardlink && params.hardlinked_files) + { + handleHardLinkedParameterFiles(src_part, params); + handleProjections(src_part, params); + } + + return std::make_pair(destination_part, std::move(temporary_directory_lock)); +} +} + +std::pair MergeTreeDataPartCloner::clone( + MergeTreeData * merge_tree_data, + const DataPartPtr & src_part, + const StorageMetadataPtr & metadata_snapshot, + const MergeTreePartInfo & dst_part_info, + const String & tmp_part_prefix, + bool require_part_metadata, + const IDataPartStorage::ClonePartParams & params, + const ReadSettings & read_settings, + const WriteSettings & write_settings) +{ + auto [destination_part, temporary_directory_lock] = cloneAndHandleHardlinksAndProjections( + merge_tree_data, src_part, metadata_snapshot, dst_part_info, tmp_part_prefix, read_settings, write_settings, params); + + return std::make_pair(finalizePart(destination_part, params, require_part_metadata), std::move(temporary_directory_lock)); +} + +std::pair MergeTreeDataPartCloner::cloneWithDistinctPartitionExpression( + MergeTreeData * merge_tree_data, + const DataPartPtr & src_part, + const StorageMetadataPtr & metadata_snapshot, + const MergeTreePartInfo & dst_part_info, + const String & tmp_part_prefix, + const ReadSettings & read_settings, + const WriteSettings & write_settings, + const MergeTreePartition & new_partition, + const IMergeTreeDataPart::MinMaxIndex & new_min_max_index, + bool sync_new_files, + const IDataPartStorage::ClonePartParams & params) +{ + auto [destination_part, temporary_directory_lock] = cloneAndHandleHardlinksAndProjections( + merge_tree_data, src_part, metadata_snapshot, dst_part_info, tmp_part_prefix, read_settings, write_settings, params); + + DistinctPartitionExpression::updateNewPartFiles( + *merge_tree_data, destination_part, new_partition, new_min_max_index, src_part->storage.getInMemoryMetadataPtr(), sync_new_files); + + return std::make_pair(finalizePart(destination_part, params, false), std::move(temporary_directory_lock)); +} + +} diff --git a/src/Storages/MergeTree/MergeTreeDataPartCloner.h b/src/Storages/MergeTree/MergeTreeDataPartCloner.h new file mode 100644 index 00000000000..53585f20b7f --- /dev/null +++ b/src/Storages/MergeTree/MergeTreeDataPartCloner.h @@ -0,0 +1,43 @@ +#pragma once + +namespace DB +{ + +struct StorageInMemoryMetadata; +using StorageMetadataPtr = std::shared_ptr; +struct MergeTreePartition; +class IMergeTreeDataPart; + +class MergeTreeDataPartCloner +{ +public: + using DataPart = IMergeTreeDataPart; + using MutableDataPartPtr = std::shared_ptr; + using DataPartPtr = std::shared_ptr; + + static std::pair clone( + MergeTreeData * merge_tree_data, + const DataPartPtr & src_part, + const StorageMetadataPtr & metadata_snapshot, + const MergeTreePartInfo & dst_part_info, + const String & tmp_part_prefix, + bool require_part_metadata, + const IDataPartStorage::ClonePartParams & params, + const ReadSettings & read_settings, + const WriteSettings & write_settings); + + static std::pair cloneWithDistinctPartitionExpression( + MergeTreeData * merge_tree_data, + const DataPartPtr & src_part, + const StorageMetadataPtr & metadata_snapshot, + const MergeTreePartInfo & dst_part_info, + const String & tmp_part_prefix, + const ReadSettings & read_settings, + const WriteSettings & write_settings, + const MergeTreePartition & new_partition, + const IMergeTreeDataPart::MinMaxIndex & new_min_max_index, + bool sync_new_files, + const IDataPartStorage::ClonePartParams & params); +}; + +} diff --git a/src/Storages/MergeTree/MergeTreePartition.cpp b/src/Storages/MergeTree/MergeTreePartition.cpp index ddeaf69136a..76ef3be25b3 100644 --- a/src/Storages/MergeTree/MergeTreePartition.cpp +++ b/src/Storages/MergeTree/MergeTreePartition.cpp @@ -467,6 +467,45 @@ void MergeTreePartition::create(const StorageMetadataPtr & metadata_snapshot, Bl } } +void MergeTreePartition::createAndValidateMinMaxPartitionIds( + const StorageMetadataPtr & metadata_snapshot, Block block_with_min_max_partition_ids, ContextPtr context) +{ + if (!metadata_snapshot->hasPartitionKey()) + return; + + auto partition_key_names_and_types = executePartitionByExpression(metadata_snapshot, block_with_min_max_partition_ids, context); + value.resize(partition_key_names_and_types.size()); + + /// Executing partition_by expression adds new columns to passed block according to partition functions. + /// The block is passed by reference and is used afterwards. `moduloLegacy` needs to be substituted back + /// with just `modulo`, because it was a temporary substitution. + static constexpr std::string_view modulo_legacy_function_name = "moduloLegacy"; + + size_t i = 0; + for (const auto & element : partition_key_names_and_types) + { + auto & partition_column = block_with_min_max_partition_ids.getByName(element.name); + + if (element.name.starts_with(modulo_legacy_function_name)) + partition_column.name.replace(0, modulo_legacy_function_name.size(), "modulo"); + + Field extracted_min_partition_id_field; + Field extracted_max_partition_id_field; + + partition_column.column->get(0, extracted_min_partition_id_field); + partition_column.column->get(1, extracted_max_partition_id_field); + + if (extracted_min_partition_id_field != extracted_max_partition_id_field) + { + throw Exception( + ErrorCodes::INVALID_PARTITION_VALUE, + "Can not create the partition. A partition can not contain values that have different partition ids"); + } + + partition_column.column->get(0u, value[i++]); + } +} + NamesAndTypesList MergeTreePartition::executePartitionByExpression(const StorageMetadataPtr & metadata_snapshot, Block & block, ContextPtr context) { auto adjusted_partition_key = adjustPartitionKey(metadata_snapshot, context); diff --git a/src/Storages/MergeTree/MergeTreePartition.h b/src/Storages/MergeTree/MergeTreePartition.h index 78b141f26ec..fd7ae02cde4 100644 --- a/src/Storages/MergeTree/MergeTreePartition.h +++ b/src/Storages/MergeTree/MergeTreePartition.h @@ -1,11 +1,12 @@ #pragma once -#include +#include #include #include #include #include -#include +#include +#include namespace DB { @@ -51,6 +52,11 @@ public: void create(const StorageMetadataPtr & metadata_snapshot, Block block, size_t row, ContextPtr context); + /// Copy of MergeTreePartition::create, but also validates if min max partition keys are equal. If they are different, + /// it means the partition can't be created because the data doesn't belong to the same partition. + void createAndValidateMinMaxPartitionIds( + const StorageMetadataPtr & metadata_snapshot, Block block_with_min_max_partition_ids, ContextPtr context); + static void appendFiles(const MergeTreeData & storage, Strings & files); /// Adjust partition key and execute its expression on block. Return sample block according to used expression. diff --git a/src/Storages/MergeTree/MergeTreePartitionCompatibilityVerifier.cpp b/src/Storages/MergeTree/MergeTreePartitionCompatibilityVerifier.cpp new file mode 100644 index 00000000000..21bcdb84a96 --- /dev/null +++ b/src/Storages/MergeTree/MergeTreePartitionCompatibilityVerifier.cpp @@ -0,0 +1,91 @@ +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ +extern const int BAD_ARGUMENTS; +} + +namespace +{ +bool isDestinationPartitionExpressionMonotonicallyIncreasing( + const std::vector & hyperrectangle, const MergeTreeData & destination_storage) +{ + auto destination_table_metadata = destination_storage.getInMemoryMetadataPtr(); + + auto key_description = destination_table_metadata->getPartitionKey(); + auto definition_ast = key_description.definition_ast->clone(); + + auto table_identifier = std::make_shared(destination_storage.getStorageID().getTableName()); + auto table_with_columns + = TableWithColumnNamesAndTypes{DatabaseAndTableWithAlias(table_identifier), destination_table_metadata->getColumns().getOrdinary()}; + + auto expression_list = extractKeyExpressionList(definition_ast); + + MonotonicityCheckVisitor::Data data{{table_with_columns}, destination_storage.getContext(), /*group_by_function_hashes*/ {}}; + + for (auto i = 0u; i < expression_list->children.size(); i++) + { + data.range = hyperrectangle[i]; + + MonotonicityCheckVisitor(data).visit(expression_list->children[i]); + + if (!data.monotonicity.is_monotonic || !data.monotonicity.is_positive) + return false; + } + + return true; +} + +bool isExpressionDirectSubsetOf(const ASTPtr source, const ASTPtr destination) +{ + auto source_expression_list = extractKeyExpressionList(source); + auto destination_expression_list = extractKeyExpressionList(destination); + + std::unordered_set source_columns; + + for (auto i = 0u; i < source_expression_list->children.size(); ++i) + source_columns.insert(source_expression_list->children[i]->getColumnName()); + + for (auto i = 0u; i < destination_expression_list->children.size(); ++i) + if (!source_columns.contains(destination_expression_list->children[i]->getColumnName())) + return false; + + return true; +} +} + +void MergeTreePartitionCompatibilityVerifier::verify( + const MergeTreeData & source_storage, const MergeTreeData & destination_storage, const DataPartsVector & source_parts) +{ + const auto source_metadata = source_storage.getInMemoryMetadataPtr(); + const auto destination_metadata = destination_storage.getInMemoryMetadataPtr(); + + const auto source_partition_key_ast = source_metadata->getPartitionKeyAST(); + const auto destination_partition_key_ast = destination_metadata->getPartitionKeyAST(); + + // If destination partition expression columns are a subset of source partition expression columns, + // there is no need to check for monotonicity. + if (isExpressionDirectSubsetOf(source_partition_key_ast, destination_partition_key_ast)) + return; + + const auto src_global_min_max_indexes = MergeTreePartitionGlobalMinMaxIdxCalculator::calculate(source_parts, destination_storage); + + assert(!src_global_min_max_indexes.hyperrectangle.empty()); + + if (!isDestinationPartitionExpressionMonotonicallyIncreasing(src_global_min_max_indexes.hyperrectangle, destination_storage)) + throw DB::Exception(ErrorCodes::BAD_ARGUMENTS, "Destination table partition expression is not monotonically increasing"); + + MergeTreePartition().createAndValidateMinMaxPartitionIds( + destination_storage.getInMemoryMetadataPtr(), + src_global_min_max_indexes.getBlock(destination_storage), + destination_storage.getContext()); +} + +} diff --git a/src/Storages/MergeTree/MergeTreePartitionCompatibilityVerifier.h b/src/Storages/MergeTree/MergeTreePartitionCompatibilityVerifier.h new file mode 100644 index 00000000000..1682add3ebd --- /dev/null +++ b/src/Storages/MergeTree/MergeTreePartitionCompatibilityVerifier.h @@ -0,0 +1,30 @@ +#pragma once + +#include +#include + +namespace DB +{ + +/* + * Verifies that source and destination partitions are compatible. + * To be compatible, one of the following criteria must be met: + * 1. Destination partition expression columns are a subset of source partition columns; or + * 2. Destination partition expression is monotonic on the source global min_max idx Range AND the computer partition id for + * the source global min_max idx range is the same. + * + * If not, an exception is thrown. + * */ + +class MergeTreePartitionCompatibilityVerifier +{ +public: + using DataPart = IMergeTreeDataPart; + using DataPartPtr = std::shared_ptr; + using DataPartsVector = std::vector; + + static void + verify(const MergeTreeData & source_storage, const MergeTreeData & destination_storage, const DataPartsVector & source_parts); +}; + +} diff --git a/src/Storages/MergeTree/MergeTreePartitionGlobalMinMaxIdxCalculator.cpp b/src/Storages/MergeTree/MergeTreePartitionGlobalMinMaxIdxCalculator.cpp new file mode 100644 index 00000000000..0871efadf0c --- /dev/null +++ b/src/Storages/MergeTree/MergeTreePartitionGlobalMinMaxIdxCalculator.cpp @@ -0,0 +1,25 @@ +#include + +namespace DB +{ + +IMergeTreeDataPart::MinMaxIndex +MergeTreePartitionGlobalMinMaxIdxCalculator::calculate(const DataPartsVector & parts, const MergeTreeData & storage) +{ + IMergeTreeDataPart::MinMaxIndex global_min_max_indexes; + + for (const auto & part : parts) + { + auto metadata_manager = std::make_shared(part.get()); + + auto local_min_max_index = MergeTreeData::DataPart::MinMaxIndex(); + + local_min_max_index.load(storage, metadata_manager); + + global_min_max_indexes.merge(local_min_max_index); + } + + return global_min_max_indexes; +} + +} diff --git a/src/Storages/MergeTree/MergeTreePartitionGlobalMinMaxIdxCalculator.h b/src/Storages/MergeTree/MergeTreePartitionGlobalMinMaxIdxCalculator.h new file mode 100644 index 00000000000..4f271177246 --- /dev/null +++ b/src/Storages/MergeTree/MergeTreePartitionGlobalMinMaxIdxCalculator.h @@ -0,0 +1,24 @@ +#pragma once + +#include + +#include +#include + +namespace DB +{ + +/* + * Calculates global min max indexes for a given set of parts on given storage. + * */ +class MergeTreePartitionGlobalMinMaxIdxCalculator +{ + using DataPart = IMergeTreeDataPart; + using DataPartPtr = std::shared_ptr; + using DataPartsVector = std::vector; + +public: + static IMergeTreeDataPart::MinMaxIndex calculate(const DataPartsVector & parts, const MergeTreeData & storage); +}; + +} diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index 4761ccd8b58..fd5354a00a9 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -5,9 +5,9 @@ #include #include -#include #include #include +#include #include "Common/Exception.h" #include #include @@ -20,25 +20,30 @@ #include #include #include +#include #include -#include #include #include #include #include #include -#include #include +#include +#include #include #include #include -#include -#include -#include -#include -#include #include +#include +#include +#include +#include +#include +#include #include +#include +#include +#include #include #include #include @@ -2039,41 +2044,73 @@ void StorageMergeTree::replacePartitionFrom(const StoragePtr & source_table, con ProfileEventsScope profile_events_scope; MergeTreeData & src_data = checkStructureAndGetMergeTreeData(source_table, source_metadata_snapshot, my_metadata_snapshot); - String partition_id = getPartitionIDFromQuery(partition, local_context); + String partition_id = src_data.getPartitionIDFromQuery(partition, local_context); DataPartsVector src_parts = src_data.getVisibleDataPartsVectorInPartition(local_context, partition_id); + + bool attach_empty_partition = !replace && src_parts.empty(); + if (attach_empty_partition) + return; + MutableDataPartsVector dst_parts; std::vector dst_parts_locks; static const String TMP_PREFIX = "tmp_replace_from_"; - for (const DataPartPtr & src_part : src_parts) + const auto my_partition_expression = my_metadata_snapshot->getPartitionKeyAST(); + const auto src_partition_expression = source_metadata_snapshot->getPartitionKeyAST(); + const auto is_partition_exp_different = queryToStringNullable(my_partition_expression) != queryToStringNullable(src_partition_expression); + + if (is_partition_exp_different && !src_parts.empty()) + MergeTreePartitionCompatibilityVerifier::verify(src_data, /* destination_storage */ *this, src_parts); + + for (DataPartPtr & src_part : src_parts) { if (!canReplacePartition(src_part)) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot replace partition '{}' because part '{}' has inconsistent granularity with table", partition_id, src_part->name); - /// This will generate unique name in scope of current server process. - Int64 temp_index = insert_increment.get(); - MergeTreePartInfo dst_part_info(partition_id, temp_index, temp_index, src_part->info.level); - IDataPartStorage::ClonePartParams clone_params{.txn = local_context->getCurrentTransaction()}; - auto [dst_part, part_lock] = cloneAndLoadDataPartOnSameDisk( - src_part, - TMP_PREFIX, - dst_part_info, - my_metadata_snapshot, - clone_params, - local_context->getReadSettings(), - local_context->getWriteSettings()); - dst_parts.emplace_back(std::move(dst_part)); - dst_parts_locks.emplace_back(std::move(part_lock)); - } + /// This will generate unique name in scope of current server process. + auto index = insert_increment.get(); - /// ATTACH empty part set - if (!replace && dst_parts.empty()) - return; + if (is_partition_exp_different) + { + auto [new_partition, new_min_max_index] = createPartitionAndMinMaxIndexFromSourcePart( + src_part, my_metadata_snapshot, local_context); + + auto [dst_part, part_lock] = cloneAndLoadPartOnSameDiskWithDifferentPartitionKey( + src_part, + new_partition, + new_partition.getID(*this), + new_min_max_index, + TMP_PREFIX, + my_metadata_snapshot, + clone_params, + local_context, + index, + index); + + dst_parts.emplace_back(std::move(dst_part)); + dst_parts_locks.emplace_back(std::move(part_lock)); + } + else + { + MergeTreePartInfo dst_part_info(partition_id, index, index, src_part->info.level); + + auto [dst_part, part_lock] = cloneAndLoadDataPartOnSameDisk( + src_part, + TMP_PREFIX, + dst_part_info, + my_metadata_snapshot, + clone_params, + local_context->getReadSettings(), + local_context->getWriteSettings()); + dst_parts.emplace_back(std::move(dst_part)); + dst_parts_locks.emplace_back(std::move(part_lock)); + } + } MergeTreePartInfo drop_range; if (replace) diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index f7e6783dbc2..512811e39d7 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -26,22 +26,21 @@ #include -#include #include #include #include #include #include -#include #include #include -#include #include #include #include #include #include +#include #include +#include #include #include #include @@ -53,9 +52,11 @@ #include #include #include +#include #include #include #include +#include #include #include @@ -2713,16 +2714,48 @@ bool StorageReplicatedMergeTree::executeReplaceRange(LogEntry & entry) .copy_instead_of_hardlink = storage_settings_ptr->always_use_copy_instead_of_hardlinks || ((our_zero_copy_enabled || source_zero_copy_enabled) && part_desc->src_table_part->isStoredOnRemoteDiskWithZeroCopySupport()), .metadata_version_to_write = metadata_snapshot->getMetadataVersion() }; - auto [res_part, temporary_part_lock] = cloneAndLoadDataPartOnSameDisk( - part_desc->src_table_part, - TMP_PREFIX + "clone_", - part_desc->new_part_info, - metadata_snapshot, - clone_params, - getContext()->getReadSettings(), - getContext()->getWriteSettings()); - part_desc->res_part = std::move(res_part); - part_desc->temporary_part_lock = std::move(temporary_part_lock); + + const auto my_partition_expression = metadata_snapshot->getPartitionKeyAST(); + const auto src_partition_expression = source_table->getInMemoryMetadataPtr()->getPartitionKeyAST(); + + const auto is_partition_exp_different = queryToStringNullable(my_partition_expression) != queryToStringNullable(src_partition_expression); + + if (is_partition_exp_different) + { + auto [new_partition, new_min_max_index] = createPartitionAndMinMaxIndexFromSourcePart( + part_desc->src_table_part, metadata_snapshot, getContext()); + + auto partition_id = new_partition.getID(*this); + + auto [res_part, temporary_part_lock] = cloneAndLoadPartOnSameDiskWithDifferentPartitionKey( + part_desc->src_table_part, + new_partition, + partition_id, + new_min_max_index, + TMP_PREFIX + "clone_", + metadata_snapshot, + clone_params, + getContext(), + part_desc->new_part_info.min_block, + part_desc->new_part_info.max_block); + + part_desc->res_part = std::move(res_part); + part_desc->temporary_part_lock = std::move(temporary_part_lock); + } + else + { + auto [res_part, temporary_part_lock] = cloneAndLoadDataPartOnSameDisk( + part_desc->src_table_part, + TMP_PREFIX + "clone_", + part_desc->new_part_info, + metadata_snapshot, + clone_params, + getContext()->getReadSettings(), + getContext()->getWriteSettings()); + + part_desc->res_part = std::move(res_part); + part_desc->temporary_part_lock = std::move(temporary_part_lock); + } } else if (!part_desc->replica.empty()) { @@ -7852,11 +7885,22 @@ void StorageReplicatedMergeTree::replacePartitionFrom( ProfileEventsScope profile_events_scope; MergeTreeData & src_data = checkStructureAndGetMergeTreeData(source_table, source_metadata_snapshot, metadata_snapshot); - String partition_id = getPartitionIDFromQuery(partition, query_context); + String partition_id = src_data.getPartitionIDFromQuery(partition, query_context); /// NOTE: Some covered parts may be missing in src_all_parts if corresponding log entries are not executed yet. DataPartsVector src_all_parts = src_data.getVisibleDataPartsVectorInPartition(query_context, partition_id); + bool attach_empty_partition = !replace && src_all_parts.empty(); + if (attach_empty_partition) + return; + + const auto my_partition_expression = metadata_snapshot->getPartitionKeyAST(); + const auto src_partition_expression = source_metadata_snapshot->getPartitionKeyAST(); + const auto is_partition_exp_different = queryToStringNullable(my_partition_expression) != queryToStringNullable(src_partition_expression); + + if (is_partition_exp_different && !src_all_parts.empty()) + MergeTreePartitionCompatibilityVerifier::verify(src_data, /* destination_storage */ *this, src_all_parts); + LOG_DEBUG(log, "Cloning {} parts", src_all_parts.size()); static const String TMP_PREFIX = "tmp_replace_from_"; @@ -7911,6 +7955,18 @@ void StorageReplicatedMergeTree::replacePartitionFrom( "Cannot replace partition '{}' because part '{}" "' has inconsistent granularity with table", partition_id, src_part->name); + IMergeTreeDataPart::MinMaxIndex min_max_index = *src_part->minmax_idx; + MergeTreePartition merge_tree_partition = src_part->partition; + + if (is_partition_exp_different) + { + auto [new_partition, new_min_max_index] = createPartitionAndMinMaxIndexFromSourcePart(src_part, metadata_snapshot, query_context); + + merge_tree_partition = new_partition; + min_max_index = new_min_max_index; + partition_id = merge_tree_partition.getID(*this); + } + String hash_hex = src_part->checksums.getTotalChecksumHex(); const bool is_duplicated_part = replaced_parts.contains(hash_hex); replaced_parts.insert(hash_hex); @@ -7929,27 +7985,52 @@ void StorageReplicatedMergeTree::replacePartitionFrom( continue; } - UInt64 index = lock->getNumber(); - MergeTreePartInfo dst_part_info(partition_id, index, index, src_part->info.level); - bool zero_copy_enabled = storage_settings_ptr->allow_remote_fs_zero_copy_replication || dynamic_cast(source_table.get())->getSettings()->allow_remote_fs_zero_copy_replication; + + UInt64 index = lock->getNumber(); + IDataPartStorage::ClonePartParams clone_params { .copy_instead_of_hardlink = storage_settings_ptr->always_use_copy_instead_of_hardlinks || (zero_copy_enabled && src_part->isStoredOnRemoteDiskWithZeroCopySupport()), .metadata_version_to_write = metadata_snapshot->getMetadataVersion() }; - auto [dst_part, part_lock] = cloneAndLoadDataPartOnSameDisk( - src_part, - TMP_PREFIX, - dst_part_info, - metadata_snapshot, - clone_params, - query_context->getReadSettings(), - query_context->getWriteSettings()); + + if (is_partition_exp_different) + { + auto [dst_part, part_lock] = cloneAndLoadPartOnSameDiskWithDifferentPartitionKey( + src_part, + merge_tree_partition, + partition_id, + min_max_index, + TMP_PREFIX, + metadata_snapshot, + clone_params, + query_context, + index, + index); + + dst_parts.emplace_back(dst_part); + dst_parts_locks.emplace_back(std::move(part_lock)); + } + else + { + MergeTreePartInfo dst_part_info(partition_id, index, index, src_part->info.level); + + auto [dst_part, part_lock] = cloneAndLoadDataPartOnSameDisk( + src_part, + TMP_PREFIX, + dst_part_info, + metadata_snapshot, + clone_params, + query_context->getReadSettings(), + query_context->getWriteSettings()); + + dst_parts.emplace_back(dst_part); + dst_parts_locks.emplace_back(std::move(part_lock)); + } + src_parts.emplace_back(src_part); - dst_parts.emplace_back(dst_part); - dst_parts_locks.emplace_back(std::move(part_lock)); ephemeral_locks.emplace_back(std::move(*lock)); block_id_paths.emplace_back(block_id_path); part_checksums.emplace_back(hash_hex); diff --git a/tests/integration/test_attach_partition_distinct_expression_replicated/__init__.py b/tests/integration/test_attach_partition_distinct_expression_replicated/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_attach_partition_distinct_expression_replicated/configs/remote_servers.xml b/tests/integration/test_attach_partition_distinct_expression_replicated/configs/remote_servers.xml new file mode 100644 index 00000000000..b40730e9f7d --- /dev/null +++ b/tests/integration/test_attach_partition_distinct_expression_replicated/configs/remote_servers.xml @@ -0,0 +1,17 @@ + + + + + true + + replica1 + 9000 + + + replica2 + 9000 + + + + + diff --git a/tests/integration/test_attach_partition_distinct_expression_replicated/test.py b/tests/integration/test_attach_partition_distinct_expression_replicated/test.py new file mode 100644 index 00000000000..1d8ac4e9e37 --- /dev/null +++ b/tests/integration/test_attach_partition_distinct_expression_replicated/test.py @@ -0,0 +1,214 @@ +import pytest +from helpers.cluster import ClickHouseCluster +from helpers.test_tools import assert_eq_with_retry + +cluster = ClickHouseCluster(__file__) + +replica1 = cluster.add_instance( + "replica1", with_zookeeper=True, main_configs=["configs/remote_servers.xml"] +) +replica2 = cluster.add_instance( + "replica2", with_zookeeper=True, main_configs=["configs/remote_servers.xml"] +) + + +@pytest.fixture(scope="module") +def start_cluster(): + try: + cluster.start() + yield cluster + except Exception as ex: + print(ex) + finally: + cluster.shutdown() + + +def cleanup(nodes): + for node in nodes: + node.query("DROP TABLE IF EXISTS source SYNC") + node.query("DROP TABLE IF EXISTS destination SYNC") + + +def create_table(node, table_name, replicated): + replica = node.name + engine = ( + f"ReplicatedMergeTree('/clickhouse/tables/1/{table_name}', '{replica}')" + if replicated + else "MergeTree()" + ) + partition_expression = ( + "toYYYYMMDD(timestamp)" if table_name == "source" else "toYYYYMM(timestamp)" + ) + node.query_with_retry( + """ + CREATE TABLE {table_name}(timestamp DateTime) + ENGINE = {engine} + ORDER BY tuple() PARTITION BY {partition_expression} + SETTINGS cleanup_delay_period=1, cleanup_delay_period_random_add=1, max_cleanup_delay_period=1; + """.format( + table_name=table_name, + engine=engine, + partition_expression=partition_expression, + ) + ) + + +def test_both_replicated(start_cluster): + for node in [replica1, replica2]: + create_table(node, "source", True) + create_table(node, "destination", True) + + replica1.query("INSERT INTO source VALUES ('2010-03-02 02:01:01')") + replica1.query("SYSTEM SYNC REPLICA source") + replica1.query("SYSTEM SYNC REPLICA destination") + replica1.query( + f"ALTER TABLE destination ATTACH PARTITION ID '20100302' FROM source" + ) + + assert_eq_with_retry( + replica1, f"SELECT * FROM destination", "2010-03-02 02:01:01\n" + ) + assert_eq_with_retry( + replica1, + f"SELECT * FROM destination", + replica2.query(f"SELECT * FROM destination"), + ) + + cleanup([replica1, replica2]) + + +def test_only_destination_replicated(start_cluster): + create_table(replica1, "source", False) + create_table(replica1, "destination", True) + create_table(replica2, "destination", True) + + replica1.query("INSERT INTO source VALUES ('2010-03-02 02:01:01')") + replica1.query("SYSTEM SYNC REPLICA destination") + replica1.query( + f"ALTER TABLE destination ATTACH PARTITION ID '20100302' FROM source" + ) + + assert_eq_with_retry( + replica1, f"SELECT * FROM destination", "2010-03-02 02:01:01\n" + ) + assert_eq_with_retry( + replica1, + f"SELECT * FROM destination", + replica2.query(f"SELECT * FROM destination"), + ) + + cleanup([replica1, replica2]) + + +def test_both_replicated_partitioned_to_unpartitioned(start_cluster): + def create_tables(nodes): + for node in nodes: + source_engine = ( + f"ReplicatedMergeTree('/clickhouse/tables/1/source', '{node.name}')" + ) + node.query( + """ + CREATE TABLE source(timestamp DateTime) + ENGINE = {engine} + ORDER BY tuple() PARTITION BY toYYYYMMDD(timestamp) + SETTINGS cleanup_delay_period=1, cleanup_delay_period_random_add=1, max_cleanup_delay_period=1; + """.format( + engine=source_engine, + ) + ) + + destination_engine = f"ReplicatedMergeTree('/clickhouse/tables/1/destination', '{node.name}')" + node.query( + """ + CREATE TABLE destination(timestamp DateTime) + ENGINE = {engine} + ORDER BY tuple() PARTITION BY tuple() + SETTINGS cleanup_delay_period=1, cleanup_delay_period_random_add=1, max_cleanup_delay_period=1; + """.format( + engine=destination_engine, + ) + ) + + create_tables([replica1, replica2]) + + replica1.query("INSERT INTO source VALUES ('2010-03-02 02:01:01')") + replica1.query("INSERT INTO source VALUES ('2010-03-03 02:01:01')") + replica1.query("SYSTEM SYNC REPLICA source") + replica1.query("SYSTEM SYNC REPLICA destination") + + replica1.query( + f"ALTER TABLE destination ATTACH PARTITION ID '20100302' FROM source" + ) + replica1.query( + f"ALTER TABLE destination ATTACH PARTITION ID '20100303' FROM source" + ) + + assert_eq_with_retry( + replica1, + f"SELECT * FROM destination ORDER BY timestamp", + "2010-03-02 02:01:01\n2010-03-03 02:01:01\n", + ) + assert_eq_with_retry( + replica1, + f"SELECT * FROM destination ORDER BY timestamp", + replica2.query(f"SELECT * FROM destination ORDER BY timestamp"), + ) + + cleanup([replica1, replica2]) + + +def test_both_replicated_different_exp_same_id(start_cluster): + def create_tables(nodes): + for node in nodes: + source_engine = ( + f"ReplicatedMergeTree('/clickhouse/tables/1/source', '{node.name}')" + ) + node.query( + """ + CREATE TABLE source(a UInt16,b UInt16,c UInt16,extra UInt64,Path String,Time DateTime,Value Float64,Timestamp Int64,sign Int8) + ENGINE = {engine} + ORDER BY tuple() PARTITION BY a % 3 + SETTINGS cleanup_delay_period=1, cleanup_delay_period_random_add=1, max_cleanup_delay_period=1; + """.format( + engine=source_engine, + ) + ) + + destination_engine = f"ReplicatedMergeTree('/clickhouse/tables/1/destination', '{node.name}')" + node.query( + """ + CREATE TABLE destination(a UInt16,b UInt16,c UInt16,extra UInt64,Path String,Time DateTime,Value Float64,Timestamp Int64,sign Int8) + ENGINE = {engine} + ORDER BY tuple() PARTITION BY a + SETTINGS cleanup_delay_period=1, cleanup_delay_period_random_add=1, max_cleanup_delay_period=1; + """.format( + engine=destination_engine, + ) + ) + + create_tables([replica1, replica2]) + + replica1.query( + "INSERT INTO source (a, b, c, extra, sign) VALUES (1, 5, 9, 1000, 1)" + ) + replica1.query( + "INSERT INTO source (a, b, c, extra, sign) VALUES (2, 6, 10, 1000, 1)" + ) + replica1.query("SYSTEM SYNC REPLICA source") + replica1.query("SYSTEM SYNC REPLICA destination") + + replica1.query(f"ALTER TABLE destination ATTACH PARTITION 1 FROM source") + replica1.query(f"ALTER TABLE destination ATTACH PARTITION 2 FROM source") + + assert_eq_with_retry( + replica1, + f"SELECT * FROM destination ORDER BY a", + "1\t5\t9\t1000\t\t1970-01-01 00:00:00\t0\t0\t1\n2\t6\t10\t1000\t\t1970-01-01 00:00:00\t0\t0\t1\n", + ) + assert_eq_with_retry( + replica1, + f"SELECT * FROM destination ORDER BY a", + replica2.query(f"SELECT * FROM destination ORDER BY a"), + ) + + cleanup([replica1, replica2]) diff --git a/tests/queries/0_stateless/02456_test_attach_partition_different_partition_exp.reference b/tests/queries/0_stateless/02456_test_attach_partition_different_partition_exp.reference new file mode 100644 index 00000000000..f1d036b08bf --- /dev/null +++ b/tests/queries/0_stateless/02456_test_attach_partition_different_partition_exp.reference @@ -0,0 +1,467 @@ +-- { echoOn } +-- Should be allowed since destination partition expr is monotonically increasing and compatible +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE source (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMMDD(timestamp); +CREATE TABLE destination (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMM(timestamp); +INSERT INTO TABLE source VALUES ('2010-03-02 02:01:01'), ('2010-03-02 02:01:03'); +ALTER TABLE destination ATTACH PARTITION ID '20100302' FROM source; +SELECT * FROM source ORDER BY timestamp; +2010-03-02 02:01:01 +2010-03-02 02:01:03 +SELECT * FROM destination ORDER BY timestamp; +2010-03-02 02:01:01 +2010-03-02 02:01:03 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +201003 +TRUNCATE TABLE destination; +ALTER TABLE destination ATTACH PARTITION '20100302' FROM source; +SELECT * FROM source ORDER BY timestamp; +2010-03-02 02:01:01 +2010-03-02 02:01:03 +SELECT * FROM destination ORDER BY timestamp; +2010-03-02 02:01:01 +2010-03-02 02:01:03 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +201003 +-- Should be allowed since destination partition expr is monotonically increasing and compatible. Note that even though +-- the destination partition expression is more granular, the data would still fall in the same partition. Thus, it is valid +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE source (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMM(timestamp); +CREATE TABLE destination (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMMDD(timestamp); +INSERT INTO TABLE source VALUES ('2010-03-02 02:01:01'), ('2010-03-02 02:01:03'); +ALTER TABLE destination ATTACH PARTITION ID '201003' FROM source; +SELECT * FROM source ORDER BY timestamp; +2010-03-02 02:01:01 +2010-03-02 02:01:03 +SELECT * FROM destination ORDER BY timestamp; +2010-03-02 02:01:01 +2010-03-02 02:01:03 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +20100302 +TRUNCATE TABLE destination; +ALTER TABLE destination ATTACH PARTITION '201003' FROM source; +SELECT * FROM source ORDER BY timestamp; +2010-03-02 02:01:01 +2010-03-02 02:01:03 +SELECT * FROM destination ORDER BY timestamp; +2010-03-02 02:01:01 +2010-03-02 02:01:03 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +20100302 +-- Should be allowed since destination partition expr is monotonically increasing and compatible for those specific values +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE source (timestamp DateTime, A Int64) engine=MergeTree ORDER BY timestamp PARTITION BY intDiv(A, 6); +CREATE TABLE destination (timestamp DateTime, A Int64) engine=MergeTree ORDER BY timestamp PARTITION BY A; +INSERT INTO TABLE source VALUES ('2010-03-02 02:01:01', 1), ('2010-03-02 02:01:03', 1); +ALTER TABLE destination ATTACH PARTITION ID '0' FROM source; +SELECT * FROM source ORDER BY timestamp; +2010-03-02 02:01:01 1 +2010-03-02 02:01:03 1 +SELECT * FROM destination ORDER BY timestamp; +2010-03-02 02:01:01 1 +2010-03-02 02:01:03 1 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +1 +TRUNCATE TABLE destination; +ALTER TABLE destination ATTACH PARTITION 0 FROM source; +SELECT * FROM source ORDER BY timestamp; +2010-03-02 02:01:01 1 +2010-03-02 02:01:03 1 +SELECT * FROM destination ORDER BY timestamp; +2010-03-02 02:01:01 1 +2010-03-02 02:01:03 1 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +1 +-- Should be allowed because dst partition exp is monot inc and data is not split +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE source (productName String, category String) engine=MergeTree ORDER BY tuple() PARTITION BY cityHash64(category); +CREATE TABLE destination (productName String, category String) engine=MergeTree ORDER BY tuple() PARTITION BY toString(category); +INSERT INTO TABLE source VALUES ('spaghetti', 'food'), ('mop', 'general'); +INSERT INTO TABLE source VALUES ('rice', 'food'); +ALTER TABLE destination ATTACH PARTITION ID '17908065610379824077' from source; +SELECT * FROM source ORDER BY productName; +mop general +rice food +spaghetti food +SELECT * FROM destination ORDER BY productName; +rice food +spaghetti food +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +59532f3c39a412a413f0f014c7750a9d +59532f3c39a412a413f0f014c7750a9d +TRUNCATE TABLE destination; +ALTER TABLE destination ATTACH PARTITION '17908065610379824077' from source; +SELECT * FROM source ORDER BY productName; +mop general +rice food +spaghetti food +SELECT * FROM destination ORDER BY productName; +rice food +spaghetti food +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +59532f3c39a412a413f0f014c7750a9d +59532f3c39a412a413f0f014c7750a9d +-- Should be allowed, extra test case to validate https://github.com/ClickHouse/ClickHouse/pull/39507#issuecomment-1747574133 + +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE source (timestamp Int64) engine=MergeTree ORDER BY (timestamp) PARTITION BY intDiv(timestamp, 86400000); +CREATE TABLE destination (timestamp Int64) engine=MergeTree ORDER BY (timestamp) PARTITION BY toYear(toDateTime(intDiv(timestamp, 1000))); +INSERT INTO TABLE source VALUES (1267495261123); +ALTER TABLE destination ATTACH PARTITION ID '14670' FROM source; +SELECT * FROM source ORDER BY timestamp; +1267495261123 +SELECT * FROM destination ORDER BY timestamp; +1267495261123 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +2010 +TRUNCATE TABLE destination; +ALTER TABLE destination ATTACH PARTITION '14670' from source; +SELECT * FROM source ORDER BY timestamp; +1267495261123 +SELECT * FROM destination ORDER BY timestamp; +1267495261123 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +2010 +-- Should be allowed, extra test case to validate https://github.com/ClickHouse/ClickHouse/pull/39507#issuecomment-1747511726 + +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE source (timestamp DateTime('UTC'), key Int64, f Float64) engine=MergeTree ORDER BY (key, timestamp) PARTITION BY toYear(timestamp); +CREATE TABLE destination (timestamp DateTime('UTC'), key Int64, f Float64) engine=MergeTree ORDER BY (key, timestamp) PARTITION BY (intDiv(toUInt32(timestamp),86400)); +INSERT INTO TABLE source VALUES ('2010-03-02 02:01:01',1,1),('2010-03-02 02:01:01',1,1),('2011-02-02 02:01:03',1,1); +ALTER TABLE destination ATTACH PARTITION ID '2010' FROM source; +SELECT * FROM source ORDER BY timestamp; +2010-03-02 02:01:01 1 1 +2010-03-02 02:01:01 1 1 +2011-02-02 02:01:03 1 1 +SELECT * FROM destination ORDER BY timestamp; +2010-03-02 02:01:01 1 1 +2010-03-02 02:01:01 1 1 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +14670 +TRUNCATE TABLE destination; +ALTER TABLE destination ATTACH PARTITION '2010' from source; +SELECT * FROM source ORDER BY timestamp; +2010-03-02 02:01:01 1 1 +2010-03-02 02:01:01 1 1 +2011-02-02 02:01:03 1 1 +SELECT * FROM destination ORDER BY timestamp; +2010-03-02 02:01:01 1 1 +2010-03-02 02:01:01 1 1 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +14670 +-- Should be allowed, partitioned table to unpartitioned. Since the destination is unpartitioned, parts would ultimately +-- fall into the same partition. +-- Destination partition by expression is omitted, which causes StorageMetadata::getPartitionKeyAST() to be nullptr. +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE source (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMM(timestamp); +CREATE TABLE destination (timestamp DateTime) engine=MergeTree ORDER BY tuple(); +INSERT INTO TABLE source VALUES ('2010-03-02 02:01:01'), ('2010-03-02 02:01:03'); +ALTER TABLE destination ATTACH PARTITION ID '201003' FROM source; +SELECT * FROM source ORDER BY timestamp; +2010-03-02 02:01:01 +2010-03-02 02:01:03 +SELECT * FROM destination ORDER BY timestamp; +2010-03-02 02:01:01 +2010-03-02 02:01:03 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +all +TRUNCATE TABLE destination; +ALTER TABLE destination ATTACH PARTITION '201003' from source; +SELECT * FROM source ORDER BY timestamp; +2010-03-02 02:01:01 +2010-03-02 02:01:03 +SELECT * FROM destination ORDER BY timestamp; +2010-03-02 02:01:01 +2010-03-02 02:01:03 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +all +-- Same as above, but destination partition by expression is explicitly defined. Test case required to validate that +-- partition by tuple() is accepted. +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE source (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMM(timestamp); +CREATE TABLE destination (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY tuple(); +INSERT INTO TABLE source VALUES ('2010-03-02 02:01:01'), ('2010-03-02 02:01:03'); +ALTER TABLE destination ATTACH PARTITION ID '201003' FROM source; +SELECT * FROM source ORDER BY timestamp; +2010-03-02 02:01:01 +2010-03-02 02:01:03 +SELECT * FROM destination ORDER BY timestamp; +2010-03-02 02:01:01 +2010-03-02 02:01:03 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +all +TRUNCATE TABLE destination; +ALTER TABLE destination ATTACH PARTITION '201003' from source; +SELECT * FROM source ORDER BY timestamp; +2010-03-02 02:01:01 +2010-03-02 02:01:03 +SELECT * FROM destination ORDER BY timestamp; +2010-03-02 02:01:01 +2010-03-02 02:01:03 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +all +-- Should be allowed because the destination partition expression columns are a subset of the source partition expression columns +-- Columns in this case refer to the expression elements, not to the actual table columns +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE source (a Int, b Int, c Int) engine=MergeTree ORDER BY tuple() PARTITION BY (a, b, c); +CREATE TABLE destination (a Int, b Int, c Int) engine=MergeTree ORDER BY tuple() PARTITION BY (a, b); +INSERT INTO TABLE source VALUES (1, 2, 3), (1, 2, 4); +ALTER TABLE destination ATTACH PARTITION ID '1-2-3' FROM source; +SELECT * FROM source ORDER BY (a, b, c); +1 2 3 +1 2 4 +SELECT * FROM destination ORDER BY (a, b, c); +1 2 3 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +1-2 +TRUNCATE TABLE destination; +ALTER TABLE destination ATTACH PARTITION (1, 2, 3) from source; +SELECT * FROM source ORDER BY (a, b, c); +1 2 3 +1 2 4 +SELECT * FROM destination ORDER BY (a, b, c); +1 2 3 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +1-2 +-- Should be allowed because the destination partition expression columns are a subset of the source partition expression columns +-- Columns in this case refer to the expression elements, not to the actual table columns +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE source (a Int, b Int, c Int) engine=MergeTree ORDER BY tuple() PARTITION BY (a, b, c); +CREATE TABLE destination (a Int, b Int, c Int) engine=MergeTree ORDER BY tuple() PARTITION BY a; +INSERT INTO TABLE source VALUES (1, 2, 3), (1, 2, 4); +ALTER TABLE destination ATTACH PARTITION ID '1-2-3' FROM source; +SELECT * FROM source ORDER BY (a, b, c); +1 2 3 +1 2 4 +SELECT * FROM destination ORDER BY (a, b, c); +1 2 3 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +1 +TRUNCATE TABLE destination; +ALTER TABLE destination ATTACH PARTITION (1, 2, 3) from source; +SELECT * FROM source ORDER BY (a, b, c); +1 2 3 +1 2 4 +SELECT * FROM destination ORDER BY (a, b, c); +1 2 3 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +1 +-- Should be allowed. Special test case, tricky to explain. First column of source partition expression is +-- timestamp, while first column of destination partition expression is `A`. One of the previous implementations +-- would not match the columns, which could lead to `timestamp` min max being used to calculate monotonicity of `A`. +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE source (`timestamp` DateTime, `A` Int64) ENGINE = MergeTree PARTITION BY tuple(toYYYYMM(timestamp), intDiv(A, 6)) ORDER BY timestamp; +CREATE TABLE destination (`timestamp` DateTime, `A` Int64) ENGINE = MergeTree PARTITION BY A ORDER BY timestamp; +INSERT INTO TABLE source VALUES ('2010-03-02 02:01:01', 5); +ALTER TABLE destination ATTACH PARTITION ID '201003-0' FROM source; +SELECT * FROM source ORDER BY timestamp; +2010-03-02 02:01:01 5 +SELECT * FROM destination ORDER BY timestamp; +2010-03-02 02:01:01 5 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +5 +TRUNCATE TABLE destination; +ALTER TABLE destination ATTACH PARTITION (201003, 0) from source; +SELECT * FROM source ORDER BY timestamp; +2010-03-02 02:01:01 5 +SELECT * FROM destination ORDER BY timestamp; +2010-03-02 02:01:01 5 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +5 +-- Should be allowed. Destination partition expression contains multiple expressions, but all of them are monotonically +-- increasing in the source partition min max indexes. +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE source (A Int, B Int) ENGINE = MergeTree PARTITION BY tuple(A, B) ORDER BY tuple(); +CREATE TABLE destination (A Int, B Int) ENGINE = MergeTree PARTITION BY tuple(intDiv(A, 2), intDiv(B, 2)) ORDER BY tuple(); +INSERT INTO TABLE source VALUES (6, 12); +ALTER TABLE destination ATTACH PARTITION ID '6-12' FROM source; +SELECT * FROM source ORDER BY A; +6 12 +SELECT * FROM destination ORDER BY A; +6 12 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +3-6 +TRUNCATE TABLE destination; +ALTER TABLE destination ATTACH PARTITION (6, 12) from source; +SELECT * FROM source ORDER BY A; +6 12 +SELECT * FROM destination ORDER BY A; +6 12 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +3-6 +-- Should be allowed. The same scenario as above, but partition expressions inverted. +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE source (A Int, B Int) ENGINE = MergeTree PARTITION BY tuple(intDiv(A, 2), intDiv(B, 2)) ORDER BY tuple(); +CREATE TABLE destination (A Int, B Int) ENGINE = MergeTree PARTITION BY tuple(A, B) ORDER BY tuple(); +INSERT INTO TABLE source VALUES (6, 12); +ALTER TABLE destination ATTACH PARTITION ID '3-6' FROM source; +SELECT * FROM source ORDER BY A; +6 12 +SELECT * FROM destination ORDER BY A; +6 12 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +6-12 +TRUNCATE TABLE destination; +ALTER TABLE destination ATTACH PARTITION (3, 6) from source; +SELECT * FROM source ORDER BY A; +6 12 +SELECT * FROM destination ORDER BY A; +6 12 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +6-12 +-- Should be allowed, it is a local operation, no different than regular attach. Replicated to replicated. +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE + source(timestamp DateTime) + ENGINE = ReplicatedMergeTree('/clickhouse/tables/{database}/test/source_replicated_to_replicated_distinct_expression', '1') + PARTITION BY toYYYYMMDD(timestamp) + ORDER BY tuple(); +CREATE TABLE + destination(timestamp DateTime) + ENGINE = ReplicatedMergeTree('/clickhouse/tables/{database}/test/destination_replicated_to_replicated_distinct_expression', '1') + PARTITION BY toYYYYMM(timestamp) + ORDER BY tuple(); +INSERT INTO TABLE source VALUES ('2010-03-02 02:01:01'), ('2010-03-02 02:01:03'); +ALTER TABLE destination ATTACH PARTITION ID '20100302' FROM source; +SELECT * FROM source ORDER BY timestamp; +2010-03-02 02:01:01 +2010-03-02 02:01:03 +SELECT * FROM destination ORDER BY timestamp; +2010-03-02 02:01:01 +2010-03-02 02:01:03 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +201003 +TRUNCATE TABLE destination; +ALTER TABLE destination ATTACH PARTITION '20100302' from source; +SELECT * FROM source ORDER BY timestamp; +2010-03-02 02:01:01 +2010-03-02 02:01:03 +SELECT * FROM destination ORDER BY timestamp; +2010-03-02 02:01:01 +2010-03-02 02:01:03 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +201003 +-- Should be allowed, it is a local operation, no different than regular attach. Non replicated to replicated +DROP TABLE IF EXISTS source SYNC; +DROP TABLE IF EXISTS destination SYNC; +CREATE TABLE source(timestamp DateTime) ENGINE = MergeTree() PARTITION BY toYYYYMMDD(timestamp) ORDER BY tuple(); +CREATE TABLE + destination(timestamp DateTime) + ENGINE = ReplicatedMergeTree('/clickhouse/tables/{database}/test/destination_non_replicated_to_replicated_distinct_expression', '1') + PARTITION BY toYYYYMM(timestamp) + ORDER BY tuple(); +INSERT INTO TABLE source VALUES ('2010-03-02 02:01:01'), ('2010-03-02 02:01:03'); +ALTER TABLE destination ATTACH PARTITION ID '20100302' FROM source; +SELECT * FROM source ORDER BY timestamp; +2010-03-02 02:01:01 +2010-03-02 02:01:03 +SELECT * FROM destination ORDER BY timestamp; +2010-03-02 02:01:01 +2010-03-02 02:01:03 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +201003 +TRUNCATE TABLE destination; +ALTER TABLE destination ATTACH PARTITION '20100302' from source; +SELECT * FROM source ORDER BY timestamp; +2010-03-02 02:01:01 +2010-03-02 02:01:03 +SELECT * FROM destination ORDER BY timestamp; +2010-03-02 02:01:01 +2010-03-02 02:01:03 +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +201003 +-- Should not be allowed because data would be split into two different partitions +DROP TABLE IF EXISTS source SYNC; +DROP TABLE IF EXISTS destination SYNC; +CREATE TABLE source (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMM(timestamp); +CREATE TABLE destination (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMMDD(timestamp); +INSERT INTO TABLE source VALUES ('2010-03-02 02:01:01'), ('2010-03-03 02:01:03'); +ALTER TABLE destination ATTACH PARTITION ID '201003' FROM source; -- { serverError 248 } +ALTER TABLE destination ATTACH PARTITION '201003' from source; -- { serverError 248 } +-- Should not be allowed because data would be split into two different partitions +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE source (timestamp DateTime, A Int64) engine=MergeTree ORDER BY timestamp PARTITION BY intDiv(A, 6); +CREATE TABLE destination (timestamp DateTime, A Int64) engine=MergeTree ORDER BY timestamp PARTITION BY A; +INSERT INTO TABLE source VALUES ('2010-03-02 02:01:01', 1), ('2010-03-02 02:01:03', 2); +ALTER TABLE destination ATTACH PARTITION ID '0' FROM source; -- { serverError 248 } +ALTER TABLE destination ATTACH PARTITION 0 FROM source; -- { serverError 248 } +-- Should not be allowed because dst partition exp takes more than two arguments, so it's not considered monotonically inc +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE source (productName String, category String) engine=MergeTree ORDER BY tuple() PARTITION BY toString(category); +CREATE TABLE destination (productName String, category String) engine=MergeTree ORDER BY tuple() PARTITION BY substring(category, 1, 2); +INSERT INTO TABLE source VALUES ('spaghetti', 'food'), ('mop', 'general'); +INSERT INTO TABLE source VALUES ('rice', 'food'); +ALTER TABLE destination ATTACH PARTITION ID '4590ba78048910b74a47d5bfb308abed' from source; -- { serverError 36 } +ALTER TABLE destination ATTACH PARTITION 'food' from source; -- { serverError 36 } +-- Should not be allowed because dst partition exp depends on a different set of columns +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE source (productName String, category String) engine=MergeTree ORDER BY tuple() PARTITION BY toString(category); +CREATE TABLE destination (productName String, category String) engine=MergeTree ORDER BY tuple() PARTITION BY toString(productName); +INSERT INTO TABLE source VALUES ('spaghetti', 'food'), ('mop', 'general'); +INSERT INTO TABLE source VALUES ('rice', 'food'); +ALTER TABLE destination ATTACH PARTITION ID '4590ba78048910b74a47d5bfb308abed' from source; -- { serverError 36 } +ALTER TABLE destination ATTACH PARTITION 'food' from source; -- { serverError 36 } +-- Should not be allowed because dst partition exp is not monotonically increasing +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE source (productName String) engine=MergeTree ORDER BY tuple() PARTITION BY left(productName, 2); +CREATE TABLE destination (productName String) engine=MergeTree ORDER BY tuple() PARTITION BY cityHash64(productName); +INSERT INTO TABLE source VALUES ('bread'), ('mop'); +INSERT INTO TABLE source VALUES ('broccoli'); +ALTER TABLE destination ATTACH PARTITION ID '4589453b7ee96ce9de1265bd57674496' from source; -- { serverError 36 } +ALTER TABLE destination ATTACH PARTITION 'br' from source; -- { serverError 36 } +-- Empty/ non-existent partition, same partition expression. Nothing should happen +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE source (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMM(timestamp); +CREATE TABLE destination (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMM(timestamp); +ALTER TABLE destination ATTACH PARTITION ID '1' FROM source; +ALTER TABLE destination ATTACH PARTITION 1 FROM source; +SELECT * FROM destination; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +-- Empty/ non-existent partition, different partition expression. Nothing should happen +-- https://github.com/ClickHouse/ClickHouse/pull/39507#discussion_r1399839045 +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE source (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMMDD(timestamp); +CREATE TABLE destination (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMM(timestamp); +ALTER TABLE destination ATTACH PARTITION ID '1' FROM source; +ALTER TABLE destination ATTACH PARTITION 1 FROM source; +SELECT * FROM destination; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +-- Replace instead of attach. Empty/ non-existent partition, same partition expression. Nothing should happen +-- https://github.com/ClickHouse/ClickHouse/pull/39507#discussion_r1399839045 +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE source (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMM(timestamp); +CREATE TABLE destination (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMM(timestamp); +ALTER TABLE destination REPLACE PARTITION '1' FROM source; +SELECT * FROM destination; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; +-- Replace instead of attach. Empty/ non-existent partition to non-empty partition, same partition id. +-- https://github.com/ClickHouse/ClickHouse/pull/39507#discussion_r1399839045 +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE source (A Int) engine=MergeTree ORDER BY tuple() PARTITION BY A; +CREATE TABLE destination (A Int) engine=MergeTree ORDER BY tuple() PARTITION BY A; +INSERT INTO TABLE destination VALUES (1); +ALTER TABLE destination REPLACE PARTITION '1' FROM source; +SELECT * FROM destination; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; diff --git a/tests/queries/0_stateless/02456_test_attach_partition_different_partition_exp.sql b/tests/queries/0_stateless/02456_test_attach_partition_different_partition_exp.sql new file mode 100644 index 00000000000..9547d6ae249 --- /dev/null +++ b/tests/queries/0_stateless/02456_test_attach_partition_different_partition_exp.sql @@ -0,0 +1,485 @@ +-- { echoOn } +-- Should be allowed since destination partition expr is monotonically increasing and compatible +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; + +CREATE TABLE source (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMMDD(timestamp); +CREATE TABLE destination (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMM(timestamp); + +INSERT INTO TABLE source VALUES ('2010-03-02 02:01:01'), ('2010-03-02 02:01:03'); + +ALTER TABLE destination ATTACH PARTITION ID '20100302' FROM source; + +SELECT * FROM source ORDER BY timestamp; +SELECT * FROM destination ORDER BY timestamp; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +TRUNCATE TABLE destination; + +ALTER TABLE destination ATTACH PARTITION '20100302' FROM source; + +SELECT * FROM source ORDER BY timestamp; +SELECT * FROM destination ORDER BY timestamp; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +-- Should be allowed since destination partition expr is monotonically increasing and compatible. Note that even though +-- the destination partition expression is more granular, the data would still fall in the same partition. Thus, it is valid +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; + +CREATE TABLE source (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMM(timestamp); +CREATE TABLE destination (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMMDD(timestamp); + +INSERT INTO TABLE source VALUES ('2010-03-02 02:01:01'), ('2010-03-02 02:01:03'); + +ALTER TABLE destination ATTACH PARTITION ID '201003' FROM source; + +SELECT * FROM source ORDER BY timestamp; +SELECT * FROM destination ORDER BY timestamp; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +TRUNCATE TABLE destination; + +ALTER TABLE destination ATTACH PARTITION '201003' FROM source; + +SELECT * FROM source ORDER BY timestamp; +SELECT * FROM destination ORDER BY timestamp; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +-- Should be allowed since destination partition expr is monotonically increasing and compatible for those specific values +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; + +CREATE TABLE source (timestamp DateTime, A Int64) engine=MergeTree ORDER BY timestamp PARTITION BY intDiv(A, 6); + +CREATE TABLE destination (timestamp DateTime, A Int64) engine=MergeTree ORDER BY timestamp PARTITION BY A; + +INSERT INTO TABLE source VALUES ('2010-03-02 02:01:01', 1), ('2010-03-02 02:01:03', 1); + +ALTER TABLE destination ATTACH PARTITION ID '0' FROM source; + +SELECT * FROM source ORDER BY timestamp; +SELECT * FROM destination ORDER BY timestamp; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +TRUNCATE TABLE destination; + +ALTER TABLE destination ATTACH PARTITION 0 FROM source; + +SELECT * FROM source ORDER BY timestamp; +SELECT * FROM destination ORDER BY timestamp; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +-- Should be allowed because dst partition exp is monot inc and data is not split +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; + +CREATE TABLE source (productName String, category String) engine=MergeTree ORDER BY tuple() PARTITION BY cityHash64(category); +CREATE TABLE destination (productName String, category String) engine=MergeTree ORDER BY tuple() PARTITION BY toString(category); + +INSERT INTO TABLE source VALUES ('spaghetti', 'food'), ('mop', 'general'); +INSERT INTO TABLE source VALUES ('rice', 'food'); + +ALTER TABLE destination ATTACH PARTITION ID '17908065610379824077' from source; + +SELECT * FROM source ORDER BY productName; +SELECT * FROM destination ORDER BY productName; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +TRUNCATE TABLE destination; + +ALTER TABLE destination ATTACH PARTITION '17908065610379824077' from source; + +SELECT * FROM source ORDER BY productName; +SELECT * FROM destination ORDER BY productName; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +-- Should be allowed, extra test case to validate https://github.com/ClickHouse/ClickHouse/pull/39507#issuecomment-1747574133 + +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; + +CREATE TABLE source (timestamp Int64) engine=MergeTree ORDER BY (timestamp) PARTITION BY intDiv(timestamp, 86400000); +CREATE TABLE destination (timestamp Int64) engine=MergeTree ORDER BY (timestamp) PARTITION BY toYear(toDateTime(intDiv(timestamp, 1000))); + +INSERT INTO TABLE source VALUES (1267495261123); + +ALTER TABLE destination ATTACH PARTITION ID '14670' FROM source; + +SELECT * FROM source ORDER BY timestamp; +SELECT * FROM destination ORDER BY timestamp; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +TRUNCATE TABLE destination; + +ALTER TABLE destination ATTACH PARTITION '14670' from source; + +SELECT * FROM source ORDER BY timestamp; +SELECT * FROM destination ORDER BY timestamp; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +-- Should be allowed, extra test case to validate https://github.com/ClickHouse/ClickHouse/pull/39507#issuecomment-1747511726 + +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; + +CREATE TABLE source (timestamp DateTime('UTC'), key Int64, f Float64) engine=MergeTree ORDER BY (key, timestamp) PARTITION BY toYear(timestamp); +CREATE TABLE destination (timestamp DateTime('UTC'), key Int64, f Float64) engine=MergeTree ORDER BY (key, timestamp) PARTITION BY (intDiv(toUInt32(timestamp),86400)); + +INSERT INTO TABLE source VALUES ('2010-03-02 02:01:01',1,1),('2010-03-02 02:01:01',1,1),('2011-02-02 02:01:03',1,1); + +ALTER TABLE destination ATTACH PARTITION ID '2010' FROM source; + +SELECT * FROM source ORDER BY timestamp; +SELECT * FROM destination ORDER BY timestamp; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +TRUNCATE TABLE destination; + +ALTER TABLE destination ATTACH PARTITION '2010' from source; + +SELECT * FROM source ORDER BY timestamp; +SELECT * FROM destination ORDER BY timestamp; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +-- Should be allowed, partitioned table to unpartitioned. Since the destination is unpartitioned, parts would ultimately +-- fall into the same partition. +-- Destination partition by expression is omitted, which causes StorageMetadata::getPartitionKeyAST() to be nullptr. +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE source (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMM(timestamp); +CREATE TABLE destination (timestamp DateTime) engine=MergeTree ORDER BY tuple(); + +INSERT INTO TABLE source VALUES ('2010-03-02 02:01:01'), ('2010-03-02 02:01:03'); + +ALTER TABLE destination ATTACH PARTITION ID '201003' FROM source; + +SELECT * FROM source ORDER BY timestamp; +SELECT * FROM destination ORDER BY timestamp; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +TRUNCATE TABLE destination; + +ALTER TABLE destination ATTACH PARTITION '201003' from source; + +SELECT * FROM source ORDER BY timestamp; +SELECT * FROM destination ORDER BY timestamp; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +-- Same as above, but destination partition by expression is explicitly defined. Test case required to validate that +-- partition by tuple() is accepted. +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE source (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMM(timestamp); +CREATE TABLE destination (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY tuple(); + +INSERT INTO TABLE source VALUES ('2010-03-02 02:01:01'), ('2010-03-02 02:01:03'); + +ALTER TABLE destination ATTACH PARTITION ID '201003' FROM source; + +SELECT * FROM source ORDER BY timestamp; +SELECT * FROM destination ORDER BY timestamp; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +TRUNCATE TABLE destination; + +ALTER TABLE destination ATTACH PARTITION '201003' from source; + +SELECT * FROM source ORDER BY timestamp; +SELECT * FROM destination ORDER BY timestamp; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +-- Should be allowed because the destination partition expression columns are a subset of the source partition expression columns +-- Columns in this case refer to the expression elements, not to the actual table columns +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE source (a Int, b Int, c Int) engine=MergeTree ORDER BY tuple() PARTITION BY (a, b, c); +CREATE TABLE destination (a Int, b Int, c Int) engine=MergeTree ORDER BY tuple() PARTITION BY (a, b); + +INSERT INTO TABLE source VALUES (1, 2, 3), (1, 2, 4); + +ALTER TABLE destination ATTACH PARTITION ID '1-2-3' FROM source; + +SELECT * FROM source ORDER BY (a, b, c); +SELECT * FROM destination ORDER BY (a, b, c); +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +TRUNCATE TABLE destination; + +ALTER TABLE destination ATTACH PARTITION (1, 2, 3) from source; + +SELECT * FROM source ORDER BY (a, b, c); +SELECT * FROM destination ORDER BY (a, b, c); +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +-- Should be allowed because the destination partition expression columns are a subset of the source partition expression columns +-- Columns in this case refer to the expression elements, not to the actual table columns +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE source (a Int, b Int, c Int) engine=MergeTree ORDER BY tuple() PARTITION BY (a, b, c); +CREATE TABLE destination (a Int, b Int, c Int) engine=MergeTree ORDER BY tuple() PARTITION BY a; + +INSERT INTO TABLE source VALUES (1, 2, 3), (1, 2, 4); + +ALTER TABLE destination ATTACH PARTITION ID '1-2-3' FROM source; + +SELECT * FROM source ORDER BY (a, b, c); +SELECT * FROM destination ORDER BY (a, b, c); +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +TRUNCATE TABLE destination; + +ALTER TABLE destination ATTACH PARTITION (1, 2, 3) from source; + +SELECT * FROM source ORDER BY (a, b, c); +SELECT * FROM destination ORDER BY (a, b, c); +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +-- Should be allowed. Special test case, tricky to explain. First column of source partition expression is +-- timestamp, while first column of destination partition expression is `A`. One of the previous implementations +-- would not match the columns, which could lead to `timestamp` min max being used to calculate monotonicity of `A`. +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; + +CREATE TABLE source (`timestamp` DateTime, `A` Int64) ENGINE = MergeTree PARTITION BY tuple(toYYYYMM(timestamp), intDiv(A, 6)) ORDER BY timestamp; +CREATE TABLE destination (`timestamp` DateTime, `A` Int64) ENGINE = MergeTree PARTITION BY A ORDER BY timestamp; + +INSERT INTO TABLE source VALUES ('2010-03-02 02:01:01', 5); + +ALTER TABLE destination ATTACH PARTITION ID '201003-0' FROM source; + +SELECT * FROM source ORDER BY timestamp; +SELECT * FROM destination ORDER BY timestamp; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +TRUNCATE TABLE destination; + +ALTER TABLE destination ATTACH PARTITION (201003, 0) from source; + +SELECT * FROM source ORDER BY timestamp; +SELECT * FROM destination ORDER BY timestamp; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +-- Should be allowed. Destination partition expression contains multiple expressions, but all of them are monotonically +-- increasing in the source partition min max indexes. +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; + +CREATE TABLE source (A Int, B Int) ENGINE = MergeTree PARTITION BY tuple(A, B) ORDER BY tuple(); +CREATE TABLE destination (A Int, B Int) ENGINE = MergeTree PARTITION BY tuple(intDiv(A, 2), intDiv(B, 2)) ORDER BY tuple(); + +INSERT INTO TABLE source VALUES (6, 12); + +ALTER TABLE destination ATTACH PARTITION ID '6-12' FROM source; + +SELECT * FROM source ORDER BY A; +SELECT * FROM destination ORDER BY A; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +TRUNCATE TABLE destination; + +ALTER TABLE destination ATTACH PARTITION (6, 12) from source; + +SELECT * FROM source ORDER BY A; +SELECT * FROM destination ORDER BY A; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +-- Should be allowed. The same scenario as above, but partition expressions inverted. +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; + +CREATE TABLE source (A Int, B Int) ENGINE = MergeTree PARTITION BY tuple(intDiv(A, 2), intDiv(B, 2)) ORDER BY tuple(); +CREATE TABLE destination (A Int, B Int) ENGINE = MergeTree PARTITION BY tuple(A, B) ORDER BY tuple(); + +INSERT INTO TABLE source VALUES (6, 12); + +ALTER TABLE destination ATTACH PARTITION ID '3-6' FROM source; + +SELECT * FROM source ORDER BY A; +SELECT * FROM destination ORDER BY A; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +TRUNCATE TABLE destination; + +ALTER TABLE destination ATTACH PARTITION (3, 6) from source; + +SELECT * FROM source ORDER BY A; +SELECT * FROM destination ORDER BY A; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +-- Should be allowed, it is a local operation, no different than regular attach. Replicated to replicated. +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; +CREATE TABLE + source(timestamp DateTime) + ENGINE = ReplicatedMergeTree('/clickhouse/tables/{database}/test/source_replicated_to_replicated_distinct_expression', '1') + PARTITION BY toYYYYMMDD(timestamp) + ORDER BY tuple(); + +CREATE TABLE + destination(timestamp DateTime) + ENGINE = ReplicatedMergeTree('/clickhouse/tables/{database}/test/destination_replicated_to_replicated_distinct_expression', '1') + PARTITION BY toYYYYMM(timestamp) + ORDER BY tuple(); + +INSERT INTO TABLE source VALUES ('2010-03-02 02:01:01'), ('2010-03-02 02:01:03'); + +ALTER TABLE destination ATTACH PARTITION ID '20100302' FROM source; + +SELECT * FROM source ORDER BY timestamp; +SELECT * FROM destination ORDER BY timestamp; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +TRUNCATE TABLE destination; + +ALTER TABLE destination ATTACH PARTITION '20100302' from source; + +SELECT * FROM source ORDER BY timestamp; +SELECT * FROM destination ORDER BY timestamp; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +-- Should be allowed, it is a local operation, no different than regular attach. Non replicated to replicated +DROP TABLE IF EXISTS source SYNC; +DROP TABLE IF EXISTS destination SYNC; +CREATE TABLE source(timestamp DateTime) ENGINE = MergeTree() PARTITION BY toYYYYMMDD(timestamp) ORDER BY tuple(); + +CREATE TABLE + destination(timestamp DateTime) + ENGINE = ReplicatedMergeTree('/clickhouse/tables/{database}/test/destination_non_replicated_to_replicated_distinct_expression', '1') + PARTITION BY toYYYYMM(timestamp) + ORDER BY tuple(); + +INSERT INTO TABLE source VALUES ('2010-03-02 02:01:01'), ('2010-03-02 02:01:03'); + +ALTER TABLE destination ATTACH PARTITION ID '20100302' FROM source; + +SELECT * FROM source ORDER BY timestamp; +SELECT * FROM destination ORDER BY timestamp; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +TRUNCATE TABLE destination; + +ALTER TABLE destination ATTACH PARTITION '20100302' from source; + +SELECT * FROM source ORDER BY timestamp; +SELECT * FROM destination ORDER BY timestamp; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +-- Should not be allowed because data would be split into two different partitions +DROP TABLE IF EXISTS source SYNC; +DROP TABLE IF EXISTS destination SYNC; + +CREATE TABLE source (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMM(timestamp); +CREATE TABLE destination (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMMDD(timestamp); + +INSERT INTO TABLE source VALUES ('2010-03-02 02:01:01'), ('2010-03-03 02:01:03'); + +ALTER TABLE destination ATTACH PARTITION ID '201003' FROM source; -- { serverError 248 } +ALTER TABLE destination ATTACH PARTITION '201003' from source; -- { serverError 248 } + +-- Should not be allowed because data would be split into two different partitions +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; + +CREATE TABLE source (timestamp DateTime, A Int64) engine=MergeTree ORDER BY timestamp PARTITION BY intDiv(A, 6); + +CREATE TABLE destination (timestamp DateTime, A Int64) engine=MergeTree ORDER BY timestamp PARTITION BY A; + +INSERT INTO TABLE source VALUES ('2010-03-02 02:01:01', 1), ('2010-03-02 02:01:03', 2); + +ALTER TABLE destination ATTACH PARTITION ID '0' FROM source; -- { serverError 248 } +ALTER TABLE destination ATTACH PARTITION 0 FROM source; -- { serverError 248 } + +-- Should not be allowed because dst partition exp takes more than two arguments, so it's not considered monotonically inc +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; + +CREATE TABLE source (productName String, category String) engine=MergeTree ORDER BY tuple() PARTITION BY toString(category); +CREATE TABLE destination (productName String, category String) engine=MergeTree ORDER BY tuple() PARTITION BY substring(category, 1, 2); + +INSERT INTO TABLE source VALUES ('spaghetti', 'food'), ('mop', 'general'); +INSERT INTO TABLE source VALUES ('rice', 'food'); + +ALTER TABLE destination ATTACH PARTITION ID '4590ba78048910b74a47d5bfb308abed' from source; -- { serverError 36 } +ALTER TABLE destination ATTACH PARTITION 'food' from source; -- { serverError 36 } + +-- Should not be allowed because dst partition exp depends on a different set of columns +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; + +CREATE TABLE source (productName String, category String) engine=MergeTree ORDER BY tuple() PARTITION BY toString(category); +CREATE TABLE destination (productName String, category String) engine=MergeTree ORDER BY tuple() PARTITION BY toString(productName); + +INSERT INTO TABLE source VALUES ('spaghetti', 'food'), ('mop', 'general'); +INSERT INTO TABLE source VALUES ('rice', 'food'); + +ALTER TABLE destination ATTACH PARTITION ID '4590ba78048910b74a47d5bfb308abed' from source; -- { serverError 36 } +ALTER TABLE destination ATTACH PARTITION 'food' from source; -- { serverError 36 } + +-- Should not be allowed because dst partition exp is not monotonically increasing +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; + +CREATE TABLE source (productName String) engine=MergeTree ORDER BY tuple() PARTITION BY left(productName, 2); +CREATE TABLE destination (productName String) engine=MergeTree ORDER BY tuple() PARTITION BY cityHash64(productName); + +INSERT INTO TABLE source VALUES ('bread'), ('mop'); +INSERT INTO TABLE source VALUES ('broccoli'); + +ALTER TABLE destination ATTACH PARTITION ID '4589453b7ee96ce9de1265bd57674496' from source; -- { serverError 36 } +ALTER TABLE destination ATTACH PARTITION 'br' from source; -- { serverError 36 } + +-- Empty/ non-existent partition, same partition expression. Nothing should happen +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; + +CREATE TABLE source (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMM(timestamp); +CREATE TABLE destination (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMM(timestamp); + +ALTER TABLE destination ATTACH PARTITION ID '1' FROM source; +ALTER TABLE destination ATTACH PARTITION 1 FROM source; + +SELECT * FROM destination; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +-- Empty/ non-existent partition, different partition expression. Nothing should happen +-- https://github.com/ClickHouse/ClickHouse/pull/39507#discussion_r1399839045 +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; + +CREATE TABLE source (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMMDD(timestamp); +CREATE TABLE destination (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMM(timestamp); + +ALTER TABLE destination ATTACH PARTITION ID '1' FROM source; +ALTER TABLE destination ATTACH PARTITION 1 FROM source; + +SELECT * FROM destination; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +-- Replace instead of attach. Empty/ non-existent partition, same partition expression. Nothing should happen +-- https://github.com/ClickHouse/ClickHouse/pull/39507#discussion_r1399839045 +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; + +CREATE TABLE source (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMM(timestamp); +CREATE TABLE destination (timestamp DateTime) engine=MergeTree ORDER BY tuple() PARTITION BY toYYYYMM(timestamp); + +ALTER TABLE destination REPLACE PARTITION '1' FROM source; + +SELECT * FROM destination; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; + +-- Replace instead of attach. Empty/ non-existent partition to non-empty partition, same partition id. +-- https://github.com/ClickHouse/ClickHouse/pull/39507#discussion_r1399839045 +DROP TABLE IF EXISTS source; +DROP TABLE IF EXISTS destination; + +CREATE TABLE source (A Int) engine=MergeTree ORDER BY tuple() PARTITION BY A; +CREATE TABLE destination (A Int) engine=MergeTree ORDER BY tuple() PARTITION BY A; + +INSERT INTO TABLE destination VALUES (1); + +ALTER TABLE destination REPLACE PARTITION '1' FROM source; + +SELECT * FROM destination; +SELECT partition_id FROM system.parts where table='destination' AND database = currentDatabase() AND active = 1; From 5179891aef9792366d948efd9f1a2454dfe8da69 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Tue, 23 Jan 2024 11:43:08 -0300 Subject: [PATCH 089/245] remove static log --- src/Storages/MergeTree/MergeTreeDataPartCloner.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeDataPartCloner.cpp b/src/Storages/MergeTree/MergeTreeDataPartCloner.cpp index 78cb9aa0624..e384e1b7066 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartCloner.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartCloner.cpp @@ -13,8 +13,6 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; } -static Poco::Logger * log = &Poco::Logger::get("MergeTreeDataPartCloner"); - namespace DistinctPartitionExpression { std::unique_ptr updatePartitionFile( @@ -182,7 +180,7 @@ std::pair cloneSourcePart( } LOG_DEBUG( - log, + &Poco::Logger::get("MergeTreeDataPartCloner"), "Clone {} part {} to {}{}", src_flushed_tmp_part ? "flushed" : "", src_part_storage->getFullPath(), From 7e86c0e9280bb6e46183c2c358474bfd283e2554 Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Tue, 23 Jan 2024 23:03:15 +0800 Subject: [PATCH 090/245] Compress state of dashboard --- programs/server/dashboard.html | 6 +++--- programs/server/js/lz-string.js | 1 + src/Server/WebUIRequestHandler.cpp | 9 +++++++++ 3 files changed, 13 insertions(+), 3 deletions(-) create mode 100644 programs/server/js/lz-string.js diff --git a/programs/server/dashboard.html b/programs/server/dashboard.html index 04fdfb2d3ca..1f32048da79 100644 --- a/programs/server/dashboard.html +++ b/programs/server/dashboard.html @@ -5,6 +5,7 @@ ClickHouse Dashboard +