diff --git a/base/daemon/BaseDaemon.cpp b/base/daemon/BaseDaemon.cpp index 311349a2ba7..b27a904b31a 100644 --- a/base/daemon/BaseDaemon.cpp +++ b/base/daemon/BaseDaemon.cpp @@ -828,7 +828,6 @@ void BaseDaemon::initializeTerminationAndSignalProcessing() /// Setup signal handlers. /// SIGTSTP is added for debugging purposes. To output a stack trace of any running thread at anytime. - addSignalHandler({SIGABRT, SIGSEGV, SIGILL, SIGBUS, SIGSYS, SIGFPE, SIGPIPE, SIGTSTP, SIGTRAP}, signalHandler, &handled_signals); addSignalHandler({SIGHUP}, closeLogsSignalHandler, &handled_signals); addSignalHandler({SIGINT, SIGQUIT, SIGTERM}, terminateRequestedSignalHandler, &handled_signals); diff --git a/docker/test/stateless/run.sh b/docker/test/stateless/run.sh index f8b73791388..63750b90b5a 100755 --- a/docker/test/stateless/run.sh +++ b/docker/test/stateless/run.sh @@ -139,7 +139,7 @@ pigz < /var/log/clickhouse-server/clickhouse-server.log > /test_output/clickhous # directly # - even though ci auto-compress some files (but not *.tsv) it does this only # for files >64MB, we want this files to be compressed explicitly -for table in query_log zookeeper_log trace_log +for table in query_log zookeeper_log trace_log transactions_info_log do clickhouse-client -q "select * from system.$table format TSVWithNamesAndTypes" | pigz > /test_output/$table.tsv.gz & if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index 0b5a7724fe5..fc9187cb622 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -1639,6 +1639,8 @@ int Server::main(const std::vector & /*args*/) server.start(); LOG_INFO(log, "Listening for {}", server.getDescription()); } + + global_context->setServerCompletelyStarted(); LOG_INFO(log, "Ready for connections."); } diff --git a/src/Access/Common/AccessType.h b/src/Access/Common/AccessType.h index fb5eafbe679..accfa0ad33d 100644 --- a/src/Access/Common/AccessType.h +++ b/src/Access/Common/AccessType.h @@ -102,6 +102,7 @@ enum class AccessType \ M(KILL_QUERY, "", GLOBAL, ALL) /* allows to kill a query started by another user (anyone can kill his own queries) */\ + M(KILL_TRANSACTION, "", GLOBAL, ALL) \ \ M(MOVE_PARTITION_BETWEEN_SHARDS, "", GLOBAL, ALL) /* required to be able to move a part/partition to a table identified by its ZooKeeper path */\ diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp index 8d8ee19f8ba..e77d683d26f 100644 --- a/src/Client/ClientBase.cpp +++ b/src/Client/ClientBase.cpp @@ -1129,7 +1129,7 @@ void ClientBase::sendData(Block & sample, const ColumnsDescription & columns_des sendDataFromPipe( storage->read( sample.getNames(), - storage->getStorageSnapshot(metadata), + storage->getStorageSnapshot(metadata, global_context), query_info, global_context, {}, diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp index 2e60e125d73..3097af6207c 100644 --- a/src/Common/ErrorCodes.cpp +++ b/src/Common/ErrorCodes.cpp @@ -617,6 +617,8 @@ M(646, CANNOT_BACKUP_DATABASE) \ M(647, CANNOT_BACKUP_TABLE) \ M(648, WRONG_DDL_RENAMING_SETTINGS) \ + M(649, INVALID_TRANSACTION) \ + M(650, SERIALIZATION_ERROR) \ \ M(999, KEEPER_EXCEPTION) \ M(1000, POCO_EXCEPTION) \ diff --git a/src/Common/SystemLogBase.cpp b/src/Common/SystemLogBase.cpp index e5991421633..88e6e8327b8 100644 --- a/src/Common/SystemLogBase.cpp +++ b/src/Common/SystemLogBase.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include diff --git a/src/Common/SystemLogBase.h b/src/Common/SystemLogBase.h index cfb4821691c..da3d6c24562 100644 --- a/src/Common/SystemLogBase.h +++ b/src/Common/SystemLogBase.h @@ -23,6 +23,7 @@ M(QueryViewsLogElement) \ M(SessionLogElement) \ M(TraceLogElement) \ + M(TransactionsInfoLogElement) \ M(ZooKeeperLogElement) \ M(ProcessorProfileLogElement) \ M(TextLogElement) diff --git a/src/Common/TransactionID.cpp b/src/Common/TransactionID.cpp new file mode 100644 index 00000000000..8a9894fbe53 --- /dev/null +++ b/src/Common/TransactionID.cpp @@ -0,0 +1,43 @@ +#include +#include +#include +#include + +namespace DB +{ + +TIDHash TransactionID::getHash() const +{ + SipHash hash; + hash.update(start_csn); + hash.update(local_tid); + hash.update(host_id); + return hash.get64(); +} + + +void TransactionID::write(const TransactionID & tid, WriteBuffer & buf) +{ + writeChar('(', buf); + writeText(tid.start_csn, buf); + writeCString(", ", buf); + writeText(tid.local_tid, buf); + writeCString(", ", buf); + writeText(tid.host_id, buf); + writeChar(')', buf); +} + +TransactionID TransactionID::read(ReadBuffer & buf) +{ + TransactionID tid = Tx::EmptyTID; + assertChar('(', buf); + readText(tid.start_csn, buf); + assertString(", ", buf); + readText(tid.local_tid, buf); + assertString(", ", buf); + readText(tid.host_id, buf); + assertChar(')', buf); + return tid; +} + +} diff --git a/src/Common/TransactionID.h b/src/Common/TransactionID.h new file mode 100644 index 00000000000..3ab86f7589c --- /dev/null +++ b/src/Common/TransactionID.h @@ -0,0 +1,115 @@ +#pragma once +#include +#include +#include +#include + +namespace DB +{ + +class IDataType; +using DataTypePtr = std::shared_ptr; +class MergeTreeTransaction; + +/// This macro is useful for places where a pointer to current transaction should be passed, +/// but transactions are not supported yet (e.g. when calling MergeTreeData's methods from StorageReplicatedMergeTree) +/// or transaction object is not needed and not passed intentionally. +#ifndef NO_TRANSACTION_PTR +#define NO_TRANSACTION_PTR std::shared_ptr(nullptr) +#define NO_TRANSACTION_RAW static_cast(nullptr) +#endif + +/// Commit Sequence Number +using CSN = UInt64; +/// Local part of TransactionID +using LocalTID = UInt64; +/// Hash of TransactionID that fits into 64-bit atomic +using TIDHash = UInt64; + +namespace Tx +{ + /// For transactions that are probably not committed (yet) + const CSN UnknownCSN = 0; + /// For changes were made without creating a transaction + const CSN PrehistoricCSN = 1; + /// Special reserved values + const CSN CommittingCSN = 2; + const CSN EverythingVisibleCSN = 3; + const CSN MaxReservedCSN = 32; + + /// So far, that changes will never become visible + const CSN RolledBackCSN = std::numeric_limits::max(); + + const LocalTID PrehistoricLocalTID = 1; + const LocalTID DummyLocalTID = 2; + const LocalTID MaxReservedLocalTID = 32; +} + +struct TransactionID +{ + /// Global sequential number, the newest commit timestamp the we saw when this transaction began + CSN start_csn = 0; + /// Local sequential that is unique for each transaction started by this host within specific start_csn + LocalTID local_tid = 0; + /// UUID of host that has started this transaction + UUID host_id = UUIDHelpers::Nil; + + /// NOTE Maybe we could just generate UUIDv4 for each transaction, but it would be harder to debug. + /// Partial order is defined for this TransactionID structure: + /// (tid1.start_csn <= tid2.start_csn) <==> (tid1 <= tid2) + /// (tid1.start_csn == tid2.start_csn && tid1.host_id == tid2.host_id && tid1.local_tid < tid2.local_tid) ==> (tid1 < tid2) + /// If two transaction have the same start_csn, but were started by different hosts, then order is undefined. + + bool operator == (const TransactionID & rhs) const + { + return start_csn == rhs.start_csn && local_tid == rhs.local_tid && host_id == rhs.host_id; + } + + bool operator != (const TransactionID & rhs) const + { + return !(*this == rhs); + } + + TIDHash getHash() const; + + bool isEmpty() const + { + assert((local_tid == 0) == (start_csn == 0 && host_id == UUIDHelpers::Nil)); + return local_tid == 0; + } + + bool isPrehistoric() const + { + assert((local_tid == Tx::PrehistoricLocalTID) == (start_csn == Tx::PrehistoricCSN)); + return local_tid == Tx::PrehistoricLocalTID; + } + + + static void write(const TransactionID & tid, WriteBuffer & buf); + static TransactionID read(ReadBuffer & buf); +}; + +namespace Tx +{ + const TransactionID EmptyTID = {0, 0, UUIDHelpers::Nil}; + const TransactionID PrehistoricTID = {PrehistoricCSN, PrehistoricLocalTID, UUIDHelpers::Nil}; + const TransactionID DummyTID = {PrehistoricCSN, DummyLocalTID, UUIDHelpers::Nil}; +} + +} + +template<> +struct fmt::formatter +{ + template + constexpr auto parse(ParseContext & context) + { + return context.begin(); + } + + template + auto format(const DB::TransactionID & tid, FormatContext & context) + { + return fmt::format_to(context.out(), "({}, {}, {})", tid.start_csn, tid.local_tid, tid.host_id); + } +}; diff --git a/src/Common/ZooKeeper/ZooKeeper.cpp b/src/Common/ZooKeeper/ZooKeeper.cpp index aae3b6d4191..0f4b141d058 100644 --- a/src/Common/ZooKeeper/ZooKeeper.cpp +++ b/src/Common/ZooKeeper/ZooKeeper.cpp @@ -1270,4 +1270,14 @@ String extractZooKeeperPath(const String & path, bool check_starts_with_slash, P return normalizeZooKeeperPath(path, check_starts_with_slash, log); } +String getSequentialNodeName(const String & prefix, UInt64 number) +{ + /// NOTE Sequential counter in ZooKeeper is Int32. + assert(number < std::numeric_limits::max()); + constexpr size_t seq_node_digits = 10; + String num_str = std::to_string(number); + String name = prefix + String(seq_node_digits - num_str.size(), '0') + num_str; + return name; +} + } diff --git a/src/Common/ZooKeeper/ZooKeeper.h b/src/Common/ZooKeeper/ZooKeeper.h index 0f7eccd2547..4d5bd039a55 100644 --- a/src/Common/ZooKeeper/ZooKeeper.h +++ b/src/Common/ZooKeeper/ZooKeeper.h @@ -417,4 +417,6 @@ String extractZooKeeperName(const String & path); String extractZooKeeperPath(const String & path, bool check_starts_with_slash, Poco::Logger * log = nullptr); +String getSequentialNodeName(const String & prefix, UInt64 number); + } diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 980d0428d27..aa78456702c 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -581,6 +581,7 @@ class IColumn; M(Bool, allow_experimental_nlp_functions, false, "Enable experimental functions for natural language processing.", 0) \ M(Bool, allow_experimental_object_type, false, "Allow Object and JSON data types", 0) \ M(String, insert_deduplication_token, "", "If not empty, used for duplicate detection instead of data digest", 0) \ + M(Bool, throw_on_unsupported_query_inside_transaction, true, "Throw exception if unsupported query is used inside transaction", 0) \ // End of COMMON_SETTINGS // Please add settings related to formats into the FORMAT_FACTORY_SETTINGS and move obsolete settings to OBSOLETE_SETTINGS. diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index d94eceb7dec..2a07ba8375d 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -461,6 +461,10 @@ void DatabaseReplicated::checkQueryValid(const ASTPtr & query, ContextPtr query_ BlockIO DatabaseReplicated::tryEnqueueReplicatedDDL(const ASTPtr & query, ContextPtr query_context) { + + if (query_context->getCurrentTransaction() && query_context->getSettingsRef().throw_on_unsupported_query_inside_transaction) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Distributed DDL queries inside transactions are not supported"); + if (is_readonly) throw Exception(ErrorCodes::NO_ZOOKEEPER, "Database is in readonly mode, because it cannot connect to ZooKeeper"); diff --git a/src/Functions/FunctionConstantBase.h b/src/Functions/FunctionConstantBase.h index 2d237c77256..c178b3a256e 100644 --- a/src/Functions/FunctionConstantBase.h +++ b/src/Functions/FunctionConstantBase.h @@ -41,9 +41,9 @@ public: bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; } - ColumnPtr executeImpl(const ColumnsWithTypeAndName &, const DataTypePtr &, size_t input_rows_count) const override + ColumnPtr executeImpl(const ColumnsWithTypeAndName &, const DataTypePtr & result_type, size_t input_rows_count) const override { - return ColumnT().createColumnConst(input_rows_count, constant_value); + return result_type->createColumnConst(input_rows_count, constant_value); } private: diff --git a/src/Functions/FunctionsTransactionCounters.cpp b/src/Functions/FunctionsTransactionCounters.cpp new file mode 100644 index 00000000000..f2e9d3aa84b --- /dev/null +++ b/src/Functions/FunctionsTransactionCounters.cpp @@ -0,0 +1,71 @@ +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace +{ + +class FunctionTransactionID : public FunctionConstantBase +{ +public: + static constexpr auto name = "transactionID"; + static Tuple getValue(const MergeTreeTransactionPtr & txn) + { + Tuple res; + if (txn) + res = {txn->tid.start_csn, txn->tid.local_tid, txn->tid.host_id}; + else + res = {UInt64(0), UInt64(0), UUIDHelpers::Nil}; + return res; + } + + DataTypePtr getReturnTypeImpl(const DataTypes & /*arguments*/) const override { return getTransactionIDDataType(); } + + static FunctionPtr create(ContextPtr context) { return std::make_shared(context); } + explicit FunctionTransactionID(ContextPtr context) : FunctionConstantBase(getValue(context->getCurrentTransaction()), context->isDistributed()) {} +}; + +class FunctionTransactionLatestSnapshot : public FunctionConstantBase +{ + static UInt64 getLatestSnapshot(ContextPtr context) + { + context->checkTransactionsAreAllowed(/* explicit_tcl_query */ true); + return TransactionLog::instance().getLatestSnapshot(); + } +public: + static constexpr auto name = "transactionLatestSnapshot"; + static FunctionPtr create(ContextPtr context) { return std::make_shared(context); } + explicit FunctionTransactionLatestSnapshot(ContextPtr context) : FunctionConstantBase(getLatestSnapshot(context), context->isDistributed()) {} +}; + +class FunctionTransactionOldestSnapshot : public FunctionConstantBase +{ + static UInt64 getOldestSnapshot(ContextPtr context) + { + context->checkTransactionsAreAllowed(/* explicit_tcl_query */ true); + return TransactionLog::instance().getOldestSnapshot(); + } +public: + static constexpr auto name = "transactionOldestSnapshot"; + static FunctionPtr create(ContextPtr context) { return std::make_shared(context); } + explicit FunctionTransactionOldestSnapshot(ContextPtr context) : FunctionConstantBase(getOldestSnapshot(context), context->isDistributed()) {} +}; + +} + +void registerFunctionsTransactionCounters(FunctionFactory & factory) +{ + factory.registerFunction(); + factory.registerFunction(); + factory.registerFunction(); +} + +} diff --git a/src/Functions/registerFunctionsMiscellaneous.cpp b/src/Functions/registerFunctionsMiscellaneous.cpp index d18c73cc8b5..9cd9c70da16 100644 --- a/src/Functions/registerFunctionsMiscellaneous.cpp +++ b/src/Functions/registerFunctionsMiscellaneous.cpp @@ -75,6 +75,7 @@ void registerFunctionFile(FunctionFactory &); void registerFunctionConnectionId(FunctionFactory &); void registerFunctionPartitionId(FunctionFactory &); void registerFunctionIsIPAddressContainedIn(FunctionFactory &); +void registerFunctionsTransactionCounters(FunctionFactory & factory); void registerFunctionQueryID(FunctionFactory &); void registerFunctionInitialQueryID(FunctionFactory &); void registerFunctionServerUUID(FunctionFactory &); @@ -163,6 +164,7 @@ void registerFunctionsMiscellaneous(FunctionFactory & factory) registerFunctionConnectionId(factory); registerFunctionPartitionId(factory); registerFunctionIsIPAddressContainedIn(factory); + registerFunctionsTransactionCounters(factory); registerFunctionQueryID(factory); registerFunctionInitialQueryID(factory); registerFunctionServerUUID(factory); diff --git a/src/IO/ReadHelpers.cpp b/src/IO/ReadHelpers.cpp index 98a33612aa8..bf3cccccab8 100644 --- a/src/IO/ReadHelpers.cpp +++ b/src/IO/ReadHelpers.cpp @@ -26,6 +26,7 @@ namespace ErrorCodes extern const int CANNOT_PARSE_DATETIME; extern const int CANNOT_PARSE_DATE; extern const int INCORRECT_DATA; + extern const int ATTEMPT_TO_READ_AFTER_EOF; } template @@ -137,6 +138,12 @@ void assertEOF(ReadBuffer & buf) throwAtAssertionFailed("eof", buf); } +void assertNotEOF(ReadBuffer & buf) +{ + if (buf.eof()) + throw Exception("Attempt to read after EOF", ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF); +} + void assertStringCaseInsensitive(const char * s, ReadBuffer & buf) { diff --git a/src/IO/ReadHelpers.h b/src/IO/ReadHelpers.h index e68da3a1c7d..13228853ff3 100644 --- a/src/IO/ReadHelpers.h +++ b/src/IO/ReadHelpers.h @@ -163,6 +163,7 @@ void readVectorBinary(std::vector & v, ReadBuffer & buf, size_t MAX_VECTOR_SI void assertString(const char * s, ReadBuffer & buf); void assertEOF(ReadBuffer & buf); +void assertNotEOF(ReadBuffer & buf); [[noreturn]] void throwAtAssertionFailed(const char * s, ReadBuffer & buf); diff --git a/src/IO/WriteBufferFromFileDescriptor.cpp b/src/IO/WriteBufferFromFileDescriptor.cpp index b91114995e8..d3ca4a9fc32 100644 --- a/src/IO/WriteBufferFromFileDescriptor.cpp +++ b/src/IO/WriteBufferFromFileDescriptor.cpp @@ -133,7 +133,6 @@ off_t WriteBufferFromFileDescriptor::seek(off_t offset, int whence) // NOLINT return res; } - void WriteBufferFromFileDescriptor::truncate(off_t length) // NOLINT { int res = ftruncate(fd, length); diff --git a/src/IO/WriteHelpers.h b/src/IO/WriteHelpers.h index 7c6abf2aec7..f72213f0d11 100644 --- a/src/IO/WriteHelpers.h +++ b/src/IO/WriteHelpers.h @@ -1163,3 +1163,19 @@ struct PcgSerializer void writePointerHex(const void * ptr, WriteBuffer & buf); } + +template<> +struct fmt::formatter +{ + template + constexpr auto parse(ParseContext & context) + { + return context.begin(); + } + + template + auto format(const DB::UUID & uuid, FormatContext & context) + { + return fmt::format_to(context.out(), "{}", toString(uuid)); + } +}; diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index 2ac80014d26..40ed4d58993 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -90,6 +90,7 @@ #include #include #include +#include #include #if USE_ROCKSDB @@ -133,6 +134,7 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; extern const int INVALID_SETTING_VALUE; extern const int UNKNOWN_READ_METHOD; + extern const int NOT_IMPLEMENTED; } @@ -281,6 +283,8 @@ struct ContextSharedPart Context::ConfigReloadCallback config_reload_callback; + bool is_server_completely_started = false; + #if USE_ROCKSDB /// Global merge tree metadata cache, stored in rocksdb. MergeTreeMetadataCachePtr merge_tree_metadata_cache; @@ -365,6 +369,8 @@ struct ContextSharedPart if (common_executor) common_executor->wait(); + TransactionLog::shutdownIfAny(); + std::unique_ptr delete_system_logs; std::unique_ptr delete_embedded_dictionaries; std::unique_ptr delete_external_dictionaries_loader; @@ -492,6 +498,8 @@ ContextMutablePtr Context::createGlobal(ContextSharedPart * shared) void Context::initGlobal() { + assert(!global_context_instance); + global_context_instance = shared_from_this(); DatabaseCatalog::init(shared_from_this()); } @@ -2475,6 +2483,17 @@ std::shared_ptr Context::getZooKeeperLog() const } +std::shared_ptr Context::getTransactionsInfoLog() const +{ + auto lock = getLock(); + + if (!shared->system_logs) + return {}; + + return shared->system_logs->transactions_info_log; +} + + std::shared_ptr Context::getProcessorsProfileLog() const { auto lock = getLock(); @@ -3078,6 +3097,56 @@ void Context::resetZooKeeperMetadataTransaction() metadata_transaction = nullptr; } + +void Context::checkTransactionsAreAllowed(bool explicit_tcl_query /* = false */) const +{ + if (getConfigRef().getInt("allow_experimental_transactions", 0)) + return; + + if (explicit_tcl_query) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Transactions are not supported"); + + throw Exception(ErrorCodes::LOGICAL_ERROR, "Experimental support for transactions is disabled, " + "however, some query or background task tried to access TransactionLog. " + "If you have not enabled this feature explicitly, then it's a bug."); +} + +void Context::initCurrentTransaction(MergeTreeTransactionPtr txn) +{ + merge_tree_transaction_holder = MergeTreeTransactionHolder(txn, false, this); + setCurrentTransaction(std::move(txn)); +} + +void Context::setCurrentTransaction(MergeTreeTransactionPtr txn) +{ + assert(!merge_tree_transaction || !txn); + assert(this == session_context.lock().get() || this == query_context.lock().get()); + merge_tree_transaction = std::move(txn); + if (!merge_tree_transaction) + merge_tree_transaction_holder = {}; +} + +MergeTreeTransactionPtr Context::getCurrentTransaction() const +{ + return merge_tree_transaction; +} + +bool Context::isServerCompletelyStarted() const +{ + auto lock = getLock(); + assert(getApplicationType() == ApplicationType::SERVER); + return shared->is_server_completely_started; +} + +void Context::setServerCompletelyStarted() +{ + auto lock = getLock(); + assert(global_context.lock().get() == this); + assert(!shared->is_server_completely_started); + assert(getApplicationType() == ApplicationType::SERVER); + shared->is_server_completely_started = true; +} + PartUUIDsPtr Context::getPartUUIDs() const { auto lock = getLock(); diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index 2d1ce023eca..b53e3945188 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -25,6 +26,8 @@ #include #include #include + +#include #include @@ -80,6 +83,7 @@ class AsynchronousMetricLog; class OpenTelemetrySpanLog; class ZooKeeperLog; class SessionLog; +class TransactionsInfoLog; class ProcessorsProfileLog; struct MergeTreeSettings; class StorageS3Settings; @@ -313,6 +317,7 @@ private: /// A flag, used to distinguish between user query and internal query to a database engine (MaterializedPostgreSQL). bool is_internal_query = false; + inline static ContextPtr global_context_instance; public: // Top-level OpenTelemetry trace context for the query. Makes sense only for a query context. @@ -340,6 +345,11 @@ private: /// thousands of signatures. /// And I hope it will be replaced with more common Transaction sometime. + MergeTreeTransactionPtr merge_tree_transaction; /// Current transaction context. Can be inside session or query context. + /// It's shared with all children contexts. + MergeTreeTransactionHolder merge_tree_transaction_holder; /// It will rollback or commit transaction on Context destruction. + + /// Use copy constructor or createGlobal() instead Context(); Context(const Context &); Context & operator=(const Context &); @@ -634,6 +644,8 @@ public: ContextMutablePtr getGlobalContext() const; + static ContextPtr getGlobalContextInstance() { return global_context_instance; } + bool hasGlobalContext() const { return !global_context.expired(); } bool isGlobalContext() const { @@ -803,6 +815,7 @@ public: std::shared_ptr getOpenTelemetrySpanLog() const; std::shared_ptr getZooKeeperLog() const; std::shared_ptr getSessionLog() const; + std::shared_ptr getTransactionsInfoLog() const; std::shared_ptr getProcessorsProfileLog() const; /// Returns an object used to log operations with parts if it possible. @@ -891,6 +904,14 @@ public: /// Removes context of current distributed DDL. void resetZooKeeperMetadataTransaction(); + void checkTransactionsAreAllowed(bool explicit_tcl_query = false) const; + void initCurrentTransaction(MergeTreeTransactionPtr txn); + void setCurrentTransaction(MergeTreeTransactionPtr txn); + MergeTreeTransactionPtr getCurrentTransaction() const; + + bool isServerCompletelyStarted() const; + void setServerCompletelyStarted(); + PartUUIDsPtr getPartUUIDs() const; PartUUIDsPtr getIgnoredPartUUIDs() const; diff --git a/src/Interpreters/DDLTask.cpp b/src/Interpreters/DDLTask.cpp index 476da294789..3f43c5eb412 100644 --- a/src/Interpreters/DDLTask.cpp +++ b/src/Interpreters/DDLTask.cpp @@ -391,12 +391,7 @@ ContextMutablePtr DatabaseReplicatedTask::makeQueryContext(ContextPtr from_conte String DDLTaskBase::getLogEntryName(UInt32 log_entry_number) { - /// Sequential counter in ZooKeeper is Int32. - assert(log_entry_number < std::numeric_limits::max()); - constexpr size_t seq_node_digits = 10; - String number = toString(log_entry_number); - String name = "query-" + String(seq_node_digits - number.size(), '0') + number; - return name; + return zkutil::getSequentialNodeName("query-", log_entry_number); } UInt32 DDLTaskBase::getLogEntryNumber(const String & log_entry_name) diff --git a/src/Interpreters/IInterpreter.cpp b/src/Interpreters/IInterpreter.cpp index af0c06e7503..84fbfee7905 100644 --- a/src/Interpreters/IInterpreter.cpp +++ b/src/Interpreters/IInterpreter.cpp @@ -1,9 +1,16 @@ #include #include #include +#include namespace DB { + +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +} + void IInterpreter::extendQueryLogElem( QueryLogElement & elem, const ASTPtr & ast, ContextPtr context, const String & query_database, const String & query_table) const { @@ -21,4 +28,18 @@ void IInterpreter::extendQueryLogElem( extendQueryLogElemImpl(elem, ast, context); } + +void IInterpreter::checkStorageSupportsTransactionsIfNeeded(const StoragePtr & storage, ContextPtr context) +{ + if (!context->getCurrentTransaction()) + return; + + if (storage->supportsTransactions()) + return; + + if (context->getSettingsRef().throw_on_unsupported_query_inside_transaction) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Storage {} (table {}) does not support transactions", + storage->getName(), storage->getStorageID().getNameForLogs()); +} + } diff --git a/src/Interpreters/IInterpreter.h b/src/Interpreters/IInterpreter.h index 665a46190fd..74a568c5cba 100644 --- a/src/Interpreters/IInterpreter.h +++ b/src/Interpreters/IInterpreter.h @@ -3,6 +3,7 @@ #include #include #include +#include namespace DB { @@ -33,6 +34,13 @@ public: virtual void extendQueryLogElemImpl(QueryLogElement &, const ASTPtr &, ContextPtr) const {} + /// Returns true if transactions maybe supported for this type of query. + /// If Interpreter returns true, than it is responsible to check that specific query with specific Storage is supported. + virtual bool supportsTransactions() const { return false; } + + /// Helper function for some Interpreters. + static void checkStorageSupportsTransactionsIfNeeded(const StoragePtr & storage, ContextPtr context); + virtual ~IInterpreter() = default; }; diff --git a/src/Interpreters/InterpreterAlterQuery.cpp b/src/Interpreters/InterpreterAlterQuery.cpp index d01f2b05567..df1d6b8c92c 100644 --- a/src/Interpreters/InterpreterAlterQuery.cpp +++ b/src/Interpreters/InterpreterAlterQuery.cpp @@ -86,6 +86,7 @@ BlockIO InterpreterAlterQuery::executeToTable(const ASTAlterQuery & alter) } StoragePtr table = DatabaseCatalog::instance().getTable(table_id, getContext()); + checkStorageSupportsTransactionsIfNeeded(table, getContext()); if (table->isStaticStorage()) throw Exception(ErrorCodes::TABLE_IS_READ_ONLY, "Table is read-only"); auto table_lock = table->lockForShare(getContext()->getCurrentQueryId(), getContext()->getSettingsRef().lock_acquire_timeout); diff --git a/src/Interpreters/InterpreterAlterQuery.h b/src/Interpreters/InterpreterAlterQuery.h index 9494a400e7b..c6648ff9e7e 100644 --- a/src/Interpreters/InterpreterAlterQuery.h +++ b/src/Interpreters/InterpreterAlterQuery.h @@ -26,6 +26,8 @@ public: void extendQueryLogElemImpl(QueryLogElement & elem, const ASTPtr & ast, ContextPtr context) const override; + bool supportsTransactions() const override { return true; } + private: AccessRightsElements getRequiredAccess() const; diff --git a/src/Interpreters/InterpreterDescribeQuery.cpp b/src/Interpreters/InterpreterDescribeQuery.cpp index da5fcedd469..9919b1272bd 100644 --- a/src/Interpreters/InterpreterDescribeQuery.cpp +++ b/src/Interpreters/InterpreterDescribeQuery.cpp @@ -89,7 +89,7 @@ BlockIO InterpreterDescribeQuery::execute() auto table_lock = table->lockForShare(getContext()->getInitialQueryId(), settings.lock_acquire_timeout); auto metadata_snapshot = table->getInMemoryMetadataPtr(); - storage_snapshot = table->getStorageSnapshot(metadata_snapshot); + storage_snapshot = table->getStorageSnapshot(metadata_snapshot, getContext()); columns = metadata_snapshot->getColumns(); } diff --git a/src/Interpreters/InterpreterExplainQuery.cpp b/src/Interpreters/InterpreterExplainQuery.cpp index edca48d3600..529ff806180 100644 --- a/src/Interpreters/InterpreterExplainQuery.cpp +++ b/src/Interpreters/InterpreterExplainQuery.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -400,6 +401,23 @@ QueryPipeline InterpreterExplainQuery::executeImpl() override_info.appendTo(buf); break; } + case ASTExplainQuery::CurrentTransaction: + { + if (ast.getSettings()) + throw Exception("Settings are not supported for EXPLAIN CURRENT TRANSACTION query.", ErrorCodes::UNKNOWN_SETTING); + + if (auto txn = getContext()->getCurrentTransaction()) + { + String dump = txn->dumpDescription(); + buf.write(dump.data(), dump.size()); + } + else + { + writeCString("", buf); + } + + break; + } } if (insert_buf) { diff --git a/src/Interpreters/InterpreterExplainQuery.h b/src/Interpreters/InterpreterExplainQuery.h index a640b1c977c..ccfe8ec88a5 100644 --- a/src/Interpreters/InterpreterExplainQuery.h +++ b/src/Interpreters/InterpreterExplainQuery.h @@ -17,6 +17,8 @@ public: static Block getSampleBlock(ASTExplainQuery::ExplainKind kind); + bool supportsTransactions() const override { return true; } + private: ASTPtr query; diff --git a/src/Interpreters/InterpreterFactory.cpp b/src/Interpreters/InterpreterFactory.cpp index e9ee2b0910a..5dcee1eae05 100644 --- a/src/Interpreters/InterpreterFactory.cpp +++ b/src/Interpreters/InterpreterFactory.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -62,6 +63,7 @@ #include #include #include +#include #include #include @@ -278,6 +280,10 @@ std::unique_ptr InterpreterFactory::get(ASTPtr & query, ContextMut { return std::make_unique(query, context); } + else if (query->as()) + { + return std::make_unique(query, context); + } else if (query->as()) { return std::make_unique(query, context, true /*persist_function*/); diff --git a/src/Interpreters/InterpreterInsertQuery.cpp b/src/Interpreters/InterpreterInsertQuery.cpp index 49e63a91721..58ca9ccc978 100644 --- a/src/Interpreters/InterpreterInsertQuery.cpp +++ b/src/Interpreters/InterpreterInsertQuery.cpp @@ -287,6 +287,8 @@ BlockIO InterpreterInsertQuery::execute() QueryPipelineBuilder pipeline; StoragePtr table = getTable(query); + checkStorageSupportsTransactionsIfNeeded(table, getContext()); + StoragePtr inner_table; if (const auto * mv = dynamic_cast(table.get())) inner_table = mv->getTargetTable(); diff --git a/src/Interpreters/InterpreterInsertQuery.h b/src/Interpreters/InterpreterInsertQuery.h index 93de92a0680..51a3f0384aa 100644 --- a/src/Interpreters/InterpreterInsertQuery.h +++ b/src/Interpreters/InterpreterInsertQuery.h @@ -46,6 +46,8 @@ public: StoragePtr getTable(ASTInsertQuery & query); Block getSampleBlock(const ASTInsertQuery & query, const StoragePtr & table, const StorageMetadataPtr & metadata_snapshot) const; + bool supportsTransactions() const override { return true; } + private: Block getSampleBlock(const Names & names, const StoragePtr & table, const StorageMetadataPtr & metadata_snapshot) const; diff --git a/src/Interpreters/InterpreterKillQueryQuery.cpp b/src/Interpreters/InterpreterKillQueryQuery.cpp index 5ec6abb08a7..481355878aa 100644 --- a/src/Interpreters/InterpreterKillQueryQuery.cpp +++ b/src/Interpreters/InterpreterKillQueryQuery.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -358,6 +359,49 @@ BlockIO InterpreterKillQueryQuery::execute() break; } + case ASTKillQueryQuery::Type::Transaction: + { + getContext()->checkAccess(AccessType::KILL_TRANSACTION); + + Block transactions_block = getSelectResult("tid, tid_hash, elapsed, is_readonly, state", "system.transactions"); + + if (!transactions_block) + return res_io; + + const ColumnUInt64 & tid_hash_col = typeid_cast(*transactions_block.getByName("tid_hash").column); + + auto header = transactions_block.cloneEmpty(); + header.insert(0, {ColumnString::create(), std::make_shared(), "kill_status"}); + MutableColumns res_columns = header.cloneEmptyColumns(); + + for (size_t i = 0; i < transactions_block.rows(); ++i) + { + UInt64 tid_hash = tid_hash_col.getUInt(i); + + CancellationCode code = CancellationCode::Unknown; + if (!query.test) + { + auto txn = TransactionLog::instance().tryGetRunningTransaction(tid_hash); + if (txn) + { + txn->onException(); + if (txn->getState() == MergeTreeTransaction::ROLLED_BACK) + code = CancellationCode::CancelSent; + else + code = CancellationCode::CancelCannotBeSent; + } + else + { + code = CancellationCode::NotFound; + } + } + + insertResultRow(i, code, transactions_block, header, res_columns); + } + + res_io.pipeline = QueryPipeline(Pipe(std::make_shared(header.cloneWithColumns(std::move(res_columns))))); + break; + } } return res_io; diff --git a/src/Interpreters/InterpreterOptimizeQuery.cpp b/src/Interpreters/InterpreterOptimizeQuery.cpp index d4fe7604ced..83bf23ab4ad 100644 --- a/src/Interpreters/InterpreterOptimizeQuery.cpp +++ b/src/Interpreters/InterpreterOptimizeQuery.cpp @@ -31,8 +31,9 @@ BlockIO InterpreterOptimizeQuery::execute() auto table_id = getContext()->resolveStorageID(ast, Context::ResolveOrdinary); StoragePtr table = DatabaseCatalog::instance().getTable(table_id, getContext()); + checkStorageSupportsTransactionsIfNeeded(table, getContext()); auto metadata_snapshot = table->getInMemoryMetadataPtr(); - auto storage_snapshot = table->getStorageSnapshot(metadata_snapshot); + auto storage_snapshot = table->getStorageSnapshot(metadata_snapshot, getContext()); // Empty list of names means we deduplicate by all columns, but user can explicitly state which columns to use. Names column_names; diff --git a/src/Interpreters/InterpreterOptimizeQuery.h b/src/Interpreters/InterpreterOptimizeQuery.h index 8491fe8df49..932700e51b5 100644 --- a/src/Interpreters/InterpreterOptimizeQuery.h +++ b/src/Interpreters/InterpreterOptimizeQuery.h @@ -18,6 +18,8 @@ public: BlockIO execute() override; + bool supportsTransactions() const override { return true; } + private: AccessRightsElements getRequiredAccess() const; diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index 5091debbe72..270c7502ecd 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -329,12 +329,28 @@ InterpreterSelectQuery::InterpreterSelectQuery( if (!metadata_snapshot) metadata_snapshot = storage->getInMemoryMetadataPtr(); - storage_snapshot = storage->getStorageSnapshotForQuery(metadata_snapshot, query_ptr); + storage_snapshot = storage->getStorageSnapshotForQuery(metadata_snapshot, query_ptr, context); } if (has_input || !joined_tables.resolveTables()) joined_tables.makeFakeTable(storage, metadata_snapshot, source_header); + + if (context->getCurrentTransaction() && context->getSettingsRef().throw_on_unsupported_query_inside_transaction) + { + if (storage) + checkStorageSupportsTransactionsIfNeeded(storage, context); + for (const auto & table : joined_tables.tablesWithColumns()) + { + if (table.table.table.empty()) + continue; + auto maybe_storage = DatabaseCatalog::instance().tryGetTable({table.table.database, table.table.table}, context); + if (!maybe_storage) + continue; + checkStorageSupportsTransactionsIfNeeded(storage, context); + } + } + /// Rewrite JOINs if (!has_input && joined_tables.tablesCount() > 1) { @@ -1791,7 +1807,7 @@ void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum proc const auto & func = desc.function; std::optional num_rows{}; - if (!query.prewhere() && !query.where()) + if (!query.prewhere() && !query.where() && !context->getCurrentTransaction()) { num_rows = storage->totalRows(settings); } diff --git a/src/Interpreters/InterpreterSelectQuery.h b/src/Interpreters/InterpreterSelectQuery.h index 6bb12caff7d..ff3e8a1f706 100644 --- a/src/Interpreters/InterpreterSelectQuery.h +++ b/src/Interpreters/InterpreterSelectQuery.h @@ -104,6 +104,8 @@ public: Names getRequiredColumns() { return required_columns; } + bool supportsTransactions() const override { return true; } + private: InterpreterSelectQuery( const ASTPtr & query_ptr_, diff --git a/src/Interpreters/InterpreterSelectWithUnionQuery.h b/src/Interpreters/InterpreterSelectWithUnionQuery.h index 720632e7be5..adf8540d626 100644 --- a/src/Interpreters/InterpreterSelectWithUnionQuery.h +++ b/src/Interpreters/InterpreterSelectWithUnionQuery.h @@ -39,6 +39,8 @@ public: virtual void ignoreWithTotals() override; + bool supportsTransactions() const override { return true; } + private: std::vector> nested_interpreters; diff --git a/src/Interpreters/InterpreterSetQuery.h b/src/Interpreters/InterpreterSetQuery.h index 9bd49708421..39d331100d6 100644 --- a/src/Interpreters/InterpreterSetQuery.h +++ b/src/Interpreters/InterpreterSetQuery.h @@ -25,6 +25,8 @@ public: */ void executeForCurrentContext(); + bool supportsTransactions() const override { return true; } + private: ASTPtr query_ptr; }; diff --git a/src/Interpreters/InterpreterSystemQuery.cpp b/src/Interpreters/InterpreterSystemQuery.cpp index b4664587a44..30f0f892ca4 100644 --- a/src/Interpreters/InterpreterSystemQuery.cpp +++ b/src/Interpreters/InterpreterSystemQuery.cpp @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -462,6 +463,7 @@ BlockIO InterpreterSystemQuery::execute() [&] { if (auto query_views_log = getContext()->getQueryViewsLog()) query_views_log->flush(true); }, [&] { if (auto zookeeper_log = getContext()->getZooKeeperLog()) zookeeper_log->flush(true); }, [&] { if (auto session_log = getContext()->getSessionLog()) session_log->flush(true); }, + [&] { if (auto transactions_info_log = getContext()->getTransactionsInfoLog()) transactions_info_log->flush(true); }, [&] { if (auto processors_profile_log = getContext()->getProcessorsProfileLog()) processors_profile_log->flush(true); } ); break; diff --git a/src/Interpreters/InterpreterTransactionControlQuery.cpp b/src/Interpreters/InterpreterTransactionControlQuery.cpp new file mode 100644 index 00000000000..61b2a4e865f --- /dev/null +++ b/src/Interpreters/InterpreterTransactionControlQuery.cpp @@ -0,0 +1,90 @@ +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; + extern const int INVALID_TRANSACTION; +} + +BlockIO InterpreterTransactionControlQuery::execute() +{ + if (!query_context->hasSessionContext()) + throw Exception(ErrorCodes::INVALID_TRANSACTION, "Transaction Control Language queries are allowed only inside session"); + + ContextMutablePtr session_context = query_context->getSessionContext(); + const auto & tcl = query_ptr->as(); + + switch (tcl.action) + { + case ASTTransactionControl::BEGIN: + return executeBegin(session_context); + case ASTTransactionControl::COMMIT: + return executeCommit(session_context); + case ASTTransactionControl::ROLLBACK: + return executeRollback(session_context); + case ASTTransactionControl::SET_SNAPSHOT: + return executeSetSnapshot(session_context, tcl.snapshot); + } + assert(false); + __builtin_unreachable(); +} + +BlockIO InterpreterTransactionControlQuery::executeBegin(ContextMutablePtr session_context) +{ + if (session_context->getCurrentTransaction()) + throw Exception(ErrorCodes::INVALID_TRANSACTION, "Nested transactions are not supported"); + + session_context->checkTransactionsAreAllowed(/* explicit_tcl_query = */ true); + auto txn = TransactionLog::instance().beginTransaction(); + session_context->initCurrentTransaction(txn); + query_context->setCurrentTransaction(txn); + return {}; +} + +BlockIO InterpreterTransactionControlQuery::executeCommit(ContextMutablePtr session_context) +{ + auto txn = session_context->getCurrentTransaction(); + if (!txn) + throw Exception(ErrorCodes::INVALID_TRANSACTION, "There is no current transaction"); + if (txn->getState() != MergeTreeTransaction::RUNNING) + throw Exception(ErrorCodes::INVALID_TRANSACTION, "Transaction is not in RUNNING state"); + + TransactionLog::instance().commitTransaction(txn); + session_context->setCurrentTransaction(NO_TRANSACTION_PTR); + return {}; +} + +BlockIO InterpreterTransactionControlQuery::executeRollback(ContextMutablePtr session_context) +{ + auto txn = session_context->getCurrentTransaction(); + if (!txn) + throw Exception(ErrorCodes::INVALID_TRANSACTION, "There is no current transaction"); + if (txn->getState() == MergeTreeTransaction::COMMITTED) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Transaction is in COMMITTED state"); + + if (txn->getState() == MergeTreeTransaction::RUNNING) + TransactionLog::instance().rollbackTransaction(txn); + session_context->setCurrentTransaction(NO_TRANSACTION_PTR); + return {}; +} + +BlockIO InterpreterTransactionControlQuery::executeSetSnapshot(ContextMutablePtr session_context, UInt64 snapshot) +{ + auto txn = session_context->getCurrentTransaction(); + if (!txn) + throw Exception(ErrorCodes::INVALID_TRANSACTION, "There is no current transaction"); + + if (snapshot <= Tx::MaxReservedCSN && snapshot != Tx::PrehistoricCSN && snapshot != Tx::EverythingVisibleCSN) + throw Exception(ErrorCodes::INVALID_TRANSACTION, "Cannot set snapshot to reserved CSN"); + + txn->setSnapshot(snapshot); + return {}; +} + +} diff --git a/src/Interpreters/InterpreterTransactionControlQuery.h b/src/Interpreters/InterpreterTransactionControlQuery.h new file mode 100644 index 00000000000..05d3068e095 --- /dev/null +++ b/src/Interpreters/InterpreterTransactionControlQuery.h @@ -0,0 +1,34 @@ +#pragma once +#include +#include + +namespace DB +{ + +class InterpreterTransactionControlQuery : public IInterpreter +{ +public: + InterpreterTransactionControlQuery(const ASTPtr & query_ptr_, ContextMutablePtr context_) + : query_context(context_) + , query_ptr(query_ptr_) + { + } + + BlockIO execute() override; + + bool ignoreQuota() const override { return true; } + bool ignoreLimits() const override { return true; } + bool supportsTransactions() const override { return true; } + +private: + BlockIO executeBegin(ContextMutablePtr session_context); + static BlockIO executeCommit(ContextMutablePtr session_context); + static BlockIO executeRollback(ContextMutablePtr session_context); + static BlockIO executeSetSnapshot(ContextMutablePtr session_context, UInt64 snapshot); + +private: + ContextMutablePtr query_context; + ASTPtr query_ptr; +}; + +} diff --git a/src/Interpreters/MergeTreeTransaction.cpp b/src/Interpreters/MergeTreeTransaction.cpp new file mode 100644 index 00000000000..7c1feb579e2 --- /dev/null +++ b/src/Interpreters/MergeTreeTransaction.cpp @@ -0,0 +1,351 @@ +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int INVALID_TRANSACTION; + extern const int LOGICAL_ERROR; +} + +static TableLockHolder getLockForOrdinary(const StoragePtr & storage) +{ + if (storage->getStorageID().uuid != UUIDHelpers::Nil) + return {}; + + /// Maybe we should just throw an exception and do not support Ordinary database? + auto default_timeout = std::chrono::milliseconds(10 * 1000); + return storage->lockForShare(RWLockImpl::NO_QUERY, default_timeout); +} + +MergeTreeTransaction::MergeTreeTransaction(CSN snapshot_, LocalTID local_tid_, UUID host_id) + : tid({snapshot_, local_tid_, host_id}) + , snapshot(snapshot_) + , csn(Tx::UnknownCSN) +{ +} + +void MergeTreeTransaction::setSnapshot(CSN new_snapshot) +{ + snapshot = new_snapshot; +} + +MergeTreeTransaction::State MergeTreeTransaction::getState() const +{ + CSN c = csn.load(); + if (c == Tx::UnknownCSN || c == Tx::CommittingCSN) + return RUNNING; + if (c == Tx::RolledBackCSN) + return ROLLED_BACK; + return COMMITTED; +} + +void MergeTreeTransaction::checkIsNotCancelled() const +{ + CSN c = csn.load(); + if (c == Tx::RolledBackCSN) + throw Exception(ErrorCodes::INVALID_TRANSACTION, "Transaction was cancelled"); + else if (c != Tx::UnknownCSN) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected CSN state: {}", c); +} + +void MergeTreeTransaction::addNewPart(const StoragePtr & storage, const DataPartPtr & new_part, MergeTreeTransaction * txn) +{ + /// Creation TID was written to data part earlier on part creation. + /// We only need to ensure that it's written and add part to in-memory set of new parts. + new_part->assertHasVersionMetadata(txn); + if (txn) + { + txn->addNewPart(storage, new_part); + /// Now we know actual part name and can write it to system log table. + tryWriteEventToSystemLog(new_part->version.log, TransactionsInfoLogElement::ADD_PART, txn->tid, TransactionInfoContext{storage->getStorageID(), new_part->name}); + } +} + +void MergeTreeTransaction::removeOldPart(const StoragePtr & storage, const DataPartPtr & part_to_remove, MergeTreeTransaction * txn) +{ + TransactionInfoContext transaction_context{storage->getStorageID(), part_to_remove->name}; + if (txn) + { + /// Lock part for removal and write current TID into version metadata file. + /// If server crash just after committing transactions + /// we will find this TID in version metadata and will finally remove part. + txn->removeOldPart(storage, part_to_remove, transaction_context); + } + else + { + /// Lock part for removal with special TID, so transactions will not try to remove it concurrently. + /// We lock it only in memory if part was not involved in any transactions. + part_to_remove->version.lockRemovalTID(Tx::PrehistoricTID, transaction_context); + if (part_to_remove->wasInvolvedInTransaction()) + part_to_remove->appendRemovalTIDToVersionMetadata(); + } +} + +void MergeTreeTransaction::addNewPartAndRemoveCovered(const StoragePtr & storage, const DataPartPtr & new_part, const DataPartsVector & covered_parts, MergeTreeTransaction * txn) +{ + TransactionID tid = txn ? txn->tid : Tx::PrehistoricTID; + TransactionInfoContext transaction_context{storage->getStorageID(), new_part->name}; + tryWriteEventToSystemLog(new_part->version.log, TransactionsInfoLogElement::ADD_PART, tid, transaction_context); + transaction_context.covering_part = std::move(transaction_context.part_name); + new_part->assertHasVersionMetadata(txn); + + if (txn) + { + txn->addNewPart(storage, new_part); + for (const auto & covered : covered_parts) + { + transaction_context.part_name = covered->name; + txn->removeOldPart(storage, covered, transaction_context); + } + } + else + { + for (const auto & covered : covered_parts) + { + transaction_context.part_name = covered->name; + covered->version.lockRemovalTID(tid, transaction_context); + } + } +} + +void MergeTreeTransaction::addNewPart(const StoragePtr & storage, const DataPartPtr & new_part) +{ + auto maybe_lock = getLockForOrdinary(storage); + std::lock_guard lock{mutex}; + checkIsNotCancelled(); + storages.insert(storage); + if (maybe_lock) + table_read_locks_for_ordinary_db.emplace_back(std::move(maybe_lock)); + creating_parts.push_back(new_part); +} + +void MergeTreeTransaction::removeOldPart(const StoragePtr & storage, const DataPartPtr & part_to_remove, const TransactionInfoContext & context) +{ + auto maybe_lock = getLockForOrdinary(storage); + + { + std::lock_guard lock{mutex}; + checkIsNotCancelled(); + + LockMemoryExceptionInThread lock_memory_tracker(VariableContext::Global); + part_to_remove->version.lockRemovalTID(tid, context); + storages.insert(storage); + if (maybe_lock) + table_read_locks_for_ordinary_db.emplace_back(std::move(maybe_lock)); + removing_parts.push_back(part_to_remove); + } + + part_to_remove->appendRemovalTIDToVersionMetadata(); +} + +void MergeTreeTransaction::addMutation(const StoragePtr & table, const String & mutation_id) +{ + auto maybe_lock = getLockForOrdinary(table); + std::lock_guard lock{mutex}; + checkIsNotCancelled(); + storages.insert(table); + if (maybe_lock) + table_read_locks_for_ordinary_db.emplace_back(std::move(maybe_lock)); + mutations.emplace_back(table, mutation_id); +} + +bool MergeTreeTransaction::isReadOnly() const +{ + std::lock_guard lock{mutex}; + assert((creating_parts.empty() && removing_parts.empty() && mutations.empty()) == storages.empty()); + return storages.empty(); +} + +scope_guard MergeTreeTransaction::beforeCommit() +{ + RunningMutationsList mutations_to_wait; + { + std::lock_guard lock{mutex}; + mutations_to_wait = mutations; + } + + /// We should wait for mutations to finish before committing transaction, because some mutation may fail and cause rollback. + for (const auto & table_and_mutation : mutations_to_wait) + table_and_mutation.first->waitForMutation(table_and_mutation.second); + + assert([&]() + { + std::lock_guard lock{mutex}; + return mutations == mutations_to_wait; + }()); + + CSN expected = Tx::UnknownCSN; + bool can_commit = csn.compare_exchange_strong(expected, Tx::CommittingCSN); + if (!can_commit) + { + /// Transaction was concurrently cancelled by KILL TRANSACTION or KILL MUTATION + if (expected == Tx::RolledBackCSN) + throw Exception(ErrorCodes::INVALID_TRANSACTION, "Transaction was cancelled"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected CSN state: {}", expected); + } + + /// We should set CSN back to Unknown if we will fail to commit transaction for some reason (connection loss, etc) + return [this]() + { + CSN expected_value = Tx::CommittingCSN; + csn.compare_exchange_strong(expected_value, Tx::UnknownCSN); + }; +} + +void MergeTreeTransaction::afterCommit(CSN assigned_csn) noexcept +{ + /// Write allocated CSN into version metadata, so we will know CSN without reading it from transaction log + /// and we will be able to remove old entries from transaction log in ZK. + /// It's not a problem if server crash before CSN is written, because we already have TID in data part and entry in the log. + [[maybe_unused]] CSN prev_value = csn.exchange(assigned_csn); + assert(prev_value == Tx::CommittingCSN); + for (const auto & part : creating_parts) + { + part->version.creation_csn.store(csn); + part->appendCSNToVersionMetadata(VersionMetadata::WhichCSN::CREATION); + } + + for (const auto & part : removing_parts) + { + part->version.removal_csn.store(csn); + part->appendCSNToVersionMetadata(VersionMetadata::WhichCSN::REMOVAL); + } + + for (const auto & storage_and_mutation : mutations) + storage_and_mutation.first->setMutationCSN(storage_and_mutation.second, csn); +} + +bool MergeTreeTransaction::rollback() noexcept +{ + CSN expected = Tx::UnknownCSN; + bool need_rollback = csn.compare_exchange_strong(expected, Tx::RolledBackCSN); + + /// Check that it was not rolled back concurrently + if (!need_rollback) + return false; + + /// It's not a problem if server crash at this point + /// because on startup we will see that TID is not committed and will simply discard these changes. + + RunningMutationsList mutations_to_kill; + DataPartsVector parts_to_remove; + DataPartsVector parts_to_activate; + + { + std::lock_guard lock{mutex}; + mutations_to_kill = mutations; + parts_to_remove = creating_parts; + parts_to_activate = removing_parts; + } + + /// Forcefully stop related mutations if any + for (const auto & table_and_mutation : mutations_to_kill) + table_and_mutation.first->killMutation(table_and_mutation.second); + + /// Discard changes in active parts set + /// Remove parts that were created, restore parts that were removed (except parts that were created by this transaction too) + for (const auto & part : parts_to_remove) + { + if (part->version.isRemovalTIDLocked()) + { + /// Don't need to remove part from working set if it was created and removed by this transaction + assert(part->version.removal_tid_lock == tid.getHash()); + continue; + } + /// FIXME do not lock removal_tid when rolling back part creation, it's ugly + const_cast(part->storage).removePartsFromWorkingSet(NO_TRANSACTION_RAW, {part}, true); + } + + for (const auto & part : parts_to_activate) + if (part->version.getCreationTID() != tid) + const_cast(part->storage).restoreAndActivatePart(part); + + /// Kind of optimization: cleanup thread can remove these parts immediately + for (const auto & part : parts_to_remove) + { + part->version.creation_csn.store(Tx::RolledBackCSN); + /// Write special RolledBackCSN, so we will be able to cleanup transaction log + part->appendCSNToVersionMetadata(VersionMetadata::CREATION); + } + + for (const auto & part : parts_to_activate) + { + /// Clear removal_tid from version metadata file, so we will not need to distinguish TIDs that were not committed + /// and TIDs that were committed long time ago and were removed from the log on log cleanup. + part->appendRemovalTIDToVersionMetadata(/* clear */ true); + part->version.unlockRemovalTID(tid, TransactionInfoContext{part->storage.getStorageID(), part->name}); + } + + + assert([&]() + { + std::lock_guard lock{mutex}; + assert(mutations_to_kill == mutations); + assert(parts_to_remove == creating_parts); + assert(parts_to_activate == removing_parts); + return csn == Tx::RolledBackCSN; + }()); + + return true; +} + +void MergeTreeTransaction::onException() +{ + TransactionLog::instance().rollbackTransaction(shared_from_this()); +} + +String MergeTreeTransaction::dumpDescription() const +{ + String res = fmt::format("{} state: {}, snapshot: {}", tid, getState(), snapshot); + + if (isReadOnly()) + { + res += ", readonly"; + return res; + } + + std::lock_guard lock{mutex}; + + res += fmt::format(", affects {} tables:", storages.size()); + + using ChangesInTable = std::tuple; + std::unordered_map storage_to_changes; + + for (const auto & part : creating_parts) + std::get<0>(storage_to_changes[&(part->storage)]).push_back(part->name); + + for (const auto & part : removing_parts) + { + String info = fmt::format("{} (created by {}, {})", part->name, part->version.getCreationTID(), part->version.creation_csn); + std::get<1>(storage_to_changes[&(part->storage)]).push_back(std::move(info)); + assert(!part->version.creation_csn || part->version.creation_csn <= snapshot); + } + + for (const auto & mutation : mutations) + std::get<2>(storage_to_changes[mutation.first.get()]).push_back(mutation.second); + + for (const auto & storage_changes : storage_to_changes) + { + res += fmt::format("\n\t{}:", storage_changes.first->getStorageID().getNameForLogs()); + const auto & creating_info = std::get<0>(storage_changes.second); + const auto & removing_info = std::get<1>(storage_changes.second); + const auto & mutations_info = std::get<2>(storage_changes.second); + + if (!creating_info.empty()) + res += fmt::format("\n\t\tcreating parts:\n\t\t\t{}", fmt::join(creating_info, "\n\t\t\t")); + if (!removing_info.empty()) + res += fmt::format("\n\t\tremoving parts:\n\t\t\t{}", fmt::join(removing_info, "\n\t\t\t")); + if (!mutations_info.empty()) + res += fmt::format("\n\t\tmutations:\n\t\t\t{}", fmt::join(mutations_info, "\n\t\t\t")); + } + + return res; +} + +} diff --git a/src/Interpreters/MergeTreeTransaction.h b/src/Interpreters/MergeTreeTransaction.h new file mode 100644 index 00000000000..7ebea450dd0 --- /dev/null +++ b/src/Interpreters/MergeTreeTransaction.h @@ -0,0 +1,84 @@ +#pragma once +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace DB +{ + +class IMergeTreeDataPart; +using DataPartPtr = std::shared_ptr; +using DataPartsVector = std::vector; + +/// This object is responsible for tracking all changes that some transaction is making in MergeTree tables. +/// It collects all changes that queries of current transaction made in data part sets of all MergeTree tables +/// to ether make them visible when transaction commits or undo when transaction rolls back. +class MergeTreeTransaction : public std::enable_shared_from_this, private boost::noncopyable +{ + friend class TransactionLog; +public: + enum State + { + RUNNING, + COMMITTED, + ROLLED_BACK, + }; + + CSN getSnapshot() const { return snapshot; } + void setSnapshot(CSN new_snapshot); + State getState() const; + + const TransactionID tid; + + MergeTreeTransaction(CSN snapshot_, LocalTID local_tid_, UUID host_id); + + void addNewPart(const StoragePtr & storage, const DataPartPtr & new_part); + void removeOldPart(const StoragePtr & storage, const DataPartPtr & part_to_remove, const TransactionInfoContext & context); + + void addMutation(const StoragePtr & table, const String & mutation_id); + + static void addNewPart(const StoragePtr & storage, const DataPartPtr & new_part, MergeTreeTransaction * txn); + static void removeOldPart(const StoragePtr & storage, const DataPartPtr & part_to_remove, MergeTreeTransaction * txn); + static void addNewPartAndRemoveCovered(const StoragePtr & storage, const DataPartPtr & new_part, const DataPartsVector & covered_parts, MergeTreeTransaction * txn); + + bool isReadOnly() const; + + void onException(); + + String dumpDescription() const; + + Float64 elapsedSeconds() const { return elapsed.elapsedSeconds(); } + +private: + scope_guard beforeCommit(); + void afterCommit(CSN assigned_csn) noexcept; + bool rollback() noexcept; + void checkIsNotCancelled() const; + + mutable std::mutex mutex; + Stopwatch elapsed; + + /// Usually it's equal to tid.start_csn, but can be changed by SET SNAPSHOT query (for introspection purposes and time-traveling) + CSN snapshot; + std::list::iterator snapshot_in_use_it; + + /// Lists of changes made by transaction + std::unordered_set storages; + std::vector table_read_locks_for_ordinary_db; + DataPartsVector creating_parts; + DataPartsVector removing_parts; + using RunningMutationsList = std::vector>; + RunningMutationsList mutations; + + std::atomic csn; +}; + +using MergeTreeTransactionPtr = std::shared_ptr; + +} diff --git a/src/Interpreters/MergeTreeTransactionHolder.cpp b/src/Interpreters/MergeTreeTransactionHolder.cpp new file mode 100644 index 00000000000..bf63a471282 --- /dev/null +++ b/src/Interpreters/MergeTreeTransactionHolder.cpp @@ -0,0 +1,84 @@ +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + +MergeTreeTransactionHolder::MergeTreeTransactionHolder(const MergeTreeTransactionPtr & txn_, bool autocommit_ = false, const Context * owned_by_session_context_) + : txn(txn_) + , autocommit(autocommit_) + , owned_by_session_context(owned_by_session_context_) +{ + assert(!txn || txn->getState() == MergeTreeTransaction::RUNNING); + assert(!owned_by_session_context || owned_by_session_context == owned_by_session_context->getSessionContext().get()); +} + +MergeTreeTransactionHolder::MergeTreeTransactionHolder(MergeTreeTransactionHolder && rhs) noexcept +{ + *this = std::move(rhs); +} + +MergeTreeTransactionHolder & MergeTreeTransactionHolder::operator=(MergeTreeTransactionHolder && rhs) noexcept +{ + onDestroy(); + txn = NO_TRANSACTION_PTR; + autocommit = false; + owned_by_session_context = nullptr; + std::swap(txn, rhs.txn); + std::swap(autocommit, rhs.autocommit); + std::swap(owned_by_session_context, rhs.owned_by_session_context); + return *this; +} + +MergeTreeTransactionHolder::~MergeTreeTransactionHolder() +{ + onDestroy(); +} + +void MergeTreeTransactionHolder::onDestroy() noexcept +{ + if (!txn) + return; + if (txn->getState() != MergeTreeTransaction::RUNNING) + return; + + if (autocommit && std::uncaught_exceptions() == 0) + { + try + { + TransactionLog::instance().commitTransaction(txn); + return; + } + catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + } + } + + TransactionLog::instance().rollbackTransaction(txn); +} + +MergeTreeTransactionHolder::MergeTreeTransactionHolder(const MergeTreeTransactionHolder & rhs) +{ + *this = rhs; +} + +MergeTreeTransactionHolder & MergeTreeTransactionHolder::operator=(const MergeTreeTransactionHolder & rhs) // NOLINT +{ + if (rhs.txn && !rhs.owned_by_session_context) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Tried to copy non-empty MergeTreeTransactionHolder that is not owned by session context. It's a bug"); + assert(!txn); + assert(!autocommit); + assert(!owned_by_session_context); + return *this; +} + +} diff --git a/src/Interpreters/MergeTreeTransactionHolder.h b/src/Interpreters/MergeTreeTransactionHolder.h new file mode 100644 index 00000000000..4e8a196f4d2 --- /dev/null +++ b/src/Interpreters/MergeTreeTransactionHolder.h @@ -0,0 +1,42 @@ +#pragma once +#include + +namespace DB +{ + +class Context; + +class MergeTreeTransaction; +/// TODO maybe replace with raw pointer? It should not be shared, only MergeTreeTransactionHolder can own a transaction object +using MergeTreeTransactionPtr = std::shared_ptr; + +/// Owns a MergeTreeTransactionObject. +/// Rolls back a transaction in dtor if it was not committed. +/// If `autocommit` flag is true, then it commits transaction if dtor is called normally +/// or rolls it back if dtor was called due to an exception. +class MergeTreeTransactionHolder +{ +public: + MergeTreeTransactionHolder() = default; + MergeTreeTransactionHolder(const MergeTreeTransactionPtr & txn_, bool autocommit_, const Context * owned_by_session_context_ = nullptr); + MergeTreeTransactionHolder(MergeTreeTransactionHolder && rhs) noexcept; + MergeTreeTransactionHolder & operator=(MergeTreeTransactionHolder && rhs) noexcept; + ~MergeTreeTransactionHolder(); + + /// NOTE: We cannot make it noncopyable, because we use it as a field of Context. + /// So the following copy constructor and operator does not copy anything, + /// they just leave txn nullptr. + MergeTreeTransactionHolder(const MergeTreeTransactionHolder & rhs); + MergeTreeTransactionHolder & operator=(const MergeTreeTransactionHolder & rhs); + + MergeTreeTransactionPtr getTransaction() const { return txn; } + +private: + void onDestroy() noexcept; + + MergeTreeTransactionPtr txn; + bool autocommit = false; + const Context * owned_by_session_context = nullptr; +}; + +} diff --git a/src/Interpreters/MutationsInterpreter.cpp b/src/Interpreters/MutationsInterpreter.cpp index f46333dc00a..2c03f109fe4 100644 --- a/src/Interpreters/MutationsInterpreter.cpp +++ b/src/Interpreters/MutationsInterpreter.cpp @@ -802,7 +802,7 @@ ASTPtr MutationsInterpreter::prepareInterpreterSelectQuery(std::vector & /// e.g. ALTER referencing the same table in scalar subquery bool execute_scalar_subqueries = !dry_run; auto syntax_result = TreeRewriter(context).analyze( - all_asts, all_columns, storage, storage->getStorageSnapshot(metadata_snapshot), + all_asts, all_columns, storage, storage->getStorageSnapshot(metadata_snapshot, context), false, true, execute_scalar_subqueries); if (execute_scalar_subqueries && context->hasQueryContext()) diff --git a/src/Interpreters/QueryLog.cpp b/src/Interpreters/QueryLog.cpp index 5adca8f0e79..03eeb81f14d 100644 --- a/src/Interpreters/QueryLog.cpp +++ b/src/Interpreters/QueryLog.cpp @@ -117,7 +117,9 @@ NamesAndTypesList QueryLogElement::getNamesAndTypes() {"used_formats", std::make_shared(std::make_shared())}, {"used_functions", std::make_shared(std::make_shared())}, {"used_storages", std::make_shared(std::make_shared())}, - {"used_table_functions", std::make_shared(std::make_shared())} + {"used_table_functions", std::make_shared(std::make_shared())}, + + {"transaction_id", getTransactionIDDataType()}, }; } @@ -257,6 +259,8 @@ void QueryLogElement::appendToBlock(MutableColumns & columns) const fill_column(used_storages, column_storage_factory_objects); fill_column(used_table_functions, column_table_function_factory_objects); } + + columns[i++]->insert(Tuple{tid.start_csn, tid.local_tid, tid.host_id}); } void QueryLogElement::appendClientInfo(const ClientInfo & client_info, MutableColumns & columns, size_t & i) diff --git a/src/Interpreters/QueryLog.h b/src/Interpreters/QueryLog.h index f015afb9249..651769cbab6 100644 --- a/src/Interpreters/QueryLog.h +++ b/src/Interpreters/QueryLog.h @@ -5,6 +5,7 @@ #include #include #include +#include namespace ProfileEvents { @@ -85,6 +86,8 @@ struct QueryLogElement std::shared_ptr profile_counters; std::shared_ptr query_settings; + TransactionID tid; + static std::string name() { return "QueryLog"; } static NamesAndTypesList getNamesAndTypes(); diff --git a/src/Interpreters/SystemLog.cpp b/src/Interpreters/SystemLog.cpp index 0f8c782463b..f079e41851a 100644 --- a/src/Interpreters/SystemLog.cpp +++ b/src/Interpreters/SystemLog.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -202,6 +203,8 @@ SystemLogs::SystemLogs(ContextPtr global_context, const Poco::Util::AbstractConf query_views_log = createSystemLog(global_context, "system", "query_views_log", config, "query_views_log"); zookeeper_log = createSystemLog(global_context, "system", "zookeeper_log", config, "zookeeper_log"); session_log = createSystemLog(global_context, "system", "session_log", config, "session_log"); + transactions_info_log = createSystemLog( + global_context, "system", "transactions_info_log", config, "transactions_info_log"); processors_profile_log = createSystemLog(global_context, "system", "processors_profile_log", config, "processors_profile_log"); if (query_log) @@ -228,6 +231,8 @@ SystemLogs::SystemLogs(ContextPtr global_context, const Poco::Util::AbstractConf logs.emplace_back(zookeeper_log.get()); if (session_log) logs.emplace_back(session_log.get()); + if (transactions_info_log) + logs.emplace_back(transactions_info_log.get()); if (processors_profile_log) logs.emplace_back(processors_profile_log.get()); @@ -544,6 +549,7 @@ ASTPtr SystemLog::getCreateTableQuery() return create; } + #define INSTANTIATE_SYSTEM_LOG(ELEMENT) template class SystemLog; SYSTEM_LOG_ELEMENTS(INSTANTIATE_SYSTEM_LOG) diff --git a/src/Interpreters/SystemLog.h b/src/Interpreters/SystemLog.h index 4ad6a0666bb..b5135e8a73a 100644 --- a/src/Interpreters/SystemLog.h +++ b/src/Interpreters/SystemLog.h @@ -43,6 +43,7 @@ class OpenTelemetrySpanLog; class QueryViewsLog; class ZooKeeperLog; class SessionLog; +class TransactionsInfoLog; class ProcessorsProfileLog; /// System logs should be destroyed in destructor of the last Context and before tables, @@ -71,6 +72,8 @@ struct SystemLogs std::shared_ptr zookeeper_log; /// Login, LogOut and Login failure events std::shared_ptr session_log; + /// Events related to transactions + std::shared_ptr transactions_info_log; /// Used to log processors profiling std::shared_ptr processors_profile_log; diff --git a/src/Interpreters/TransactionLog.cpp b/src/Interpreters/TransactionLog.cpp new file mode 100644 index 00000000000..393a8aa848b --- /dev/null +++ b/src/Interpreters/TransactionLog.cpp @@ -0,0 +1,484 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/// It's used in critical places to exit on unexpected exceptions. +/// SIGABRT is usually better that broken state in memory with unpredictable consequences. +#define NOEXCEPT_SCOPE SCOPE_EXIT({ if (std::uncaught_exceptions()) { tryLogCurrentException("NOEXCEPT_SCOPE"); abort(); } }) + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + +static void tryWriteEventToSystemLog(Poco::Logger * log, ContextPtr context, + TransactionsInfoLogElement::Type type, const TransactionID & tid, CSN csn = Tx::UnknownCSN) +try +{ + auto system_log = context->getTransactionsInfoLog(); + if (!system_log) + return; + + TransactionsInfoLogElement elem; + elem.type = type; + elem.tid = tid; + elem.csn = csn; + elem.fillCommonFields(nullptr); + system_log->add(elem); +} +catch (...) +{ + tryLogCurrentException(log); +} + + +TransactionLog::TransactionLog() + : log(&Poco::Logger::get("TransactionLog")) +{ + global_context = Context::getGlobalContextInstance(); + global_context->checkTransactionsAreAllowed(); + + zookeeper_path = global_context->getConfigRef().getString("transaction_log.zookeeper_path", "/clickhouse/txn"); + zookeeper_path_log = zookeeper_path + "/log"; + + loadLogFromZooKeeper(); + + updating_thread = ThreadFromGlobalPool(&TransactionLog::runUpdatingThread, this); +} + +TransactionLog::~TransactionLog() +{ + shutdown(); +} + +void TransactionLog::shutdown() +{ + if (stop_flag.exchange(true)) + return; + log_updated_event->set(); + latest_snapshot.notify_all(); + updating_thread.join(); + + std::lock_guard lock{mutex}; + /// This is required to... you'll never guess - avoid race condition inside Poco::Logger (Coordination::ZooKeeper::log) + zookeeper.reset(); +} + +ZooKeeperPtr TransactionLog::getZooKeeper() const +{ + std::lock_guard lock{mutex}; + return zookeeper; +} + +UInt64 TransactionLog::deserializeCSN(const String & csn_node_name) +{ + ReadBufferFromString buf{csn_node_name}; + assertString("csn-", buf); + UInt64 res; + readText(res, buf); + assertEOF(buf); + return res; +} + +String TransactionLog::serializeCSN(CSN csn) +{ + return zkutil::getSequentialNodeName("csn-", csn); +} + +TransactionID TransactionLog::deserializeTID(const String & csn_node_content) +{ + TransactionID tid = Tx::EmptyTID; + if (csn_node_content.empty()) + return tid; + + ReadBufferFromString buf{csn_node_content}; + tid = TransactionID::read(buf); + assertEOF(buf); + return tid; +} + +String TransactionLog::serializeTID(const TransactionID & tid) +{ + WriteBufferFromOwnString buf; + TransactionID::write(tid, buf); + return buf.str(); +} + + +void TransactionLog::loadEntries(Strings::const_iterator beg, Strings::const_iterator end) +{ + std::vector> futures; + size_t entries_count = std::distance(beg, end); + if (!entries_count) + return; + + String last_entry = *std::prev(end); + LOG_TRACE(log, "Loading {} entries from {}: {}..{}", entries_count, zookeeper_path_log, *beg, last_entry); + futures.reserve(entries_count); + for (auto it = beg; it != end; ++it) + futures.emplace_back(zookeeper->asyncGet(fs::path(zookeeper_path_log) / *it)); + + std::vector> loaded; + loaded.reserve(entries_count); + auto it = beg; + for (size_t i = 0; i < entries_count; ++i, ++it) + { + auto res = futures[i].get(); + CSN csn = deserializeCSN(*it); + TransactionID tid = deserializeTID(res.data); + loaded.emplace_back(tid.getHash(), CSNEntry{csn, tid}); + LOG_TEST(log, "Got entry {} -> {}", tid, csn); + } + futures.clear(); + + NOEXCEPT_SCOPE; + LockMemoryExceptionInThread lock_memory_tracker(VariableContext::Global); + std::lock_guard lock{mutex}; + for (const auto & entry : loaded) + { + if (entry.first == Tx::EmptyTID.getHash()) + continue; + + tid_to_csn.emplace(entry.first, entry.second); + } + last_loaded_entry = last_entry; + latest_snapshot = loaded.back().second.csn; + local_tid_counter = Tx::MaxReservedLocalTID; +} + +void TransactionLog::loadLogFromZooKeeper() +{ + assert(!zookeeper); + assert(tid_to_csn.empty()); + assert(last_loaded_entry.empty()); + zookeeper = global_context->getZooKeeper(); + + /// We do not write local_tid_counter to disk or zk and maintain it only in memory. + /// Create empty entry to allocate new CSN to safely start counting from the beginning and avoid TID duplication. + /// TODO It's possible to skip this step in come cases (especially for multi-host configuration). + Coordination::Error code = zookeeper->tryCreate(zookeeper_path_log + "/csn-", "", zkutil::CreateMode::PersistentSequential); + if (code != Coordination::Error::ZOK) + { + /// Log probably does not exist, create it + assert(code == Coordination::Error::ZNONODE); + zookeeper->createAncestors(zookeeper_path_log); + Coordination::Requests ops; + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/tail_ptr", serializeCSN(Tx::MaxReservedCSN), zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path_log, "", zkutil::CreateMode::Persistent)); + + /// Fast-forward sequential counter to skip reserved CSNs + for (size_t i = 0; i <= Tx::MaxReservedCSN; ++i) + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path_log + "/csn-", "", zkutil::CreateMode::PersistentSequential)); + Coordination::Responses res; + code = zookeeper->tryMulti(ops, res); + if (code != Coordination::Error::ZNODEEXISTS) + zkutil::KeeperMultiException::check(code, ops, res); + } + + /// TODO Split log into "subdirectories" to: + /// 1. fetch it more optimal way (avoid listing all CSNs on further incremental updates) + /// 2. simplify log rotation + /// 3. support 64-bit CSNs on top of Apache ZooKeeper (it uses Int32 for sequential numbers) + Strings entries_list = zookeeper->getChildren(zookeeper_path_log, nullptr, log_updated_event); + assert(!entries_list.empty()); + std::sort(entries_list.begin(), entries_list.end()); + loadEntries(entries_list.begin(), entries_list.end()); + assert(!last_loaded_entry.empty()); + assert(latest_snapshot == deserializeCSN(last_loaded_entry)); + local_tid_counter = Tx::MaxReservedLocalTID; + + tail_ptr = deserializeCSN(zookeeper->get(zookeeper_path + "/tail_ptr")); +} + +void TransactionLog::runUpdatingThread() +{ + while (true) + { + try + { + log_updated_event->wait(); + if (stop_flag.load()) + return; + + if (!zookeeper) + { + auto new_zookeeper = global_context->getZooKeeper(); + std::lock_guard lock{mutex}; + zookeeper = new_zookeeper; + } + + loadNewEntries(); + removeOldEntries(); + } + catch (const Coordination::Exception & e) + { + tryLogCurrentException(log); + /// TODO better backoff + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + if (Coordination::isHardwareError(e.code)) + { + std::lock_guard lock{mutex}; + zookeeper.reset(); + } + log_updated_event->set(); + } + catch (...) + { + tryLogCurrentException(log); + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + log_updated_event->set(); + } + } +} + +void TransactionLog::loadNewEntries() +{ + Strings entries_list = zookeeper->getChildren(zookeeper_path_log, nullptr, log_updated_event); + assert(!entries_list.empty()); + std::sort(entries_list.begin(), entries_list.end()); + auto it = std::upper_bound(entries_list.begin(), entries_list.end(), last_loaded_entry); + loadEntries(it, entries_list.end()); + assert(last_loaded_entry == entries_list.back()); + assert(latest_snapshot == deserializeCSN(last_loaded_entry)); + latest_snapshot.notify_all(); +} + +void TransactionLog::removeOldEntries() +{ + /// Try to update tail pointer. It's (almost) safe to set it to the oldest snapshot + /// because if a transaction released snapshot, then CSN is already written into metadata. + /// Why almost? Because on server startup we do not have the oldest snapshot (it's simply equal to the latest one), + /// but it's possible that some CSNs are not written into data parts (and we will write them during startup). + if (!global_context->isServerCompletelyStarted()) + return; + + /// Also similar problem is possible if some table was not attached during startup (for example, if table is detached permanently). + /// Also we write CSNs into data parts without fsync, so it's theoretically possible that we wrote CSN, finished transaction, + /// removed its entry from the log, but after that server restarts and CSN is not actually saved to metadata on disk. + /// We should store a bit more entries in ZK and keep outdated entries for a while. + + /// TODO we will need a bit more complex logic for multiple hosts + Coordination::Stat stat; + CSN old_tail_ptr = deserializeCSN(zookeeper->get(zookeeper_path + "/tail_ptr", &stat)); + CSN new_tail_ptr = getOldestSnapshot(); + if (new_tail_ptr < old_tail_ptr) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Got unexpected tail_ptr {}, oldest snapshot is {}, it's a bug", old_tail_ptr, new_tail_ptr); + else if (new_tail_ptr == old_tail_ptr) + return; + + /// (it's not supposed to fail with ZBADVERSION while there is only one host) + LOG_TRACE(log, "Updating tail_ptr from {} to {}", old_tail_ptr, new_tail_ptr); + zookeeper->set(zookeeper_path + "/tail_ptr", serializeCSN(new_tail_ptr), stat.version); + tail_ptr.store(new_tail_ptr); + + /// Now we can find and remove old entries + TIDMap tids; + { + std::lock_guard lock{mutex}; + tids = tid_to_csn; + } + + /// TODO support batching + std::vector removed_entries; + CSN latest_entry_csn = latest_snapshot.load(); + for (const auto & elem : tids) + { + /// Definitely not safe to remove + if (new_tail_ptr <= elem.second.tid.start_csn) + continue; + + /// Keep at least one node (the latest one we fetched) + if (elem.second.csn == latest_entry_csn) + continue; + + LOG_TEST(log, "Removing entry {} -> {}", elem.second.tid, elem.second.csn); + auto code = zookeeper->tryRemove(zookeeper_path_log + "/" + serializeCSN(elem.second.csn)); + if (code == Coordination::Error::ZOK || code == Coordination::Error::ZNONODE) + removed_entries.push_back(elem.first); + } + + std::lock_guard lock{mutex}; + for (const auto & tid_hash : removed_entries) + tid_to_csn.erase(tid_hash); +} + +CSN TransactionLog::getLatestSnapshot() const +{ + return latest_snapshot.load(); +} + +MergeTreeTransactionPtr TransactionLog::beginTransaction() +{ + MergeTreeTransactionPtr txn; + { + std::lock_guard lock{running_list_mutex}; + CSN snapshot = latest_snapshot.load(); + LocalTID ltid = 1 + local_tid_counter.fetch_add(1); + txn = std::make_shared(snapshot, ltid, ServerUUID::get()); + bool inserted = running_list.try_emplace(txn->tid.getHash(), txn).second; + if (!inserted) + throw Exception(ErrorCodes::LOGICAL_ERROR, "I's a bug: TID {} {} exists", txn->tid.getHash(), txn->tid); + txn->snapshot_in_use_it = snapshots_in_use.insert(snapshots_in_use.end(), snapshot); + } + + LOG_TEST(log, "Beginning transaction {} ({})", txn->tid, txn->tid.getHash()); + tryWriteEventToSystemLog(log, global_context, TransactionsInfoLogElement::BEGIN, txn->tid); + + return txn; +} + +CSN TransactionLog::commitTransaction(const MergeTreeTransactionPtr & txn) +{ + /// Some precommit checks, may throw + auto committing_lock = txn->beforeCommit(); + + CSN new_csn; + if (txn->isReadOnly()) + { + /// Don't need to allocate CSN in ZK for readonly transactions, it's safe to use snapshot/start_csn as "commit" timestamp + LOG_TEST(log, "Closing readonly transaction {}", txn->tid); + new_csn = txn->snapshot; + tryWriteEventToSystemLog(log, global_context, TransactionsInfoLogElement::COMMIT, txn->tid, new_csn); + } + else + { + LOG_TEST(log, "Committing transaction {}", txn->dumpDescription()); + /// TODO handle connection loss + /// TODO support batching + auto current_zookeeper = getZooKeeper(); + String path_created = current_zookeeper->create(zookeeper_path_log + "/csn-", serializeTID(txn->tid), zkutil::CreateMode::PersistentSequential); /// Commit point + NOEXCEPT_SCOPE; + + /// FIXME Transactions: Sequential node numbers in ZooKeeper are Int32, but 31 bit is not enough for production use + /// (overflow is possible in a several weeks/months of active usage) + new_csn = deserializeCSN(path_created.substr(zookeeper_path_log.size() + 1)); + + LOG_INFO(log, "Transaction {} committed with CSN={}", txn->tid, new_csn); + tryWriteEventToSystemLog(log, global_context, TransactionsInfoLogElement::COMMIT, txn->tid, new_csn); + + /// Wait for committed changes to become actually visible, so the next transaction in this session will see the changes + /// TODO it's optional, add a setting for this + auto current_latest_snapshot = latest_snapshot.load(); + while (current_latest_snapshot < new_csn && !stop_flag) + { + latest_snapshot.wait(current_latest_snapshot); + current_latest_snapshot = latest_snapshot.load(); + } + } + + /// Write allocated CSN, so we will be able to cleanup log in ZK. This method is noexcept. + txn->afterCommit(new_csn); + + { + /// Finally we can remove transaction from the list and release the snapshot + std::lock_guard lock{running_list_mutex}; + bool removed = running_list.erase(txn->tid.getHash()); + if (!removed) + throw Exception(ErrorCodes::LOGICAL_ERROR, "I's a bug: TID {} {} doesn't exist", txn->tid.getHash(), txn->tid); + snapshots_in_use.erase(txn->snapshot_in_use_it); + } + + return new_csn; +} + +void TransactionLog::rollbackTransaction(const MergeTreeTransactionPtr & txn) noexcept +{ + LOG_TRACE(log, "Rolling back transaction {}{}", txn->tid, + std::uncaught_exceptions() ? fmt::format(" due to uncaught exception (code: {})", getCurrentExceptionCode()) : ""); + + if (!txn->rollback()) + { + /// Transaction was cancelled concurrently, it's already rolled back. + assert(txn->csn == Tx::RolledBackCSN); + return; + } + + { + std::lock_guard lock{running_list_mutex}; + bool removed = running_list.erase(txn->tid.getHash()); + if (!removed) + abort(); + snapshots_in_use.erase(txn->snapshot_in_use_it); + } + + tryWriteEventToSystemLog(log, global_context, TransactionsInfoLogElement::ROLLBACK, txn->tid); +} + +MergeTreeTransactionPtr TransactionLog::tryGetRunningTransaction(const TIDHash & tid) +{ + std::lock_guard lock{running_list_mutex}; + auto it = running_list.find(tid); + if (it == running_list.end()) + return NO_TRANSACTION_PTR; + return it->second; +} + +CSN TransactionLog::getCSN(const TransactionID & tid) +{ + /// Avoid creation of the instance if transactions are not actually involved + if (tid == Tx::PrehistoricTID) + return Tx::PrehistoricCSN; + return instance().getCSNImpl(tid.getHash()); +} + +CSN TransactionLog::getCSN(const TIDHash & tid) +{ + /// Avoid creation of the instance if transactions are not actually involved + if (tid == Tx::PrehistoricTID.getHash()) + return Tx::PrehistoricCSN; + return instance().getCSNImpl(tid); +} + +CSN TransactionLog::getCSNImpl(const TIDHash & tid_hash) const +{ + assert(tid_hash); + assert(tid_hash != Tx::EmptyTID.getHash()); + + std::lock_guard lock{mutex}; + auto it = tid_to_csn.find(tid_hash); + if (it != tid_to_csn.end()) + return it->second.csn; + + return Tx::UnknownCSN; +} + +void TransactionLog::assertTIDIsNotOutdated(const TransactionID & tid) +{ + if (tid == Tx::PrehistoricTID) + return; + + /// Ensure that we are not trying to get CSN for TID that was already removed from the log + CSN tail = instance().tail_ptr.load(); + if (tail <= tid.start_csn) + return; + + throw Exception(ErrorCodes::LOGICAL_ERROR, "Trying to get CSN for too old TID {}, current tail_ptr is {}, probably it's a bug", tid, tail); +} + +CSN TransactionLog::getOldestSnapshot() const +{ + std::lock_guard lock{running_list_mutex}; + if (snapshots_in_use.empty()) + return getLatestSnapshot(); + return snapshots_in_use.front(); +} + +TransactionLog::TransactionsList TransactionLog::getTransactionsList() const +{ + std::lock_guard lock{running_list_mutex}; + return running_list; +} + +} diff --git a/src/Interpreters/TransactionLog.h b/src/Interpreters/TransactionLog.h new file mode 100644 index 00000000000..86584a74c68 --- /dev/null +++ b/src/Interpreters/TransactionLog.h @@ -0,0 +1,192 @@ +#pragma once +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +/// We want to create a TransactionLog object lazily and avoid creation if it's not needed. +/// But we also want to call shutdown() in a specific place to avoid race conditions. +/// We cannot simply use return-static-variable pattern, +/// because a call to shutdown() may construct unnecessary object in this case. +template +class SingletonHelper : private boost::noncopyable +{ +public: + static Derived & instance() + { + Derived * ptr = instance_raw_ptr.load(); + if (likely(ptr)) + return *ptr; + + return createInstanceOrThrow(); + } + + static void shutdownIfAny() + { + std::lock_guard lock{instance_mutex}; + if (instance_holder) + instance_holder->shutdown(); + } + +private: + static Derived & createInstanceOrThrow(); + + static inline std::atomic instance_raw_ptr; + /// It was supposed to be std::optional, but gcc fails to compile it for some reason + static inline std::shared_ptr instance_holder; + static inline std::mutex instance_mutex; +}; + +class TransactionsInfoLog; +using TransactionsInfoLogPtr = std::shared_ptr; +using ZooKeeperPtr = std::shared_ptr; + +/// This class maintains transaction log in ZooKeeper and a list of currently running transactions in memory. +/// +/// Each transaction has unique ID (TID, see details below). +/// TransactionID is allocated when transaction begins. +/// +/// We use TransactionID to associate changes (created/removed data parts) with transaction that has made/is going to make these changes. +/// To commit a transaction we create sequential node "/path_to_log/log/csn-" in ZK and write TID into this node. +/// Allocated sequential number is a commit timestamp or Commit Sequence Number (CSN). It indicates a (logical) point in time +/// when transaction is committed and all its changes became visible. So we have total order of all changes. +/// +/// Also CSNs are used as snapshots: all changes that were made by a transaction that was committed with a CSN less or equal than some_csn +/// are visible in some_csn snapshot. +/// +/// TransactionID consists of three parts: (start_csn, local_tid, host_id) +/// - start_csn is the newest CSN that existed when the transaction was started and also it's snapshot that is visible for this transaction +/// - local_tid is local sequential number of the transaction, each server allocates local_tids independently without requests to ZK +/// - host_id is persistent UUID of host that has started the transaction, it's kind of tie-breaker that makes ID unique across all servers +/// +/// To check if some transaction is committed or not we fetch "csn-xxxxxx" nodes from ZK and construct TID -> CSN mapping, +/// so for committed transactions we know commit timestamps. +/// However, if we did not find a mapping for some TID, it means one of the following cases: +/// 1. Transaction is not committed (yet) +/// 2. Transaction is rolled back (quite similar to the first case, but it will never be committed) +/// 3. Transactions was committed a long time ago and we removed its entry from the log +/// To distinguish the third case we store a "tail pointer" in "/path_to_log/tail_ptr". It's a CSN such that it's safe to remove from log +/// entries with tid.start_csn < tail_ptr, because CSNs for those TIDs are already written into data parts +/// and we will not do a CSN lookup for those TIDs anymore. +/// +/// (however, transactions involving multiple hosts and/or ReplicatedMergeTree tables are currently not supported) +class TransactionLog final : public SingletonHelper +{ +public: + + TransactionLog(); + + ~TransactionLog(); + + void shutdown(); + + /// Returns the newest snapshot available for reading + CSN getLatestSnapshot() const; + /// Returns the oldest snapshot that is visible for some running transaction + CSN getOldestSnapshot() const; + + /// Allocates TID, returns new transaction object + MergeTreeTransactionPtr beginTransaction(); + + /// Tries to commit transaction. Returns Commit Sequence Number. + /// Throw if transaction was concurrently killed or if some precommit check failed. + /// May throw if ZK connection is lost. Transaction status is unknown in this case. + CSN commitTransaction(const MergeTreeTransactionPtr & txn); + + /// Releases locks that that were acquired by transaction, releases snapshot, removes transaction from the list of active transactions. + /// Normally it should not throw, but if it does for some reason (global memory limit exceeded, disk failure, etc) + /// then we should terminate server and reinitialize it to avoid corruption of data structures. That's why it's noexcept. + void rollbackTransaction(const MergeTreeTransactionPtr & txn) noexcept; + + /// Returns CSN if transaction with specified ID was committed and UnknownCSN if it was not. + /// Returns PrehistoricCSN for PrehistoricTID without creating a TransactionLog instance as a special case. + static CSN getCSN(const TransactionID & tid); + static CSN getCSN(const TIDHash & tid); + + /// Ensures that getCSN returned UnknownCSN because transaction is not committed and not because entry was removed from the log. + static void assertTIDIsNotOutdated(const TransactionID & tid); + + /// Returns a pointer to transaction object if it's running or nullptr. + MergeTreeTransactionPtr tryGetRunningTransaction(const TIDHash & tid); + + using TransactionsList = std::unordered_map; + /// Returns copy of list of running transactions. + TransactionsList getTransactionsList() const; + +private: + void loadLogFromZooKeeper(); + void runUpdatingThread(); + + void loadEntries(Strings::const_iterator beg, Strings::const_iterator end); + void loadNewEntries(); + void removeOldEntries(); + + static UInt64 deserializeCSN(const String & csn_node_name); + static String serializeCSN(CSN csn); + static TransactionID deserializeTID(const String & csn_node_content); + static String serializeTID(const TransactionID & tid); + + ZooKeeperPtr getZooKeeper() const; + + CSN getCSNImpl(const TIDHash & tid_hash) const; + + ContextPtr global_context; + Poco::Logger * log; + + /// The newest snapshot available for reading + std::atomic latest_snapshot; + + /// Local part of TransactionID number. We reset this counter for each new snapshot. + std::atomic local_tid_counter; + + mutable std::mutex mutex; + /// Mapping from TransactionID to CSN for recently committed transactions. + /// Allows to check if some transactions is committed. + struct CSNEntry + { + CSN csn; + TransactionID tid; + }; + using TIDMap = std::unordered_map; + TIDMap tid_to_csn; + + mutable std::mutex running_list_mutex; + /// Transactions that are currently processed + TransactionsList running_list; + /// Ordered list of snapshots that are currently used by some transactions. Needed for background cleanup. + std::list snapshots_in_use; + + ZooKeeperPtr zookeeper; + String zookeeper_path; + + String zookeeper_path_log; + /// Name of the newest entry that was loaded from log in ZK + String last_loaded_entry; + /// The oldest CSN such that we store in log entries with TransactionIDs containing this CSN. + std::atomic tail_ptr = Tx::UnknownCSN; + + zkutil::EventPtr log_updated_event = std::make_shared(); + + std::atomic_bool stop_flag = false; + ThreadFromGlobalPool updating_thread; +}; + +template +Derived & SingletonHelper::createInstanceOrThrow() +{ + std::lock_guard lock{instance_mutex}; + if (!instance_holder) + { + instance_holder = std::make_shared(); + instance_raw_ptr = instance_holder.get(); + } + return *instance_holder; +} + +} diff --git a/src/Interpreters/TransactionVersionMetadata.cpp b/src/Interpreters/TransactionVersionMetadata.cpp new file mode 100644 index 00000000000..ac02f29661d --- /dev/null +++ b/src/Interpreters/TransactionVersionMetadata.cpp @@ -0,0 +1,431 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int SERIALIZATION_ERROR; + extern const int LOGICAL_ERROR; + extern const int CANNOT_PARSE_TEXT; +} + +inline static CSN getCSNAndAssert(TIDHash tid_hash, std::atomic & csn, const TransactionID * tid = nullptr) +{ + CSN maybe_csn = TransactionLog::getCSN(tid_hash); + if (maybe_csn) + return maybe_csn; + + /// Either transaction is not committed (yet) or it was committed and then the CSN entry was cleaned up from the log. + /// We should load CSN again to distinguish the second case. + /// If entry was cleaned up, then CSN is already stored in VersionMetadata and we will get it. + /// And for the first case we will get UnknownCSN again. + maybe_csn = csn.load(); + if (maybe_csn) + return maybe_csn; + + if (tid) + TransactionLog::assertTIDIsNotOutdated(*tid); + + return Tx::UnknownCSN; +} + +VersionMetadata::VersionMetadata() +{ + /// It would be better to make it static, but static loggers do not work for some reason (initialization order?) + log = &Poco::Logger::get("VersionMetadata"); +} + +/// It can be used for introspection purposes only +TransactionID VersionMetadata::getRemovalTID() const +{ + TIDHash removal_lock = removal_tid_lock.load(); + if (removal_lock) + { + if (removal_lock == Tx::PrehistoricTID.getHash()) + return Tx::PrehistoricTID; + if (auto txn = TransactionLog::instance().tryGetRunningTransaction(removal_lock)) + return txn->tid; + } + + if (removal_csn.load(std::memory_order_relaxed)) + { + /// removal_tid cannot be changed since we have removal_csn, so it's readonly + return removal_tid; + } + + return Tx::EmptyTID; +} + +void VersionMetadata::lockRemovalTID(const TransactionID & tid, const TransactionInfoContext & context) +{ + LOG_TEST(log, "Trying to lock removal_tid by {}, table: {}, part: {}", tid, context.table.getNameForLogs(), context.part_name); + TIDHash locked_by = 0; + if (tryLockRemovalTID(tid, context, &locked_by)) + return; + + String part_desc; + if (context.covering_part.empty()) + part_desc = context.part_name; + else + part_desc = fmt::format("{} (covered by {})", context.part_name, context.covering_part); + throw Exception(ErrorCodes::SERIALIZATION_ERROR, + "Serialization error: " + "Transaction {} tried to remove data part {} from {}, " + "but it's locked by another transaction (TID: {}, TIDH: {}) which is currently removing this part.", + tid, part_desc, context.table.getNameForLogs(), getRemovalTID(), locked_by); +} + +bool VersionMetadata::tryLockRemovalTID(const TransactionID & tid, const TransactionInfoContext & context, TIDHash * locked_by_id) +{ + assert(!tid.isEmpty()); + assert(!creation_tid.isEmpty()); + TIDHash removal_lock_value = tid.getHash(); + TIDHash expected_removal_lock_value = 0; + bool locked = removal_tid_lock.compare_exchange_strong(expected_removal_lock_value, removal_lock_value); + if (!locked) + { + if (tid == Tx::PrehistoricTID && expected_removal_lock_value == Tx::PrehistoricTID.getHash()) + { + /// Don't need to lock part for queries without transaction + LOG_TEST(log, "Assuming removal_tid is locked by {}, table: {}, part: {}", tid, context.table.getNameForLogs(), context.part_name); + return true; + } + + if (locked_by_id) + *locked_by_id = expected_removal_lock_value; + return false; + } + + removal_tid = tid; + tryWriteEventToSystemLog(log, TransactionsInfoLogElement::LOCK_PART, tid, context); + return true; +} + +void VersionMetadata::unlockRemovalTID(const TransactionID & tid, const TransactionInfoContext & context) +{ + LOG_TEST(log, "Unlocking removal_tid by {}, table: {}, part: {}", tid, context.table.getNameForLogs(), context.part_name); + assert(!tid.isEmpty()); + TIDHash removal_lock_value = tid.getHash(); + TIDHash locked_by = removal_tid_lock.load(); + + auto throw_cannot_unlock = [&]() + { + auto locked_by_txn = TransactionLog::instance().tryGetRunningTransaction(locked_by); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot unlock removal_tid, it's a bug. Current: {} {}, actual: {} {}", + removal_lock_value, tid, locked_by, locked_by_txn ? locked_by_txn->tid : Tx::EmptyTID); + }; + + if (locked_by != removal_lock_value) + throw_cannot_unlock(); + + removal_tid = Tx::EmptyTID; + bool unlocked = removal_tid_lock.compare_exchange_strong(locked_by, 0); + if (!unlocked) + throw_cannot_unlock(); + + tryWriteEventToSystemLog(log, TransactionsInfoLogElement::UNLOCK_PART, tid, context); +} + +bool VersionMetadata::isRemovalTIDLocked() const +{ + return removal_tid_lock.load() != 0; +} + +void VersionMetadata::setCreationTID(const TransactionID & tid, TransactionInfoContext * context) +{ + /// NOTE ReplicatedMergeTreeBlockOutputStream may add one part multiple times + assert(creation_tid.isEmpty() || creation_tid == tid); + creation_tid = tid; + if (context) + tryWriteEventToSystemLog(log, TransactionsInfoLogElement::ADD_PART, tid, *context); +} + +bool VersionMetadata::isVisible(const MergeTreeTransaction & txn) +{ + return isVisible(txn.getSnapshot(), txn.tid); +} + +bool VersionMetadata::isVisible(CSN snapshot_version, TransactionID current_tid) +{ + assert(!creation_tid.isEmpty()); + CSN creation = creation_csn.load(std::memory_order_relaxed); + TIDHash removal_lock = removal_tid_lock.load(std::memory_order_relaxed); + CSN removal = removal_csn.load(std::memory_order_relaxed); + + [[maybe_unused]] bool had_creation_csn = creation; + [[maybe_unused]] bool had_removal_tid = removal_lock; + [[maybe_unused]] bool had_removal_csn = removal; + assert(!had_removal_csn || had_removal_tid); + assert(!had_removal_csn || had_creation_csn); + assert(creation == Tx::UnknownCSN || creation == Tx::PrehistoricCSN || Tx::MaxReservedCSN < creation); + assert(removal == Tx::UnknownCSN || removal == Tx::PrehistoricCSN || Tx::MaxReservedCSN < removal); + + /// Special snapshot for introspection purposes + if (unlikely(snapshot_version == Tx::EverythingVisibleCSN)) + return true; + + /// Fast path: + + /// Part is definitely not visible if: + /// - creation was committed after we took the snapshot + /// - removal was committed before we took the snapshot + /// - current transaction is removing it + if (creation && snapshot_version < creation) + return false; + if (removal && removal <= snapshot_version) + return false; + if (!current_tid.isEmpty() && removal_lock && removal_lock == current_tid.getHash()) + return false; + + /// Otherwise, part is definitely visible if: + /// - creation was committed before we took the snapshot and nobody tried to remove the part + /// - creation was committed before and removal was committed after + /// - current transaction is creating it + if (creation && creation <= snapshot_version && !removal_lock) + return true; + if (creation && creation <= snapshot_version && removal && snapshot_version < removal) + return true; + if (!current_tid.isEmpty() && creation_tid == current_tid) + return true; + + /// End of fast path. + + /// Data part has creation_tid/removal_tid, but does not have creation_csn/removal_csn. + /// It means that some transaction is creating/removing the part right now or has done it recently + /// and we don't know if it was already committed or not. + assert(!had_creation_csn || (had_removal_tid && !had_removal_csn)); + assert(current_tid.isEmpty() || (creation_tid != current_tid && removal_lock != current_tid.getHash())); + + /// Before doing CSN lookup, let's check some extra conditions. + /// If snapshot_version <= some_tid.start_csn, then changes of the transaction with some_tid + /// are definitely not visible for us (because the transaction can be committed with greater CSN only), + /// so we don't need to check if it was committed. + if (snapshot_version <= creation_tid.start_csn) + return false; + + /// Check if creation_tid/removal_tid transactions are committed and write CSNs + /// TODO Transactions: we probably need more optimizations here + /// to avoid some CSN lookups or make the lookups cheaper. + /// NOTE: Old enough committed parts always have written CSNs, + /// so we can determine their visibility through fast path. + /// But for long-running writing transactions we will always do + /// CNS lookup and get 0 (UnknownCSN) until the transaction is committed/rolled back. + creation = getCSNAndAssert(creation_tid.getHash(), creation_csn, &creation_tid); + if (!creation) + { + return false; /// Part creation is not committed yet + } + + /// We don't need to check if CSNs are already written or not, + /// because once written CSN cannot be changed, so it's safe to overwrite it (with the same value). + creation_csn.store(creation, std::memory_order_relaxed); + + if (removal_lock) + { + removal = getCSNAndAssert(removal_lock, removal_csn); + if (removal) + removal_csn.store(removal, std::memory_order_relaxed); + } + + return creation <= snapshot_version && (!removal || snapshot_version < removal); +} + +bool VersionMetadata::canBeRemoved() +{ + if (creation_tid == Tx::PrehistoricTID) + { + /// Avoid access to Transaction log if transactions are not involved + + TIDHash removal_lock = removal_tid_lock.load(std::memory_order_relaxed); + if (!removal_lock) + return false; + + if (removal_lock == Tx::PrehistoricTID.getHash()) + return true; + } + + return canBeRemovedImpl(TransactionLog::instance().getOldestSnapshot()); +} + +bool VersionMetadata::canBeRemovedImpl(CSN oldest_snapshot_version) +{ + CSN creation = creation_csn.load(std::memory_order_relaxed); + /// We can safely remove part if its creation was rolled back + if (creation == Tx::RolledBackCSN) + return true; + + if (!creation) + { + /// Cannot remove part if its creation not committed yet + creation = getCSNAndAssert(creation_tid.getHash(), creation_csn, &creation_tid); + if (creation) + creation_csn.store(creation, std::memory_order_relaxed); + else + return false; + } + + /// Part is probably visible for some transactions (part is too new or the oldest snapshot is too old) + if (oldest_snapshot_version < creation) + return false; + + TIDHash removal_lock = removal_tid_lock.load(std::memory_order_relaxed); + /// Part is active + if (!removal_lock) + return false; + + CSN removal = removal_csn.load(std::memory_order_relaxed); + if (!removal) + { + /// Part removal is not committed yet + removal = getCSNAndAssert(removal_lock, removal_csn); + if (removal) + removal_csn.store(removal, std::memory_order_relaxed); + else + return false; + } + + /// We can safely remove part if all running transactions were started after part removal was committed + return removal <= oldest_snapshot_version; +} + +#define CREATION_TID_STR "creation_tid: " +#define CREATION_CSN_STR "creation_csn: " +#define REMOVAL_TID_STR "removal_tid: " +#define REMOVAL_CSN_STR "removal_csn: " + + +void VersionMetadata::writeCSN(WriteBuffer & buf, WhichCSN which_csn, bool internal /* = false*/) const +{ + if (which_csn == CREATION) + { + if (CSN creation = creation_csn.load()) + { + writeCString("\n" CREATION_CSN_STR, buf); + writeText(creation, buf); + } + else if (!internal) + throw Exception(ErrorCodes::LOGICAL_ERROR, "writeCSN called for creation_csn = 0, it's a bug"); + } + else /// if (which_csn == REMOVAL) + { + if (CSN removal = removal_csn.load()) + { + writeCString("\n" REMOVAL_CSN_STR, buf); + writeText(removal, buf); + } + else if (!internal) + throw Exception(ErrorCodes::LOGICAL_ERROR, "writeCSN called for removal_csn = 0, it's a bug"); + } +} + +void VersionMetadata::writeRemovalTID(WriteBuffer & buf, bool clear) const +{ + writeCString("\n" REMOVAL_TID_STR, buf); + if (clear) + TransactionID::write(Tx::EmptyTID, buf); + else + TransactionID::write(removal_tid, buf); +} + +void VersionMetadata::write(WriteBuffer & buf) const +{ + writeCString("version: 1", buf); + writeCString("\n" CREATION_TID_STR, buf); + TransactionID::write(creation_tid, buf); + writeCSN(buf, CREATION, /* internal */ true); + + if (removal_tid_lock) + { + assert(!removal_tid.isEmpty()); + assert(removal_tid.getHash() == removal_tid_lock); + writeRemovalTID(buf); + writeCSN(buf, REMOVAL, /* internal */ true); + } +} + +void VersionMetadata::read(ReadBuffer & buf) +{ + constexpr size_t size = sizeof(CREATION_TID_STR) - 1; + static_assert(sizeof(CREATION_CSN_STR) - 1 == size); + static_assert(sizeof(REMOVAL_TID_STR) - 1 == size); + static_assert(sizeof(REMOVAL_CSN_STR) - 1 == size); + + assertString("version: 1", buf); + assertString("\n" CREATION_TID_STR, buf); + creation_tid = TransactionID::read(buf); + if (buf.eof()) + return; + + String name; + name.resize(size); + + auto read_csn = [&]() + { + UInt64 val; + readText(val, buf); + return val; + }; + + while (!buf.eof()) + { + assertChar('\n', buf); + buf.readStrict(name.data(), size); + + if (name == CREATION_CSN_STR) + { + assert(!creation_csn); + creation_csn = read_csn(); + } + else if (name == REMOVAL_TID_STR) + { + /// NOTE Metadata file may actually contain multiple creation TIDs, we need the last one. + removal_tid = TransactionID::read(buf); + if (!removal_tid.isEmpty()) + removal_tid_lock = removal_tid.getHash(); + } + else if (name == REMOVAL_CSN_STR) + { + if (removal_tid.isEmpty()) + throw Exception(ErrorCodes::CANNOT_PARSE_TEXT, "Found removal_csn in metadata file, but removal_tid is {}", removal_tid); + assert(!removal_csn); + removal_csn = read_csn(); + } + else + { + throw Exception(ErrorCodes::CANNOT_PARSE_TEXT, "Got unexpected content: {}", name); + } + } +} + +String VersionMetadata::toString(bool one_line) const +{ + WriteBufferFromOwnString buf; + write(buf); + String res = buf.str(); + if (one_line) + std::replace(res.begin(), res.end(), '\n', ' '); + return res; +} + + +DataTypePtr getTransactionIDDataType() +{ + DataTypes types; + types.push_back(std::make_shared()); + types.push_back(std::make_shared()); + types.push_back(std::make_shared()); + return std::make_shared(std::move(types)); +} + +} diff --git a/src/Interpreters/TransactionVersionMetadata.h b/src/Interpreters/TransactionVersionMetadata.h new file mode 100644 index 00000000000..18ac445cc29 --- /dev/null +++ b/src/Interpreters/TransactionVersionMetadata.h @@ -0,0 +1,81 @@ +#pragma once +#include +#include + +namespace Poco +{ +class Logger; +} + +namespace DB +{ + +/// This structure allows to pass more information about a part that transaction is trying to create/remove. +/// It's useful for logging and for exception messages. +struct TransactionInfoContext +{ + /// To which table a part belongs + StorageID table = StorageID::createEmpty(); + /// Name of a part that transaction is trying to create/remove + String part_name; + /// Optional: name of part that covers `part_name` if transaction is trying to remove `part_name` + String covering_part; + + TransactionInfoContext(StorageID id, String part) : table(std::move(id)), part_name(std::move(part)) {} +}; + +/// This structure contains metadata of an object (currently it's used for data parts in MergeTree only) +/// that allows to determine when and by which transaction it has been created/removed +struct VersionMetadata +{ + /// ID of transaction that has created/is trying to create this object + TransactionID creation_tid = Tx::EmptyTID; + /// ID of transaction that has removed/is trying to remove this object + TransactionID removal_tid = Tx::EmptyTID; + + /// Hash of removal_tid, used to lock an object for removal + std::atomic removal_tid_lock = 0; + + /// CSN of transaction that has created this object + std::atomic creation_csn = Tx::UnknownCSN; + /// CSN of transaction that has removed this object + std::atomic removal_csn = Tx::UnknownCSN; + + /// Checks if an object is visible for transaction or not. + bool isVisible(const MergeTreeTransaction & txn); + bool isVisible(CSN snapshot_version, TransactionID current_tid = Tx::EmptyTID); + + TransactionID getCreationTID() const { return creation_tid; } + TransactionID getRemovalTID() const; + + /// Looks an object for removal, throws if it's already locked by concurrent transaction + bool tryLockRemovalTID(const TransactionID & tid, const TransactionInfoContext & context, TIDHash * locked_by_id = nullptr); + void lockRemovalTID(const TransactionID & tid, const TransactionInfoContext & context); + /// Unlocks an object for removal (when transaction is rolling back) + void unlockRemovalTID(const TransactionID & tid, const TransactionInfoContext & context); + + bool isRemovalTIDLocked() const; + + /// It can be called only from MergeTreeTransaction or on server startup + void setCreationTID(const TransactionID & tid, TransactionInfoContext * context); + + /// Checks if it's safe to remove outdated version of an object + bool canBeRemoved(); + bool canBeRemovedImpl(CSN oldest_snapshot_version); + + void write(WriteBuffer & buf) const; + void read(ReadBuffer & buf); + + enum WhichCSN { CREATION, REMOVAL }; + void writeCSN(WriteBuffer & buf, WhichCSN which_csn, bool internal = false) const; + void writeRemovalTID(WriteBuffer & buf, bool clear = false) const; + + String toString(bool one_line = true) const; + + Poco::Logger * log; + VersionMetadata(); +}; + +DataTypePtr getTransactionIDDataType(); + +} diff --git a/src/Interpreters/TransactionsInfoLog.cpp b/src/Interpreters/TransactionsInfoLog.cpp new file mode 100644 index 00000000000..0498ee00e9e --- /dev/null +++ b/src/Interpreters/TransactionsInfoLog.cpp @@ -0,0 +1,111 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +NamesAndTypesList TransactionsInfoLogElement::getNamesAndTypes() +{ + auto type_enum = std::make_shared( + DataTypeEnum8::Values + { + {"Begin", static_cast(BEGIN)}, + {"Commit", static_cast(COMMIT)}, + {"Rollback", static_cast(ROLLBACK)}, + + {"AddPart", static_cast(ADD_PART)}, + {"LockPart", static_cast(LOCK_PART)}, + {"UnlockPart", static_cast(UNLOCK_PART)}, + }); + + return + { + {"type", std::move(type_enum)}, + {"event_date", std::make_shared()}, + {"event_time", std::make_shared(6)}, + {"thread_id", std::make_shared()}, + + {"query_id", std::make_shared()}, + {"tid", getTransactionIDDataType()}, + {"tid_hash", std::make_shared()}, + + {"csn", std::make_shared()}, + + {"database", std::make_shared()}, + {"table", std::make_shared()}, + {"uuid", std::make_shared()}, + {"part", std::make_shared()}, + }; +} + +void TransactionsInfoLogElement::fillCommonFields(const TransactionInfoContext * context) +{ + event_time = std::chrono::duration_cast(std::chrono::system_clock::now().time_since_epoch()).count(); + thread_id = getThreadId(); + + query_id = CurrentThread::getQueryId().toString(); + + if (!context) + return; + + table = context->table; + part_name = context->part_name; +} + +void TransactionsInfoLogElement::appendToBlock(MutableColumns & columns) const +{ + assert(type != UNKNOWN); + size_t i = 0; + + columns[i++]->insert(type); + auto event_time_seconds = event_time / 1000000; + columns[i++]->insert(DateLUT::instance().toDayNum(event_time_seconds).toUnderType()); + columns[i++]->insert(event_time); + columns[i++]->insert(thread_id); + + columns[i++]->insert(query_id); + columns[i++]->insert(Tuple{tid.start_csn, tid.local_tid, tid.host_id}); + columns[i++]->insert(tid.getHash()); + + columns[i++]->insert(csn); + + columns[i++]->insert(table.database_name); + columns[i++]->insert(table.table_name); + columns[i++]->insert(table.uuid); + columns[i++]->insert(part_name); +} + + +void tryWriteEventToSystemLog(Poco::Logger * log, + TransactionsInfoLogElement::Type type, const TransactionID & tid, + const TransactionInfoContext & context) +try +{ + auto system_log = Context::getGlobalContextInstance()->getTransactionsInfoLog(); + if (!system_log) + return; + + TransactionsInfoLogElement elem; + elem.type = type; + elem.tid = tid; + elem.fillCommonFields(&context); + system_log->add(elem); +} +catch (...) +{ + tryLogCurrentException(log); +} + +} diff --git a/src/Interpreters/TransactionsInfoLog.h b/src/Interpreters/TransactionsInfoLog.h new file mode 100644 index 00000000000..f595413a729 --- /dev/null +++ b/src/Interpreters/TransactionsInfoLog.h @@ -0,0 +1,58 @@ +#pragma once + +#include +#include +#include + +namespace DB +{ + +struct TransactionInfoContext; + +struct TransactionsInfoLogElement +{ + enum Type + { + UNKNOWN = 0, + + BEGIN = 1, + COMMIT = 2, + ROLLBACK = 3, + + ADD_PART = 10, + LOCK_PART = 11, + UNLOCK_PART = 12, + }; + + Type type = UNKNOWN; + Decimal64 event_time = 0; + UInt64 thread_id; + + String query_id; + TransactionID tid = Tx::EmptyTID; + + /// For COMMIT events + CSN csn = Tx::UnknownCSN; + + /// For *_PART events + StorageID table = StorageID::createEmpty(); + String part_name; + + static std::string name() { return "TransactionsInfoLog"; } + static NamesAndTypesList getNamesAndTypes(); + static NamesAndAliases getNamesAndAliases() { return {}; } + void appendToBlock(MutableColumns & columns) const; + + void fillCommonFields(const TransactionInfoContext * context = nullptr); +}; + +class TransactionsInfoLog : public SystemLog +{ + using SystemLog::SystemLog; +}; + + +void tryWriteEventToSystemLog(Poco::Logger * log, TransactionsInfoLogElement::Type type, + const TransactionID & tid, const TransactionInfoContext & context); + +} diff --git a/src/Interpreters/executeDDLQueryOnCluster.cpp b/src/Interpreters/executeDDLQueryOnCluster.cpp index f0279bafca2..28bcbcf06dc 100644 --- a/src/Interpreters/executeDDLQueryOnCluster.cpp +++ b/src/Interpreters/executeDDLQueryOnCluster.cpp @@ -63,6 +63,9 @@ BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr, ContextPtr context, c BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr_, ContextPtr context, AccessRightsElements && query_requires_access) { + if (context->getCurrentTransaction() && context->getSettingsRef().throw_on_unsupported_query_inside_transaction) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "ON CLUSTER queries inside transactions are not supported"); + /// Remove FORMAT and INTO OUTFILE if exists ASTPtr query_ptr = query_ptr_->clone(); ASTQueryWithOutput::resetOutputASTIfExist(*query_ptr); diff --git a/src/Interpreters/executeQuery.cpp b/src/Interpreters/executeQuery.cpp index dbd7063f3b3..a3232b798e5 100644 --- a/src/Interpreters/executeQuery.cpp +++ b/src/Interpreters/executeQuery.cpp @@ -25,6 +25,8 @@ #include #include #include +#include +#include #include #include #include @@ -49,6 +51,7 @@ #include #include #include +#include #include #include @@ -85,6 +88,7 @@ namespace ErrorCodes { extern const int INTO_OUTFILE_NOT_ALLOWED; extern const int QUERY_WAS_CANCELLED; + extern const int INVALID_TRANSACTION; extern const int LOGICAL_ERROR; extern const int NOT_IMPLEMENTED; } @@ -176,10 +180,15 @@ static void logQuery(const String & query, ContextPtr context, bool internal) if (!comment.empty()) comment = fmt::format(" (comment: {})", comment); - LOG_DEBUG(&Poco::Logger::get("executeQuery"), "(from {}{}{}){} {}", + String transaction_info; + if (auto txn = context->getCurrentTransaction()) + transaction_info = fmt::format(" (TID: {}, TIDH: {})", txn->tid, txn->tid.getHash()); + + LOG_DEBUG(&Poco::Logger::get("executeQuery"), "(from {}{}{}){}{} {}", client_info.current_address.toString(), (current_user != "default" ? ", user: " + current_user : ""), (!initial_query_id.empty() && current_query_id != initial_query_id ? ", initial_query_id: " + initial_query_id : std::string()), + transaction_info, comment, joinLines(query)); @@ -294,6 +303,9 @@ static void onExceptionBeforeStart(const String & query_for_logging, ContextPtr if (elem.log_comment.size() > settings.max_query_size) elem.log_comment.resize(settings.max_query_size); + if (auto txn = context->getCurrentTransaction()) + elem.tid = txn->tid; + if (settings.calculate_text_stack_trace) setExceptionStackTrace(elem); logException(context, elem); @@ -428,6 +440,13 @@ static std::tuple executeQueryImpl( /// TODO: parser should fail early when max_query_size limit is reached. ast = parseQuery(parser, begin, end, "", max_query_size, settings.max_parser_depth); + if (auto txn = context->getCurrentTransaction()) + { + assert(txn->getState() != MergeTreeTransaction::COMMITTED); + if (txn->getState() == MergeTreeTransaction::ROLLED_BACK && !ast->as() && !ast->as()) + throw Exception(ErrorCodes::INVALID_TRANSACTION, "Cannot execute query: transaction is rolled back"); + } + /// Interpret SETTINGS clauses as early as possible (before invoking the corresponding interpreter), /// to allow settings to take effect. if (const auto * select_query = ast->as()) @@ -629,11 +648,18 @@ static std::tuple executeQueryImpl( const auto & table_id = insert_query->table_id; if (!table_id.empty()) context->setInsertionTable(table_id); + + if (context->getCurrentTransaction() && context->getSettingsRef().throw_on_unsupported_query_inside_transaction) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Async inserts inside transactions are not supported"); } else { interpreter = InterpreterFactory::get(ast, context, SelectQueryOptions(stage).setInternal(internal)); + if (context->getCurrentTransaction() && !interpreter->supportsTransactions() && + context->getSettingsRef().throw_on_unsupported_query_inside_transaction) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Transactions are not supported for this type of query ({})", ast->getID()); + if (!interpreter->ignoreQuota()) { quota = context->getQuota(); @@ -724,6 +750,9 @@ static std::tuple executeQueryImpl( elem.client_info = client_info; + if (auto txn = context->getCurrentTransaction()) + elem.tid = txn->tid; + bool log_queries = settings.log_queries && !internal; /// Log into system table start of query execution, if need. @@ -945,6 +974,9 @@ static std::tuple executeQueryImpl( log_queries_min_query_duration_ms = settings.log_queries_min_query_duration_ms.totalMilliseconds(), quota(quota), status_info_to_query_log] () mutable { + if (auto txn = context->getCurrentTransaction()) + txn->onException(); + if (quota) quota->used(QuotaType::ERRORS, 1, /* check_exceeded = */ false); @@ -1001,6 +1033,9 @@ static std::tuple executeQueryImpl( } catch (...) { + if (auto txn = context->getCurrentTransaction()) + txn->onException(); + if (!internal) { if (query_for_logging.empty()) diff --git a/src/Parsers/ASTExplainQuery.h b/src/Parsers/ASTExplainQuery.h index abed9803a7b..3f169a93bad 100644 --- a/src/Parsers/ASTExplainQuery.h +++ b/src/Parsers/ASTExplainQuery.h @@ -19,6 +19,7 @@ public: QueryPipeline, /// 'EXPLAIN PIPELINE ...' QueryEstimates, /// 'EXPLAIN ESTIMATE ...' TableOverride, /// 'EXPLAIN TABLE OVERRIDE ...' + CurrentTransaction, /// 'EXPLAIN CURRENT TRANSACTION' }; explicit ASTExplainQuery(ExplainKind kind_) : kind(kind_) {} @@ -111,6 +112,7 @@ private: case QueryPipeline: return "EXPLAIN PIPELINE"; case QueryEstimates: return "EXPLAIN ESTIMATE"; case TableOverride: return "EXPLAIN TABLE OVERRIDE"; + case CurrentTransaction: return "EXPLAIN CURRENT TRANSACTION"; } __builtin_unreachable(); diff --git a/src/Parsers/ASTKillQueryQuery.cpp b/src/Parsers/ASTKillQueryQuery.cpp index 71c3011dd2c..8bf99312544 100644 --- a/src/Parsers/ASTKillQueryQuery.cpp +++ b/src/Parsers/ASTKillQueryQuery.cpp @@ -24,6 +24,9 @@ void ASTKillQueryQuery::formatQueryImpl(const FormatSettings & settings, FormatS case Type::PartMoveToShard: settings.ostr << "PART_MOVE_TO_SHARD"; break; + case Type::Transaction: + settings.ostr << "TRANSACTION"; + break; } formatOnCluster(settings); diff --git a/src/Parsers/ASTKillQueryQuery.h b/src/Parsers/ASTKillQueryQuery.h index 6ff12bcba93..95be3ec6309 100644 --- a/src/Parsers/ASTKillQueryQuery.h +++ b/src/Parsers/ASTKillQueryQuery.h @@ -14,6 +14,7 @@ public: Query, /// KILL QUERY Mutation, /// KILL MUTATION PartMoveToShard, /// KILL PART_MOVE_TO_SHARD + Transaction, /// KILL TRANSACTION }; Type type = Type::Query; diff --git a/src/Parsers/ASTTransactionControl.cpp b/src/Parsers/ASTTransactionControl.cpp new file mode 100644 index 00000000000..3ff29d9e43e --- /dev/null +++ b/src/Parsers/ASTTransactionControl.cpp @@ -0,0 +1,32 @@ +#include +#include +#include + +namespace DB +{ + +void ASTTransactionControl::formatImpl(const FormatSettings & format /*state*/, FormatState &, FormatStateStacked /*frame*/) const +{ + switch (action) + { + case BEGIN: + format.ostr << (format.hilite ? hilite_keyword : "") << "BEGIN TRANSACTION" << (format.hilite ? hilite_none : ""); + break; + case COMMIT: + format.ostr << (format.hilite ? hilite_keyword : "") << "COMMIT" << (format.hilite ? hilite_none : ""); + break; + case ROLLBACK: + format.ostr << (format.hilite ? hilite_keyword : "") << "ROLLBACK" << (format.hilite ? hilite_none : ""); + break; + case SET_SNAPSHOT: + format.ostr << (format.hilite ? hilite_keyword : "") << "SET TRANSACTION SNAPSHOT " << (format.hilite ? hilite_none : "") << snapshot; + break; + } +} + +void ASTTransactionControl::updateTreeHashImpl(SipHash & hash_state) const +{ + hash_state.update(action); +} + +} diff --git a/src/Parsers/ASTTransactionControl.h b/src/Parsers/ASTTransactionControl.h new file mode 100644 index 00000000000..06f578ff138 --- /dev/null +++ b/src/Parsers/ASTTransactionControl.h @@ -0,0 +1,32 @@ +#pragma once +#include + +namespace DB +{ + +/// Common AST for TCL queries +class ASTTransactionControl : public IAST +{ +public: + enum QueryType + { + BEGIN, + COMMIT, + ROLLBACK, + SET_SNAPSHOT, + }; + + QueryType action; + + UInt64 snapshot; /// For SET TRANSACTION SNAPSHOT ... + + ASTTransactionControl(QueryType action_) : action(action_) {} + + String getID(char /*delimiter*/) const override { return "ASTTransactionControl"; } + ASTPtr clone() const override { return std::make_shared(*this); } + + void formatImpl(const FormatSettings & format, FormatState & /*state*/, FormatStateStacked /*frame*/) const override; + void updateTreeHashImpl(SipHash & hash_state) const override; +}; + +} diff --git a/src/Parsers/ParserExplainQuery.cpp b/src/Parsers/ParserExplainQuery.cpp index 63314452447..71c49a020cc 100644 --- a/src/Parsers/ParserExplainQuery.cpp +++ b/src/Parsers/ParserExplainQuery.cpp @@ -22,6 +22,7 @@ bool ParserExplainQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected ParserKeyword s_plan("PLAN"); ParserKeyword s_estimates("ESTIMATE"); ParserKeyword s_table_override("TABLE OVERRIDE"); + ParserKeyword s_current_transaction("CURRENT TRANSACTION"); if (s_explain.ignore(pos, expected)) { @@ -39,6 +40,8 @@ bool ParserExplainQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected kind = ASTExplainQuery::ExplainKind::QueryEstimates; //-V1048 else if (s_table_override.ignore(pos, expected)) kind = ASTExplainQuery::ExplainKind::TableOverride; + else if (s_current_transaction.ignore(pos, expected)) + kind = ASTExplainQuery::ExplainKind::CurrentTransaction; } else return false; @@ -79,6 +82,10 @@ bool ParserExplainQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected explain_query->setTableFunction(table_function); explain_query->setTableOverride(table_override); } + else if (kind == ASTExplainQuery::ExplainKind::CurrentTransaction) + { + /// Nothing to parse + } else if (select_p.parse(pos, query, expected) || create_p.parse(pos, query, expected) || insert_p.parse(pos, query, expected)) diff --git a/src/Parsers/ParserKillQueryQuery.cpp b/src/Parsers/ParserKillQueryQuery.cpp index bc895406c9f..0b1b37e61bf 100644 --- a/src/Parsers/ParserKillQueryQuery.cpp +++ b/src/Parsers/ParserKillQueryQuery.cpp @@ -18,6 +18,7 @@ bool ParserKillQueryQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expect ParserKeyword p_query{"QUERY"}; ParserKeyword p_mutation{"MUTATION"}; ParserKeyword p_part_move_to_shard{"PART_MOVE_TO_SHARD"}; + ParserKeyword p_transaction{"TRANSACTION"}; ParserKeyword p_on{"ON"}; ParserKeyword p_test{"TEST"}; ParserKeyword p_sync{"SYNC"}; @@ -34,6 +35,8 @@ bool ParserKillQueryQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expect query->type = ASTKillQueryQuery::Type::Mutation; else if (p_part_move_to_shard.ignore(pos, expected)) query->type = ASTKillQueryQuery::Type::PartMoveToShard; + else if (p_transaction.ignore(pos, expected)) + query->type = ASTKillQueryQuery::Type::Transaction; else return false; diff --git a/src/Parsers/ParserQuery.cpp b/src/Parsers/ParserQuery.cpp index 78d8854f298..eaea5dd0f5f 100644 --- a/src/Parsers/ParserQuery.cpp +++ b/src/Parsers/ParserQuery.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -46,6 +47,7 @@ bool ParserQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) ParserGrantQuery grant_p; ParserSetRoleQuery set_role_p; ParserExternalDDLQuery external_ddl_p; + ParserTransactionControl transaction_control_p; ParserBackupQuery backup_p; bool res = query_with_output_p.parse(pos, node, expected) @@ -64,6 +66,7 @@ bool ParserQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) || drop_access_entity_p.parse(pos, node, expected) || grant_p.parse(pos, node, expected) || external_ddl_p.parse(pos, node, expected) + || transaction_control_p.parse(pos, node, expected) || backup_p.parse(pos, node, expected); return res; diff --git a/src/Parsers/ParserSetQuery.cpp b/src/Parsers/ParserSetQuery.cpp index d50de5f700d..76d6a299857 100644 --- a/src/Parsers/ParserSetQuery.cpp +++ b/src/Parsers/ParserSetQuery.cpp @@ -53,6 +53,10 @@ bool ParserSetQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) if (!s_set.ignore(pos, expected)) return false; + + /// Parse SET TRANSACTION ... queries using ParserTransactionControl + if (ParserKeyword{"TRANSACTION"}.check(pos, expected)) + return false; } SettingsChanges changes; diff --git a/src/Parsers/ParserTransactionControl.cpp b/src/Parsers/ParserTransactionControl.cpp new file mode 100644 index 00000000000..da593170002 --- /dev/null +++ b/src/Parsers/ParserTransactionControl.cpp @@ -0,0 +1,41 @@ +#include +#include +#include +#include +#include + +namespace DB +{ + +bool ParserTransactionControl::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) +{ + ASTTransactionControl::QueryType action; + UInt64 snapshot = 0; + + if (ParserKeyword("BEGIN TRANSACTION").ignore(pos, expected)) + action = ASTTransactionControl::BEGIN; + else if (ParserKeyword("COMMIT").ignore(pos, expected)) + action = ASTTransactionControl::COMMIT; + else if (ParserKeyword("ROLLBACK").ignore(pos, expected)) + action = ASTTransactionControl::ROLLBACK; + else if (ParserKeyword("SET TRANSACTION SNAPSHOT").ignore(pos, expected)) + { + action = ASTTransactionControl::SET_SNAPSHOT; + ASTPtr ast; + if (!ParserNumber{}.parse(pos, ast, expected)) + return false; + + const auto & snapshot_num = ast->as()->value; + if (!snapshot_num.tryGet(snapshot)) + return false; + } + else + return false; + + auto ast = std::make_shared(action); + ast->snapshot = snapshot; + node = ast; + return true; +} + +} diff --git a/src/Parsers/ParserTransactionControl.h b/src/Parsers/ParserTransactionControl.h new file mode 100644 index 00000000000..157c088624c --- /dev/null +++ b/src/Parsers/ParserTransactionControl.h @@ -0,0 +1,14 @@ +#pragma once +#include + +namespace DB +{ + +class ParserTransactionControl : public IParserBase +{ +public: + const char * getName() const override { return "TCL query"; } + bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; +}; + +} diff --git a/src/QueryPipeline/RemoteQueryExecutor.cpp b/src/QueryPipeline/RemoteQueryExecutor.cpp index 110d4308236..f4a30a9fee7 100644 --- a/src/QueryPipeline/RemoteQueryExecutor.cpp +++ b/src/QueryPipeline/RemoteQueryExecutor.cpp @@ -563,7 +563,7 @@ void RemoteQueryExecutor::sendExternalTables() { SelectQueryInfo query_info; auto metadata_snapshot = cur->getInMemoryMetadataPtr(); - auto storage_snapshot = cur->getStorageSnapshot(metadata_snapshot); + auto storage_snapshot = cur->getStorageSnapshot(metadata_snapshot, context); QueryProcessingStage::Enum read_from_table_stage = cur->getQueryProcessingStage( context, QueryProcessingStage::Complete, storage_snapshot, query_info); diff --git a/src/Storages/FileLog/StorageFileLog.cpp b/src/Storages/FileLog/StorageFileLog.cpp index d7c732aee02..47490aae75b 100644 --- a/src/Storages/FileLog/StorageFileLog.cpp +++ b/src/Storages/FileLog/StorageFileLog.cpp @@ -679,7 +679,7 @@ bool StorageFileLog::streamToViews() throw Exception("Engine table " + table_id.getNameForLogs() + " doesn't exist", ErrorCodes::LOGICAL_ERROR); auto metadata_snapshot = getInMemoryMetadataPtr(); - auto storage_snapshot = getStorageSnapshot(metadata_snapshot); + auto storage_snapshot = getStorageSnapshot(metadata_snapshot, getContext()); auto max_streams_number = std::min(filelog_settings->max_threads.value, file_infos.file_names.size()); /// No files to parse diff --git a/src/Storages/IStorage.h b/src/Storages/IStorage.h index 17e9e55455c..013f7e97682 100644 --- a/src/Storages/IStorage.h +++ b/src/Storages/IStorage.h @@ -161,6 +161,11 @@ public: /// Returns true if the storage supports reading of subcolumns of complex types. virtual bool supportsSubcolumns() const { return false; } + /// Returns true if the storage supports transactions for SELECT, INSERT and ALTER queries. + /// Storage may throw an exception later if some query kind is not fully supported. + /// This method can return true for readonly engines that return the same rows for reading (such as SystemNumbers) + virtual bool supportsTransactions() const { return false; } + /// Returns true if the storage supports storing of dynamic subcolumns. /// For now it makes sense only for data type Object. virtual bool supportsDynamicSubcolumns() const { return false; } @@ -483,6 +488,16 @@ public: throw Exception("Mutations are not supported by storage " + getName(), ErrorCodes::NOT_IMPLEMENTED); } + virtual void waitForMutation(const String & /*mutation_id*/) + { + throw Exception("Mutations are not supported by storage " + getName(), ErrorCodes::NOT_IMPLEMENTED); + } + + virtual void setMutationCSN(const String & /*mutation_id*/, UInt64 /*csn*/) + { + throw Exception("Mutations are not supported by storage " + getName(), ErrorCodes::NOT_IMPLEMENTED); + } + /// Cancel a part move to shard. virtual CancellationCode killPartMoveToShard(const UUID & /*task_uuid*/) { @@ -553,11 +568,6 @@ public: /// Similar to above but checks for DETACH. It's only used for DICTIONARIES. virtual void checkTableCanBeDetached() const {} - /// Checks that Partition could be dropped right now - /// Otherwise - throws an exception with detailed information. - /// We do not use mutex because it is not very important that the size could change during the operation. - virtual void checkPartitionCanBeDropped(const ASTPtr & /*partition*/) {} - /// Returns true if Storage may store some data on disk. /// NOTE: may not be equivalent to !getDataPaths().empty() virtual bool storesDataOnDisk() const { return false; } @@ -610,15 +620,15 @@ public: virtual std::optional lifetimeBytes() const { return {}; } /// Creates a storage snapshot from given metadata. - virtual StorageSnapshotPtr getStorageSnapshot(const StorageMetadataPtr & metadata_snapshot) const + virtual StorageSnapshotPtr getStorageSnapshot(const StorageMetadataPtr & metadata_snapshot, ContextPtr /*query_context*/) const { return std::make_shared(*this, metadata_snapshot); } /// Creates a storage snapshot from given metadata and columns, which are used in query. - virtual StorageSnapshotPtr getStorageSnapshotForQuery(const StorageMetadataPtr & metadata_snapshot, const ASTPtr & /*query*/) const + virtual StorageSnapshotPtr getStorageSnapshotForQuery(const StorageMetadataPtr & metadata_snapshot, const ASTPtr & /*query*/, ContextPtr query_context) const { - return getStorageSnapshot(metadata_snapshot); + return getStorageSnapshot(metadata_snapshot, query_context); } private: diff --git a/src/Storages/Kafka/StorageKafka.cpp b/src/Storages/Kafka/StorageKafka.cpp index 71a494c93fa..542eb392d01 100644 --- a/src/Storages/Kafka/StorageKafka.cpp +++ b/src/Storages/Kafka/StorageKafka.cpp @@ -647,7 +647,7 @@ bool StorageKafka::streamToViews() CurrentMetrics::Increment metric_increment{CurrentMetrics::KafkaBackgroundReads}; ProfileEvents::increment(ProfileEvents::KafkaBackgroundReads); - auto storage_snapshot = getStorageSnapshot(getInMemoryMetadataPtr()); + auto storage_snapshot = getStorageSnapshot(getInMemoryMetadataPtr(), getContext()); // Create an INSERT query for streaming data auto insert = std::make_shared(); diff --git a/src/Storages/MergeTree/DataPartsExchange.cpp b/src/Storages/MergeTree/DataPartsExchange.cpp index 4e7dcc60696..0dcccc33266 100644 --- a/src/Storages/MergeTree/DataPartsExchange.cpp +++ b/src/Storages/MergeTree/DataPartsExchange.cpp @@ -573,6 +573,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToMemory( auto volume = std::make_shared("volume_" + part_name, disk, 0); MergeTreeData::MutableDataPartPtr new_data_part = std::make_shared(data, part_name, volume); + new_data_part->version.setCreationTID(Tx::PrehistoricTID, nullptr); for (auto i = 0ul; i < projections; ++i) { @@ -601,7 +602,8 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToMemory( metadata_snapshot->projections.get(projection_name).metadata, block.getNamesAndTypesList(), {}, - CompressionCodecFactory::instance().get("NONE", {})); + CompressionCodecFactory::instance().get("NONE", {}), + NO_TRANSACTION_PTR); part_out.write(block); part_out.finalizePart(new_projection_part, false); @@ -625,7 +627,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToMemory( MergedBlockOutputStream part_out( new_data_part, metadata_snapshot, block.getNamesAndTypesList(), {}, - CompressionCodecFactory::instance().get("NONE", {})); + CompressionCodecFactory::instance().get("NONE", {}), NO_TRANSACTION_PTR); part_out.write(block); part_out.finalizePart(new_data_part, false); @@ -753,6 +755,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToDisk( assertEOF(in); auto volume = std::make_shared("volume_" + part_name, disk, 0); MergeTreeData::MutableDataPartPtr new_data_part = data.createPart(part_name, volume, part_relative_path); + new_data_part->version.setCreationTID(Tx::PrehistoricTID, nullptr); new_data_part->is_temp = true; new_data_part->modification_time = time(nullptr); new_data_part->loadColumnsChecksumsIndexes(true, false); @@ -842,6 +845,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToDiskRemoteMeta( assertEOF(in); MergeTreeData::MutableDataPartPtr new_data_part = data.createPart(part_name, volume, part_relative_path); + new_data_part->version.setCreationTID(Tx::PrehistoricTID, nullptr); new_data_part->is_temp = true; new_data_part->modification_time = time(nullptr); new_data_part->loadColumnsChecksumsIndexes(true, false); diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 0ed96f5dda4..d704d8fc435 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include @@ -29,6 +28,8 @@ #include #include #include +#include +#include namespace CurrentMetrics @@ -65,6 +66,12 @@ namespace ErrorCodes extern const int NOT_IMPLEMENTED; } +static std::unique_ptr openForReading(const DiskPtr & disk, const String & path) +{ + size_t file_size = disk->getFileSize(path); + return disk->readFile(path, ReadSettings().adjustBufferSize(file_size), file_size); +} + void IMergeTreeDataPart::MinMaxIndex::load(const MergeTreeData & data, const PartMetadataManagerPtr & manager) { auto metadata_snapshot = data.getInMemoryMetadataPtr(); @@ -466,6 +473,7 @@ SerializationPtr IMergeTreeDataPart::getSerialization(const NameAndTypePair & co void IMergeTreeDataPart::removeIfNeeded() { + assert(assertHasValidVersionMetadata()); if (!is_temp && state != State::DeleteOnDestroy) return; @@ -790,10 +798,14 @@ NameSet IMergeTreeDataPart::getFileNamesWithoutChecksums() const NameSet result = {"checksums.txt", "columns.txt"}; String default_codec_path = fs::path(getFullRelativePath()) / DEFAULT_COMPRESSION_CODEC_FILE_NAME; + String txn_version_path = fs::path(getFullRelativePath()) / TXN_VERSION_METADATA_FILE_NAME; if (volume->getDisk()->exists(default_codec_path)) result.emplace(DEFAULT_COMPRESSION_CODEC_FILE_NAME); + if (volume->getDisk()->exists(txn_version_path)) + result.emplace(TXN_VERSION_METADATA_FILE_NAME); + return result; } @@ -1223,6 +1235,218 @@ void IMergeTreeDataPart::loadColumns(bool require) setSerializationInfos(infos); } +void IMergeTreeDataPart::assertHasVersionMetadata(MergeTreeTransaction * txn) const +{ + TransactionID expected_tid = txn ? txn->tid : Tx::PrehistoricTID; + if (version.creation_tid != expected_tid) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "CreationTID of part {} (table {}) is set to unexpected value {}, it's a bug. Current transaction: {}", + name, storage.getStorageID().getNameForLogs(), version.creation_tid, txn ? txn->dumpDescription() : ""); + + assert(!txn || storage.supportsTransactions()); + assert(!txn || volume->getDisk()->exists(fs::path(getFullRelativePath()) / TXN_VERSION_METADATA_FILE_NAME)); +} + +void IMergeTreeDataPart::storeVersionMetadata() const +{ + if (!wasInvolvedInTransaction()) + return; + + LOG_TEST(storage.log, "Writing version for {} (creation: {}, removal {})", name, version.creation_tid, version.removal_tid); + assert(storage.supportsTransactions()); + + if (!isStoredOnDisk()) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Transactions are not supported for in-memory parts (table: {}, part: {})", + storage.getStorageID().getNameForLogs(), name); + + String version_file_name = fs::path(getFullRelativePath()) / TXN_VERSION_METADATA_FILE_NAME; + String tmp_version_file_name = version_file_name + ".tmp"; + DiskPtr disk = volume->getDisk(); + { + /// TODO IDisk interface does not allow to open file with O_EXCL flag (for DiskLocal), + /// so we create empty file at first (expecting that createFile throws if file already exists) + /// and then overwrite it. + disk->createFile(tmp_version_file_name); + auto out = disk->writeFile(tmp_version_file_name, 256, WriteMode::Rewrite); + version.write(*out); + out->finalize(); + out->sync(); + } + + SyncGuardPtr sync_guard; + if (storage.getSettings()->fsync_part_directory) + sync_guard = disk->getDirectorySyncGuard(getFullRelativePath()); + disk->replaceFile(tmp_version_file_name, version_file_name); +} + +void IMergeTreeDataPart::appendCSNToVersionMetadata(VersionMetadata::WhichCSN which_csn) const +{ + assert(!version.creation_tid.isEmpty()); + assert(!(which_csn == VersionMetadata::WhichCSN::CREATION && version.creation_tid.isPrehistoric())); + assert(!(which_csn == VersionMetadata::WhichCSN::CREATION && version.creation_csn == 0)); + assert(!(which_csn == VersionMetadata::WhichCSN::REMOVAL && (version.removal_tid.isPrehistoric() || version.removal_tid.isEmpty()))); + assert(!(which_csn == VersionMetadata::WhichCSN::REMOVAL && version.removal_csn == 0)); + assert(isStoredOnDisk()); + + /// Small enough appends to file are usually atomic, + /// so we append new metadata instead of rewriting file to reduce number of fsyncs. + /// We don't need to do fsync when writing CSN, because in case of hard restart + /// we will be able to restore CSN from transaction log in Keeper. + + String version_file_name = fs::path(getFullRelativePath()) / TXN_VERSION_METADATA_FILE_NAME; + DiskPtr disk = volume->getDisk(); + auto out = disk->writeFile(version_file_name, 256, WriteMode::Append); + version.writeCSN(*out, which_csn); + out->finalize(); +} + +void IMergeTreeDataPart::appendRemovalTIDToVersionMetadata(bool clear) const +{ + assert(!version.creation_tid.isEmpty()); + assert(version.removal_csn == 0); + assert(!version.removal_tid.isEmpty()); + assert(isStoredOnDisk()); + + if (version.creation_tid.isPrehistoric() && !clear) + { + /// Metadata file probably does not exist, because it was not written on part creation, because it was created without a transaction. + /// Let's create it (if needed). Concurrent writes are not possible, because creation_csn is prehistoric and we own removal_tid_lock. + storeVersionMetadata(); + return; + } + + if (clear) + LOG_TEST(storage.log, "Clearing removal TID for {} (creation: {}, removal {})", name, version.creation_tid, version.removal_tid); + else + LOG_TEST(storage.log, "Appending removal TID for {} (creation: {}, removal {})", name, version.creation_tid, version.removal_tid); + + String version_file_name = fs::path(getFullRelativePath()) / TXN_VERSION_METADATA_FILE_NAME; + DiskPtr disk = volume->getDisk(); + auto out = disk->writeFile(version_file_name, 256, WriteMode::Append); + version.writeRemovalTID(*out, clear); + out->finalize(); + + /// fsync is not required when we clearing removal TID, because after hard restart we will fix metadata + if (!clear) + out->sync(); +} + +void IMergeTreeDataPart::loadVersionMetadata() const +try +{ + String version_file_name = fs::path(getFullRelativePath()) / TXN_VERSION_METADATA_FILE_NAME; + String tmp_version_file_name = version_file_name + ".tmp"; + DiskPtr disk = volume->getDisk(); + + auto remove_tmp_file = [&]() + { + auto last_modified = disk->getLastModified(tmp_version_file_name); + auto buf = openForReading(disk, tmp_version_file_name); + String content; + readStringUntilEOF(content, *buf); + LOG_WARNING(storage.log, "Found file {} that was last modified on {}, has size {} and the following content: {}", + tmp_version_file_name, last_modified.epochTime(), content.size(), content); + disk->removeFile(tmp_version_file_name); + }; + + if (disk->exists(version_file_name)) + { + auto buf = openForReading(disk, version_file_name); + version.read(*buf); + if (disk->exists(tmp_version_file_name)) + remove_tmp_file(); + return; + } + + /// Four (?) cases are possible: + /// 1. Part was created without transactions. + /// 2. Version metadata file was not renamed from *.tmp on part creation. + /// 3. Version metadata were written to *.tmp file, but hard restart happened before fsync. + /// 4. Fsyncs in storeVersionMetadata() work incorrectly. + + if (!disk->exists(tmp_version_file_name)) + { + /// Case 1. + /// We do not have version metadata and transactions history for old parts, + /// so let's consider that such parts were created by some ancient transaction + /// and were committed with some prehistoric CSN. + /// NOTE It might be Case 3, but version metadata file is written on part creation before other files, + /// so it's not Case 3 if part is not broken. + version.setCreationTID(Tx::PrehistoricTID, nullptr); + version.creation_csn = Tx::PrehistoricCSN; + return; + } + + /// Case 2. + /// Content of *.tmp file may be broken, just use fake TID. + /// Transaction was not committed if *.tmp file was not renamed, so we should complete rollback by removing part. + version.setCreationTID(Tx::DummyTID, nullptr); + version.creation_csn = Tx::RolledBackCSN; + remove_tmp_file(); +} +catch (Exception & e) +{ + e.addMessage("While loading version metadata from table {} part {}", storage.getStorageID().getNameForLogs(), name); + throw; +} + +bool IMergeTreeDataPart::wasInvolvedInTransaction() const +{ + assert(!version.creation_tid.isEmpty() || (state == State::Temporary /* && std::uncaught_exceptions() */)); + bool created_by_transaction = !version.creation_tid.isPrehistoric(); + bool removed_by_transaction = version.isRemovalTIDLocked() && version.removal_tid_lock != Tx::PrehistoricTID.getHash(); + return created_by_transaction || removed_by_transaction; +} + +bool IMergeTreeDataPart::assertHasValidVersionMetadata() const +{ + /// We don't have many tests with server restarts and it's really inconvenient to write such tests. + /// So we use debug assertions to ensure that part version is written correctly. + /// This method is not supposed to be called in release builds. + + if (isProjectionPart()) + return true; + + if (!wasInvolvedInTransaction()) + return true; + + if (!isStoredOnDisk()) + return false; + + if (part_is_probably_removed_from_disk) + return true; + + DiskPtr disk = volume->getDisk(); + if (!disk->exists(getFullRelativePath())) + return true; + + String content; + String version_file_name = fs::path(getFullRelativePath()) / TXN_VERSION_METADATA_FILE_NAME; + try + { + auto buf = openForReading(disk, version_file_name); + readStringUntilEOF(content, *buf); + ReadBufferFromString str_buf{content}; + VersionMetadata file; + file.read(str_buf); + bool valid_creation_tid = version.creation_tid == file.creation_tid; + bool valid_removal_tid = version.removal_tid == file.removal_tid || version.removal_tid == Tx::PrehistoricTID; + bool valid_creation_csn = version.creation_csn == file.creation_csn || version.creation_csn == Tx::RolledBackCSN; + bool valid_removal_csn = version.removal_csn == file.removal_csn || version.removal_csn == Tx::PrehistoricCSN; + if (!valid_creation_tid || !valid_removal_tid || !valid_creation_csn || !valid_removal_csn) + throw Exception(ErrorCodes::CORRUPTED_DATA, "Invalid version metadata file"); + return true; + } + catch (...) + { + WriteBufferFromOwnString expected; + version.write(expected); + tryLogCurrentException(storage.log, fmt::format("File {} contains:\n{}\nexpected:\n{}", version_file_name, content, expected.str())); + return false; + } +} + + void IMergeTreeDataPart::appendFilesOfColumns(Strings & files) { files.push_back("columns.txt"); @@ -1337,6 +1561,9 @@ void IMergeTreeDataPart::initializePartMetadataManager() void IMergeTreeDataPart::remove() const { + assert(assertHasValidVersionMetadata()); + part_is_probably_removed_from_disk = true; + std::optional keep_shared_data = keepSharedDataInDecoupledStorage(); if (!keep_shared_data.has_value()) return; @@ -1444,6 +1671,7 @@ void IMergeTreeDataPart::remove() const request.emplace_back(fs::path(to) / DEFAULT_COMPRESSION_CODEC_FILE_NAME, true); request.emplace_back(fs::path(to) / DELETE_ON_DESTROY_MARKER_FILE_NAME, true); + request.emplace_back(fs::path(to) / TXN_VERSION_METADATA_FILE_NAME, true); disk->removeSharedFiles(request, *keep_shared_data); disk->removeDirectory(to); diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index 694be50d168..19df88c5466 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -40,6 +41,7 @@ class IMergeTreeReader; class IMergeTreeDataPartWriter; class MarkCache; class UncompressedCache; +class MergeTreeTransaction; /// Description of the data part. class IMergeTreeDataPart : public std::enable_shared_from_this @@ -327,6 +329,8 @@ public: CompressionCodecPtr default_codec; + mutable VersionMetadata version; + /// For data in RAM ('index') UInt64 getIndexSizeInBytes() const; UInt64 getIndexSizeInAllocatedBytes() const; @@ -414,6 +418,8 @@ public: /// (number of rows, number of rows with default values, etc). static inline constexpr auto SERIALIZATION_FILE_NAME = "serialization.json"; + static inline constexpr auto TXN_VERSION_METADATA_FILE_NAME = "txn_version.txt"; + /// One of part files which is used to check how many references (I'd like /// to say hardlinks, but it will confuse even more) we have for the part /// for zero copy replication. Sadly it's very complex. @@ -435,12 +441,38 @@ public: /// Required for distinguish different copies of the same part on remote FS. String getUniqueId() const; + /// Ensures that creation_tid was correctly set after part creation. + void assertHasVersionMetadata(MergeTreeTransaction * txn) const; + + /// [Re]writes file with transactional metadata on disk + void storeVersionMetadata() const; + + /// Appends the corresponding CSN to file on disk (without fsync) + void appendCSNToVersionMetadata(VersionMetadata::WhichCSN which_csn) const; + + /// Appends removal TID to file on disk (with fsync) + void appendRemovalTIDToVersionMetadata(bool clear = false) const; + + /// Loads transactional metadata from disk + void loadVersionMetadata() const; + + /// Returns true if part was created or removed by a transaction + bool wasInvolvedInTransaction() const; + + /// Moar hardening: this method is supposed to be used for debug assertions + bool assertHasValidVersionMetadata() const; + + /// Return hardlink count for part. + /// Required for keep data on remote FS when part has shadow copies. + UInt32 getNumberOfRefereneces() const; + /// Get checksums of metadata file in part directory IMergeTreeDataPart::uint128 getActualChecksumByFile(const String & file_path) const; /// Check metadata in cache is consistent with actual metadata on disk(if use_metadata_cache is true) std::unordered_map checkMetadata() const; + protected: /// Total size of all columns, calculated once in calcuateColumnSizesOnDisk @@ -551,6 +583,9 @@ private: CompressionCodecPtr detectDefaultCompressionCodec() const; mutable State state{State::Temporary}; + + /// This ugly flag is needed for debug assertions only + mutable bool part_is_probably_removed_from_disk = false; }; using MergeTreeDataPartState = IMergeTreeDataPart::State; diff --git a/src/Storages/MergeTree/MergeFromLogEntryTask.cpp b/src/Storages/MergeTree/MergeFromLogEntryTask.cpp index 68ffb42a90a..9459849b90a 100644 --- a/src/Storages/MergeTree/MergeFromLogEntryTask.cpp +++ b/src/Storages/MergeTree/MergeFromLogEntryTask.cpp @@ -227,7 +227,7 @@ ReplicatedMergeMutateTaskBase::PrepareResult MergeFromLogEntryTask::prepare() future_merged_part, settings); - transaction_ptr = std::make_unique(storage); + transaction_ptr = std::make_unique(storage, NO_TRANSACTION_RAW); stopwatch_ptr = std::make_unique(); merge_task = storage.merger_mutator.mergePartsToTemporaryPart( @@ -241,7 +241,8 @@ ReplicatedMergeMutateTaskBase::PrepareResult MergeFromLogEntryTask::prepare() reserved_space, entry.deduplicate, entry.deduplicate_by_columns, - storage.merging_params); + storage.merging_params, + NO_TRANSACTION_PTR); /// Adjust priority @@ -264,7 +265,7 @@ bool MergeFromLogEntryTask::finalize(ReplicatedMergeMutateTaskBase::PartLogWrite /// Task is not needed merge_task.reset(); - storage.merger_mutator.renameMergedTemporaryPart(part, parts, transaction_ptr.get()); + storage.merger_mutator.renameMergedTemporaryPart(part, parts, NO_TRANSACTION_PTR, transaction_ptr.get()); try { diff --git a/src/Storages/MergeTree/MergeMutateSelectedEntry.h b/src/Storages/MergeTree/MergeMutateSelectedEntry.h index 64136205157..c420cbca12b 100644 --- a/src/Storages/MergeTree/MergeMutateSelectedEntry.h +++ b/src/Storages/MergeTree/MergeMutateSelectedEntry.h @@ -39,10 +39,13 @@ struct MergeMutateSelectedEntry FutureMergedMutatedPartPtr future_part; CurrentlyMergingPartsTaggerPtr tagger; MutationCommandsConstPtr commands; - MergeMutateSelectedEntry(FutureMergedMutatedPartPtr future_part_, CurrentlyMergingPartsTaggerPtr tagger_, MutationCommandsConstPtr commands_) + MergeTreeTransactionPtr txn; + MergeMutateSelectedEntry(FutureMergedMutatedPartPtr future_part_, CurrentlyMergingPartsTaggerPtr tagger_, + MutationCommandsConstPtr commands_, const MergeTreeTransactionPtr & txn_ = NO_TRANSACTION_PTR) : future_part(future_part_) , tagger(std::move(tagger_)) , commands(commands_) + , txn(txn_) {} }; diff --git a/src/Storages/MergeTree/MergePlainMergeTreeTask.cpp b/src/Storages/MergeTree/MergePlainMergeTreeTask.cpp index 14e43b2897e..0146ce4c7b3 100644 --- a/src/Storages/MergeTree/MergePlainMergeTreeTask.cpp +++ b/src/Storages/MergeTree/MergePlainMergeTreeTask.cpp @@ -107,14 +107,15 @@ void MergePlainMergeTreeTask::prepare() merge_mutate_entry->tagger->reserved_space, deduplicate, deduplicate_by_columns, - storage.merging_params); + storage.merging_params, + txn); } void MergePlainMergeTreeTask::finish() { new_part = merge_task->getFuture().get(); - storage.merger_mutator.renameMergedTemporaryPart(new_part, future_part->parts, nullptr); + storage.merger_mutator.renameMergedTemporaryPart(new_part, future_part->parts, txn, nullptr); write_part_log({}); } diff --git a/src/Storages/MergeTree/MergePlainMergeTreeTask.h b/src/Storages/MergeTree/MergePlainMergeTreeTask.h index 2c36386d32d..0f6d38d2cbf 100644 --- a/src/Storages/MergeTree/MergePlainMergeTreeTask.h +++ b/src/Storages/MergeTree/MergePlainMergeTreeTask.h @@ -39,6 +39,12 @@ public: StorageID getStorageID() override; UInt64 getPriority() override { return priority; } + void setCurrentTransaction(MergeTreeTransactionHolder && txn_holder_, MergeTreeTransactionPtr && txn_) + { + txn_holder = std::move(txn_holder_); + txn = std::move(txn_); + } + private: void prepare(); @@ -73,6 +79,9 @@ private: std::function write_part_log; IExecutableTask::TaskResultCallback task_result_callback; MergeTaskPtr merge_task{nullptr}; + + MergeTreeTransactionHolder txn_holder; + MergeTreeTransactionPtr txn; }; diff --git a/src/Storages/MergeTree/MergeTask.cpp b/src/Storages/MergeTree/MergeTask.cpp index 1bb77fd27ef..e3146f169a7 100644 --- a/src/Storages/MergeTree/MergeTask.cpp +++ b/src/Storages/MergeTree/MergeTask.cpp @@ -260,6 +260,7 @@ bool MergeTask::ExecuteAndFinalizeHorizontalPart::prepare() global_ctx->merging_columns, MergeTreeIndexFactory::instance().getMany(global_ctx->metadata_snapshot->getSecondaryIndices()), ctx->compression_codec, + global_ctx->txn, /*reset_columns=*/ true, ctx->blocks_are_granules_size, global_ctx->context->getWriteSettings()); @@ -593,6 +594,7 @@ bool MergeTask::MergeProjectionsStage::mergeMinMaxIndexAndPrepareProjections() c projection_merging_params, global_ctx->new_data_part.get(), ".proj", + NO_TRANSACTION_PTR, global_ctx->data, global_ctx->mutator, global_ctx->merges_blocker, diff --git a/src/Storages/MergeTree/MergeTask.h b/src/Storages/MergeTree/MergeTask.h index 04da9ad77c4..efab102bfe6 100644 --- a/src/Storages/MergeTree/MergeTask.h +++ b/src/Storages/MergeTree/MergeTask.h @@ -60,6 +60,7 @@ public: MergeTreeData::MergingParams merging_params_, const IMergeTreeDataPart * parent_part_, String suffix_, + MergeTreeTransactionPtr txn, MergeTreeData * data_, MergeTreeDataMergerMutator * mutator_, ActionBlocker * merges_blocker_, @@ -83,6 +84,7 @@ public: global_ctx->mutator = std::move(mutator_); global_ctx->merges_blocker = std::move(merges_blocker_); global_ctx->ttl_merges_blocker = std::move(ttl_merges_blocker_); + global_ctx->txn = std::move(txn); auto prepare_stage_ctx = std::make_shared(); @@ -164,6 +166,8 @@ private: std::promise promise{}; IMergedBlockOutputStream::WrittenOffsetColumns written_offset_columns{}; + + MergeTreeTransactionPtr txn; }; using GlobalRuntimeContextPtr = std::shared_ptr; diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 90ccba9046d..a541822d6c1 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -30,8 +30,10 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -49,8 +51,6 @@ #include #include #include -#include -#include #include #include #include @@ -1320,51 +1320,155 @@ void MergeTreeData::loadDataParts(bool skip_sanity_checks) for (auto & part : duplicate_parts_to_remove) part->remove(); + auto deactivate_part = [&] (DataPartIteratorByStateAndInfo it) + { + + (*it)->remove_time.store((*it)->modification_time, std::memory_order_relaxed); + auto creation_csn = (*it)->version.creation_csn.load(std::memory_order_relaxed); + if (creation_csn != Tx::RolledBackCSN && creation_csn != Tx::PrehistoricCSN && !(*it)->version.isRemovalTIDLocked()) + { + /// It's possible that covering part was created without transaction, + /// but if covered part was created with transaction (i.e. creation_tid is not prehistoric), + /// then it must have removal tid in metadata file. + throw Exception(ErrorCodes::LOGICAL_ERROR, "Data part {} is Outdated and has creation TID {} and CSN {}, " + "but does not have removal tid. It's a bug or a result of manual intervention.", + (*it)->name, (*it)->version.creation_tid, creation_csn); + } + modifyPartState(it, DataPartState::Outdated); + removePartContributionToDataVolume(*it); + }; + + /// All parts are in "Active" state after loading + assert(std::find_if(data_parts_by_state_and_info.begin(), data_parts_by_state_and_info.end(), + [](const auto & part) + { + return part->getState() != DataPartState::Active; + }) == data_parts_by_state_and_info.end()); + + bool have_parts_with_version_metadata = false; + auto iter = data_parts_by_state_and_info.begin(); + while (iter != data_parts_by_state_and_info.end() && (*iter)->getState() == DataPartState::Active) + { + const DataPartPtr & part = *iter; + part->loadVersionMetadata(); + VersionMetadata & version = part->version; + if (part->wasInvolvedInTransaction()) + { + have_parts_with_version_metadata = true; + } + else + { + ++iter; + continue; + } + + /// Check if CSNs were witten after committing transaction, update and write if needed. + bool version_updated = false; + assert(!version.creation_tid.isEmpty()); + if (!part->version.creation_csn) + { + auto min = TransactionLog::getCSN(version.creation_tid); + if (!min) + { + /// Transaction that created this part was not committed. Remove part. + TransactionLog::assertTIDIsNotOutdated(version.creation_tid); + min = Tx::RolledBackCSN; + } + LOG_TRACE(log, "Will fix version metadata of {} after unclean restart: part has creation_tid={}, setting creation_csn={}", + part->name, version.creation_tid, min); + version.creation_csn = min; + version_updated = true; + } + if (!version.removal_tid.isEmpty() && !part->version.removal_csn) + { + auto max = TransactionLog::getCSN(version.removal_tid); + if (max) + { + LOG_TRACE(log, "Will fix version metadata of {} after unclean restart: part has removal_tid={}, setting removal_csn={}", + part->name, version.removal_tid, max); + version.removal_csn = max; + } + else + { + TransactionLog::assertTIDIsNotOutdated(version.removal_tid); + /// Transaction that tried to remove this part was not committed. Clear removal_tid. + LOG_TRACE(log, "Will fix version metadata of {} after unclean restart: clearing removal_tid={}", + part->name, version.removal_tid); + version.unlockRemovalTID(version.removal_tid, TransactionInfoContext{getStorageID(), part->name}); + } + version_updated = true; + } + + /// Sanity checks + bool csn_order = !version.removal_csn || version.creation_csn <= version.removal_csn || version.removal_csn == Tx::PrehistoricCSN; + bool min_start_csn_order = version.creation_tid.start_csn <= version.creation_csn; + bool max_start_csn_order = version.removal_tid.start_csn <= version.removal_csn; + bool creation_csn_known = version.creation_csn; + if (!csn_order || !min_start_csn_order || !max_start_csn_order || !creation_csn_known) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Part {} has invalid version metadata: {}", part->name, version.toString()); + + if (version_updated) + part->storeVersionMetadata(); + + /// Deactivate part if creation was not committed or if removal was. + if (version.creation_csn == Tx::RolledBackCSN || version.removal_csn) + { + auto next_it = std::next(iter); + deactivate_part(iter); + iter = next_it; + } + else + { + ++iter; + } + } + + if (have_parts_with_version_metadata) + transactions_enabled.store(true); + /// Delete from the set of current parts those parts that are covered by another part (those parts that /// were merged), but that for some reason are still not deleted from the filesystem. /// Deletion of files will be performed later in the clearOldParts() method. - if (data_parts_indexes.size() >= 2) + auto active_parts_range = getDataPartsStateRange(DataPartState::Active); + auto prev_it = active_parts_range.begin(); + auto end_it = active_parts_range.end(); + + bool less_than_two_active_parts = prev_it == end_it || std::next(prev_it) == end_it; + + if (!less_than_two_active_parts) { - /// Now all parts are committed, so data_parts_by_state_and_info == committed_parts_range - auto prev_jt = data_parts_by_state_and_info.begin(); - auto curr_jt = std::next(prev_jt); + (*prev_it)->assertState({DataPartState::Active}); + auto curr_it = std::next(prev_it); - auto deactivate_part = [&] (DataPartIteratorByStateAndInfo it) + while (curr_it != data_parts_by_state_and_info.end() && (*curr_it)->getState() == DataPartState::Active) { - (*it)->remove_time.store((*it)->modification_time, std::memory_order_relaxed); - modifyPartState(it, DataPartState::Outdated); - removePartContributionToDataVolume(*it); - }; + (*curr_it)->assertState({DataPartState::Active}); - (*prev_jt)->assertState({DataPartState::Active}); - - while (curr_jt != data_parts_by_state_and_info.end() && (*curr_jt)->getState() == DataPartState::Active) - { /// Don't consider data parts belonging to different partitions. - if ((*curr_jt)->info.partition_id != (*prev_jt)->info.partition_id) + if ((*curr_it)->info.partition_id != (*prev_it)->info.partition_id) { - ++prev_jt; - ++curr_jt; + ++prev_it; + ++curr_it; continue; } - if ((*curr_jt)->contains(**prev_jt)) + if ((*curr_it)->contains(**prev_it)) { - deactivate_part(prev_jt); - prev_jt = curr_jt; - ++curr_jt; + deactivate_part(prev_it); + prev_it = curr_it; + ++curr_it; } - else if ((*prev_jt)->contains(**curr_jt)) + else if ((*prev_it)->contains(**curr_it)) { - auto next = std::next(curr_jt); - deactivate_part(curr_jt); - curr_jt = next; + auto next = std::next(curr_it); + deactivate_part(curr_it); + curr_it = next; } else { - ++prev_jt; - ++curr_jt; + ++prev_it; + ++curr_it; } } } @@ -1482,12 +1586,20 @@ MergeTreeData::DataPartsVector MergeTreeData::grabOldParts(bool force) { const DataPartPtr & part = *it; + /// Do not remove outdated part if it may be visible for some transaction + if (!part->version.canBeRemoved()) + continue; + auto part_remove_time = part->remove_time.load(std::memory_order_relaxed); - if (part.unique() && /// Grab only parts that are not used by anyone (SELECTs for example). - ((part_remove_time < now && - now - part_remove_time > getSettings()->old_parts_lifetime.totalSeconds()) || force - || isInMemoryPart(part))) /// Remove in-memory parts immediately to not store excessive data in RAM + /// Grab only parts that are not used by anyone (SELECTs for example). + if (!part.unique()) + continue; + + if ((part_remove_time < now && now - part_remove_time > getSettings()->old_parts_lifetime.totalSeconds()) + || force + || isInMemoryPart(part) /// Remove in-memory parts immediately to not store excessive data in RAM + || (part->version.creation_csn == Tx::RolledBackCSN && getSettings()->remove_rolled_back_parts_immediately)) { parts_to_delete.emplace_back(it); } @@ -1578,7 +1690,7 @@ void MergeTreeData::flushAllInMemoryPartsIfNeeded() return; auto metadata_snapshot = getInMemoryMetadataPtr(); - DataPartsVector parts = getDataPartsVector(); + DataPartsVector parts = getDataPartsVectorForInternalUsage(); for (const auto & part : parts) { if (auto part_in_memory = asInMemoryPart(part)) @@ -1641,7 +1753,7 @@ void MergeTreeData::clearPartsFromFilesystem(const DataPartsVector & parts_to_re size_t MergeTreeData::clearOldWriteAheadLogs() { - DataPartsVector parts = getDataPartsVector(); + DataPartsVector parts = getDataPartsVectorForInternalUsage(); std::vector> all_block_numbers_on_disk; std::vector> block_numbers_on_disk; @@ -1706,14 +1818,19 @@ size_t MergeTreeData::clearEmptyParts() return 0; size_t cleared_count = 0; - auto parts = getDataPartsVector(); + auto parts = getDataPartsVectorForInternalUsage(); for (const auto & part : parts) { - if (part->rows_count == 0) - { - dropPartNoWaitNoThrow(part->name); - ++cleared_count; - } + if (part->rows_count != 0) + continue; + + /// Do not try to drop uncommitted parts. + if (!part->version.getCreationTID().isPrehistoric() && !part->version.isVisible(TransactionLog::instance().getLatestSnapshot())) + continue; + + LOG_TRACE(log, "Will drop empty part {}", part->name); + dropPartNoWaitNoThrow(part->name); + ++cleared_count; } return cleared_count; } @@ -2215,7 +2332,7 @@ void MergeTreeData::checkAlterIsPossible(const AlterCommands & commands, Context } } - for (const auto & part : getDataPartsVector()) + for (const auto & part : getDataPartsVectorForInternalUsage()) { bool at_least_one_column_rest = false; for (const auto & column : part->getColumns()) @@ -2500,6 +2617,7 @@ MergeTreeData::DataPartsVector MergeTreeData::getActivePartsToReplace( bool MergeTreeData::renameTempPartAndAdd( MutableDataPartPtr & part, + MergeTreeTransaction * txn, SimpleIncrement * increment, Transaction * out_transaction, MergeTreeDeduplicationLog * deduplication_log, @@ -2512,7 +2630,7 @@ bool MergeTreeData::renameTempPartAndAdd( DataPartsVector covered_parts; { auto lock = lockParts(); - if (!renameTempPartAndReplace(part, increment, out_transaction, lock, &covered_parts, deduplication_log, deduplication_token)) + if (!renameTempPartAndReplace(part, txn, increment, out_transaction, lock, &covered_parts, deduplication_log, deduplication_token)) return false; } if (!covered_parts.empty()) @@ -2525,6 +2643,7 @@ bool MergeTreeData::renameTempPartAndAdd( bool MergeTreeData::renameTempPartAndReplace( MutableDataPartPtr & part, + MergeTreeTransaction * txn, SimpleIncrement * increment, Transaction * out_transaction, std::unique_lock & lock, @@ -2536,6 +2655,9 @@ bool MergeTreeData::renameTempPartAndReplace( throw Exception("MergeTreeData::Transaction for one table cannot be used with another. It is a bug.", ErrorCodes::LOGICAL_ERROR); + if (txn) + transactions_enabled.store(true); + part->assertState({DataPartState::Temporary}); MergeTreePartInfo part_info = part->info; @@ -2578,7 +2700,6 @@ bool MergeTreeData::renameTempPartAndReplace( DataPartPtr covering_part; DataPartsVector covered_parts = getActivePartsToReplace(part_info, part_name, covering_part, lock); - DataPartsVector covered_parts_in_memory; if (covering_part) { @@ -2614,6 +2735,9 @@ bool MergeTreeData::renameTempPartAndReplace( part->renameTo(part_name, true); auto part_it = data_parts_indexes.insert(part).first; + /// FIXME Transactions: it's not the best place for checking and setting removal_tid, + /// because it's too optimistic. We should lock removal_tid of covered parts at the beginning of operation. + MergeTreeTransaction::addNewPartAndRemoveCovered(shared_from_this(), part, covered_parts, txn); if (out_transaction) { @@ -2669,7 +2793,8 @@ bool MergeTreeData::renameTempPartAndReplace( } MergeTreeData::DataPartsVector MergeTreeData::renameTempPartAndReplace( - MutableDataPartPtr & part, SimpleIncrement * increment, Transaction * out_transaction, MergeTreeDeduplicationLog * deduplication_log) + MutableDataPartPtr & part, MergeTreeTransaction * txn, SimpleIncrement * increment, + Transaction * out_transaction, MergeTreeDeduplicationLog * deduplication_log) { if (out_transaction && &out_transaction->data != this) throw Exception("MergeTreeData::Transaction for one table cannot be used with another. It is a bug.", @@ -2678,18 +2803,25 @@ MergeTreeData::DataPartsVector MergeTreeData::renameTempPartAndReplace( DataPartsVector covered_parts; { auto lock = lockParts(); - renameTempPartAndReplace(part, increment, out_transaction, lock, &covered_parts, deduplication_log); + renameTempPartAndReplace(part, txn, increment, out_transaction, lock, &covered_parts, deduplication_log); } return covered_parts; } -void MergeTreeData::removePartsFromWorkingSet(const MergeTreeData::DataPartsVector & remove, bool clear_without_timeout, DataPartsLock & acquired_lock) +void MergeTreeData::removePartsFromWorkingSet(MergeTreeTransaction * txn, const MergeTreeData::DataPartsVector & remove, bool clear_without_timeout, DataPartsLock & acquired_lock) + { + if (txn) + transactions_enabled.store(true); + auto remove_time = clear_without_timeout ? 0 : time(nullptr); bool removed_active_part = false; for (const DataPartPtr & part : remove) { + if (part->version.creation_csn != Tx::RolledBackCSN) + MergeTreeTransaction::removeOldPart(shared_from_this(), part, txn); + if (part->getState() == IMergeTreeDataPart::State::Active) { removePartContributionToColumnAndSecondaryIndexSizes(part); @@ -2714,7 +2846,6 @@ void MergeTreeData::removePartsFromWorkingSet(const MergeTreeData::DataPartsVect void MergeTreeData::removePartsFromWorkingSetImmediatelyAndSetTemporaryState(const DataPartsVector & remove) { auto lock = lockParts(); - bool removed_active_part = false; for (const auto & part : remove) { @@ -2722,19 +2853,16 @@ void MergeTreeData::removePartsFromWorkingSetImmediatelyAndSetTemporaryState(con if (it_part == data_parts_by_info.end()) throw Exception("Part " + part->getNameWithState() + " not found in data_parts", ErrorCodes::LOGICAL_ERROR); - if (part->getState() == IMergeTreeDataPart::State::Active) - removed_active_part = true; + assert(part->getState() == IMergeTreeDataPart::State::PreActive); modifyPartState(part, IMergeTreeDataPart::State::Temporary); /// Erase immediately data_parts_indexes.erase(it_part); } - - if (removed_active_part) - resetObjectColumnsFromActiveParts(lock); } -void MergeTreeData::removePartsFromWorkingSet(const DataPartsVector & remove, bool clear_without_timeout, DataPartsLock * acquired_lock) +void MergeTreeData::removePartsFromWorkingSet( + MergeTreeTransaction * txn, const DataPartsVector & remove, bool clear_without_timeout, DataPartsLock * acquired_lock) { auto lock = (acquired_lock) ? DataPartsLock() : lockParts(); @@ -2746,11 +2874,12 @@ void MergeTreeData::removePartsFromWorkingSet(const DataPartsVector & remove, bo part->assertState({DataPartState::PreActive, DataPartState::Active, DataPartState::Outdated}); } - removePartsFromWorkingSet(remove, clear_without_timeout, lock); + removePartsFromWorkingSet(txn, remove, clear_without_timeout, lock); } -MergeTreeData::DataPartsVector MergeTreeData::removePartsInRangeFromWorkingSet(const MergeTreePartInfo & drop_range, bool clear_without_timeout, - DataPartsLock & lock) +MergeTreeData::DataPartsVector MergeTreeData::removePartsInRangeFromWorkingSet( + MergeTreeTransaction * txn, const MergeTreePartInfo & drop_range, + bool clear_without_timeout, DataPartsLock & lock) { DataPartsVector parts_to_remove; @@ -2815,15 +2944,34 @@ MergeTreeData::DataPartsVector MergeTreeData::removePartsInRangeFromWorkingSet(c part->name, drop_range.getPartName()); } - if (part->getState() != DataPartState::Deleting) - parts_to_remove.emplace_back(part); + if (part->getState() == DataPartState::Deleting) + continue; + + /// FIXME refactor removePartsFromWorkingSet(...), do not remove parts twice + if (txn) + { + if (!part->version.isVisible(*txn)) + continue; + } + + parts_to_remove.emplace_back(part); } - removePartsFromWorkingSet(parts_to_remove, clear_without_timeout, lock); + removePartsFromWorkingSet(txn, parts_to_remove, clear_without_timeout, lock); return parts_to_remove; } +void MergeTreeData::restoreAndActivatePart(const DataPartPtr & part, DataPartsLock * acquired_lock) +{ + auto lock = (acquired_lock) ? DataPartsLock() : lockParts(); //-V1018 + if (part->getState() == DataPartState::Active) + return; + addPartContributionToColumnAndSecondaryIndexSizes(part); + addPartContributionToDataVolume(part); + modifyPartState(part, DataPartState::Active); +} + void MergeTreeData::forgetPartAndMoveToDetached(const MergeTreeData::DataPartPtr & part_to_detach, const String & prefix, bool restore_covered) { if (prefix.empty()) @@ -3238,9 +3386,23 @@ MergeTreeData::DataPartPtr MergeTreeData::getActiveContainingPart(const String & return getActiveContainingPart(part_info); } -MergeTreeData::DataPartsVector MergeTreeData::getDataPartsVectorInPartition(MergeTreeData::DataPartState state, const String & partition_id) const +MergeTreeData::DataPartsVector MergeTreeData::getVisibleDataPartsVectorInPartition(ContextPtr local_context, const String & partition_id) const { - DataPartStateAndPartitionID state_with_partition{state, partition_id}; + if (const auto * txn = local_context->getCurrentTransaction().get()) + { + DataPartStateAndPartitionID active_parts{MergeTreeDataPartState::Active, partition_id}; + DataPartStateAndPartitionID outdated_parts{MergeTreeDataPartState::Outdated, partition_id}; + DataPartsVector res; + { + auto lock = lockParts(); + res.insert(res.end(), data_parts_by_state_and_info.lower_bound(active_parts), data_parts_by_state_and_info.upper_bound(active_parts)); + res.insert(res.end(), data_parts_by_state_and_info.lower_bound(outdated_parts), data_parts_by_state_and_info.upper_bound(outdated_parts)); + } + filterVisibleDataParts(res, txn->getSnapshot(), txn->tid); + return res; + } + + DataPartStateAndPartitionID state_with_partition{MergeTreeDataPartState::Active, partition_id}; auto lock = lockParts(); return DataPartsVector( @@ -3248,19 +3410,37 @@ MergeTreeData::DataPartsVector MergeTreeData::getDataPartsVectorInPartition(Merg data_parts_by_state_and_info.upper_bound(state_with_partition)); } -MergeTreeData::DataPartsVector MergeTreeData::getDataPartsVectorInPartitions(MergeTreeData::DataPartState state, const std::unordered_set & partition_ids) const +MergeTreeData::DataPartsVector MergeTreeData::getVisibleDataPartsVectorInPartitions(ContextPtr local_context, const std::unordered_set & partition_ids) const { - auto lock = lockParts(); + auto txn = local_context->getCurrentTransaction(); DataPartsVector res; - for (const auto & partition_id : partition_ids) { - DataPartStateAndPartitionID state_with_partition{state, partition_id}; - insertAtEnd( - res, - DataPartsVector( - data_parts_by_state_and_info.lower_bound(state_with_partition), - data_parts_by_state_and_info.upper_bound(state_with_partition))); + auto lock = lockParts(); + for (const auto & partition_id : partition_ids) + { + DataPartStateAndPartitionID active_parts{MergeTreeDataPartState::Active, partition_id}; + insertAtEnd( + res, + DataPartsVector( + data_parts_by_state_and_info.lower_bound(active_parts), + data_parts_by_state_and_info.upper_bound(active_parts))); + + if (txn) + { + DataPartStateAndPartitionID outdated_parts{MergeTreeDataPartState::Active, partition_id}; + + insertAtEnd( + res, + DataPartsVector( + data_parts_by_state_and_info.lower_bound(outdated_parts), + data_parts_by_state_and_info.upper_bound(outdated_parts))); + } + } } + + if (txn) + filterVisibleDataParts(res, txn->getSnapshot(), txn->tid); + return res; } @@ -3292,6 +3472,8 @@ static void loadPartAndFixMetadataImpl(MergeTreeData::MutableDataPartPtr part) part->loadColumnsChecksumsIndexes(false, true); part->modification_time = disk->getLastModified(full_part_path).epochTime(); + disk->removeFileIfExists(fs::path(full_part_path) / IMergeTreeDataPart::DELETE_ON_DESTROY_MARKER_FILE_NAME); + disk->removeFileIfExists(fs::path(full_part_path) / IMergeTreeDataPart::TXN_VERSION_METADATA_FILE_NAME); } void MergeTreeData::calculateColumnAndSecondaryIndexSizesImpl() @@ -3398,16 +3580,16 @@ void MergeTreeData::checkAlterPartitionIsPossible( } } -void MergeTreeData::checkPartitionCanBeDropped(const ASTPtr & partition) +void MergeTreeData::checkPartitionCanBeDropped(const ASTPtr & partition, ContextPtr local_context) { DataPartsVector parts_to_remove; const auto * partition_ast = partition->as(); if (partition_ast && partition_ast->all) - parts_to_remove = getDataPartsVector(); + parts_to_remove = getVisibleDataPartsVector(local_context); else { - const String partition_id = getPartitionIDFromQuery(partition, getContext()); - parts_to_remove = getDataPartsVectorInPartition(MergeTreeDataPartState::Active, partition_id); + const String partition_id = getPartitionIDFromQuery(partition, local_context); + parts_to_remove = getVisibleDataPartsVectorInPartition(local_context, partition_id); } UInt64 partition_size = 0; @@ -3446,7 +3628,7 @@ void MergeTreeData::movePartitionToDisk(const ASTPtr & partition, const String & throw Exception("Part " + partition_id + " is not exists or not active", ErrorCodes::NO_SUCH_DATA_PART); } else - parts = getDataPartsVectorInPartition(MergeTreeDataPartState::Active, partition_id); + parts = getVisibleDataPartsVectorInPartition(local_context, partition_id); auto disk = getStoragePolicy()->getDiskByName(name); parts.erase(std::remove_if(parts.begin(), parts.end(), [&](auto part_ptr) @@ -3488,7 +3670,7 @@ void MergeTreeData::movePartitionToVolume(const ASTPtr & partition, const String throw Exception("Part " + partition_id + " is not exists or not active", ErrorCodes::NO_SUCH_DATA_PART); } else - parts = getDataPartsVectorInPartition(MergeTreeDataPartState::Active, partition_id); + parts = getVisibleDataPartsVectorInPartition(local_context, partition_id); auto volume = getStoragePolicy()->getVolumeByName(name); if (!volume) @@ -3560,7 +3742,7 @@ Pipe MergeTreeData::alterPartition( } else { - checkPartitionCanBeDropped(command.partition); + checkPartitionCanBeDropped(command.partition, query_context); dropPartition(command.partition, command.detach, query_context); } } @@ -3609,7 +3791,7 @@ Pipe MergeTreeData::alterPartition( case PartitionCommand::REPLACE_PARTITION: { if (command.replace) - checkPartitionCanBeDropped(command.partition); + checkPartitionCanBeDropped(command.partition, query_context); String from_database = query_context->resolveDatabase(command.from_database); auto from_storage = DatabaseCatalog::instance().getTable({from_database, command.from_table}, query_context); replacePartitionFrom(from_storage, command.partition, command.replace, query_context); @@ -3668,9 +3850,9 @@ BackupEntries MergeTreeData::backupData(ContextPtr local_context, const ASTs & p { DataPartsVector data_parts; if (partitions.empty()) - data_parts = getDataPartsVector(); + data_parts = getVisibleDataPartsVector(local_context); else - data_parts = getDataPartsVectorInPartitions(MergeTreeDataPartState::Active, getPartitionIDsFromQuery(partitions, local_context)); + data_parts = getVisibleDataPartsVectorInPartitions(local_context, getPartitionIDsFromQuery(partitions, local_context)); return backupDataParts(data_parts); } @@ -3807,8 +3989,11 @@ private: auto single_disk_volume = std::make_shared(disk->getName(), disk, 0); auto part = storage->createPart(part_name, part_info, single_disk_volume, relative_temp_part_dir); + /// TODO Transactions: Decide what to do with version metadata (if any). Let's just remove it for now. + disk->removeFileIfExists(fs::path(temp_part_dir) / IMergeTreeDataPart::TXN_VERSION_METADATA_FILE_NAME); + part->version.setCreationTID(Tx::PrehistoricTID, nullptr); part->loadColumnsChecksumsIndexes(false, true); - storage->renameTempPartAndAdd(part, increment); + storage->renameTempPartAndAdd(part, NO_TRANSACTION_RAW, increment); return {}; } @@ -3928,6 +4113,86 @@ String MergeTreeData::getPartitionIDFromQuery(const ASTPtr & ast, ContextPtr loc return partition_id; } + +DataPartsVector MergeTreeData::getVisibleDataPartsVector(ContextPtr local_context) const +{ + DataPartsVector res; + if (const auto * txn = local_context->getCurrentTransaction().get()) + { + res = getDataPartsVectorForInternalUsage({DataPartState::Active, DataPartState::Outdated}); + filterVisibleDataParts(res, txn->getSnapshot(), txn->tid); + } + else + { + res = getDataPartsVectorForInternalUsage(); + } + return res; +} + +DataPartsVector MergeTreeData::getVisibleDataPartsVectorUnlocked(ContextPtr local_context, const DataPartsLock & lock) const +{ + DataPartsVector res; + if (const auto * txn = local_context->getCurrentTransaction().get()) + { + res = getDataPartsVectorForInternalUsage({DataPartState::Active, DataPartState::Outdated}, lock); + filterVisibleDataParts(res, txn->getSnapshot(), txn->tid); + } + else + { + res = getDataPartsVectorForInternalUsage({DataPartState::Active}, lock); + } + return res; +} + +MergeTreeData::DataPartsVector MergeTreeData::getVisibleDataPartsVector(const MergeTreeTransactionPtr & txn) const +{ + DataPartsVector res; + if (txn) + { + res = getDataPartsVectorForInternalUsage({DataPartState::Active, DataPartState::Outdated}); + filterVisibleDataParts(res, txn->getSnapshot(), txn->tid); + } + else + { + res = getDataPartsVectorForInternalUsage(); + } + return res; +} + +MergeTreeData::DataPartsVector MergeTreeData::getVisibleDataPartsVector(CSN snapshot_version, TransactionID current_tid) const +{ + auto res = getDataPartsVectorForInternalUsage({DataPartState::Active, DataPartState::Outdated}); + filterVisibleDataParts(res, snapshot_version, current_tid); + return res; +} + +void MergeTreeData::filterVisibleDataParts(DataPartsVector & maybe_visible_parts, CSN snapshot_version, TransactionID current_tid) const +{ + [[maybe_unused]] size_t total_size = maybe_visible_parts.size(); + + auto need_remove_pred = [snapshot_version, ¤t_tid] (const DataPartPtr & part) -> bool + { + return !part->version.isVisible(snapshot_version, current_tid); + }; + + auto new_end_it = std::remove_if(maybe_visible_parts.begin(), maybe_visible_parts.end(), need_remove_pred); + maybe_visible_parts.erase(new_end_it, maybe_visible_parts.end()); + [[maybe_unused]] size_t visible_size = maybe_visible_parts.size(); + + + auto get_part_names = [&maybe_visible_parts]() -> Strings + { + Strings visible_part_names; + for (const auto & p : maybe_visible_parts) + visible_part_names.push_back(p->name); + return visible_part_names; + }; + + LOG_TEST(log, "Got {} parts (of {}) visible in snapshot {} (TID {}): {}", + visible_size, total_size, snapshot_version, current_tid, fmt::join(get_part_names(), ", ")); +} + + std::unordered_set MergeTreeData::getPartitionIDsFromQuery(const ASTs & asts, ContextPtr local_context) const { std::unordered_set partition_ids; @@ -3958,7 +4223,7 @@ std::set MergeTreeData::getPartitionIdsAffectedByCommands( } -MergeTreeData::DataPartsVector MergeTreeData::getDataPartsVectorUnlocked( +MergeTreeData::DataPartsVector MergeTreeData::getDataPartsVectorForInternalUsage( const DataPartStates & affordable_states, const DataPartsLock & /*lock*/, DataPartStateVector * out_states, @@ -4005,13 +4270,13 @@ MergeTreeData::DataPartsVector MergeTreeData::getDataPartsVectorUnlocked( return res; } -MergeTreeData::DataPartsVector MergeTreeData::getDataPartsVector( +MergeTreeData::DataPartsVector MergeTreeData::getDataPartsVectorForInternalUsage( const DataPartStates & affordable_states, DataPartStateVector * out_states, bool require_projection_parts) const { auto lock = lockParts(); - return getDataPartsVectorUnlocked(affordable_states, lock, out_states, require_projection_parts); + return getDataPartsVectorForInternalUsage(affordable_states, lock, out_states, require_projection_parts); } MergeTreeData::DataPartsVector @@ -4371,14 +4636,14 @@ MergeTreeData::DataParts MergeTreeData::getDataParts(const DataPartStates & affo return res; } -MergeTreeData::DataParts MergeTreeData::getDataParts() const +MergeTreeData::DataParts MergeTreeData::getDataPartsForInternalUsage() const { return getDataParts({DataPartState::Active}); } -MergeTreeData::DataPartsVector MergeTreeData::getDataPartsVector() const +MergeTreeData::DataPartsVector MergeTreeData::getDataPartsVectorForInternalUsage() const { - return getDataPartsVector({DataPartState::Active}); + return getDataPartsVectorForInternalUsage({DataPartState::Active}); } MergeTreeData::DataPartPtr MergeTreeData::getAnyPartInPartition( @@ -4422,7 +4687,19 @@ void MergeTreeData::Transaction::rollback() buf << "."; LOG_DEBUG(data.log, "Undoing transaction.{}", buf.str()); - data.removePartsFromWorkingSet( + if (!txn) + { + auto lock = data.lockParts(); + for (const auto & part : precommitted_parts) + { + DataPartPtr covering_part; + DataPartsVector covered_parts = data.getActivePartsToReplace(part->info, part->name, covering_part, lock); + for (auto & covered : covered_parts) + covered->version.unlockRemovalTID(Tx::PrehistoricTID, TransactionInfoContext{data.getStorageID(), covered->name}); + } + } + + data.removePartsFromWorkingSet(txn, DataPartsVector(precommitted_parts.begin(), precommitted_parts.end()), /* clear_without_timeout = */ true); } @@ -5413,7 +5690,8 @@ MergeTreeData::MutableDataPartPtr MergeTreeData::cloneAndLoadDataPartOnSameDisk( const MergeTreeData::DataPartPtr & src_part, const String & tmp_part_prefix, const MergeTreePartInfo & dst_part_info, - const StorageMetadataPtr & metadata_snapshot) + const StorageMetadataPtr & metadata_snapshot, + const MergeTreeTransactionPtr & txn) { /// Check that the storage policy contains the disk where the src_part is located. bool does_storage_policy_allow_same_disk = false; @@ -5431,6 +5709,7 @@ MergeTreeData::MutableDataPartPtr MergeTreeData::cloneAndLoadDataPartOnSameDisk( ErrorCodes::BAD_ARGUMENTS); String dst_part_name = src_part->getNewName(dst_part_info); + assert(!tmp_part_prefix.empty()); String tmp_dst_part_name = tmp_part_prefix + dst_part_name; auto reservation = reserveSpace(src_part->getBytesOnDisk(), src_part->volume->getDisk()); @@ -5451,12 +5730,18 @@ MergeTreeData::MutableDataPartPtr MergeTreeData::cloneAndLoadDataPartOnSameDisk( } LOG_DEBUG(log, "Cloning part {} to {}", fullPath(disk, src_part_path), fullPath(disk, dst_part_path)); - localBackup(disk, src_part_path, dst_part_path); + localBackup(disk, src_part_path, dst_part_path, /* make_source_readonly */ false); disk->removeFileIfExists(fs::path(dst_part_path) / IMergeTreeDataPart::DELETE_ON_DESTROY_MARKER_FILE_NAME); + disk->removeFileIfExists(fs::path(dst_part_path) / IMergeTreeDataPart::TXN_VERSION_METADATA_FILE_NAME); auto single_disk_volume = std::make_shared(disk->getName(), disk, 0); auto dst_data_part = createPart(dst_part_name, dst_part_info, single_disk_volume, tmp_dst_part_name); + /// We should write version metadata on part creation to distinguish it from parts that were created without transaction. + TransactionID tid = txn ? txn->tid : Tx::PrehistoricTID; + dst_data_part->version.setCreationTID(tid, nullptr); + dst_data_part->storeVersionMetadata(); + dst_data_part->is_temp = true; dst_data_part->loadColumnsChecksumsIndexes(require_part_metadata, true); @@ -5505,7 +5790,7 @@ void MergeTreeData::reportBrokenPart(MergeTreeData::DataPartPtr & data_part) con if (data_part->volume && data_part->volume->getDisk()->isBroken()) { auto disk = data_part->volume->getDisk(); - auto parts = getDataParts(); + auto parts = getDataPartsForInternalUsage(); LOG_WARNING(log, "Scanning parts to recover on broken disk {}.", disk->getName() + "@" + disk->getPath()); for (const auto & part : parts) { @@ -5580,7 +5865,7 @@ PartitionCommandsResultInfo MergeTreeData::freezePartitionsByMatcher( const String shadow_path = "shadow/"; /// Acquire a snapshot of active data parts to prevent removing while doing backup. - const auto data_parts = getDataParts(); + const auto data_parts = getVisibleDataPartsVector(local_context); String backup_name = (!with_name.empty() ? escapeForFileName(with_name) : toString(increment)); String backup_path = fs::path(shadow_path) / backup_name / ""; @@ -6323,12 +6608,12 @@ void MergeTreeData::updateObjectColumns(const DataPartPtr & part, const DataPart DB::updateObjectColumns(object_columns, part->getColumns()); } -StorageSnapshotPtr MergeTreeData::getStorageSnapshot(const StorageMetadataPtr & metadata_snapshot) const +StorageSnapshotPtr MergeTreeData::getStorageSnapshot(const StorageMetadataPtr & metadata_snapshot, ContextPtr query_context) const { auto snapshot_data = std::make_unique(); auto lock = lockParts(); - snapshot_data->parts = getDataPartsVectorUnlocked({DataPartState::Active}, lock); + snapshot_data->parts = getVisibleDataPartsVectorUnlocked(query_context, lock); return std::make_shared(*this, metadata_snapshot, object_columns, std::move(snapshot_data)); } diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index 44736fe2cc5..5ce5f30f0dc 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -49,6 +49,7 @@ class MutationCommands; class Context; using PartitionIdToMaxBlock = std::unordered_map; struct JobAndPool; +class MergeTreeTransaction; struct ZeroCopyLock; /// Auxiliary struct holding information about the future merged or mutated part. @@ -247,7 +248,7 @@ public: class Transaction : private boost::noncopyable { public: - explicit Transaction(MergeTreeData & data_) : data(data_) {} + Transaction(MergeTreeData & data_, MergeTreeTransaction * txn_) : data(data_), txn(txn_) {} DataPartsVector commit(MergeTreeData::DataPartsLock * acquired_parts_lock = nullptr); @@ -276,6 +277,7 @@ public: friend class MergeTreeData; MergeTreeData & data; + MergeTreeTransaction * txn; DataParts precommitted_parts; void clear() { precommitted_parts.clear(); } @@ -436,7 +438,7 @@ public: DataPartsVector parts; }; - StorageSnapshotPtr getStorageSnapshot(const StorageMetadataPtr & metadata_snapshot) const override; + StorageSnapshotPtr getStorageSnapshot(const StorageMetadataPtr & metadata_snapshot, ContextPtr query_context) const override; /// Load the set of data parts from disk. Call once - immediately after the object is created. void loadDataParts(bool skip_sanity_checks); @@ -445,10 +447,11 @@ public: Int64 getMaxBlockNumber() const; + /// Returns a copy of the list so that the caller shouldn't worry about locks. DataParts getDataParts(const DataPartStates & affordable_states) const; - DataPartsVector getDataPartsVectorUnlocked( + DataPartsVector getDataPartsVectorForInternalUsage( const DataPartStates & affordable_states, const DataPartsLock & lock, DataPartStateVector * out_states = nullptr, @@ -456,31 +459,28 @@ public: /// Returns sorted list of the parts with specified states /// out_states will contain snapshot of each part state - DataPartsVector getDataPartsVector( - const DataPartStates & affordable_states, - DataPartStateVector * out_states = nullptr, - bool require_projection_parts = false) const; + DataPartsVector getDataPartsVectorForInternalUsage( + const DataPartStates & affordable_states, DataPartStateVector * out_states = nullptr, bool require_projection_parts = false) const; + /// Returns absolutely all parts (and snapshot of their states) DataPartsVector getAllDataPartsVector( DataPartStateVector * out_states = nullptr, bool require_projection_parts = false) const; - /// Returns all detached parts - DetachedPartsInfo getDetachedParts() const; + /// Returns parts in Active state + DataParts getDataPartsForInternalUsage() const; + DataPartsVector getDataPartsVectorForInternalUsage() const; - static void validateDetachedPartName(const String & name); + void filterVisibleDataParts(DataPartsVector & maybe_visible_parts, CSN snapshot_version, TransactionID current_tid) const; - void dropDetached(const ASTPtr & partition, bool part, ContextPtr local_context); + /// Returns parts that visible with current snapshot + DataPartsVector getVisibleDataPartsVector(ContextPtr local_context) const; + DataPartsVector getVisibleDataPartsVectorUnlocked(ContextPtr local_context, const DataPartsLock & lock) const; + DataPartsVector getVisibleDataPartsVector(const MergeTreeTransactionPtr & txn) const; + DataPartsVector getVisibleDataPartsVector(CSN snapshot_version, TransactionID current_tid) const; - MutableDataPartsVector tryLoadPartsToAttach(const ASTPtr & partition, bool attach_part, - ContextPtr context, PartsTemporaryRename & renamed_parts); - - /// Returns Active parts - DataParts getDataParts() const; - DataPartsVector getDataPartsVector() const; - - /// Returns a committed part with the given name or a part containing it. If there is no such part, returns nullptr. + /// Returns a part in Active state with the given name or a part containing it. If there is no such part, returns nullptr. DataPartPtr getActiveContainingPart(const String & part_name) const; DataPartPtr getActiveContainingPart(const MergeTreePartInfo & part_info) const; DataPartPtr getActiveContainingPart(const MergeTreePartInfo & part_info, DataPartState state, DataPartsLock & lock) const; @@ -490,8 +490,8 @@ public: void swapActivePart(MergeTreeData::DataPartPtr part_copy); /// Returns all parts in specified partition - DataPartsVector getDataPartsVectorInPartition(DataPartState state, const String & partition_id) const; - DataPartsVector getDataPartsVectorInPartitions(DataPartState state, const std::unordered_set & partition_ids) const; + DataPartsVector getVisibleDataPartsVectorInPartition(ContextPtr local_context, const String & partition_id) const; + DataPartsVector getVisibleDataPartsVectorInPartitions(ContextPtr local_context, const std::unordered_set & partition_ids) const; /// Returns the part with the given name and state or nullptr if no such part. DataPartPtr getPartIfExists(const String & part_name, const DataPartStates & valid_states); @@ -511,6 +511,18 @@ public: /// Makes sense only for ordinary MergeTree engines because for them block numbering doesn't depend on partition. std::optional getMinPartDataVersion() const; + + /// Returns all detached parts + DetachedPartsInfo getDetachedParts() const; + + static void validateDetachedPartName(const String & name); + + void dropDetached(const ASTPtr & partition, bool part, ContextPtr context); + + MutableDataPartsVector tryLoadPartsToAttach(const ASTPtr & partition, bool attach_part, + ContextPtr context, PartsTemporaryRename & renamed_parts); + + /// If the table contains too many active parts, sleep for a while to give them time to merge. /// If until is non-null, wake up from the sleep earlier if the event happened. void delayInsertOrThrowIfNeeded(Poco::Event * until = nullptr) const; @@ -524,6 +536,7 @@ public: /// Returns true if part was added. Returns false if part is covered by bigger part. bool renameTempPartAndAdd( MutableDataPartPtr & part, + MergeTreeTransaction * txn, SimpleIncrement * increment = nullptr, Transaction * out_transaction = nullptr, MergeTreeDeduplicationLog * deduplication_log = nullptr, @@ -533,11 +546,14 @@ public: /// Returns all parts covered by the added part (in ascending order). /// If out_transaction == nullptr, marks covered parts as Outdated. DataPartsVector renameTempPartAndReplace( - MutableDataPartPtr & part, SimpleIncrement * increment = nullptr, Transaction * out_transaction = nullptr, MergeTreeDeduplicationLog * deduplication_log = nullptr); + MutableDataPartPtr & part, MergeTreeTransaction * txn, SimpleIncrement * increment = nullptr, + Transaction * out_transaction = nullptr, MergeTreeDeduplicationLog * deduplication_log = nullptr); /// Low-level version of previous one, doesn't lock mutex + /// FIXME Transactions: remove add_to_txn flag, maybe merge MergeTreeTransaction and Transaction bool renameTempPartAndReplace( MutableDataPartPtr & part, + MergeTreeTransaction * txn, SimpleIncrement * increment, Transaction * out_transaction, DataPartsLock & lock, @@ -554,15 +570,18 @@ public: /// Parts in add must already be in data_parts with PreActive, Active, or Outdated states. /// If clear_without_timeout is true, the parts will be deleted at once, or during the next call to /// clearOldParts (ignoring old_parts_lifetime). - void removePartsFromWorkingSet(const DataPartsVector & remove, bool clear_without_timeout, DataPartsLock * acquired_lock = nullptr); - void removePartsFromWorkingSet(const DataPartsVector & remove, bool clear_without_timeout, DataPartsLock & acquired_lock); + void removePartsFromWorkingSet(MergeTreeTransaction * txn, const DataPartsVector & remove, bool clear_without_timeout, DataPartsLock * acquired_lock = nullptr); + void removePartsFromWorkingSet(MergeTreeTransaction * txn, const DataPartsVector & remove, bool clear_without_timeout, DataPartsLock & acquired_lock); /// Removes all parts from the working set parts /// for which (partition_id = drop_range.partition_id && min_block >= drop_range.min_block && max_block <= drop_range.max_block). /// Used in REPLACE PARTITION command; - DataPartsVector removePartsInRangeFromWorkingSet(const MergeTreePartInfo & drop_range, bool clear_without_timeout, + DataPartsVector removePartsInRangeFromWorkingSet(MergeTreeTransaction * txn, const MergeTreePartInfo & drop_range, bool clear_without_timeout, DataPartsLock & lock); + /// Restores Outdated part and adds it to working set + void restoreAndActivatePart(const DataPartPtr & part, DataPartsLock * acquired_lock = nullptr); + /// Renames the part to detached/_ and removes it from data_parts, //// so it will not be deleted in clearOldParts. /// If restore_covered is true, adds to the working set inactive parts, which were merged into the deleted part. @@ -697,7 +716,10 @@ public: /// Moves partition to specified Volume void movePartitionToVolume(const ASTPtr & partition, const String & name, bool moving_part, ContextPtr context); - void checkPartitionCanBeDropped(const ASTPtr & partition) override; + /// Checks that Partition could be dropped right now + /// Otherwise - throws an exception with detailed information. + /// We do not use mutex because it is not very important that the size could change during the operation. + void checkPartitionCanBeDropped(const ASTPtr & partition, ContextPtr local_context); void checkPartCanBeDropped(const String & part_name); @@ -743,7 +765,8 @@ public: MergeTreeData & checkStructureAndGetMergeTreeData(IStorage & source_table, const StorageMetadataPtr & src_snapshot, const StorageMetadataPtr & my_snapshot) const; MergeTreeData::MutableDataPartPtr cloneAndLoadDataPartOnSameDisk( - const MergeTreeData::DataPartPtr & src_part, const String & tmp_part_prefix, const MergeTreePartInfo & dst_part_info, const StorageMetadataPtr & metadata_snapshot); + const MergeTreeData::DataPartPtr & src_part, const String & tmp_part_prefix, const MergeTreePartInfo & dst_part_info, + const StorageMetadataPtr & metadata_snapshot, const MergeTreeTransactionPtr & txn); virtual std::vector getMutationsStatus() const = 0; @@ -978,6 +1001,9 @@ protected: mutable std::shared_mutex pinned_part_uuids_mutex; PinnedPartUUIDsPtr pinned_part_uuids; + /// True if at least one part was created/removed with transaction. + mutable std::atomic_bool transactions_enabled = false; + /// Work with data parts struct TagByInfo{}; diff --git a/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp b/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp index 22a868f218e..31d52cfa8ff 100644 --- a/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp +++ b/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -29,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -52,6 +54,7 @@ namespace DB namespace ErrorCodes { extern const int LOGICAL_ERROR; + extern const int ABORTED; } /// Do not start to merge parts, if free space is less than sum size of parts times specified coefficient. @@ -124,9 +127,70 @@ SelectPartsDecision MergeTreeDataMergerMutator::selectPartsToMerge( size_t max_total_size_to_merge, const AllowedMergingPredicate & can_merge_callback, bool merge_with_ttl_allowed, + const MergeTreeTransactionPtr & txn, String * out_disable_reason) { - MergeTreeData::DataPartsVector data_parts = data.getDataPartsVector(); + MergeTreeData::DataPartsVector data_parts; + if (txn) + { + /// Merge predicate (for simple MergeTree) allows to merge two parts only if both parts are visible for merge transaction. + /// So at the first glance we could just get all active parts. + /// Active parts include uncommitted parts, but it's ok and merge predicate handles it. + /// However, it's possible that some transaction is trying to remove a part in the middle, for example, all_2_2_0. + /// If parts all_1_1_0 and all_3_3_0 are active and visible for merge transaction, then we would try to merge them. + /// But it's wrong, because all_2_2_0 may become active again if transaction will roll back. + /// That's why we must include some outdated parts into `data_part`, more precisely, such parts that removal is not committed. + MergeTreeData::DataPartsVector active_parts; + MergeTreeData::DataPartsVector outdated_parts; + + { + auto lock = data.lockParts(); + active_parts = data.getDataPartsVectorForInternalUsage({MergeTreeData::DataPartState::Active}, lock); + outdated_parts = data.getDataPartsVectorForInternalUsage({MergeTreeData::DataPartState::Outdated}, lock); + } + + ActiveDataPartSet active_parts_set{data.format_version}; + for (const auto & part : active_parts) + active_parts_set.add(part->name); + + for (const auto & part : outdated_parts) + { + /// We don't need rolled back parts. + /// NOTE When rolling back a transaction we set creation_csn to RolledBackCSN at first + /// and then remove part from working set, so there's no race condition + if (part->version.creation_csn == Tx::RolledBackCSN) + continue; + + /// We don't need parts that are finally removed. + /// NOTE There's a minor race condition: we may get UnknownCSN if a transaction has been just committed concurrently. + /// But it's not a problem if we will add such part to `data_parts`. + if (part->version.removal_csn != Tx::UnknownCSN) + continue; + + active_parts_set.add(part->name); + } + + /// Restore "active" parts set from selected active and outdated parts + auto remove_pred = [&](const MergeTreeData::DataPartPtr & part) -> bool + { + return active_parts_set.getContainingPart(part->info) != part->name; + }; + + auto new_end_it = std::remove_if(active_parts.begin(), active_parts.end(), remove_pred); + active_parts.erase(new_end_it, active_parts.end()); + + new_end_it = std::remove_if(outdated_parts.begin(), outdated_parts.end(), remove_pred); + outdated_parts.erase(new_end_it, outdated_parts.end()); + + std::merge(active_parts.begin(), active_parts.end(), + outdated_parts.begin(), outdated_parts.end(), + std::back_inserter(data_parts), MergeTreeData::LessDataPart()); + } + else + { + /// Simply get all active parts + data_parts = data.getDataPartsVectorForInternalUsage(); + } const auto data_settings = data.getSettings(); auto metadata_snapshot = data.getInMemoryMetadataPtr(); @@ -172,7 +236,7 @@ SelectPartsDecision MergeTreeDataMergerMutator::selectPartsToMerge( * So we have to check if this part is currently being inserted with quorum and so on and so forth. * Obviously we have to check it manually only for the first part * of each partition because it will be automatically checked for a pair of parts. */ - if (!can_merge_callback(nullptr, part, nullptr)) + if (!can_merge_callback(nullptr, part, txn.get(), nullptr)) continue; /// This part can be merged only with next parts (no prev part exists), so start @@ -184,7 +248,7 @@ SelectPartsDecision MergeTreeDataMergerMutator::selectPartsToMerge( { /// If we cannot merge with previous part we had to start new parts /// interval (in the same partition) - if (!can_merge_callback(*prev_part, part, nullptr)) + if (!can_merge_callback(*prev_part, part, txn.get(), nullptr)) { /// Now we have no previous part prev_part = nullptr; @@ -196,7 +260,7 @@ SelectPartsDecision MergeTreeDataMergerMutator::selectPartsToMerge( /// for example, merge is already assigned for such parts, or they participate in quorum inserts /// and so on. /// Also we don't start new interval here (maybe all next parts cannot be merged and we don't want to have empty interval) - if (!can_merge_callback(nullptr, part, nullptr)) + if (!can_merge_callback(nullptr, part, txn.get(), nullptr)) continue; /// Starting new interval in the same partition @@ -307,6 +371,7 @@ SelectPartsDecision MergeTreeDataMergerMutator::selectAllPartsToMergeWithinParti const String & partition_id, bool final, const StorageMetadataPtr & metadata_snapshot, + const MergeTreeTransactionPtr & txn, String * out_disable_reason, bool optimize_skip_merged_partitions) { @@ -343,7 +408,7 @@ SelectPartsDecision MergeTreeDataMergerMutator::selectAllPartsToMergeWithinParti while (it != parts.end()) { /// For the case of one part, we check that it can be merged "with itself". - if ((it != parts.begin() || parts.size() == 1) && !can_merge(*prev_it, *it, out_disable_reason)) + if ((it != parts.begin() || parts.size() == 1) && !can_merge(*prev_it, *it, txn.get(), out_disable_reason)) { return SelectPartsDecision::CANNOT_SELECT; } @@ -390,7 +455,7 @@ MergeTreeData::DataPartsVector MergeTreeDataMergerMutator::selectAllPartsFromPar { MergeTreeData::DataPartsVector parts_from_partition; - MergeTreeData::DataParts data_parts = data.getDataParts(); + MergeTreeData::DataParts data_parts = data.getDataPartsForInternalUsage(); for (const auto & current_part : data_parts) { @@ -416,6 +481,7 @@ MergeTaskPtr MergeTreeDataMergerMutator::mergePartsToTemporaryPart( bool deduplicate, const Names & deduplicate_by_columns, const MergeTreeData::MergingParams & merging_params, + const MergeTreeTransactionPtr & txn, const IMergeTreeDataPart * parent_part, const String & suffix) { @@ -432,6 +498,7 @@ MergeTaskPtr MergeTreeDataMergerMutator::mergePartsToTemporaryPart( merging_params, parent_part, suffix, + txn, &data, this, &merges_blocker, @@ -446,6 +513,7 @@ MutateTaskPtr MergeTreeDataMergerMutator::mutatePartToTemporaryPart( MergeListEntry * merge_entry, time_t time_of_mutation, ContextPtr context, + const MergeTreeTransactionPtr & txn, ReservationSharedPtr space_reservation, TableLockHolder & holder) { @@ -458,6 +526,7 @@ MutateTaskPtr MergeTreeDataMergerMutator::mutatePartToTemporaryPart( context, space_reservation, holder, + txn, data, *this, merges_blocker @@ -508,10 +577,16 @@ MergeAlgorithm MergeTreeDataMergerMutator::chooseMergeAlgorithm( MergeTreeData::DataPartPtr MergeTreeDataMergerMutator::renameMergedTemporaryPart( MergeTreeData::MutableDataPartPtr & new_data_part, const MergeTreeData::DataPartsVector & parts, + const MergeTreeTransactionPtr & txn, MergeTreeData::Transaction * out_transaction) { + /// Some of source parts was possibly created in transaction, so non-transactional merge may break isolation. + if (data.transactions_enabled.load(std::memory_order_relaxed) && !txn) + throw Exception(ErrorCodes::ABORTED, "Cancelling merge, because it was done without starting transaction," + "but transactions were enabled for this table"); + /// Rename new part, add to the set and remove original parts. - auto replaced_parts = data.renameTempPartAndReplace(new_data_part, nullptr, out_transaction); + auto replaced_parts = data.renameTempPartAndReplace(new_data_part, txn.get(), nullptr, out_transaction); /// Let's check that all original parts have been deleted and only them. if (replaced_parts.size() != parts.size()) diff --git a/src/Storages/MergeTree/MergeTreeDataMergerMutator.h b/src/Storages/MergeTree/MergeTreeDataMergerMutator.h index e64c13ca6c3..9a60e4c6078 100644 --- a/src/Storages/MergeTree/MergeTreeDataMergerMutator.h +++ b/src/Storages/MergeTree/MergeTreeDataMergerMutator.h @@ -40,7 +40,10 @@ enum class ExecuteTTLType class MergeTreeDataMergerMutator { public: - using AllowedMergingPredicate = std::function; + using AllowedMergingPredicate = std::function; MergeTreeDataMergerMutator(MergeTreeData & data_, size_t max_tasks_count_); @@ -72,6 +75,7 @@ public: size_t max_total_size_to_merge, const AllowedMergingPredicate & can_merge, bool merge_with_ttl_allowed, + const MergeTreeTransactionPtr & txn, String * out_disable_reason = nullptr); /** Select all the parts in the specified partition for merge, if possible. @@ -85,6 +89,7 @@ public: const String & partition_id, bool final, const StorageMetadataPtr & metadata_snapshot, + const MergeTreeTransactionPtr & txn, String * out_disable_reason = nullptr, bool optimize_skip_merged_partitions = false); @@ -107,6 +112,7 @@ public: bool deduplicate, const Names & deduplicate_by_columns, const MergeTreeData::MergingParams & merging_params, + const MergeTreeTransactionPtr & txn, const IMergeTreeDataPart * parent_part = nullptr, const String & suffix = ""); @@ -118,12 +124,14 @@ public: MergeListEntry * merge_entry, time_t time_of_mutation, ContextPtr context, + const MergeTreeTransactionPtr & txn, ReservationSharedPtr space_reservation, TableLockHolder & table_lock_holder); MergeTreeData::DataPartPtr renameMergedTemporaryPart( MergeTreeData::MutableDataPartPtr & new_data_part, const MergeTreeData::DataPartsVector & parts, + const MergeTreeTransactionPtr & txn, MergeTreeData::Transaction * out_transaction = nullptr); diff --git a/src/Storages/MergeTree/MergeTreeDataPartInMemory.cpp b/src/Storages/MergeTree/MergeTreeDataPartInMemory.cpp index e4a174a7d29..b63d46ee463 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartInMemory.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartInMemory.cpp @@ -91,7 +91,7 @@ void MergeTreeDataPartInMemory::flushToDisk(const String & base_path, const Stri auto compression_codec = storage.getContext()->chooseCompressionCodec(0, 0); auto indices = MergeTreeIndexFactory::instance().getMany(metadata_snapshot->getSecondaryIndices()); - MergedBlockOutputStream out(new_data_part, metadata_snapshot, columns, indices, compression_codec); + MergedBlockOutputStream out(new_data_part, metadata_snapshot, columns, indices, compression_codec, NO_TRANSACTION_PTR); out.write(block); const auto & projections = metadata_snapshot->getProjections(); for (const auto & [projection_name, projection] : projection_parts) @@ -122,7 +122,7 @@ void MergeTreeDataPartInMemory::flushToDisk(const String & base_path, const Stri auto projection_indices = MergeTreeIndexFactory::instance().getMany(desc.metadata->getSecondaryIndices()); MergedBlockOutputStream projection_out( projection_data_part, desc.metadata, projection_part->columns, projection_indices, - projection_compression_codec); + projection_compression_codec, NO_TRANSACTION_PTR); projection_out.write(projection_part->block); projection_out.finalizePart(projection_data_part, false); diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp index f8f69b19458..60eb11a4fc0 100644 --- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp +++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp @@ -130,9 +130,11 @@ QueryPlanPtr MergeTreeDataSelectExecutor::read( return std::make_unique(); const auto & settings = context->getSettingsRef(); + const auto & metadata_for_reading = storage_snapshot->getMetadataForQuery(); const auto & snapshot_data = assert_cast(*storage_snapshot->data); + const auto & parts = snapshot_data.parts; if (!query_info.projection) diff --git a/src/Storages/MergeTree/MergeTreeDataWriter.cpp b/src/Storages/MergeTree/MergeTreeDataWriter.cpp index 9aa446c5f2f..e39ae7a4037 100644 --- a/src/Storages/MergeTree/MergeTreeDataWriter.cpp +++ b/src/Storages/MergeTree/MergeTreeDataWriter.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -433,7 +434,9 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeTempPart( const auto & index_factory = MergeTreeIndexFactory::instance(); auto out = std::make_unique(new_data_part, metadata_snapshot, columns, - index_factory.getMany(metadata_snapshot->getSecondaryIndices()), compression_codec, false, false, context->getWriteSettings()); + index_factory.getMany(metadata_snapshot->getSecondaryIndices()), compression_codec, + context->getCurrentTransaction(), false, false, context->getWriteSettings()); + out->writeWithPermutation(block, perm_ptr); @@ -565,7 +568,8 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeProjectionPartImpl( metadata_snapshot, columns, MergeTreeIndices{}, - compression_codec); + compression_codec, + NO_TRANSACTION_PTR); out->writeWithPermutation(block, perm_ptr); auto finalizer = out->finalizePartAsync(new_data_part, false); diff --git a/src/Storages/MergeTree/MergeTreeMutationEntry.cpp b/src/Storages/MergeTree/MergeTreeMutationEntry.cpp index 2147575f1d5..bfa9129bd53 100644 --- a/src/Storages/MergeTree/MergeTreeMutationEntry.cpp +++ b/src/Storages/MergeTree/MergeTreeMutationEntry.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include @@ -43,13 +44,15 @@ UInt64 MergeTreeMutationEntry::parseFileName(const String & file_name_) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot parse mutation version from file name, expected 'mutation_.txt', got '{}'", file_name_); } -MergeTreeMutationEntry::MergeTreeMutationEntry(MutationCommands commands_, DiskPtr disk_, const String & path_prefix_, UInt64 tmp_number, const WriteSettings & settings) +MergeTreeMutationEntry::MergeTreeMutationEntry(MutationCommands commands_, DiskPtr disk_, const String & path_prefix_, UInt64 tmp_number, + const TransactionID & tid_, const WriteSettings & settings) : create_time(time(nullptr)) , commands(std::move(commands_)) , disk(std::move(disk_)) , path_prefix(path_prefix_) , file_name("tmp_mutation_" + toString(tmp_number) + ".txt") , is_temp(true) + , tid(tid_) { try { @@ -59,6 +62,16 @@ MergeTreeMutationEntry::MergeTreeMutationEntry(MutationCommands commands_, DiskP *out << "commands: "; commands.writeText(*out); *out << "\n"; + if (tid.isPrehistoric()) + { + csn = Tx::PrehistoricCSN; + } + else + { + *out << "tid: "; + TransactionID::write(tid, *out); + *out << "\n"; + } out->sync(); } catch (...) @@ -90,6 +103,14 @@ void MergeTreeMutationEntry::removeFile() } } +void MergeTreeMutationEntry::writeCSN(CSN csn_) +{ + csn = csn_; + auto out = disk->writeFile(path_prefix + file_name, 256, WriteMode::Append); + *out << "csn: " << csn << "\n"; + out->finalize(); +} + MergeTreeMutationEntry::MergeTreeMutationEntry(DiskPtr disk_, const String & path_prefix_, const String & file_name_) : disk(std::move(disk_)) , path_prefix(path_prefix_) @@ -111,6 +132,23 @@ MergeTreeMutationEntry::MergeTreeMutationEntry(DiskPtr disk_, const String & pat commands.readText(*buf); *buf >> "\n"; + if (buf->eof()) + { + tid = Tx::PrehistoricTID; + csn = Tx::PrehistoricCSN; + } + else + { + *buf >> "tid: "; + tid = TransactionID::read(*buf); + *buf >> "\n"; + + if (!buf->eof()) + { + *buf >> "csn: " >> csn >> "\n"; + } + } + assertEOF(*buf); } diff --git a/src/Storages/MergeTree/MergeTreeMutationEntry.h b/src/Storages/MergeTree/MergeTreeMutationEntry.h index fa3a4058ae6..3d4c4d0c4a1 100644 --- a/src/Storages/MergeTree/MergeTreeMutationEntry.h +++ b/src/Storages/MergeTree/MergeTreeMutationEntry.h @@ -4,6 +4,7 @@ #include #include #include +#include namespace DB @@ -28,8 +29,15 @@ struct MergeTreeMutationEntry time_t latest_fail_time = 0; String latest_fail_reason; + /// ID of transaction which has created mutation. + TransactionID tid = Tx::PrehistoricTID; + /// CSN of transaction which has created mutation + /// or UnknownCSN if it's not committed (yet) or RolledBackCSN if it's rolled back or PrehistoricCSN if there is no transaction. + CSN csn = Tx::UnknownCSN; + /// Create a new entry and write it to a temporary file. - MergeTreeMutationEntry(MutationCommands commands_, DiskPtr disk, const String & path_prefix_, UInt64 tmp_number, const WriteSettings & settings); + MergeTreeMutationEntry(MutationCommands commands_, DiskPtr disk, const String & path_prefix_, UInt64 tmp_number, + const TransactionID & tid_, const WriteSettings & settings); MergeTreeMutationEntry(const MergeTreeMutationEntry &) = delete; MergeTreeMutationEntry(MergeTreeMutationEntry &&) = default; @@ -38,6 +46,8 @@ struct MergeTreeMutationEntry void removeFile(); + void writeCSN(CSN csn_); + static String versionToFileName(UInt64 block_number_); static UInt64 tryParseFileName(const String & file_name_); static UInt64 parseFileName(const String & file_name_); diff --git a/src/Storages/MergeTree/MergeTreePartsMover.cpp b/src/Storages/MergeTree/MergeTreePartsMover.cpp index 9cc3ffe6e9e..83b58960ad1 100644 --- a/src/Storages/MergeTree/MergeTreePartsMover.cpp +++ b/src/Storages/MergeTree/MergeTreePartsMover.cpp @@ -94,7 +94,7 @@ bool MergeTreePartsMover::selectPartsForMove( unsigned parts_to_move_by_ttl_rules = 0; double parts_to_move_total_size_bytes = 0.0; - MergeTreeData::DataPartsVector data_parts = data->getDataPartsVector(); + MergeTreeData::DataPartsVector data_parts = data->getDataPartsVectorForInternalUsage(); if (data_parts.empty()) return false; @@ -231,6 +231,7 @@ MergeTreeData::DataPartPtr MergeTreePartsMover::clonePart(const MergeTreeMoveEnt LOG_TRACE(log, "Part {} was cloned to {}", part->name, cloned_part->getFullPath()); cloned_part->loadColumnsChecksumsIndexes(true, true); + cloned_part->loadVersionMetadata(); cloned_part->modification_time = disk->getLastModified(cloned_part->getFullRelativePath()).epochTime(); return cloned_part; diff --git a/src/Storages/MergeTree/MergeTreeSettings.h b/src/Storages/MergeTree/MergeTreeSettings.h index a3d2d607873..bf8accc0f47 100644 --- a/src/Storages/MergeTree/MergeTreeSettings.h +++ b/src/Storages/MergeTree/MergeTreeSettings.h @@ -61,6 +61,7 @@ struct Settings; M(UInt64, merge_selecting_sleep_ms, 5000, "Sleep time for merge selecting when no part selected, a lower setting will trigger selecting tasks in background_schedule_pool frequently which result in large amount of requests to zookeeper in large-scale clusters", 0) \ M(UInt64, merge_tree_clear_old_temporary_directories_interval_seconds, 60, "The period of executing the clear old temporary directories operation in background.", 0) \ M(UInt64, merge_tree_clear_old_parts_interval_seconds, 1, "The period of executing the clear old parts operation in background.", 0) \ + M(Bool, remove_rolled_back_parts_immediately, 1, "Setting for an incomplete experimental feature.", 0) \ \ /** Inserts settings. */ \ M(UInt64, parts_to_delay_insert, 150, "If table contains at least that many active parts in single partition, artificially slow down insert into table.", 0) \ diff --git a/src/Storages/MergeTree/MergeTreeSink.cpp b/src/Storages/MergeTree/MergeTreeSink.cpp index 7a4ecae24b3..7e8ee3dcbef 100644 --- a/src/Storages/MergeTree/MergeTreeSink.cpp +++ b/src/Storages/MergeTree/MergeTreeSink.cpp @@ -50,7 +50,7 @@ struct MergeTreeSink::DelayedChunk void MergeTreeSink::consume(Chunk chunk) { auto block = getHeader().cloneWithColumns(chunk.detachColumns()); - auto storage_snapshot = storage.getStorageSnapshot(metadata_snapshot); + auto storage_snapshot = storage.getStorageSnapshot(metadata_snapshot, context); storage.writer.deduceTypesOfObjectColumns(storage_snapshot, block); auto part_blocks = storage.writer.splitBlockIntoParts(block, max_parts_per_block, metadata_snapshot, context); @@ -134,7 +134,7 @@ void MergeTreeSink::finishDelayedChunk() auto & part = partition.temp_part.part; /// Part can be deduplicated, so increment counters and add to part log only if it's really added - if (storage.renameTempPartAndAdd(part, &storage.increment, nullptr, storage.getDeduplicationLog(), partition.block_dedup_token)) + if (storage.renameTempPartAndAdd(part, context->getCurrentTransaction().get(), &storage.increment, nullptr, storage.getDeduplicationLog(), partition.block_dedup_token)) { PartLog::addNewPart(storage.getContext(), part, partition.elapsed_ns); diff --git a/src/Storages/MergeTree/MergeTreeWriteAheadLog.cpp b/src/Storages/MergeTree/MergeTreeWriteAheadLog.cpp index d7cddfe9c14..bab0947a8ff 100644 --- a/src/Storages/MergeTree/MergeTreeWriteAheadLog.cpp +++ b/src/Storages/MergeTree/MergeTreeWriteAheadLog.cpp @@ -197,7 +197,8 @@ MergeTreeData::MutableDataPartsVector MergeTreeWriteAheadLog::restore(const Stor metadata_snapshot, block.getNamesAndTypesList(), {}, - CompressionCodecFactory::instance().get("NONE", {})); + CompressionCodecFactory::instance().get("NONE", {}), + NO_TRANSACTION_PTR); part->minmax_idx->update(block, storage.getMinMaxColumnsNames(metadata_snapshot->getPartitionKey())); part->partition.create(metadata_snapshot, block, 0, context); diff --git a/src/Storages/MergeTree/MergedBlockOutputStream.cpp b/src/Storages/MergeTree/MergedBlockOutputStream.cpp index 4fce24fae74..6acbfacd4c1 100644 --- a/src/Storages/MergeTree/MergedBlockOutputStream.cpp +++ b/src/Storages/MergeTree/MergedBlockOutputStream.cpp @@ -1,5 +1,6 @@ #include #include +#include #include @@ -18,6 +19,7 @@ MergedBlockOutputStream::MergedBlockOutputStream( const NamesAndTypesList & columns_list_, const MergeTreeIndices & skip_indices, CompressionCodecPtr default_codec_, + const MergeTreeTransactionPtr & txn, bool reset_columns_, bool blocks_are_granules_size, const WriteSettings & write_settings) @@ -36,6 +38,13 @@ MergedBlockOutputStream::MergedBlockOutputStream( if (!part_path.empty()) volume->getDisk()->createDirectories(part_path); + /// We should write version metadata on part creation to distinguish it from parts that were created without transaction. + TransactionID tid = txn ? txn->tid : Tx::PrehistoricTID; + /// NOTE do not pass context for writing to system.transactions_info_log, + /// because part may have temporary name (with temporary block numbers). Will write it later. + data_part->version.setCreationTID(tid, nullptr); + data_part->storeVersionMetadata(); + writer = data_part->getWriter(columns_list, metadata_snapshot, skip_indices, default_codec, writer_settings); } diff --git a/src/Storages/MergeTree/MergedBlockOutputStream.h b/src/Storages/MergeTree/MergedBlockOutputStream.h index 7beb9c65ca5..67dec1923e8 100644 --- a/src/Storages/MergeTree/MergedBlockOutputStream.h +++ b/src/Storages/MergeTree/MergedBlockOutputStream.h @@ -20,6 +20,7 @@ public: const NamesAndTypesList & columns_list_, const MergeTreeIndices & skip_indices, CompressionCodecPtr default_codec_, + const MergeTreeTransactionPtr & txn, bool reset_columns_ = false, bool blocks_are_granules_size = false, const WriteSettings & write_settings = {}); diff --git a/src/Storages/MergeTree/MutateFromLogEntryTask.cpp b/src/Storages/MergeTree/MutateFromLogEntryTask.cpp index 309432e4675..de31fbe3c56 100644 --- a/src/Storages/MergeTree/MutateFromLogEntryTask.cpp +++ b/src/Storages/MergeTree/MutateFromLogEntryTask.cpp @@ -98,7 +98,7 @@ ReplicatedMergeMutateTaskBase::PrepareResult MutateFromLogEntryTask::prepare() RWLockImpl::NO_QUERY, storage_settings_ptr->lock_acquire_timeout_for_background_operations); StorageMetadataPtr metadata_snapshot = storage.getInMemoryMetadataPtr(); - transaction_ptr = std::make_unique(storage); + transaction_ptr = std::make_unique(storage, NO_TRANSACTION_RAW); future_mutated_part = std::make_shared(); future_mutated_part->name = entry.new_part_name; @@ -152,7 +152,7 @@ ReplicatedMergeMutateTaskBase::PrepareResult MutateFromLogEntryTask::prepare() mutate_task = storage.merger_mutator.mutatePartToTemporaryPart( future_mutated_part, metadata_snapshot, commands, merge_mutate_entry.get(), - entry.create_time, fake_query_context, reserved_space, table_lock_holder); + entry.create_time, fake_query_context, NO_TRANSACTION_PTR, reserved_space, table_lock_holder); /// Adjust priority for (auto & item : future_mutated_part->parts) @@ -171,7 +171,7 @@ bool MutateFromLogEntryTask::finalize(ReplicatedMergeMutateTaskBase::PartLogWrit { new_part = mutate_task->getFuture().get(); - storage.renameTempPartAndReplace(new_part, nullptr, transaction_ptr.get()); + storage.renameTempPartAndReplace(new_part, NO_TRANSACTION_RAW, nullptr, transaction_ptr.get()); try { diff --git a/src/Storages/MergeTree/MutatePlainMergeTreeTask.cpp b/src/Storages/MergeTree/MutatePlainMergeTreeTask.cpp index e3fa07dd0c0..80a33bfe0e3 100644 --- a/src/Storages/MergeTree/MutatePlainMergeTreeTask.cpp +++ b/src/Storages/MergeTree/MutatePlainMergeTreeTask.cpp @@ -1,7 +1,7 @@ #include #include - +#include namespace DB { @@ -55,7 +55,7 @@ void MutatePlainMergeTreeTask::prepare() mutate_task = storage.merger_mutator.mutatePartToTemporaryPart( future_part, metadata_snapshot, merge_mutate_entry->commands, merge_list_entry.get(), - time(nullptr), fake_query_context, merge_mutate_entry->tagger->reserved_space, table_lock_holder); + time(nullptr), fake_query_context, merge_mutate_entry->txn, merge_mutate_entry->tagger->reserved_space, table_lock_holder); } bool MutatePlainMergeTreeTask::executeStep() @@ -83,7 +83,8 @@ bool MutatePlainMergeTreeTask::executeStep() new_part = mutate_task->getFuture().get(); - storage.renameTempPartAndReplace(new_part); + /// FIXME Transactions: it's too optimistic, better to lock parts before starting transaction + storage.renameTempPartAndReplace(new_part, merge_mutate_entry->txn.get()); storage.updateMutationEntriesErrors(future_part, true, ""); write_part_log({}); @@ -92,7 +93,11 @@ bool MutatePlainMergeTreeTask::executeStep() } catch (...) { - storage.updateMutationEntriesErrors(future_part, false, getCurrentExceptionMessage(false)); + if (merge_mutate_entry->txn) + merge_mutate_entry->txn->onException(); + String exception_message = getCurrentExceptionMessage(false); + LOG_ERROR(&Poco::Logger::get("MutatePlainMergeTreeTask"), "{}", exception_message); + storage.updateMutationEntriesErrors(future_part, false, exception_message); write_part_log(ExecutionStatus::fromCurrentException()); tryLogCurrentException(__PRETTY_FUNCTION__); return false; diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index 9641299f1f8..b9bebc665b2 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -546,6 +547,8 @@ struct MutationContext bool need_sync; ExecuteTTLType execute_ttl_type{ExecuteTTLType::NONE}; + + MergeTreeTransactionPtr txn; }; using MutationContextPtr = std::shared_ptr; @@ -651,6 +654,7 @@ public: false, // TODO Do we need deduplicate for projections {}, projection_merging_params, + NO_TRANSACTION_PTR, ctx->new_data_part.get(), ".tmp_proj"); @@ -972,7 +976,8 @@ private: ctx->metadata_snapshot, ctx->new_data_part->getColumns(), skip_part_indices, - ctx->compression_codec); + ctx->compression_codec, + ctx->txn); ctx->mutating_pipeline = QueryPipelineBuilder::getPipeline(std::move(builder)); ctx->mutating_executor = std::make_unique(ctx->mutating_pipeline); @@ -1059,6 +1064,13 @@ private: ctx->disk->createDirectories(ctx->new_part_tmp_path); + /// We should write version metadata on part creation to distinguish it from parts that were created without transaction. + TransactionID tid = ctx->txn ? ctx->txn->tid : Tx::PrehistoricTID; + /// NOTE do not pass context for writing to system.transactions_info_log, + /// because part may have temporary name (with temporary block numbers). Will write it later. + ctx->new_data_part->version.setCreationTID(tid, nullptr); + ctx->new_data_part->storeVersionMetadata(); + /// Create hardlinks for unchanged files for (auto it = ctx->disk->iterateDirectory(ctx->source_part->getFullRelativePath()); it->isValid(); it->next()) { @@ -1193,6 +1205,7 @@ MutateTask::MutateTask( ContextPtr context_, ReservationSharedPtr space_reservation_, TableLockHolder & table_lock_holder_, + const MergeTreeTransactionPtr & txn, MergeTreeData & data_, MergeTreeDataMergerMutator & mutator_, ActionBlocker & merges_blocker_) @@ -1210,6 +1223,7 @@ MutateTask::MutateTask( ctx->metadata_snapshot = metadata_snapshot_; ctx->space_reservation = space_reservation_; ctx->storage_columns = metadata_snapshot_->getColumns().getAllPhysical(); + ctx->txn = txn; } @@ -1269,7 +1283,7 @@ bool MutateTask::prepare() storage_from_source_part, ctx->metadata_snapshot, ctx->commands_for_part, Context::createCopy(context_for_reading))) { LOG_TRACE(ctx->log, "Part {} doesn't change up to mutation version {}", ctx->source_part->name, ctx->future_part->part_info.mutation); - promise.set_value(ctx->data->cloneAndLoadDataPartOnSameDisk(ctx->source_part, "tmp_clone_", ctx->future_part->part_info, ctx->metadata_snapshot)); + promise.set_value(ctx->data->cloneAndLoadDataPartOnSameDisk(ctx->source_part, "tmp_clone_", ctx->future_part->part_info, ctx->metadata_snapshot, ctx->txn)); return false; } else @@ -1294,6 +1308,8 @@ bool MutateTask::prepare() } ctx->single_disk_volume = std::make_shared("volume_" + ctx->future_part->name, ctx->space_reservation->getDisk(), 0); + /// FIXME new_data_part is not used in the case when we clone part with cloneAndLoadDataPartOnSameDisk and return false + /// Is it possible to handle this case earlier? ctx->new_data_part = ctx->data->createPart( ctx->future_part->name, ctx->future_part->type, ctx->future_part->part_info, ctx->single_disk_volume, "tmp_mut_" + ctx->future_part->name); @@ -1358,7 +1374,7 @@ bool MutateTask::prepare() && ctx->files_to_rename.empty()) { LOG_TRACE(ctx->log, "Part {} doesn't change up to mutation version {} (optimized)", ctx->source_part->name, ctx->future_part->part_info.mutation); - promise.set_value(ctx->data->cloneAndLoadDataPartOnSameDisk(ctx->source_part, "tmp_clone_", ctx->future_part->part_info, ctx->metadata_snapshot)); + promise.set_value(ctx->data->cloneAndLoadDataPartOnSameDisk(ctx->source_part, "tmp_mut_", ctx->future_part->part_info, ctx->metadata_snapshot, ctx->txn)); return false; } diff --git a/src/Storages/MergeTree/MutateTask.h b/src/Storages/MergeTree/MutateTask.h index 79c3bff213a..aa38ee34b4a 100644 --- a/src/Storages/MergeTree/MutateTask.h +++ b/src/Storages/MergeTree/MutateTask.h @@ -32,6 +32,7 @@ public: ContextPtr context_, ReservationSharedPtr space_reservation_, TableLockHolder & table_lock_holder_, + const MergeTreeTransactionPtr & txn, MergeTreeData & data_, MergeTreeDataMergerMutator & mutator_, ActionBlocker & merges_blocker_); diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp index 5f805c39ae2..c5798aaefe5 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp @@ -1981,6 +1981,7 @@ ReplicatedMergeTreeMergePredicate::ReplicatedMergeTreeMergePredicate( bool ReplicatedMergeTreeMergePredicate::operator()( const MergeTreeData::DataPartPtr & left, const MergeTreeData::DataPartPtr & right, + const MergeTreeTransaction *, String * out_reason) const { if (left) diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.h b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.h index 1d10c504b3c..ae0ca806344 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.h +++ b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.h @@ -483,6 +483,7 @@ public: /// Depending on the existence of left part checks a merge predicate for two parts or for single part. bool operator()(const MergeTreeData::DataPartPtr & left, const MergeTreeData::DataPartPtr & right, + const MergeTreeTransaction * txn, String * out_reason = nullptr) const; /// Can we assign a merge with these two parts? diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp index 63fa2071056..187e4eb96c5 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp @@ -150,7 +150,7 @@ void ReplicatedMergeTreeSink::consume(Chunk chunk) if (quorum) checkQuorumPrecondition(zookeeper); - auto storage_snapshot = storage.getStorageSnapshot(metadata_snapshot); + auto storage_snapshot = storage.getStorageSnapshot(metadata_snapshot, context); storage.writer.deduceTypesOfObjectColumns(storage_snapshot, block); auto part_blocks = storage.writer.splitBlockIntoParts(block, max_parts_per_block, metadata_snapshot, context); @@ -287,6 +287,7 @@ void ReplicatedMergeTreeSink::writeExistingPart(MergeTreeData::MutableDataPartPt try { + part->version.setCreationTID(Tx::PrehistoricTID, nullptr); commitPart(zookeeper, part, ""); PartLog::addNewPart(storage.getContext(), part, watch.elapsed()); } @@ -471,12 +472,12 @@ void ReplicatedMergeTreeSink::commitPart( /// Information about the part. storage.getCommitPartOps(ops, part, block_id_path); - MergeTreeData::Transaction transaction(storage); /// If you can not add a part to ZK, we'll remove it back from the working set. + MergeTreeData::Transaction transaction(storage, NO_TRANSACTION_RAW); /// If you can not add a part to ZK, we'll remove it back from the working set. bool renamed = false; try { - renamed = storage.renameTempPartAndAdd(part, nullptr, &transaction); + renamed = storage.renameTempPartAndAdd(part, NO_TRANSACTION_RAW, nullptr, &transaction); } catch (const Exception & e) { diff --git a/src/Storages/MergeTree/localBackup.cpp b/src/Storages/MergeTree/localBackup.cpp index 1a04aa4b678..8bb3e4cf78a 100644 --- a/src/Storages/MergeTree/localBackup.cpp +++ b/src/Storages/MergeTree/localBackup.cpp @@ -14,7 +14,7 @@ namespace ErrorCodes } -static void localBackupImpl(const DiskPtr & disk, const String & source_path, const String & destination_path, size_t level, +static void localBackupImpl(const DiskPtr & disk, const String & source_path, const String & destination_path, bool make_source_readonly, size_t level, std::optional max_level) { if (max_level && level > *max_level) @@ -32,12 +32,13 @@ static void localBackupImpl(const DiskPtr & disk, const String & source_path, co if (!disk->isDirectory(source)) { - disk->setReadOnly(source); + if (make_source_readonly) + disk->setReadOnly(source); disk->createHardLink(source, destination); } else { - localBackupImpl(disk, source, destination, level + 1, max_level); + localBackupImpl(disk, source, destination, make_source_readonly, level + 1, max_level); } } } @@ -80,7 +81,7 @@ private: }; } -void localBackup(const DiskPtr & disk, const String & source_path, const String & destination_path, std::optional max_level) +void localBackup(const DiskPtr & disk, const String & source_path, const String & destination_path, bool make_source_readonly, std::optional max_level) { if (disk->exists(destination_path) && !disk->isDirectoryEmpty(destination_path)) { @@ -100,7 +101,7 @@ void localBackup(const DiskPtr & disk, const String & source_path, const String { try { - localBackupImpl(disk, source_path, destination_path, 0, max_level); + localBackupImpl(disk, source_path, destination_path, make_source_readonly, 0, max_level); } catch (const DB::ErrnoException & e) { diff --git a/src/Storages/MergeTree/localBackup.h b/src/Storages/MergeTree/localBackup.h index 066ba8f7489..c6a46620447 100644 --- a/src/Storages/MergeTree/localBackup.h +++ b/src/Storages/MergeTree/localBackup.h @@ -20,6 +20,6 @@ namespace DB * If max_level is specified, than only files which depth relative source_path less or equal max_level will be copied. * So, if max_level=0 than only direct file child are copied. */ -void localBackup(const DiskPtr & disk, const String & source_path, const String & destination_path, std::optional max_level = {}); +void localBackup(const DiskPtr & disk, const String & source_path, const String & destination_path, bool make_source_readonly = true, std::optional max_level = {}); } diff --git a/src/Storages/RabbitMQ/StorageRabbitMQ.cpp b/src/Storages/RabbitMQ/StorageRabbitMQ.cpp index b5e120d9405..54a71f18ce1 100644 --- a/src/Storages/RabbitMQ/StorageRabbitMQ.cpp +++ b/src/Storages/RabbitMQ/StorageRabbitMQ.cpp @@ -1026,7 +1026,7 @@ bool StorageRabbitMQ::streamToViews() InterpreterInsertQuery interpreter(insert, rabbitmq_context, false, true, true); auto block_io = interpreter.execute(); - auto storage_snapshot = getStorageSnapshot(getInMemoryMetadataPtr()); + auto storage_snapshot = getStorageSnapshot(getInMemoryMetadataPtr(), getContext()); auto column_names = block_io.pipeline.getHeader().getNames(); auto sample_block = storage_snapshot->getSampleBlockForColumns(column_names); diff --git a/src/Storages/ReadFinalForExternalReplicaStorage.cpp b/src/Storages/ReadFinalForExternalReplicaStorage.cpp index cf1c5c35629..a03ccb5cf43 100644 --- a/src/Storages/ReadFinalForExternalReplicaStorage.cpp +++ b/src/Storages/ReadFinalForExternalReplicaStorage.cpp @@ -54,7 +54,7 @@ Pipe readFinalFromNestedStorage( filter_column_name = expressions->children.back()->getColumnName(); } - auto nested_snapshot = nested_storage->getStorageSnapshot(nested_metadata); + auto nested_snapshot = nested_storage->getStorageSnapshot(nested_metadata, context); Pipe pipe = nested_storage->read(require_columns_name, nested_snapshot, query_info, context, processed_stage, max_block_size, num_streams); pipe.addTableLock(lock); pipe.addStorageHolder(nested_storage); diff --git a/src/Storages/StorageBuffer.cpp b/src/Storages/StorageBuffer.cpp index 801e1b80a20..a503e79dc2c 100644 --- a/src/Storages/StorageBuffer.cpp +++ b/src/Storages/StorageBuffer.cpp @@ -203,7 +203,7 @@ QueryProcessingStage::Enum StorageBuffer::getQueryProcessingStage( /// TODO: Find a way to support projections for StorageBuffer query_info.ignore_projections = true; const auto & destination_metadata = destination->getInMemoryMetadataPtr(); - return destination->getQueryProcessingStage(local_context, to_stage, destination->getStorageSnapshot(destination_metadata), query_info); + return destination->getQueryProcessingStage(local_context, to_stage, destination->getStorageSnapshot(destination_metadata, local_context), query_info); } return QueryProcessingStage::FetchColumns; @@ -248,7 +248,7 @@ void StorageBuffer::read( auto destination_lock = destination->lockForShare(local_context->getCurrentQueryId(), local_context->getSettingsRef().lock_acquire_timeout); auto destination_metadata_snapshot = destination->getInMemoryMetadataPtr(); - auto destination_snapshot = destination->getStorageSnapshot(destination_metadata_snapshot); + auto destination_snapshot = destination->getStorageSnapshot(destination_metadata_snapshot, local_context); const bool dst_has_same_structure = std::all_of(column_names.begin(), column_names.end(), [metadata_snapshot, destination_metadata_snapshot](const String& column_name) { diff --git a/src/Storages/StorageDistributed.cpp b/src/Storages/StorageDistributed.cpp index 1a390f784a2..62ec2524a32 100644 --- a/src/Storages/StorageDistributed.cpp +++ b/src/Storages/StorageDistributed.cpp @@ -617,13 +617,13 @@ static bool requiresObjectColumns(const ColumnsDescription & all_columns, ASTPtr return false; } -StorageSnapshotPtr StorageDistributed::getStorageSnapshot(const StorageMetadataPtr & metadata_snapshot) const +StorageSnapshotPtr StorageDistributed::getStorageSnapshot(const StorageMetadataPtr & metadata_snapshot, ContextPtr query_context) const { - return getStorageSnapshotForQuery(metadata_snapshot, nullptr); + return getStorageSnapshotForQuery(metadata_snapshot, nullptr, query_context); } StorageSnapshotPtr StorageDistributed::getStorageSnapshotForQuery( - const StorageMetadataPtr & metadata_snapshot, const ASTPtr & query) const + const StorageMetadataPtr & metadata_snapshot, const ASTPtr & query, ContextPtr /*query_context*/) const { /// If query doesn't use columns of type Object, don't deduce /// concrete types for them, because it required extra round trip. diff --git a/src/Storages/StorageDistributed.h b/src/Storages/StorageDistributed.h index 317463783ee..a890cabd8b1 100644 --- a/src/Storages/StorageDistributed.h +++ b/src/Storages/StorageDistributed.h @@ -69,9 +69,9 @@ public: ColumnsDescriptionByShardNum objects_by_shard; }; - StorageSnapshotPtr getStorageSnapshot(const StorageMetadataPtr & metadata_snapshot) const override; + StorageSnapshotPtr getStorageSnapshot(const StorageMetadataPtr & metadata_snapshot, ContextPtr query_context) const override; StorageSnapshotPtr getStorageSnapshotForQuery( - const StorageMetadataPtr & metadata_snapshot, const ASTPtr & query) const override; + const StorageMetadataPtr & metadata_snapshot, const ASTPtr & query, ContextPtr query_context) const override; QueryProcessingStage::Enum getQueryProcessingStage(ContextPtr, QueryProcessingStage::Enum, const StorageSnapshotPtr &, SelectQueryInfo &) const override; diff --git a/src/Storages/StorageGenerateRandom.h b/src/Storages/StorageGenerateRandom.h index 2894b17d409..ca12d9c2841 100644 --- a/src/Storages/StorageGenerateRandom.h +++ b/src/Storages/StorageGenerateRandom.h @@ -24,6 +24,7 @@ public: size_t max_block_size, unsigned num_streams) override; + bool supportsTransactions() const override { return true; } private: UInt64 max_array_length = 10; UInt64 max_string_length = 10; diff --git a/src/Storages/StorageMaterializedView.cpp b/src/Storages/StorageMaterializedView.cpp index 0c79c31eb7a..610d16c8ea8 100644 --- a/src/Storages/StorageMaterializedView.cpp +++ b/src/Storages/StorageMaterializedView.cpp @@ -143,7 +143,7 @@ QueryProcessingStage::Enum StorageMaterializedView::getQueryProcessingStage( /// converting and use it just like a normal view. query_info.ignore_projections = true; const auto & target_metadata = getTargetTable()->getInMemoryMetadataPtr(); - return getTargetTable()->getQueryProcessingStage(local_context, to_stage, getTargetTable()->getStorageSnapshot(target_metadata), query_info); + return getTargetTable()->getQueryProcessingStage(local_context, to_stage, getTargetTable()->getStorageSnapshot(target_metadata, local_context), query_info); } Pipe StorageMaterializedView::read( @@ -175,7 +175,7 @@ void StorageMaterializedView::read( auto storage = getTargetTable(); auto lock = storage->lockForShare(local_context->getCurrentQueryId(), local_context->getSettingsRef().lock_acquire_timeout); auto target_metadata_snapshot = storage->getInMemoryMetadataPtr(); - auto target_storage_snapshot = storage->getStorageSnapshot(target_metadata_snapshot); + auto target_storage_snapshot = storage->getStorageSnapshot(target_metadata_snapshot, local_context); if (query_info.order_optimizer) query_info.input_order_info = query_info.order_optimizer->getInputOrder(target_metadata_snapshot, local_context); diff --git a/src/Storages/StorageMaterializedView.h b/src/Storages/StorageMaterializedView.h index 41c97fbc4d8..35fe38058de 100644 --- a/src/Storages/StorageMaterializedView.h +++ b/src/Storages/StorageMaterializedView.h @@ -26,6 +26,7 @@ public: bool supportsIndexForIn() const override { return getTargetTable()->supportsIndexForIn(); } bool supportsParallelInsert() const override { return getTargetTable()->supportsParallelInsert(); } bool supportsSubcolumns() const override { return getTargetTable()->supportsSubcolumns(); } + bool supportsTransactions() const override { return getTargetTable()->supportsTransactions(); } bool mayBenefitFromIndexForIn(const ASTPtr & left_in_operand, ContextPtr query_context, const StorageMetadataPtr & /* metadata_snapshot */) const override { auto target_table = getTargetTable(); diff --git a/src/Storages/StorageMemory.cpp b/src/Storages/StorageMemory.cpp index a371ac1ccf8..3e2fe996fe8 100644 --- a/src/Storages/StorageMemory.cpp +++ b/src/Storages/StorageMemory.cpp @@ -122,10 +122,11 @@ class MemorySink : public SinkToStorage public: MemorySink( StorageMemory & storage_, - const StorageMetadataPtr & metadata_snapshot_) + const StorageMetadataPtr & metadata_snapshot_, + ContextPtr context) : SinkToStorage(metadata_snapshot_->getSampleBlock()) , storage(storage_) - , storage_snapshot(storage_.getStorageSnapshot(metadata_snapshot_)) + , storage_snapshot(storage_.getStorageSnapshot(metadata_snapshot_, context)) { } @@ -201,7 +202,7 @@ StorageMemory::StorageMemory( setInMemoryMetadata(storage_metadata); } -StorageSnapshotPtr StorageMemory::getStorageSnapshot(const StorageMetadataPtr & metadata_snapshot) const +StorageSnapshotPtr StorageMemory::getStorageSnapshot(const StorageMetadataPtr & metadata_snapshot, ContextPtr /*query_context*/) const { auto snapshot_data = std::make_unique(); snapshot_data->blocks = data.get(); @@ -271,9 +272,9 @@ Pipe StorageMemory::read( } -SinkToStoragePtr StorageMemory::write(const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, ContextPtr /*context*/) +SinkToStoragePtr StorageMemory::write(const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, ContextPtr context) { - return std::make_shared(*this, metadata_snapshot); + return std::make_shared(*this, metadata_snapshot, context); } diff --git a/src/Storages/StorageMemory.h b/src/Storages/StorageMemory.h index 20f47828846..d4e82ccb4fc 100644 --- a/src/Storages/StorageMemory.h +++ b/src/Storages/StorageMemory.h @@ -37,7 +37,7 @@ public: std::shared_ptr blocks; }; - StorageSnapshotPtr getStorageSnapshot(const StorageMetadataPtr & metadata_snapshot) const override; + StorageSnapshotPtr getStorageSnapshot(const StorageMetadataPtr & metadata_snapshot, ContextPtr query_context) const override; Pipe read( const Names & column_names, diff --git a/src/Storages/StorageMerge.cpp b/src/Storages/StorageMerge.cpp index 96e6070e09e..8b71cfdb102 100644 --- a/src/Storages/StorageMerge.cpp +++ b/src/Storages/StorageMerge.cpp @@ -201,7 +201,7 @@ QueryProcessingStage::Enum StorageMerge::getQueryProcessingStage( stage_in_source_tables = std::max( stage_in_source_tables, table->getQueryProcessingStage(local_context, to_stage, - table->getStorageSnapshot(table->getInMemoryMetadataPtr()), query_info)); + table->getStorageSnapshot(table->getInMemoryMetadataPtr(), local_context), query_info)); } iterator->next(); @@ -338,7 +338,7 @@ Pipe StorageMerge::read( Aliases aliases; auto storage_metadata_snapshot = storage->getInMemoryMetadataPtr(); auto storage_columns = storage_metadata_snapshot->getColumns(); - auto nested_storage_snaphsot = storage->getStorageSnapshot(storage_metadata_snapshot); + auto nested_storage_snaphsot = storage->getStorageSnapshot(storage_metadata_snapshot, local_context); auto modified_query_info = getModifiedQueryInfo(query_info, modified_context, storage->getStorageID(), storage->as()); auto syntax_result = TreeRewriter(local_context).analyzeSelect( @@ -377,7 +377,7 @@ Pipe StorageMerge::read( } syntax_result = TreeRewriter(local_context).analyze( - required_columns_expr_list, storage_columns.getAllPhysical(), storage, storage->getStorageSnapshot(storage_metadata_snapshot)); + required_columns_expr_list, storage_columns.getAllPhysical(), storage, storage->getStorageSnapshot(storage_metadata_snapshot, local_context)); auto alias_actions = ExpressionAnalyzer(required_columns_expr_list, syntax_result, local_context).getActionsDAG(true); diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index 6c11bb84dc7..b25b47ac772 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -60,6 +61,22 @@ namespace ActionLocks extern const StorageActionBlockType PartsMove; } +static MergeTreeTransactionPtr tryGetTransactionForMutation(const MergeTreeMutationEntry & mutation, Poco::Logger * log = nullptr) +{ + assert(!mutation.tid.isEmpty()); + if (mutation.tid.isPrehistoric()) + return {}; + + auto txn = TransactionLog::instance().tryGetRunningTransaction(mutation.tid.getHash()); + if (txn) + return txn; + + if (log) + LOG_WARNING(log, "Cannot find transaction {} which had started mutation {}, probably it finished", mutation.tid, mutation.file_name); + + return {}; +} + StorageMergeTree::StorageMergeTree( const StorageID & table_id_, @@ -89,7 +106,7 @@ StorageMergeTree::StorageMergeTree( { loadDataParts(has_force_restore_data_flag); - if (!attach && !getDataParts().empty()) + if (!attach && !getDataPartsForInternalUsage().empty()) throw Exception("Data directory for table already containing data parts - probably it was unclean DROP table or manual intervention. You must either clear directory by hand or use ATTACH TABLE instead of CREATE TABLE if you need to use that parts.", ErrorCodes::INCORRECT_DATA); increment.set(getMaxBlockNumber()); @@ -234,7 +251,7 @@ std::optional StorageMergeTree::totalRows(const Settings &) const std::optional StorageMergeTree::totalRowsByPartitionPredicate(const SelectQueryInfo & query_info, ContextPtr local_context) const { - auto parts = getDataPartsVector({DataPartState::Active}); + auto parts = getVisibleDataPartsVector(local_context); return totalRowsByPartitionPredicateImpl(query_info, local_context, parts); } @@ -266,15 +283,15 @@ void StorageMergeTree::drop() dropAllData(); } -void StorageMergeTree::truncate(const ASTPtr &, const StorageMetadataPtr &, ContextPtr, TableExclusiveLockHolder &) +void StorageMergeTree::truncate(const ASTPtr &, const StorageMetadataPtr &, ContextPtr local_context, TableExclusiveLockHolder &) { { /// Asks to complete merges and does not allow them to start. /// This protects against "revival" of data for a removed partition after completion of merge. auto merge_blocker = stopMergesAndWait(); - auto parts_to_remove = getDataPartsVector(); - removePartsFromWorkingSet(parts_to_remove, true); + auto parts_to_remove = getVisibleDataPartsVector(local_context); + removePartsFromWorkingSet(local_context->getCurrentTransaction().get(), parts_to_remove, true); LOG_INFO(log, "Removed {} parts.", parts_to_remove.size()); } @@ -289,13 +306,15 @@ void StorageMergeTree::alter( ContextPtr local_context, AlterLockHolder & table_lock_holder) { + if (local_context->getCurrentTransaction() && local_context->getSettingsRef().throw_on_unsupported_query_inside_transaction) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "ALTER METADATA is not supported inside transactions"); + auto table_id = getStorageID(); auto old_storage_settings = getSettings(); StorageInMemoryMetadata new_metadata = getInMemoryMetadata(); StorageInMemoryMetadata old_metadata = getInMemoryMetadata(); auto maybe_mutation_commands = commands.getMutationCommands(new_metadata, local_context->getSettingsRef().materialize_ttl_after_modify, local_context); - String mutation_file_name; Int64 mutation_version = -1; commands.apply(new_metadata, local_context); @@ -317,13 +336,13 @@ void StorageMergeTree::alter( DatabaseCatalog::instance().getDatabase(table_id.database_name)->alterTable(local_context, table_id, new_metadata); if (!maybe_mutation_commands.empty()) - mutation_version = startMutation(maybe_mutation_commands, mutation_file_name); + mutation_version = startMutation(maybe_mutation_commands, local_context); } /// Always execute required mutations synchronously, because alters /// should be executed in sequential order. if (!maybe_mutation_commands.empty()) - waitForMutation(mutation_version, mutation_file_name); + waitForMutation(mutation_version); } { @@ -414,24 +433,35 @@ CurrentlyMergingPartsTagger::~CurrentlyMergingPartsTagger() storage.currently_processing_in_background_condition.notify_all(); } -Int64 StorageMergeTree::startMutation(const MutationCommands & commands, String & mutation_file_name) +Int64 StorageMergeTree::startMutation(const MutationCommands & commands, ContextPtr query_context) { /// Choose any disk, because when we load mutations we search them at each disk /// where storage can be placed. See loadMutations(). auto disk = getStoragePolicy()->getAnyDisk(); + TransactionID current_tid = Tx::PrehistoricTID; + String additional_info; + auto txn = query_context->getCurrentTransaction(); + if (txn) + { + current_tid = txn->tid; + additional_info = fmt::format(" (TID: {}; TIDH: {})", current_tid, current_tid.getHash()); + } + Int64 version; { std::lock_guard lock(currently_processing_in_background_mutex); - MergeTreeMutationEntry entry(commands, disk, relative_data_path, insert_increment.get(), getContext()->getWriteSettings()); + MergeTreeMutationEntry entry(commands, disk, relative_data_path, insert_increment.get(), current_tid, getContext()->getWriteSettings()); version = increment.get(); entry.commit(version); - mutation_file_name = entry.file_name; + String mutation_id = entry.file_name; + if (txn) + txn->addMutation(shared_from_this(), mutation_id); bool inserted = current_mutations_by_version.try_emplace(version, std::move(entry)).second; if (!inserted) throw Exception(ErrorCodes::LOGICAL_ERROR, "Mutation {} already exists, it's a bug", version); - LOG_INFO(log, "Added mutation: {}", mutation_file_name); + LOG_INFO(log, "Added mutation: {}{}", mutation_id, additional_info); } background_operations_assignee.trigger(); return version; @@ -477,9 +507,15 @@ void StorageMergeTree::updateMutationEntriesErrors(FutureMergedMutatedPartPtr re mutation_wait_event.notify_all(); } -void StorageMergeTree::waitForMutation(Int64 version, const String & file_name) +void StorageMergeTree::waitForMutation(Int64 version) { - LOG_INFO(log, "Waiting mutation: {}", file_name); + waitForMutation(MergeTreeMutationEntry::versionToFileName(version)); +} + +void StorageMergeTree::waitForMutation(const String & mutation_id) +{ + UInt64 version = MergeTreeMutationEntry::parseFileName(mutation_id); + LOG_INFO(log, "Waiting mutation: {}", mutation_id); { auto check = [version, this]() { @@ -495,20 +531,24 @@ void StorageMergeTree::waitForMutation(Int64 version, const String & file_name) /// At least we have our current mutation std::set mutation_ids; - mutation_ids.insert(file_name); + mutation_ids.insert(mutation_id); auto mutation_status = getIncompleteMutationsStatus(version, &mutation_ids); - try - { - checkMutationStatus(mutation_status, mutation_ids); - } - catch (...) - { - tryLogCurrentException(__PRETTY_FUNCTION__); - throw; - } + checkMutationStatus(mutation_status, mutation_ids); - LOG_INFO(log, "Mutation {} done", file_name); + LOG_INFO(log, "Mutation {} done", mutation_id); +} + +void StorageMergeTree::setMutationCSN(const String & mutation_id, CSN csn) +{ + LOG_INFO(log, "Writing CSN {} for mutation {}", csn, mutation_id); + UInt64 version = MergeTreeMutationEntry::parseFileName(mutation_id); + + std::lock_guard lock(currently_processing_in_background_mutex); + auto it = current_mutations_by_version.find(version); + if (it == current_mutations_by_version.end()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot find mutation {}", mutation_id); + it->second.writeCSN(csn); } void StorageMergeTree::mutate(const MutationCommands & commands, ContextPtr query_context) @@ -516,11 +556,10 @@ void StorageMergeTree::mutate(const MutationCommands & commands, ContextPtr quer /// Validate partition IDs (if any) before starting mutation getPartitionIdsAffectedByCommands(commands, query_context); - String mutation_file_name; - Int64 version = startMutation(commands, mutation_file_name); + Int64 version = startMutation(commands, query_context); - if (query_context->getSettingsRef().mutations_sync > 0) - waitForMutation(version, mutation_file_name); + if (query_context->getSettingsRef().mutations_sync > 0 || query_context->getCurrentTransaction()) + waitForMutation(version); } std::optional StorageMergeTree::getIncompleteMutationsStatus(Int64 mutation_version, std::set * mutation_ids) const @@ -536,7 +575,9 @@ std::optional StorageMergeTree::getIncompleteMutationsS const auto & mutation_entry = current_mutation_it->second; - auto data_parts = getDataPartsVector(); + auto txn = tryGetTransactionForMutation(mutation_entry, log); + assert(txn || mutation_entry.tid.isPrehistoric()); + auto data_parts = getVisibleDataPartsVector(txn); for (const auto & data_part : data_parts) { Int64 data_version = getUpdatedDataVersion(data_part, lock); @@ -560,6 +601,17 @@ std::optional StorageMergeTree::getIncompleteMutationsS mutation_ids->insert(it->second.file_name); } } + else if (txn) + { + /// Part is locked by concurrent transaction, most likely it will never be mutated + TIDHash part_locked = data_part->version.removal_tid_lock.load(); + if (part_locked && part_locked != mutation_entry.tid.getHash()) + { + result.latest_failed_part = data_part->name; + result.latest_fail_reason = fmt::format("Serialization error: part {} is locked by transaction {}", data_part->name, part_locked); + result.latest_fail_time = time(nullptr); + } + } return result; } @@ -635,6 +687,12 @@ CancellationCode StorageMergeTree::killMutation(const String & mutation_id) if (!to_kill) return CancellationCode::NotFound; + if (auto txn = tryGetTransactionForMutation(*to_kill, log)) + { + LOG_TRACE(log, "Cancelling transaction {} which had started mutation {}", to_kill->tid, mutation_id); + TransactionLog::instance().rollbackTransaction(txn); + } + getContext()->getMergeList().cancelPartMutations(getStorageID(), {}, to_kill->block_number); to_kill->removeFile(); LOG_TRACE(log, "Cancelled part mutations and removed mutation file {}", mutation_id); @@ -671,6 +729,24 @@ void StorageMergeTree::loadMutations() MergeTreeMutationEntry entry(disk, relative_data_path, it->name()); UInt64 block_number = entry.block_number; LOG_DEBUG(log, "Loading mutation: {} entry, commands size: {}", it->name(), entry.commands.size()); + + if (!entry.tid.isPrehistoric() && !entry.csn) + { + if (auto csn = TransactionLog::getCSN(entry.tid)) + { + /// Transaction is committed => mutation is finished, but let's load it anyway (so it will be shown in system.mutations) + entry.writeCSN(csn); + } + else + { + TransactionLog::assertTIDIsNotOutdated(entry.tid); + LOG_DEBUG(log, "Mutation entry {} was created by transaction {}, but it was not committed. Removing mutation entry", + it->name(), entry.tid); + disk->removeFile(it->path()); + continue; + } + } + auto inserted = current_mutations_by_version.try_emplace(block_number, std::move(entry)).second; if (!inserted) throw Exception(ErrorCodes::LOGICAL_ERROR, "Mutation {} already exists, it's a bug", block_number); @@ -694,6 +770,7 @@ std::shared_ptr StorageMergeTree::selectPartsToMerge( String * out_disable_reason, TableLockHolder & /* table_lock_holder */, std::unique_lock & lock, + const MergeTreeTransactionPtr & txn, bool optimize_skip_merged_partitions, SelectPartsDecision * select_decision_out) { @@ -708,8 +785,24 @@ std::shared_ptr StorageMergeTree::selectPartsToMerge( CurrentlyMergingPartsTaggerPtr merging_tagger; MergeList::EntryPtr merge_entry; - auto can_merge = [this, &lock](const DataPartPtr & left, const DataPartPtr & right, String *) -> bool + auto can_merge = [this, &lock](const DataPartPtr & left, const DataPartPtr & right, const MergeTreeTransaction * tx, String *) -> bool { + if (tx) + { + /// Cannot merge parts if some of them are not visible in current snapshot + /// TODO Transactions: We can use simplified visibility rules (without CSN lookup) here + if (left && !left->version.isVisible(tx->getSnapshot(), Tx::EmptyTID)) + return false; + if (right && !right->version.isVisible(tx->getSnapshot(), Tx::EmptyTID)) + return false; + + /// Do not try to merge parts that are locked for removal (merge will probably fail) + if (left && left->version.isRemovalTIDLocked()) + return false; + if (right && right->version.isRemovalTIDLocked()) + return false; + } + /// This predicate is checked for the first part of each range. /// (left = nullptr, right = "first part of partition") if (!left) @@ -736,6 +829,7 @@ std::shared_ptr StorageMergeTree::selectPartsToMerge( max_source_parts_size, can_merge, merge_with_ttl_allowed, + txn, out_disable_reason); } else if (out_disable_reason) @@ -746,7 +840,7 @@ std::shared_ptr StorageMergeTree::selectPartsToMerge( while (true) { select_decision = merger_mutator.selectAllPartsToMergeWithinPartition( - future_part, can_merge, partition_id, final, metadata_snapshot, out_disable_reason, optimize_skip_merged_partitions); + future_part, can_merge, partition_id, final, metadata_snapshot, txn, out_disable_reason, optimize_skip_merged_partitions); auto timeout_ms = getSettings()->lock_acquire_timeout_for_background_operations.totalMilliseconds(); auto timeout = std::chrono::milliseconds(timeout_ms); @@ -804,6 +898,7 @@ bool StorageMergeTree::merge( bool final, bool deduplicate, const Names & deduplicate_by_columns, + const MergeTreeTransactionPtr & txn, String * out_disable_reason, bool optimize_skip_merged_partitions) { @@ -827,6 +922,7 @@ bool StorageMergeTree::merge( out_disable_reason, table_lock_holder, lock, + txn, optimize_skip_merged_partitions, &select_decision); } @@ -838,11 +934,12 @@ bool StorageMergeTree::merge( if (!merge_mutate_entry) return false; - /// Copying a vector of columns `deduplicate bu columns. auto task = std::make_shared( *this, metadata_snapshot, deduplicate, deduplicate_by_columns, merge_mutate_entry, table_lock_holder, [](bool){}); + task->setCurrentTransaction(MergeTreeTransactionHolder{}, MergeTreeTransactionPtr{txn}); + executeHere(task); return true; @@ -882,7 +979,7 @@ std::shared_ptr StorageMergeTree::selectPartsToMutate( } auto mutations_end_it = current_mutations_by_version.end(); - for (const auto & part : getDataPartsVector()) + for (const auto & part : getDataPartsVectorForInternalUsage()) { if (currently_merging_mutating_parts.count(part)) continue; @@ -902,12 +999,28 @@ std::shared_ptr StorageMergeTree::selectPartsToMutate( continue; } - auto commands = MutationCommands::create(); + TransactionID first_mutation_tid = mutations_begin_it->second.tid; + MergeTreeTransactionPtr txn = tryGetTransactionForMutation(mutations_begin_it->second, log); + assert(txn || first_mutation_tid.isPrehistoric()); + if (txn) + { + /// Mutate visible parts only + /// NOTE Do not mutate visible parts in Outdated state, because it does not make sense: + /// mutation will fail anyway due to serialization error. + if (!part->version.isVisible(*txn)) + continue; + } + + auto commands = MutationCommands::create(); size_t current_ast_elements = 0; auto last_mutation_to_apply = mutations_end_it; for (auto it = mutations_begin_it; it != mutations_end_it; ++it) { + /// Do not squash mutations from different transactions to be able to commit/rollback them independently. + if (first_mutation_tid != it->second.tid) + break; + size_t commands_size = 0; MutationCommands commands_for_size_validation; for (const auto & command : it->second.commands) @@ -994,13 +1107,14 @@ std::shared_ptr StorageMergeTree::selectPartsToMutate( future_part->type = part->getType(); tagger = std::make_unique(future_part, MergeTreeDataMergerMutator::estimateNeededDiskSpace({part}), *this, metadata_snapshot, true); - return std::make_shared(future_part, std::move(tagger), commands); + return std::make_shared(future_part, std::move(tagger), commands, txn); } } return {}; } + bool StorageMergeTree::scheduleDataProcessingJob(BackgroundJobsAssignee & assignee) //-V657 { if (shutdown_called) @@ -1014,13 +1128,22 @@ bool StorageMergeTree::scheduleDataProcessingJob(BackgroundJobsAssignee & assign auto share_lock = lockForShare(RWLockImpl::NO_QUERY, getSettings()->lock_acquire_timeout_for_background_operations); + MergeTreeTransactionHolder transaction_for_merge; + MergeTreeTransactionPtr txn; + if (transactions_enabled.load(std::memory_order_relaxed)) + { + /// TODO Transactions: avoid beginning transaction if there is nothing to merge. + txn = TransactionLog::instance().beginTransaction(); + transaction_for_merge = MergeTreeTransactionHolder{txn, /* autocommit = */ true}; + } + bool has_mutations = false; { std::unique_lock lock(currently_processing_in_background_mutex); if (merger_mutator.merges_blocker.isCancelled()) return false; - merge_entry = selectPartsToMerge(metadata_snapshot, false, {}, false, nullptr, share_lock, lock); + merge_entry = selectPartsToMerge(metadata_snapshot, false, {}, false, nullptr, share_lock, lock, txn); if (!merge_entry) mutate_entry = selectPartsToMutate(metadata_snapshot, nullptr, share_lock, lock, were_some_mutations_skipped); @@ -1038,6 +1161,7 @@ bool StorageMergeTree::scheduleDataProcessingJob(BackgroundJobsAssignee & assign if (merge_entry) { auto task = std::make_shared(*this, metadata_snapshot, false, Names{}, merge_entry, share_lock, common_assignee_trigger); + task->setCurrentTransaction(std::move(transaction_for_merge), std::move(txn)); assignee.scheduleMergeMutateTask(task); return true; } @@ -1112,54 +1236,52 @@ UInt64 StorageMergeTree::getCurrentMutationVersion( size_t StorageMergeTree::clearOldMutations(bool truncate) { - const auto settings = getSettings(); - if (!truncate && !settings->finished_mutations_to_keep) - return 0; + size_t finished_mutations_to_keep = truncate ? 0 : getSettings()->finished_mutations_to_keep; std::vector mutations_to_delete; { std::unique_lock lock(currently_processing_in_background_mutex); - if (!truncate && current_mutations_by_version.size() <= settings->finished_mutations_to_keep) + if (current_mutations_by_version.size() <= finished_mutations_to_keep) return 0; auto end_it = current_mutations_by_version.end(); auto begin_it = current_mutations_by_version.begin(); - size_t to_delete_count = std::distance(begin_it, end_it); - if (!truncate) + if (std::optional min_version = getMinPartDataVersion()) + end_it = current_mutations_by_version.upper_bound(*min_version); + + size_t done_count = std::distance(begin_it, end_it); + if (done_count <= finished_mutations_to_keep) + return 0; + + auto part_versions_with_names = getSortedPartVersionsWithNames(lock); + + for (auto it = begin_it; it != end_it; ++it) { - if (std::optional min_version = getMinPartDataVersion()) - end_it = current_mutations_by_version.upper_bound(*min_version); + const PartVersionWithName needle{static_cast(it->first), ""}; + auto versions_it = std::lower_bound( + part_versions_with_names.begin(), part_versions_with_names.end(), needle); - size_t done_count = std::distance(begin_it, end_it); - if (done_count <= settings->finished_mutations_to_keep) - return 0; - - auto part_versions_with_names = getSortedPartVersionsWithNames(lock); - - for (auto it = begin_it; it != end_it; ++it) + if (versions_it != part_versions_with_names.begin() || !it->second.tid.isPrehistoric()) { - const PartVersionWithName needle{static_cast(it->first), ""}; - auto versions_it = std::lower_bound( - part_versions_with_names.begin(), part_versions_with_names.end(), needle); - - if (versions_it != part_versions_with_names.begin()) - { - done_count = std::distance(begin_it, it); - break; - } + done_count = std::distance(begin_it, it); + break; } - - if (done_count <= settings->finished_mutations_to_keep) - return 0; - - to_delete_count = done_count - settings->finished_mutations_to_keep; } + if (done_count <= finished_mutations_to_keep) + return 0; + + size_t to_delete_count = done_count - finished_mutations_to_keep; + auto it = begin_it; for (size_t i = 0; i < to_delete_count; ++i) { + const auto & tid = it->second.tid; + if (!tid.isPrehistoric() && !TransactionLog::getCSN(tid)) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot remove mutation {}, because transaction {} is not committed. It's a bug", + it->first, tid); mutations_to_delete.push_back(std::move(it->second)); it = current_mutations_by_version.erase(it); } @@ -1178,7 +1300,7 @@ std::vector StorageMergeTree::getSortedPa std::unique_lock & currently_processing_in_background_mutex_lock) const { std::vector part_versions_with_names; - auto data_parts = getDataPartsVector(); + auto data_parts = getDataPartsVectorForInternalUsage(); part_versions_with_names.reserve(data_parts.size()); for (const auto & part : data_parts) part_versions_with_names.emplace_back(PartVersionWithName{ @@ -1206,10 +1328,12 @@ bool StorageMergeTree::optimize( LOG_DEBUG(log, "DEDUPLICATE BY ('{}')", fmt::join(deduplicate_by_columns, "', '")); } + auto txn = local_context->getCurrentTransaction(); + String disable_reason; if (!partition && final) { - DataPartsVector data_parts = getDataPartsVector(); + DataPartsVector data_parts = getVisibleDataPartsVector(local_context); std::unordered_set partition_ids; for (const DataPartPtr & part : data_parts) @@ -1223,6 +1347,7 @@ bool StorageMergeTree::optimize( true, deduplicate, deduplicate_by_columns, + txn, &disable_reason, local_context->getSettingsRef().optimize_skip_merged_partitions)) { @@ -1249,6 +1374,7 @@ bool StorageMergeTree::optimize( final, deduplicate, deduplicate_by_columns, + txn, &disable_reason, local_context->getSettingsRef().optimize_skip_merged_partitions)) { @@ -1291,7 +1417,7 @@ ActionLock StorageMergeTree::stopMergesAndWait() } -MergeTreeDataPartPtr StorageMergeTree::outdatePart(const String & part_name, bool force) +MergeTreeDataPartPtr StorageMergeTree::outdatePart(MergeTreeTransaction * txn, const String & part_name, bool force) { if (force) @@ -1301,7 +1427,7 @@ MergeTreeDataPartPtr StorageMergeTree::outdatePart(const String & part_name, boo auto part = getPartIfExists(part_name, {MergeTreeDataPartState::Active}); if (!part) throw Exception("Part " + part_name + " not found, won't try to drop it.", ErrorCodes::NO_SUCH_DATA_PART); - removePartsFromWorkingSet({part}, true); + removePartsFromWorkingSet(txn, {part}, true); return part; } else @@ -1320,22 +1446,22 @@ MergeTreeDataPartPtr StorageMergeTree::outdatePart(const String & part_name, boo if (currently_merging_mutating_parts.count(part)) return nullptr; - removePartsFromWorkingSet({part}, true); + removePartsFromWorkingSet(txn, {part}, true); return part; } } void StorageMergeTree::dropPartNoWaitNoThrow(const String & part_name) { - if (auto part = outdatePart(part_name, /*force=*/ false)) + if (auto part = outdatePart(NO_TRANSACTION_RAW, part_name, /*force=*/ false)) dropPartsImpl({part}, /*detach=*/ false); /// Else nothing to do, part was removed in some different way } -void StorageMergeTree::dropPart(const String & part_name, bool detach, ContextPtr /*query_context*/) +void StorageMergeTree::dropPart(const String & part_name, bool detach, ContextPtr query_context) { - if (auto part = outdatePart(part_name, /*force=*/ true)) + if (auto part = outdatePart(query_context->getCurrentTransaction().get(), part_name, /*force=*/ true)) dropPartsImpl({part}, detach); } @@ -1349,14 +1475,14 @@ void StorageMergeTree::dropPartition(const ASTPtr & partition, bool detach, Cont auto merge_blocker = stopMergesAndWait(); const auto * partition_ast = partition->as(); if (partition_ast && partition_ast->all) - parts_to_remove = getDataPartsVector(); + parts_to_remove = getVisibleDataPartsVector(local_context); else { String partition_id = getPartitionIDFromQuery(partition, local_context); - parts_to_remove = getDataPartsVectorInPartition(MergeTreeDataPartState::Active, partition_id); + parts_to_remove = getVisibleDataPartsVectorInPartition(local_context, partition_id); } /// TODO should we throw an exception if parts_to_remove is empty? - removePartsFromWorkingSet(parts_to_remove, true); + removePartsFromWorkingSet(local_context->getCurrentTransaction().get(), parts_to_remove, true); } dropPartsImpl(std::move(parts_to_remove), detach); @@ -1405,8 +1531,14 @@ PartitionCommandsResultInfo StorageMergeTree::attachPartition( for (size_t i = 0; i < loaded_parts.size(); ++i) { LOG_INFO(log, "Attaching part {} from {}", loaded_parts[i]->name, renamed_parts.old_and_new_names[i].new_name); + /// We should write version metadata on part creation to distinguish it from parts that were created without transaction. + auto txn = local_context->getCurrentTransaction(); + TransactionID tid = txn ? txn->tid : Tx::PrehistoricTID; + loaded_parts[i]->version.setCreationTID(tid, nullptr); + loaded_parts[i]->storeVersionMetadata(); + String old_name = renamed_parts.old_and_new_names[i].old_name; - renameTempPartAndAdd(loaded_parts[i], &increment); + renameTempPartAndAdd(loaded_parts[i], local_context->getCurrentTransaction().get(), &increment); renamed_parts.old_and_new_names[i].old_name.clear(); results.push_back(PartitionCommandResultInfo{ @@ -1435,7 +1567,7 @@ void StorageMergeTree::replacePartitionFrom(const StoragePtr & source_table, con MergeTreeData & src_data = checkStructureAndGetMergeTreeData(source_table, source_metadata_snapshot, my_metadata_snapshot); String partition_id = getPartitionIDFromQuery(partition, local_context); - DataPartsVector src_parts = src_data.getDataPartsVectorInPartition(MergeTreeDataPartState::Active, partition_id); + DataPartsVector src_parts = src_data.getVisibleDataPartsVectorInPartition(local_context, partition_id); MutableDataPartsVector dst_parts; static const String TMP_PREFIX = "tmp_replace_from_"; @@ -1451,7 +1583,8 @@ void StorageMergeTree::replacePartitionFrom(const StoragePtr & source_table, con Int64 temp_index = insert_increment.get(); MergeTreePartInfo dst_part_info(partition_id, temp_index, temp_index, src_part->info.level); - dst_parts.emplace_back(cloneAndLoadDataPartOnSameDisk(src_part, TMP_PREFIX, dst_part_info, my_metadata_snapshot)); + auto dst_part = cloneAndLoadDataPartOnSameDisk(src_part, TMP_PREFIX, dst_part_info, my_metadata_snapshot, local_context->getCurrentTransaction()); + dst_parts.emplace_back(std::move(dst_part)); } /// ATTACH empty part set @@ -1473,19 +1606,19 @@ void StorageMergeTree::replacePartitionFrom(const StoragePtr & source_table, con { /// Here we use the transaction just like RAII since rare errors in renameTempPartAndReplace() are possible /// and we should be able to rollback already added (Precomitted) parts - Transaction transaction(*this); + Transaction transaction(*this, local_context->getCurrentTransaction().get()); auto data_parts_lock = lockParts(); /// Populate transaction for (MutableDataPartPtr & part : dst_parts) - renameTempPartAndReplace(part, &increment, &transaction, data_parts_lock); + renameTempPartAndReplace(part, local_context->getCurrentTransaction().get(), &increment, &transaction, data_parts_lock); transaction.commit(&data_parts_lock); /// If it is REPLACE (not ATTACH), remove all parts which max_block_number less then min_block_number of the first new block if (replace) - removePartsInRangeFromWorkingSet(drop_range, true, data_parts_lock); + removePartsInRangeFromWorkingSet(local_context->getCurrentTransaction().get(), drop_range, true, data_parts_lock); } PartLog::addNewParts(getContext(), dst_parts, watch.elapsed()); @@ -1520,7 +1653,7 @@ void StorageMergeTree::movePartitionToTable(const StoragePtr & dest_table, const MergeTreeData & src_data = dest_table_storage->checkStructureAndGetMergeTreeData(*this, metadata_snapshot, dest_metadata_snapshot); String partition_id = getPartitionIDFromQuery(partition, local_context); - DataPartsVector src_parts = src_data.getDataPartsVectorInPartition(MergeTreeDataPartState::Active, partition_id); + DataPartsVector src_parts = src_data.getVisibleDataPartsVectorInPartition(local_context, partition_id); MutableDataPartsVector dst_parts; static const String TMP_PREFIX = "tmp_move_from_"; @@ -1536,7 +1669,8 @@ void StorageMergeTree::movePartitionToTable(const StoragePtr & dest_table, const Int64 temp_index = insert_increment.get(); MergeTreePartInfo dst_part_info(partition_id, temp_index, temp_index, src_part->info.level); - dst_parts.emplace_back(dest_table_storage->cloneAndLoadDataPartOnSameDisk(src_part, TMP_PREFIX, dst_part_info, dest_metadata_snapshot)); + auto dst_part = dest_table_storage->cloneAndLoadDataPartOnSameDisk(src_part, TMP_PREFIX, dst_part_info, dest_metadata_snapshot, local_context->getCurrentTransaction()); + dst_parts.emplace_back(std::move(dst_part)); } /// empty part set @@ -1547,7 +1681,7 @@ void StorageMergeTree::movePartitionToTable(const StoragePtr & dest_table, const try { { - Transaction transaction(*dest_table_storage); + Transaction transaction(*dest_table_storage, local_context->getCurrentTransaction().get()); auto src_data_parts_lock = lockParts(); auto dest_data_parts_lock = dest_table_storage->lockParts(); @@ -1556,9 +1690,9 @@ void StorageMergeTree::movePartitionToTable(const StoragePtr & dest_table, const DataPartsLock lock(mutex); for (MutableDataPartPtr & part : dst_parts) - dest_table_storage->renameTempPartAndReplace(part, &dest_table_storage->increment, &transaction, lock); + dest_table_storage->renameTempPartAndReplace(part, local_context->getCurrentTransaction().get(), &dest_table_storage->increment, &transaction, lock); - removePartsFromWorkingSet(src_parts, true, lock); + removePartsFromWorkingSet(local_context->getCurrentTransaction().get(), src_parts, true, lock); transaction.commit(&lock); } @@ -1600,10 +1734,10 @@ CheckResults StorageMergeTree::checkData(const ASTPtr & query, ContextPtr local_ if (const auto & check_query = query->as(); check_query.partition) { String partition_id = getPartitionIDFromQuery(check_query.partition, local_context); - data_parts = getDataPartsVectorInPartition(MergeTreeDataPartState::Active, partition_id); + data_parts = getVisibleDataPartsVectorInPartition(local_context, partition_id); } else - data_parts = getDataPartsVector(); + data_parts = getVisibleDataPartsVector(local_context); for (auto & part : data_parts) { diff --git a/src/Storages/StorageMergeTree.h b/src/Storages/StorageMergeTree.h index a1fc310d912..74fb954bb6d 100644 --- a/src/Storages/StorageMergeTree.h +++ b/src/Storages/StorageMergeTree.h @@ -41,6 +41,8 @@ public: bool supportsIndexForIn() const override { return true; } + bool supportsTransactions() const override { return true; } + Pipe read( const Names & column_names, const StorageSnapshotPtr & storage_snapshot, @@ -152,19 +154,29 @@ private: * If aggressive - when selects parts don't takes into account their ratio size and novelty (used for OPTIMIZE query). * Returns true if merge is finished successfully. */ - bool merge(bool aggressive, const String & partition_id, bool final, bool deduplicate, const Names & deduplicate_by_columns, String * out_disable_reason = nullptr, bool optimize_skip_merged_partitions = false); + bool merge( + bool aggressive, + const String & partition_id, + bool final, bool deduplicate, + const Names & deduplicate_by_columns, + const MergeTreeTransactionPtr & txn, + String * out_disable_reason = nullptr, + bool optimize_skip_merged_partitions = false); /// Make part state outdated and queue it to remove without timeout /// If force, then stop merges and block them until part state became outdated. Throw exception if part doesn't exists /// If not force, then take merges selector and check that part is not participating in background operations. - MergeTreeDataPartPtr outdatePart(const String & part_name, bool force); + MergeTreeDataPartPtr outdatePart(MergeTreeTransaction * txn, const String & part_name, bool force); ActionLock stopMergesAndWait(); /// Allocate block number for new mutation, write mutation to disk /// and into in-memory structures. Wake up merge-mutation task. - Int64 startMutation(const MutationCommands & commands, String & mutation_file_name); + Int64 startMutation(const MutationCommands & commands, ContextPtr query_context); /// Wait until mutation with version will finish mutation for all parts - void waitForMutation(Int64 version, const String & file_name); + void waitForMutation(Int64 version); + void waitForMutation(const String & mutation_id) override; + void setMutationCSN(const String & mutation_id, CSN csn) override; + friend struct CurrentlyMergingPartsTagger; @@ -187,6 +199,7 @@ private: String * disable_reason, TableLockHolder & table_lock_holder, std::unique_lock & lock, + const MergeTreeTransactionPtr & txn, bool optimize_skip_merged_partitions = false, SelectPartsDecision * select_decision_out = nullptr); @@ -236,7 +249,6 @@ private: std::unique_ptr getDefaultSettings() const override; - friend class MergeTreeProjectionBlockOutputStream; friend class MergeTreeSink; friend class MergeTreeData; friend class MergePlainMergeTreeTask; diff --git a/src/Storages/StorageProxy.h b/src/Storages/StorageProxy.h index d5af81ced3d..b1eb190bd1d 100644 --- a/src/Storages/StorageProxy.h +++ b/src/Storages/StorageProxy.h @@ -41,7 +41,7 @@ public: /// TODO: Find a way to support projections for StorageProxy info.ignore_projections = true; const auto & nested_metadata = getNested()->getInMemoryMetadataPtr(); - return getNested()->getQueryProcessingStage(context, to_stage, getNested()->getStorageSnapshot(nested_metadata), info); + return getNested()->getQueryProcessingStage(context, to_stage, getNested()->getStorageSnapshot(nested_metadata, context), info); } Pipe watch( @@ -149,7 +149,6 @@ public: CheckResults checkData(const ASTPtr & query , ContextPtr context) override { return getNested()->checkData(query, context); } void checkTableCanBeDropped() const override { getNested()->checkTableCanBeDropped(); } - void checkPartitionCanBeDropped(const ASTPtr & partition) override { getNested()->checkPartitionCanBeDropped(partition); } bool storesDataOnDisk() const override { return getNested()->storesDataOnDisk(); } Strings getDataPaths() const override { return getNested()->getDataPaths(); } StoragePolicyPtr getStoragePolicy() const override { return getNested()->getStoragePolicy(); } diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 39840f91325..66a5baf555b 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -375,7 +375,7 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree( if (!attach) { - if (!getDataParts().empty()) + if (!getDataPartsForInternalUsage().empty()) throw Exception("Data directory for table already contains data parts" " - probably it was unclean DROP table or manual intervention." " You must either clear directory by hand or use ATTACH TABLE" @@ -1589,9 +1589,10 @@ bool StorageReplicatedMergeTree::executeLogEntry(LogEntry & entry) { LOG_TRACE(log, "Found valid local part for {}, preparing the transaction", part->name); - Transaction transaction(*this); + Transaction transaction(*this, NO_TRANSACTION_RAW); - renameTempPartAndReplace(part, nullptr, &transaction); + part->version.setCreationTID(Tx::PrehistoricTID, nullptr); + renameTempPartAndReplace(part, NO_TRANSACTION_RAW, nullptr, &transaction); checkPartChecksumsAndCommit(transaction, part); writePartLog(PartLogElement::Type::NEW_PART, {}, 0 /** log entry is fake so we don't measure the time */, @@ -1885,7 +1886,7 @@ void StorageReplicatedMergeTree::executeDropRange(const LogEntry & entry) DataPartsVector parts_to_remove; { auto data_parts_lock = lockParts(); - parts_to_remove = removePartsInRangeFromWorkingSet(drop_range_info, true, data_parts_lock); + parts_to_remove = removePartsInRangeFromWorkingSet(NO_TRANSACTION_RAW, drop_range_info, true, data_parts_lock); if (parts_to_remove.empty()) { if (!drop_range_info.isFakeDropRangePart()) @@ -2018,7 +2019,7 @@ bool StorageReplicatedMergeTree::executeReplaceRange(const LogEntry & entry) if (parts_to_add.empty() && replace) { - parts_to_remove = removePartsInRangeFromWorkingSet(drop_range, true, data_parts_lock); + parts_to_remove = removePartsInRangeFromWorkingSet(NO_TRANSACTION_RAW, drop_range, true, data_parts_lock); String parts_to_remove_str; for (const auto & part : parts_to_remove) { @@ -2198,7 +2199,7 @@ bool StorageReplicatedMergeTree::executeReplaceRange(const LogEntry & entry) throw Exception("Checksums of " + part_desc->src_table_part->name + " is suddenly changed", ErrorCodes::UNFINISHED); part_desc->res_part = cloneAndLoadDataPartOnSameDisk( - part_desc->src_table_part, TMP_PREFIX + "clone_", part_desc->new_part_info, metadata_snapshot); + part_desc->src_table_part, TMP_PREFIX + "clone_", part_desc->new_part_info, metadata_snapshot, NO_TRANSACTION_PTR); } else if (!part_desc->replica.empty()) { @@ -2238,12 +2239,12 @@ bool StorageReplicatedMergeTree::executeReplaceRange(const LogEntry & entry) { /// Commit parts auto zookeeper = getZooKeeper(); - Transaction transaction(*this); + Transaction transaction(*this, NO_TRANSACTION_RAW); Coordination::Requests ops; for (PartDescriptionPtr & part_desc : final_parts) { - renameTempPartAndReplace(part_desc->res_part, nullptr, &transaction); + renameTempPartAndReplace(part_desc->res_part, NO_TRANSACTION_RAW, nullptr, &transaction); getCommitPartOps(ops, part_desc->res_part); } @@ -2256,7 +2257,7 @@ bool StorageReplicatedMergeTree::executeReplaceRange(const LogEntry & entry) transaction.commit(&data_parts_lock); if (replace) { - parts_to_remove = removePartsInRangeFromWorkingSet(drop_range, true, data_parts_lock); + parts_to_remove = removePartsInRangeFromWorkingSet(NO_TRANSACTION_RAW, drop_range, true, data_parts_lock); String parts_to_remove_str; for (const auto & part : parts_to_remove) { @@ -2510,7 +2511,7 @@ void StorageReplicatedMergeTree::cloneReplica(const String & source_replica, Coo removePartsFromZooKeeperWithRetries(parts_to_remove_from_zk); - auto local_active_parts = getDataParts(); + auto local_active_parts = getDataPartsForInternalUsage(); DataPartsVector parts_to_remove_from_working_set; @@ -2534,7 +2535,7 @@ void StorageReplicatedMergeTree::cloneReplica(const String & source_replica, Coo } } - removePartsFromWorkingSet(parts_to_remove_from_working_set, true); + removePartsFromWorkingSet(NO_TRANSACTION_RAW, parts_to_remove_from_working_set, true); std::unordered_set created_get_parts; @@ -3119,7 +3120,7 @@ void StorageReplicatedMergeTree::mergeSelectingTask() future_merged_part->uuid = UUIDHelpers::generateV4(); if (max_source_parts_size_for_merge > 0 && - merger_mutator.selectPartsToMerge(future_merged_part, false, max_source_parts_size_for_merge, merge_pred, merge_with_ttl_allowed, nullptr) == SelectPartsDecision::SELECTED) + merger_mutator.selectPartsToMerge(future_merged_part, false, max_source_parts_size_for_merge, merge_pred, merge_with_ttl_allowed, NO_TRANSACTION_PTR, nullptr) == SelectPartsDecision::SELECTED) { create_result = createLogEntryToMergeParts( zookeeper, @@ -3138,7 +3139,7 @@ void StorageReplicatedMergeTree::mergeSelectingTask() && merges_and_mutations_queued.mutations < storage_settings_ptr->max_replicated_mutations_in_queue) { /// Choose a part to mutate. - DataPartsVector data_parts = getDataPartsVector(); + DataPartsVector data_parts = getDataPartsVectorForInternalUsage(); for (const auto & part : data_parts) { if (part->getBytesOnDisk() > max_source_part_size_for_mutation) @@ -3397,7 +3398,7 @@ void StorageReplicatedMergeTree::removePartAndEnqueueFetch(const String & part_n /// It's quite dangerous, so clone covered parts to detached. auto broken_part_info = MergeTreePartInfo::fromPartName(part_name, format_version); - auto partition_range = getDataPartsVectorInPartition(MergeTreeDataPartState::Active, broken_part_info.partition_id); + auto partition_range = getVisibleDataPartsVectorInPartition(getContext(), broken_part_info.partition_id); for (const auto & part : partition_range) { if (!broken_part_info.contains(part->info)) @@ -3930,7 +3931,7 @@ bool StorageReplicatedMergeTree::fetchPart(const String & part_name, const Stora { get_part = [&, part_to_clone]() { - return cloneAndLoadDataPartOnSameDisk(part_to_clone, "tmp_clone_", part_info, metadata_snapshot); + return cloneAndLoadDataPartOnSameDisk(part_to_clone, "tmp_clone_", part_info, metadata_snapshot, NO_TRANSACTION_PTR); }; } else @@ -3973,8 +3974,8 @@ bool StorageReplicatedMergeTree::fetchPart(const String & part_name, const Stora if (!to_detached) { - Transaction transaction(*this); - renameTempPartAndReplace(part, nullptr, &transaction); + Transaction transaction(*this, NO_TRANSACTION_RAW); + renameTempPartAndReplace(part, NO_TRANSACTION_RAW, nullptr, &transaction); replaced_parts = checkPartChecksumsAndCommit(transaction, part); @@ -4254,7 +4255,7 @@ ReplicatedMergeTreeQuorumAddedParts::PartitionIdToMaxBlock StorageReplicatedMerg { ReplicatedMergeTreeQuorumAddedParts::PartitionIdToMaxBlock max_added_blocks; - for (const auto & data_part : getDataParts()) + for (const auto & data_part : getDataPartsForInternalUsage()) { max_added_blocks[data_part->info.partition_id] = std::max(max_added_blocks[data_part->info.partition_id], data_part->info.max_block); @@ -4368,6 +4369,7 @@ void StorageReplicatedMergeTree::foreachActiveParts(Func && func, bool select_se max_added_blocks = getMaxAddedBlocks(); auto lock = lockParts(); + /// TODO Transactions: should we count visible parts only? for (const auto & part : getDataPartsStateRange(DataPartState::Active)) { if (part->isEmpty()) @@ -4485,12 +4487,12 @@ bool StorageReplicatedMergeTree::optimize( { select_decision = merger_mutator.selectPartsToMerge( future_merged_part, /* aggressive */ true, storage_settings_ptr->max_bytes_to_merge_at_max_space_in_pool, - can_merge, /* merge_with_ttl_allowed */ false, &disable_reason); + can_merge, /* merge_with_ttl_allowed */ false, NO_TRANSACTION_PTR, &disable_reason); } else { select_decision = merger_mutator.selectAllPartsToMergeWithinPartition( - future_merged_part, can_merge, partition_id, final, metadata_snapshot, + future_merged_part, can_merge, partition_id, final, metadata_snapshot, NO_TRANSACTION_PTR, &disable_reason, query_context->getSettingsRef().optimize_skip_merged_partitions); } @@ -4539,7 +4541,7 @@ bool StorageReplicatedMergeTree::optimize( bool assigned = false; if (!partition && final) { - DataPartsVector data_parts = getDataPartsVector(); + DataPartsVector data_parts = getVisibleDataPartsVector(query_context); std::unordered_set partition_ids; for (const DataPartPtr & part : data_parts) @@ -6330,7 +6332,7 @@ void StorageReplicatedMergeTree::replacePartitionFrom( String partition_id = getPartitionIDFromQuery(partition, query_context); /// NOTE: Some covered parts may be missing in src_all_parts if corresponding log entries are not executed yet. - DataPartsVector src_all_parts = src_data.getDataPartsVectorInPartition(MergeTreeDataPartState::Active, partition_id); + DataPartsVector src_all_parts = src_data.getVisibleDataPartsVectorInPartition(query_context, partition_id); LOG_DEBUG(log, "Cloning {} parts", src_all_parts.size()); @@ -6402,7 +6404,7 @@ void StorageReplicatedMergeTree::replacePartitionFrom( UInt64 index = lock->getNumber(); MergeTreePartInfo dst_part_info(partition_id, index, index, src_part->info.level); - auto dst_part = cloneAndLoadDataPartOnSameDisk(src_part, TMP_PREFIX, dst_part_info, metadata_snapshot); + auto dst_part = cloneAndLoadDataPartOnSameDisk(src_part, TMP_PREFIX, dst_part_info, metadata_snapshot, NO_TRANSACTION_PTR); src_parts.emplace_back(src_part); dst_parts.emplace_back(dst_part); @@ -6458,12 +6460,12 @@ void StorageReplicatedMergeTree::replacePartitionFrom( ops.emplace_back(zkutil::makeSetRequest(fs::path(zookeeper_path) / "log", "", -1)); ops.emplace_back(zkutil::makeCreateRequest(fs::path(zookeeper_path) / "log/log-", entry.toString(), zkutil::CreateMode::PersistentSequential)); - Transaction transaction(*this); + Transaction transaction(*this, NO_TRANSACTION_RAW); { auto data_parts_lock = lockParts(); for (MutableDataPartPtr & part : dst_parts) - renameTempPartAndReplace(part, nullptr, &transaction, data_parts_lock); + renameTempPartAndReplace(part, query_context->getCurrentTransaction().get(), nullptr, &transaction, data_parts_lock); } Coordination::Error code = zookeeper->tryMulti(ops, op_results); @@ -6483,10 +6485,9 @@ void StorageReplicatedMergeTree::replacePartitionFrom( { auto data_parts_lock = lockParts(); - transaction.commit(&data_parts_lock); if (replace) - parts_to_remove = removePartsInRangeFromWorkingSet(drop_range, true, data_parts_lock); + parts_to_remove = removePartsInRangeFromWorkingSet(NO_TRANSACTION_RAW, drop_range, true, data_parts_lock); } PartLog::addNewParts(getContext(), dst_parts, watch.elapsed()); @@ -6614,7 +6615,7 @@ void StorageReplicatedMergeTree::movePartitionToTable(const StoragePtr & dest_ta UInt64 index = lock->getNumber(); MergeTreePartInfo dst_part_info(partition_id, index, index, src_part->info.level); - auto dst_part = dest_table_storage->cloneAndLoadDataPartOnSameDisk(src_part, TMP_PREFIX, dst_part_info, dest_metadata_snapshot); + auto dst_part = dest_table_storage->cloneAndLoadDataPartOnSameDisk(src_part, TMP_PREFIX, dst_part_info, dest_metadata_snapshot, NO_TRANSACTION_PTR); src_parts.emplace_back(src_part); dst_parts.emplace_back(dst_part); @@ -6676,7 +6677,7 @@ void StorageReplicatedMergeTree::movePartitionToTable(const StoragePtr & dest_ta entry.toString(), zkutil::CreateMode::PersistentSequential)); { - Transaction transaction(*dest_table_storage); + Transaction transaction(*dest_table_storage, NO_TRANSACTION_RAW); auto src_data_parts_lock = lockParts(); auto dest_data_parts_lock = dest_table_storage->lockParts(); @@ -6685,7 +6686,7 @@ void StorageReplicatedMergeTree::movePartitionToTable(const StoragePtr & dest_ta DataPartsLock lock(mutex); for (MutableDataPartPtr & part : dst_parts) - dest_table_storage->renameTempPartAndReplace(part, nullptr, &transaction, lock); + dest_table_storage->renameTempPartAndReplace(part, query_context->getCurrentTransaction().get(), nullptr, &transaction, lock); Coordination::Error code = zookeeper->tryMulti(ops, op_results); if (code == Coordination::Error::ZBADVERSION) @@ -6693,7 +6694,7 @@ void StorageReplicatedMergeTree::movePartitionToTable(const StoragePtr & dest_ta else zkutil::KeeperMultiException::check(code, ops, op_results); - parts_to_remove = removePartsInRangeFromWorkingSet(drop_range, true, lock); + parts_to_remove = removePartsInRangeFromWorkingSet(NO_TRANSACTION_RAW, drop_range, true, lock); transaction.commit(&lock); } @@ -7153,10 +7154,10 @@ CheckResults StorageReplicatedMergeTree::checkData(const ASTPtr & query, Context if (const auto & check_query = query->as(); check_query.partition) { String partition_id = getPartitionIDFromQuery(check_query.partition, local_context); - data_parts = getDataPartsVectorInPartition(MergeTreeDataPartState::Active, partition_id); + data_parts = getVisibleDataPartsVectorInPartition(local_context, partition_id); } else - data_parts = getDataPartsVector(); + data_parts = getVisibleDataPartsVector(local_context); for (auto & part : data_parts) { @@ -7192,7 +7193,7 @@ void StorageReplicatedMergeTree::checkBrokenDisks() LOG_INFO(log, "Scanning parts to recover on broken disk {} with path {}", disk_ptr->getName(), disk_ptr->getPath()); if (!parts) - parts = std::make_unique(getDataPartsVector()); + parts = std::make_unique(getDataPartsVectorForInternalUsage()); for (auto & part : *parts) { @@ -7730,7 +7731,7 @@ bool StorageReplicatedMergeTree::createEmptyPartInsteadOfLost(zkutil::ZooKeeperP const auto & index_factory = MergeTreeIndexFactory::instance(); MergedBlockOutputStream out(new_data_part, metadata_snapshot, columns, - index_factory.getMany(metadata_snapshot->getSecondaryIndices()), compression_codec); + index_factory.getMany(metadata_snapshot->getSecondaryIndices()), compression_codec, NO_TRANSACTION_PTR); bool sync_on_insert = settings->fsync_after_insert; @@ -7742,8 +7743,8 @@ bool StorageReplicatedMergeTree::createEmptyPartInsteadOfLost(zkutil::ZooKeeperP try { - MergeTreeData::Transaction transaction(*this); - auto replaced_parts = renameTempPartAndReplace(new_data_part, nullptr, &transaction); + MergeTreeData::Transaction transaction(*this, NO_TRANSACTION_RAW); + auto replaced_parts = renameTempPartAndReplace(new_data_part, NO_TRANSACTION_RAW, nullptr, &transaction); if (!replaced_parts.empty()) { diff --git a/src/Storages/StorageTableFunction.h b/src/Storages/StorageTableFunction.h index 4616421b24a..8bc1b160e77 100644 --- a/src/Storages/StorageTableFunction.h +++ b/src/Storages/StorageTableFunction.h @@ -104,7 +104,7 @@ public: for (const auto & c : column_names) cnames += c + " "; auto storage = getNested(); - auto nested_snapshot = storage->getStorageSnapshot(storage->getInMemoryMetadataPtr()); + auto nested_snapshot = storage->getStorageSnapshot(storage->getInMemoryMetadataPtr(), context); auto pipe = storage->read(column_names, nested_snapshot, query_info, context, processed_stage, max_block_size, num_streams); if (!pipe.empty() && add_conversion) diff --git a/src/Storages/StorageValues.h b/src/Storages/StorageValues.h index 21156ec27cc..a4e18657f9f 100644 --- a/src/Storages/StorageValues.h +++ b/src/Storages/StorageValues.h @@ -31,6 +31,10 @@ public: { return virtuals; } + + /// FIXME probably it should return false, but StorageValues is used in ExecutingInnerQueryFromViewTransform (whatever it is) + bool supportsTransactions() const override { return true; } + private: Block res_block; NamesAndTypesList virtuals; diff --git a/src/Storages/System/StorageSystemNumbers.h b/src/Storages/System/StorageSystemNumbers.h index 5f3a12c530d..043a0e7c0c2 100644 --- a/src/Storages/System/StorageSystemNumbers.h +++ b/src/Storages/System/StorageSystemNumbers.h @@ -40,6 +40,7 @@ public: bool hasEvenlyDistributedRead() const override { return true; } bool isSystemStorage() const override { return true; } + bool supportsTransactions() const override { return true; } private: bool multithreaded; diff --git a/src/Storages/System/StorageSystemOne.h b/src/Storages/System/StorageSystemOne.h index b0ca389b76f..1c3d5c9ab80 100644 --- a/src/Storages/System/StorageSystemOne.h +++ b/src/Storages/System/StorageSystemOne.h @@ -32,6 +32,8 @@ public: bool isSystemStorage() const override { return true; } + bool supportsTransactions() const override { return true; } + protected: explicit StorageSystemOne(const StorageID & table_id_); }; diff --git a/src/Storages/System/StorageSystemParts.cpp b/src/Storages/System/StorageSystemParts.cpp index 2efb337b302..6674de06c07 100644 --- a/src/Storages/System/StorageSystemParts.cpp +++ b/src/Storages/System/StorageSystemParts.cpp @@ -12,6 +12,7 @@ #include #include #include +#include namespace DB { @@ -81,13 +82,19 @@ StorageSystemParts::StorageSystemParts(const StorageID & table_id_) {"rows_where_ttl_info.max", std::make_shared(std::make_shared())}, {"projections", std::make_shared(std::make_shared())}, + + {"visible", std::make_shared()}, + {"creation_tid", getTransactionIDDataType()}, + {"removal_tid", getTransactionIDDataType()}, + {"creation_csn", std::make_shared()}, + {"removal_csn", std::make_shared()}, } ) { } void StorageSystemParts::processNextStorage( - MutableColumns & columns, std::vector & columns_mask, const StoragesInfo & info, bool has_state_column) + ContextPtr context, MutableColumns & columns, std::vector & columns_mask, const StoragesInfo & info, bool has_state_column) { using State = IMergeTreeDataPart::State; MergeTreeData::DataPartStateVector all_parts_state; @@ -272,6 +279,29 @@ void StorageSystemParts::processNextStorage( if (columns_mask[src_index++]) columns[res_index++]->insert(projections); + if (columns_mask[src_index++]) + { + auto txn = context->getCurrentTransaction(); + if (txn) + columns[res_index++]->insert(part->version.isVisible(*txn)); + else + columns[res_index++]->insert(part_state == State::Active); + } + + auto get_tid_as_field = [](const TransactionID & tid) -> Field + { + return Tuple{tid.start_csn, tid.local_tid, tid.host_id}; + }; + + if (columns_mask[src_index++]) + columns[res_index++]->insert(get_tid_as_field(part->version.creation_tid)); + if (columns_mask[src_index++]) + columns[res_index++]->insert(get_tid_as_field(part->version.getRemovalTID())); + if (columns_mask[src_index++]) + columns[res_index++]->insert(part->version.creation_csn.load(std::memory_order_relaxed)); + if (columns_mask[src_index++]) + columns[res_index++]->insert(part->version.removal_csn.load(std::memory_order_relaxed)); + /// _state column should be the latest. /// Do not use part->getState*, it can be changed from different thread if (has_state_column) diff --git a/src/Storages/System/StorageSystemParts.h b/src/Storages/System/StorageSystemParts.h index 69e957c5a1e..f7b069c9516 100644 --- a/src/Storages/System/StorageSystemParts.h +++ b/src/Storages/System/StorageSystemParts.h @@ -21,7 +21,7 @@ public: protected: explicit StorageSystemParts(const StorageID & table_id_); void processNextStorage( - MutableColumns & columns, std::vector & columns_mask, const StoragesInfo & info, bool has_state_column) override; + ContextPtr context, MutableColumns & columns, std::vector & columns_mask, const StoragesInfo & info, bool has_state_column) override; }; } diff --git a/src/Storages/System/StorageSystemPartsBase.cpp b/src/Storages/System/StorageSystemPartsBase.cpp index 26b1b151073..1462cc58a42 100644 --- a/src/Storages/System/StorageSystemPartsBase.cpp +++ b/src/Storages/System/StorageSystemPartsBase.cpp @@ -57,12 +57,12 @@ StoragesInfo::getParts(MergeTreeData::DataPartStateVector & state, bool has_stat { /// If has_state_column is requested, return all states. if (!has_state_column) - return data->getDataPartsVector({State::Active, State::Outdated}, &state, require_projection_parts); + return data->getDataPartsVectorForInternalUsage({State::Active, State::Outdated}, &state, require_projection_parts); return data->getAllDataPartsVector(&state, require_projection_parts); } - return data->getDataPartsVector({State::Active}, &state, require_projection_parts); + return data->getDataPartsVectorForInternalUsage({State::Active}, &state, require_projection_parts); } StoragesInfoStream::StoragesInfoStream(const SelectQueryInfo & query_info, ContextPtr context) @@ -268,7 +268,7 @@ Pipe StorageSystemPartsBase::read( while (StoragesInfo info = stream.next()) { - processNextStorage(res_columns, columns_mask, info, has_state_column); + processNextStorage(context, res_columns, columns_mask, info, has_state_column); } if (has_state_column) diff --git a/src/Storages/System/StorageSystemPartsBase.h b/src/Storages/System/StorageSystemPartsBase.h index 0daa01a6b99..3eb8c7c8711 100644 --- a/src/Storages/System/StorageSystemPartsBase.h +++ b/src/Storages/System/StorageSystemPartsBase.h @@ -78,7 +78,7 @@ protected: StorageSystemPartsBase(const StorageID & table_id_, NamesAndTypesList && columns_); virtual void - processNextStorage(MutableColumns & columns, std::vector & columns_mask, const StoragesInfo & info, bool has_state_column) = 0; + processNextStorage(ContextPtr context, MutableColumns & columns, std::vector & columns_mask, const StoragesInfo & info, bool has_state_column) = 0; }; } diff --git a/src/Storages/System/StorageSystemPartsColumns.cpp b/src/Storages/System/StorageSystemPartsColumns.cpp index a9341abb9cd..7f648054da2 100644 --- a/src/Storages/System/StorageSystemPartsColumns.cpp +++ b/src/Storages/System/StorageSystemPartsColumns.cpp @@ -76,7 +76,7 @@ StorageSystemPartsColumns::StorageSystemPartsColumns(const StorageID & table_id_ } void StorageSystemPartsColumns::processNextStorage( - MutableColumns & columns, std::vector & columns_mask, const StoragesInfo & info, bool has_state_column) + ContextPtr, MutableColumns & columns, std::vector & columns_mask, const StoragesInfo & info, bool has_state_column) { /// Prepare information about columns in storage. struct ColumnInfo diff --git a/src/Storages/System/StorageSystemPartsColumns.h b/src/Storages/System/StorageSystemPartsColumns.h index b8c52ca16ef..9cdd2befb40 100644 --- a/src/Storages/System/StorageSystemPartsColumns.h +++ b/src/Storages/System/StorageSystemPartsColumns.h @@ -23,7 +23,7 @@ public: protected: explicit StorageSystemPartsColumns(const StorageID & table_id_); void processNextStorage( - MutableColumns & columns, std::vector & columns_mask, const StoragesInfo & info, bool has_state_column) override; + ContextPtr context, MutableColumns & columns, std::vector & columns_mask, const StoragesInfo & info, bool has_state_column) override; }; } diff --git a/src/Storages/System/StorageSystemProjectionParts.cpp b/src/Storages/System/StorageSystemProjectionParts.cpp index d15acc97cb1..591277c1a66 100644 --- a/src/Storages/System/StorageSystemProjectionParts.cpp +++ b/src/Storages/System/StorageSystemProjectionParts.cpp @@ -90,7 +90,7 @@ StorageSystemProjectionParts::StorageSystemProjectionParts(const StorageID & tab } void StorageSystemProjectionParts::processNextStorage( - MutableColumns & columns, std::vector & columns_mask, const StoragesInfo & info, bool has_state_column) + ContextPtr, MutableColumns & columns, std::vector & columns_mask, const StoragesInfo & info, bool has_state_column) { using State = IMergeTreeDataPart::State; MergeTreeData::DataPartStateVector all_parts_state; diff --git a/src/Storages/System/StorageSystemProjectionParts.h b/src/Storages/System/StorageSystemProjectionParts.h index a8db87fbba4..be31d08b24e 100644 --- a/src/Storages/System/StorageSystemProjectionParts.h +++ b/src/Storages/System/StorageSystemProjectionParts.h @@ -21,6 +21,6 @@ public: protected: explicit StorageSystemProjectionParts(const StorageID & table_id_); void processNextStorage( - MutableColumns & columns, std::vector & columns_mask, const StoragesInfo & info, bool has_state_column) override; + ContextPtr context, MutableColumns & columns, std::vector & columns_mask, const StoragesInfo & info, bool has_state_column) override; }; } diff --git a/src/Storages/System/StorageSystemProjectionPartsColumns.cpp b/src/Storages/System/StorageSystemProjectionPartsColumns.cpp index 29c877733d8..8f6db9fcbe8 100644 --- a/src/Storages/System/StorageSystemProjectionPartsColumns.cpp +++ b/src/Storages/System/StorageSystemProjectionPartsColumns.cpp @@ -73,7 +73,7 @@ StorageSystemProjectionPartsColumns::StorageSystemProjectionPartsColumns(const S } void StorageSystemProjectionPartsColumns::processNextStorage( - MutableColumns & columns, std::vector & columns_mask, const StoragesInfo & info, bool has_state_column) + ContextPtr, MutableColumns & columns, std::vector & columns_mask, const StoragesInfo & info, bool has_state_column) { /// Prepare information about columns in storage. struct ColumnInfo diff --git a/src/Storages/System/StorageSystemProjectionPartsColumns.h b/src/Storages/System/StorageSystemProjectionPartsColumns.h index 5679f5e9093..ade07b70a23 100644 --- a/src/Storages/System/StorageSystemProjectionPartsColumns.h +++ b/src/Storages/System/StorageSystemProjectionPartsColumns.h @@ -23,6 +23,6 @@ public: protected: explicit StorageSystemProjectionPartsColumns(const StorageID & table_id_); void processNextStorage( - MutableColumns & columns, std::vector & columns_mask, const StoragesInfo & info, bool has_state_column) override; + ContextPtr context, MutableColumns & columns, std::vector & columns_mask, const StoragesInfo & info, bool has_state_column) override; }; } diff --git a/src/Storages/System/StorageSystemTransactions.cpp b/src/Storages/System/StorageSystemTransactions.cpp new file mode 100644 index 00000000000..396fc875f74 --- /dev/null +++ b/src/Storages/System/StorageSystemTransactions.cpp @@ -0,0 +1,49 @@ +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +static DataTypePtr getStateEnumType() +{ + return std::make_shared( + DataTypeEnum8::Values + { + {"RUNNING", static_cast(MergeTreeTransaction::State::RUNNING)}, + {"COMMITTED", static_cast(MergeTreeTransaction::State::COMMITTED)}, + {"ROLLED_BACK", static_cast(MergeTreeTransaction::State::ROLLED_BACK)}, + }); +} + +NamesAndTypesList StorageSystemTransactions::getNamesAndTypes() +{ + return { + {"tid", getTransactionIDDataType()}, + {"tid_hash", std::make_shared()}, + {"elapsed", std::make_shared()}, + {"is_readonly", std::make_shared()}, + {"state", getStateEnumType()}, + }; +} + +void StorageSystemTransactions::fillData(MutableColumns & res_columns, ContextPtr, const SelectQueryInfo &) const +{ + auto list = TransactionLog::instance().getTransactionsList(); + for (const auto & elem : list) + { + auto txn = elem.second; + size_t i = 0; + res_columns[i++]->insert(Tuple{txn->tid.start_csn, txn->tid.local_tid, txn->tid.host_id}); + res_columns[i++]->insert(txn->tid.getHash()); + res_columns[i++]->insert(txn->elapsedSeconds()); + res_columns[i++]->insert(txn->isReadOnly()); + res_columns[i++]->insert(txn->getState()); + } +} + +} diff --git a/src/Storages/System/StorageSystemTransactions.h b/src/Storages/System/StorageSystemTransactions.h new file mode 100644 index 00000000000..38244815549 --- /dev/null +++ b/src/Storages/System/StorageSystemTransactions.h @@ -0,0 +1,27 @@ +#pragma once +#include +#include + + +namespace DB +{ + +class Context; + +class StorageSystemTransactions final : public shared_ptr_helper, public IStorageSystemOneBlock +{ + friend struct shared_ptr_helper; +public: + String getName() const override { return "SystemTransactions"; } + + static NamesAndTypesList getNamesAndTypes(); + + static NamesAndAliases getNamesAndAliases() { return {}; } + +protected: + using IStorageSystemOneBlock::IStorageSystemOneBlock; + + void fillData(MutableColumns & res_columns, ContextPtr context, const SelectQueryInfo & query_info) const override; +}; + +} diff --git a/src/Storages/System/StorageSystemZeros.h b/src/Storages/System/StorageSystemZeros.h index bf72352b7be..067e6c7217d 100644 --- a/src/Storages/System/StorageSystemZeros.h +++ b/src/Storages/System/StorageSystemZeros.h @@ -31,6 +31,7 @@ public: bool hasEvenlyDistributedRead() const override { return true; } bool isSystemStorage() const override { return true; } + bool supportsTransactions() const override { return true; } private: bool multithreaded; diff --git a/src/Storages/System/attachSystemTables.cpp b/src/Storages/System/attachSystemTables.cpp index 43be415ff41..6558890b8c4 100644 --- a/src/Storages/System/attachSystemTables.cpp +++ b/src/Storages/System/attachSystemTables.cpp @@ -68,6 +68,7 @@ #include #include #include +#include #include #include @@ -168,6 +169,9 @@ void attachSystemTablesServer(ContextPtr context, IDatabase & system_database, b if (has_zookeeper) attach(context, system_database, "zookeeper"); + + if (context->getConfigRef().getInt("allow_experimental_transactions", 0)) + attach(context, system_database, "transactions"); } void attachSystemTablesAsync(ContextPtr context, IDatabase & system_database, AsynchronousMetrics & async_metrics) diff --git a/src/Storages/tests/gtest_storage_log.cpp b/src/Storages/tests/gtest_storage_log.cpp index 4cda9d6c9f5..66922afdd9c 100644 --- a/src/Storages/tests/gtest_storage_log.cpp +++ b/src/Storages/tests/gtest_storage_log.cpp @@ -117,7 +117,7 @@ std::string readData(DB::StoragePtr & table, const DB::ContextPtr context) { using namespace DB; auto metadata_snapshot = table->getInMemoryMetadataPtr(); - auto storage_snapshot = table->getStorageSnapshot(metadata_snapshot); + auto storage_snapshot = table->getStorageSnapshot(metadata_snapshot, context); Names column_names; column_names.push_back("a"); diff --git a/tests/clickhouse-test b/tests/clickhouse-test index 3efb37cc27d..b93416beda6 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -204,6 +204,17 @@ def get_processlist(args): else: return clickhouse_execute_json(args, 'SHOW PROCESSLIST') +def get_transactions_list(args): + try: + if args.replicated_database: + return clickhouse_execute_json(args, """ + SELECT materialize((hostName(), tcpPort())) as host, * + FROM clusterAllReplicas('test_cluster_database_replicated', system.transactions) + """) + else: + return clickhouse_execute_json(args, 'select * from system.transactions') + except Exception as e: + return f"Cannot get list of transactions: {e}" def get_processlist_after_test(args): log_comment = args.testcase_basename @@ -1395,6 +1406,7 @@ def main(args): if processlist: print(colored("\nFound hung queries in processlist:", args, "red", attrs=["bold"])) print(json.dumps(processlist, indent=4)) + print(get_transactions_list(args)) print_stacktraces() exit_code.value = 1 diff --git a/tests/config/config.d/transactions.xml b/tests/config/config.d/transactions.xml new file mode 100644 index 00000000000..19810986ea1 --- /dev/null +++ b/tests/config/config.d/transactions.xml @@ -0,0 +1,13 @@ + + 42 + + + + + + system + transactions_info_log
+ 7500 +
+ +
diff --git a/tests/config/install.sh b/tests/config/install.sh index ff92f01e53f..8f28fb386ff 100755 --- a/tests/config/install.sh +++ b/tests/config/install.sh @@ -34,6 +34,7 @@ ln -sf $SRC_PATH/config.d/merge_tree.xml $DEST_SERVER_PATH/config.d/ ln -sf $SRC_PATH/config.d/tcp_with_proxy.xml $DEST_SERVER_PATH/config.d/ ln -sf $SRC_PATH/config.d/top_level_domains_lists.xml $DEST_SERVER_PATH/config.d/ ln -sf $SRC_PATH/config.d/top_level_domains_path.xml $DEST_SERVER_PATH/config.d/ +ln -sf $SRC_PATH/config.d/transactions.xml $DEST_SERVER_PATH/config.d/ ln -sf $SRC_PATH/config.d/encryption.xml $DEST_SERVER_PATH/config.d/ ln -sf $SRC_PATH/config.d/CORS.xml $DEST_SERVER_PATH/config.d/ ln -sf $SRC_PATH/config.d/zookeeper_log.xml $DEST_SERVER_PATH/config.d/ diff --git a/tests/integration/helpers/cluster.py b/tests/integration/helpers/cluster.py index 50c9e0d894a..78274e0232e 100644 --- a/tests/integration/helpers/cluster.py +++ b/tests/integration/helpers/cluster.py @@ -2919,7 +2919,8 @@ class ClickHouseInstance: else: params = params.copy() - params["query"] = sql + if sql is not None: + params["query"] = sql auth = None if user and password: diff --git a/tests/integration/test_MemoryTracking/configs/no_system_log.xml b/tests/integration/test_MemoryTracking/configs/no_system_log.xml index bd1b9f9a49e..3218dae4dc7 100644 --- a/tests/integration/test_MemoryTracking/configs/no_system_log.xml +++ b/tests/integration/test_MemoryTracking/configs/no_system_log.xml @@ -14,4 +14,5 @@ + diff --git a/tests/integration/test_transactions/__init__.py b/tests/integration/test_transactions/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_transactions/configs/transactions.xml b/tests/integration/test_transactions/configs/transactions.xml new file mode 100644 index 00000000000..a8d3e8fbf6d --- /dev/null +++ b/tests/integration/test_transactions/configs/transactions.xml @@ -0,0 +1,14 @@ + + 42 + + + 100500 + 0 + + + + system + transactions_info_log
+ 7500 +
+
diff --git a/tests/integration/test_transactions/test.py b/tests/integration/test_transactions/test.py new file mode 100644 index 00000000000..8983e70b4cb --- /dev/null +++ b/tests/integration/test_transactions/test.py @@ -0,0 +1,120 @@ +import pytest +from helpers.cluster import ClickHouseCluster + +cluster = ClickHouseCluster(__file__) +node = cluster.add_instance( + "node", + main_configs=["configs/transactions.xml"], + stay_alive=True, + with_zookeeper=True, +) + + +@pytest.fixture(scope="module") +def start_cluster(): + try: + cluster.start() + yield cluster + finally: + cluster.shutdown() + + +def tx(session, query): + params = {"session_id": "session_{}".format(session)} + return node.http_query(None, data=query, params=params) + + +def test_rollback_unfinished_on_restart(start_cluster): + node.query( + "create table mt (n int, m int) engine=MergeTree order by n partition by n % 2" + ) + node.query("insert into mt values (1, 10), (2, 20)") + tid0 = "(1,1,'00000000-0000-0000-0000-000000000000')" + + # it will hold a snapshot and avoid parts cleanup + tx(0, "begin transaction") + + tx(4, "begin transaction") + + tx(1, "begin transaction") + tid1 = tx(1, "select transactionID()").strip() + tx(1, "alter table mt drop partition id '1'") + tx(1, "commit") + + tx(1, "begin transaction") + tid2 = tx(1, "select transactionID()").strip() + tx(1, "insert into mt values (3, 30), (4, 40)") + tx(1, "commit") + + node.query("system flush logs") + csn1 = node.query( + "select csn from system.transactions_info_log where type='Commit' and tid={}".format( + tid1 + ) + ).strip() + csn2 = node.query( + "select csn from system.transactions_info_log where type='Commit' and tid={}".format( + tid2 + ) + ).strip() + + # insert a part before starting mutation and check that it will not be mutated + tx(4, "insert into mt values (9, 90)") + + # check that uncommitted mutation will be rolled back on restart + tx(1, "begin transaction") + tid3 = tx(1, "select transactionID()").strip() + tx(1, "insert into mt values (5, 50)") + tx(1, "alter table mt update m = m+n in partition id '1' where 1") + + # check that uncommitted merge will be rolled back on restart + tx(2, "begin transaction") + tid4 = tx(2, "select transactionID()").strip() + tx( + 2, + "optimize table mt partition id '0' final settings optimize_throw_if_noop = 1", + ) + + # check that uncommitted insert will be rolled back on restart + tx(3, "begin transaction") + tid5 = tx(3, "select transactionID()").strip() + tx(3, "insert into mt values (6, 70)") + + tid6 = tx(4, "select transactionID()").strip() + tx(4, "commit") + node.query("system flush logs") + csn6 = node.query( + "select csn from system.transactions_info_log where type='Commit' and tid={}".format( + tid6 + ) + ).strip() + + node.restart_clickhouse(kill=True) + + assert ( + node.query("select *, _part from mt order by n") + == "2\t20\t0_2_2_0\n3\t30\t1_3_3_0\n4\t40\t0_4_4_0\n9\t90\t1_5_5_0\n" + ) + res = node.query( + "select name, active, creation_tid, 'csn' || toString(creation_csn) || '_', removal_tid, 'csn' || toString(removal_csn) || '_' from system.parts where table='mt' order by name" + ) + res = res.replace(tid0, "tid0") + res = res.replace(tid1, "tid1").replace("csn" + csn1 + "_", "csn_1") + res = res.replace(tid2, "tid2").replace("csn" + csn2 + "_", "csn_2") + res = res.replace(tid3, "tid3") + res = res.replace(tid4, "tid4") + res = res.replace(tid5, "tid5") + res = res.replace(tid6, "tid6").replace("csn" + csn6 + "_", "csn_6") + assert ( + res + == "0_2_2_0\t1\ttid0\tcsn1_\t(0,0,'00000000-0000-0000-0000-000000000000')\tcsn0_\n" + "0_2_4_1\t0\ttid4\tcsn18446744073709551615_\t(0,0,'00000000-0000-0000-0000-000000000000')\tcsn0_\n" + "0_4_4_0\t1\ttid2\tcsn_2\t(0,0,'00000000-0000-0000-0000-000000000000')\tcsn0_\n" + "0_8_8_0\t0\ttid5\tcsn18446744073709551615_\t(0,0,'00000000-0000-0000-0000-000000000000')\tcsn0_\n" + "1_1_1_0\t0\ttid0\tcsn1_\ttid1\tcsn_1\n" + "1_3_3_0\t1\ttid2\tcsn_2\t(0,0,'00000000-0000-0000-0000-000000000000')\tcsn0_\n" + "1_3_3_0_7\t0\ttid3\tcsn18446744073709551615_\t(0,0,'00000000-0000-0000-0000-000000000000')\tcsn0_\n" + "1_5_5_0\t1\ttid6\tcsn_6\t(0,0,'00000000-0000-0000-0000-000000000000')\tcsn0_\n" + "1_6_6_0\t0\ttid3\tcsn18446744073709551615_\t(0,0,'00000000-0000-0000-0000-000000000000')\tcsn0_\n" + "1_6_6_0_7\t0\ttid3\tcsn18446744073709551615_\t(0,0,'00000000-0000-0000-0000-000000000000')\tcsn0_\n" + ) diff --git a/tests/queries/0_stateless/01167_isolation_hermitage.reference b/tests/queries/0_stateless/01167_isolation_hermitage.reference new file mode 100644 index 00000000000..4488809f3ed --- /dev/null +++ b/tests/queries/0_stateless/01167_isolation_hermitage.reference @@ -0,0 +1,59 @@ +Serialization error +INVALID_TRANSACTION +INVALID_TRANSACTION +1 1 11 +1 2 21 +tx4 2 1 10 +tx4 2 2 20 +tx4 3 1 10 +tx4 3 2 20 +4 1 10 +4 2 20 +tx6 5 1 10 +tx6 5 2 20 +tx6 6 1 10 +tx6 6 2 20 +7 1 11 +7 2 20 +Serialization error +tx7 8 1 11 +tx7 8 2 20 +INVALID_TRANSACTION +INVALID_TRANSACTION +10 1 11 +10 2 20 +Serialization error +tx11 11 1 10 +tx11 11 2 20 +INVALID_TRANSACTION +tx11 12 1 10 +tx11 12 2 20 +INVALID_TRANSACTION +13 1 11 +13 2 19 +16 1 10 +16 2 20 +16 3 30 +Serialization error +INVALID_TRANSACTION +INVALID_TRANSACTION +18 1 20 +18 2 30 +tx16 19 1 10 +tx16 19 2 20 +tx17 20 1 10 +tx17 20 2 20 +Serialization error +INVALID_TRANSACTION +21 1 11 +21 2 20 +tx18 22 1 10 +tx19 23 1 10 +tx19 24 2 20 +tx18 25 2 20 +26 1 12 +26 2 18 +29 1 10 +29 2 20 +29 3 30 +29 4 42 diff --git a/tests/queries/0_stateless/01167_isolation_hermitage.sh b/tests/queries/0_stateless/01167_isolation_hermitage.sh new file mode 100755 index 00000000000..7f495801dd0 --- /dev/null +++ b/tests/queries/0_stateless/01167_isolation_hermitage.sh @@ -0,0 +1,166 @@ +#!/usr/bin/env bash +# Tags: long, no-fasttest, no-replicated-database +# Looks like server does not listen https port in fasttest +# FIXME Replicated database executes ALTERs in separate context, so transaction info is lost + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh +# shellcheck source=./transactions.lib +. "$CURDIR"/transactions.lib +set -e + +# https://github.com/ept/hermitage + +$CLICKHOUSE_CLIENT -q "drop table if exists test" +$CLICKHOUSE_CLIENT -q "create table test (id int, value int) engine=MergeTree order by id" + +function reset_table() +{ + $CLICKHOUSE_CLIENT -q "truncate table test;" + $CLICKHOUSE_CLIENT -q "insert into test (id, value) values (1, 10);" + $CLICKHOUSE_CLIENT -q "insert into test (id, value) values (2, 20);" +} + +# TODO update test after implementing Read Committed + +# G0 +reset_table +tx 1 "begin transaction" +tx 2 "begin transaction" +tx 1 "alter table test update value=11 where id=1" +tx 2 "alter table test update value=12 where id=1" | grep -Eo "Serialization error" | uniq +tx 1 "alter table test update value=21 where id=2" +tx 1 "commit" +tx 2 "alter table test update value=22 where id=2" | grep -Eo "INVALID_TRANSACTION" | uniq +tx 2 "commit" | grep -Eo "INVALID_TRANSACTION" | uniq +tx 2 "rollback" +$CLICKHOUSE_CLIENT -q "select 1, * from test order by id" + +# G1a +reset_table +tx_async 3 "begin transaction" +tx_async 4 "begin transaction" +tx_async 3 "alter table test update value=101 where id=1" +tx_async 4 "select 2, * from test order by id" +tx_async 3 "alter table test update value=11 where id=1" +tx_async 3 "rollback" +tx_async 4 "select 3, * from test order by id" +tx_async 4 "commit" +tx_wait 3 +tx_wait 4 +$CLICKHOUSE_CLIENT -q "select 4, * from test order by id" + +# G1b +reset_table +tx_async 5 "begin transaction" +tx_async 6 "begin transaction" +tx_async 5 "alter table test update value=101 where id=1" +tx_async 6 "select 5, * from test order by id" +tx_async 5 "alter table test update value=11 where id=1" +tx_async 5 "commit" +tx_async 6 "select 6, * from test order by id" +tx_async 6 "commit" +tx_wait 5 +tx_wait 6 +$CLICKHOUSE_CLIENT -q "select 7, * from test order by id" + +# G1c +# NOTE both transactions will succeed if we implement skipping of unaffected partitions/parts +reset_table +tx 7 "begin transaction" +tx 8 "begin transaction" +tx 7 "alter table test update value=11 where id=1" +tx 8 "alter table test update value=22 where id=2" | grep -Eo "Serialization error" | uniq +tx 7 "select 8, * from test order by id" +tx 8 "select 9, * from test order by id" | grep -Eo "INVALID_TRANSACTION" | uniq +tx 7 "commit" +tx 8 "commit" | grep -Eo "INVALID_TRANSACTION" | uniq +tx 8 "rollback" +$CLICKHOUSE_CLIENT -q "select 10, * from test order by id" + +# OTV +reset_table +tx 9 "begin transaction" +tx 10 "begin transaction" +tx 11 "begin transaction" +tx 9 "alter table test update value = 11 where id = 1" +tx 9 "alter table test update value = 19 where id = 2" +tx 10 "alter table test update value = 12 where id = 1" | grep -Eo "Serialization error" | uniq +tx 9 "commit" +tx 11 "select 11, * from test order by id" +tx 10 "alter table test update value = 18 where id = 2" | grep -Eo "INVALID_TRANSACTION" | uniq +tx 11 "select 12, * from test order by id" +tx 10 "commit" | grep -Eo "INVALID_TRANSACTION" | uniq +tx 10 "rollback" +tx 11 "commit" +$CLICKHOUSE_CLIENT -q "select 13, * from test order by id" + +# PMP +reset_table +tx_async 12 "begin transaction" +tx_async 13 "begin transaction" +tx_async 12 "select 14, * from test where value = 30" +tx_async 13 "insert into test (id, value) values (3, 30)" +tx_async 13 "commit" +tx_async 12 "select 15, * from test where value = 30" +tx_async 12 "commit" +tx_wait 12 +tx_wait 13 +$CLICKHOUSE_CLIENT -q "select 16, * from test order by id" + +# PMP write +reset_table +tx 14 "begin transaction" +tx 15 "begin transaction" +tx 14 "alter table test update value = value + 10 where 1" +tx 15 "alter table test delete where value = 20" | grep -Eo "Serialization error" | uniq +tx 14 "commit" +tx 15 "select 17, * from test order by id" | grep -Eo "INVALID_TRANSACTION" | uniq +tx 15 "commit" | grep -Eo "INVALID_TRANSACTION" | uniq +tx 15 "rollback" +$CLICKHOUSE_CLIENT -q "select 18, * from test order by id" + +# P4 +reset_table +tx 16 "begin transaction" +tx 17 "begin transaction" +tx 16 "select 19, * from test order by id" +tx 17 "select 20, * from test order by id" +tx 16 "alter table test update value = 11 where id = 1" +tx 17 "alter table test update value = 11 where id = 1" | grep -Eo "Serialization error" | uniq +tx 16 "commit" +tx 17 "commit" | grep -Eo "INVALID_TRANSACTION" | uniq +tx 17 "rollback" +$CLICKHOUSE_CLIENT -q "select 21, * from test order by id" + +# G-single +reset_table +tx_async 18 "begin transaction" +tx_async 19 "begin transaction" +tx_sync 18 "select 22, * from test where id = 1" +tx_async 19 "select 23, * from test where id = 1" +tx_async 19 "select 24, * from test where id = 2" +tx_async 19 "alter table test update value = 12 where id = 1" +tx_async 19 "alter table test update value = 18 where id = 2" +tx_async 19 "commit" +tx_async 18 "select 25, * from test where id = 2" +tx_async 18 "commit" +tx_wait 18 +tx_wait 19 +$CLICKHOUSE_CLIENT -q "select 26, * from test order by id" + +# G2 +reset_table +tx_async 20 "begin transaction" +tx_async 21 "begin transaction" +tx_sync 20 "select 27, * from test where value % 3 = 0" +tx_async 21 "select 28, * from test where value % 3 = 0" +tx_async 20 "insert into test (id, value) values (3, 30)" +tx_async 21 "insert into test (id, value) values (4, 42)" +tx_async 20 "commit" +tx_async 21 "commit" +tx_wait 20 +tx_wait 21 +$CLICKHOUSE_CLIENT -q "select 29, * from test order by id" + diff --git a/tests/queries/0_stateless/01168_mutations_isolation.reference b/tests/queries/0_stateless/01168_mutations_isolation.reference new file mode 100644 index 00000000000..1b3e3f145b1 --- /dev/null +++ b/tests/queries/0_stateless/01168_mutations_isolation.reference @@ -0,0 +1,38 @@ +tx2 1 10 all_1_1_0_4 +tx2 1 30 all_3_3_0_4 +tx1 2 1 all_1_1_0 +tx1 2 2 all_2_2_0 +Serialization error +INVALID_TRANSACTION +tx3 3 1 all_1_1_0 +Serialization error +INVALID_TRANSACTION +INVALID_TRANSACTION +tx5 4 2 all_1_1_0_8 +tx5 4 5 all_10_10_0 +tx5 4 6 all_7_7_0_8 +tx5 5 2 all_1_1_0_8 +tx5 5 5 all_10_10_0 +tx5 5 6 all_7_7_0_8 +SERIALIZATION_ERROR +tx6 6 2 all_1_1_0_11 +tx6 6 6 all_7_7_0_11 +tx7 7 20 all_1_1_0_13 +tx7 7 40 all_14_14_0 +tx7 7 60 all_7_7_0_13 +tx7 7 80 all_12_12_0_13 +tx7 8 20 all_1_14_1_13 +tx7 8 40 all_1_14_1_13 +tx7 8 60 all_1_14_1_13 +tx7 8 80 all_1_14_1_13 +Serialization error +INVALID_TRANSACTION +tx11 9 21 all_1_14_1_17 +tx11 9 41 all_1_14_1_17 +tx11 9 61 all_1_14_1_17 +tx11 9 81 all_1_14_1_17 +1 1 RUNNING +tx14 10 22 all_1_14_1_18 +tx14 10 42 all_1_14_1_18 +tx14 10 62 all_1_14_1_18 +tx14 10 82 all_1_14_1_18 diff --git a/tests/queries/0_stateless/01168_mutations_isolation.sh b/tests/queries/0_stateless/01168_mutations_isolation.sh new file mode 100755 index 00000000000..888858edf32 --- /dev/null +++ b/tests/queries/0_stateless/01168_mutations_isolation.sh @@ -0,0 +1,93 @@ +#!/usr/bin/env bash +# Tags: no-fasttest, no-replicated-database +# Looks like server does not listen https port in fasttest +# FIXME Replicated database executes ALTERs in separate context, so transaction info is lost + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh +# shellcheck source=./transactions.lib +. "$CURDIR"/transactions.lib + +$CLICKHOUSE_CLIENT -q "drop table if exists mt" +$CLICKHOUSE_CLIENT -q "create table mt (n int) engine=MergeTree order by tuple()" + +$CLICKHOUSE_CLIENT -q "insert into mt values (1)" + +tx 1 "begin transaction" +tx 2 "begin transaction" +tx 1 "insert into mt values (2)" +tx 2 "insert into mt values (3)" +tx 2 "alter table mt update n=n*10 where 1" +tx 2 "select 1, n, _part from mt order by n" +tx 1 "select 2, n, _part from mt order by n" +tx 1 "alter table mt update n=n+1 where 1" | grep -Eo "Serialization error" | uniq +tx 1 "commit" | grep -Eo "INVALID_TRANSACTION" | uniq +tx 2 "rollback" + + +tx 3 "begin transaction" +tx 3 "select 3, n, _part from mt order by n" +tx 4 "begin transaction" +tx 3 "insert into mt values (2)" +tx 4 "insert into mt values (3)" +tx 4 "alter table mt update n=n*2 where 1" +tx 3 "alter table mt update n=n+42 where 1" | grep -Eo "Serialization error" | uniq +tx 3 "insert into mt values (4)" | grep -Eo "INVALID_TRANSACTION" | uniq +tx 4 "insert into mt values (5)" +tx 3 "commit" | grep -Eo "INVALID_TRANSACTION" | uniq +tx 4 "commit" + + +tx 5 "begin transaction" +tx 5 "select 4, n, _part from mt order by n" +tx 6 "begin transaction" +tx 6 "alter table mt delete where n%2=1" +tx 6 "alter table mt drop part 'all_10_10_0_11'" +tx 5 "select 5, n, _part from mt order by n" +tx 5 "alter table mt drop partition id 'all'" | grep -Eo "SERIALIZATION_ERROR" | uniq +tx 6 "select 6, n, _part from mt order by n" +tx 5 "rollback" +tx 6 "insert into mt values (8)" +tx 6 "alter table mt update n=n*10 where 1" +tx 6 "insert into mt values (40)" +tx 6 "commit" + + +tx 7 "begin transaction" +tx 7 "select 7, n, _part from mt order by n" +tx 8 "begin transaction" +tx_async 8 "alter table mt update n = 0 where 1" >/dev/null +$CLICKHOUSE_CLIENT -q "kill mutation where database=currentDatabase() and mutation_id='mutation_15.txt' format Null" 2>&1| grep -Fv "probably it finished" +tx_sync 8 "rollback" +tx 7 "optimize table mt final" +tx 7 "select 8, n, _part from mt order by n" +tx 10 "begin transaction" +tx 10 "alter table mt update n = 0 where 1" | grep -Eo "Serialization error" | uniq +tx 7 "alter table mt update n=n+1 where 1" +tx 10 "commit" | grep -Eo "INVALID_TRANSACTION" | uniq +tx 10 "rollback" +tx 7 "commit" + + +tx_async 11 "begin transaction" +tx_async 11 "select 9, n, _part from mt order by n" +tx_async 12 "begin transaction" +tx_async 11 "alter table mt update n=n+1 where 1" >/dev/null +tx_async 12 "alter table mt update n=n+1 where 1" >/dev/null +tx_async 11 "commit" >/dev/null +tx_async 12 "commit" >/dev/null +tx_wait 11 +tx_wait 12 + +tx 13 "begin transaction" +tid_to_kill=$(tx 13 "select transactionID()" | grep -Po "\(.*") +$CLICKHOUSE_CLIENT -q "select count(), any(is_readonly), any(state) from system.transactions where tid=$tid_to_kill" +tx_async 13 "alter table mt update n = 0 where 1" >/dev/null +$CLICKHOUSE_CLIENT -q "kill transaction where tid=$tid_to_kill format Null" +tx_sync 13 "rollback" + +tx 14 "begin transaction" +tx 14 "select 10, n, _part from mt order by n" + +$CLICKHOUSE_CLIENT --database_atomic_wait_for_drop_and_detach_synchronously=0 -q "drop table mt" diff --git a/tests/queries/0_stateless/01169_alter_partition_isolation_stress.reference b/tests/queries/0_stateless/01169_alter_partition_isolation_stress.reference new file mode 100644 index 00000000000..12b941eab50 --- /dev/null +++ b/tests/queries/0_stateless/01169_alter_partition_isolation_stress.reference @@ -0,0 +1,8 @@ +1 1 +2 1 +3 1 +4 1 +1 +10 100 +1 1 1 +2 1 1 diff --git a/tests/queries/0_stateless/01169_alter_partition_isolation_stress.sh b/tests/queries/0_stateless/01169_alter_partition_isolation_stress.sh new file mode 100755 index 00000000000..ab348fd31fb --- /dev/null +++ b/tests/queries/0_stateless/01169_alter_partition_isolation_stress.sh @@ -0,0 +1,133 @@ +#!/usr/bin/env bash +# Tags: long, no-replicated-database + +# shellcheck disable=SC2015 + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +set -e + +$CLICKHOUSE_CLIENT --query "DROP TABLE IF EXISTS src"; +$CLICKHOUSE_CLIENT --query "DROP TABLE IF EXISTS dst"; +$CLICKHOUSE_CLIENT --query "CREATE TABLE src (n UInt64, type UInt8) ENGINE=MergeTree ORDER BY type SETTINGS old_parts_lifetime=0"; +$CLICKHOUSE_CLIENT --query "CREATE TABLE dst (n UInt64, type UInt8) ENGINE=MergeTree ORDER BY type SETTINGS old_parts_lifetime=0"; + +function thread_insert() +{ + set -e + trap "exit 0" INT + val=1 + while true; do + $CLICKHOUSE_CLIENT --multiquery --query " + BEGIN TRANSACTION; + INSERT INTO src VALUES /* ($val, 1) */ ($val, 1); + INSERT INTO src VALUES /* ($val, 2) */ ($val, 2); + COMMIT;" + val=$((val+1)) + sleep 0.$RANDOM; + done +} + + +# NOTE +# ALTER PARTITION query stops merges, +# but serialization error is still possible if some merge was assigned (and committed) between BEGIN and ALTER. +function thread_partition_src_to_dst() +{ + set -e + count=0 + sum=0 + for i in {1..20}; do + out=$( + $CLICKHOUSE_CLIENT --multiquery --query " + BEGIN TRANSACTION; + INSERT INTO src VALUES /* ($i, 3) */ ($i, 3); + INSERT INTO dst SELECT * FROM src; + ALTER TABLE src DROP PARTITION ID 'all'; + SET throw_on_unsupported_query_inside_transaction=0; + SELECT throwIf((SELECT (count(), sum(n)) FROM merge(currentDatabase(), '') WHERE type=3) != ($count + 1, $sum + $i)) FORMAT Null; + COMMIT;" 2>&1) ||: + + echo "$out" | grep -Fv "SERIALIZATION_ERROR" | grep -F "Received from " && $CLICKHOUSE_CLIENT --multiquery --query " + begin transaction; + set transaction snapshot 3; + select $i, 'src', type, n, _part from src order by type, n; + select $i, 'dst', type, n, _part from dst order by type, n; + rollback" ||: + echo "$out" | grep -Fa "SERIALIZATION_ERROR" >/dev/null || count=$((count+1)) + echo "$out" | grep -Fa "SERIALIZATION_ERROR" >/dev/null || sum=$((sum+i)) + done +} + +function thread_partition_dst_to_src() +{ + set -e + for i in {1..20}; do + action="ROLLBACK" + if (( i % 2 )); then + action="COMMIT" + fi + $CLICKHOUSE_CLIENT --multiquery --query " + SYSTEM STOP MERGES dst; + ALTER TABLE dst DROP PARTITION ID 'nonexistent'; -- STOP MERGES doesn't wait for started merges to finish, so we use this trick + BEGIN TRANSACTION; + INSERT INTO dst VALUES /* ($i, 4) */ ($i, 4); + INSERT INTO src SELECT * FROM dst; + ALTER TABLE dst DROP PARTITION ID 'all'; + SET throw_on_unsupported_query_inside_transaction=0; + SYSTEM START MERGES dst; + SELECT throwIf((SELECT (count(), sum(n)) FROM merge(currentDatabase(), '') WHERE type=4) != (toUInt8($i/2 + 1), (select sum(number) from numbers(1, $i) where number % 2 or number=$i))) FORMAT Null; + $action;" || $CLICKHOUSE_CLIENT --multiquery --query " + begin transaction; + set transaction snapshot 3; + select $i, 'src', type, n, _part from src order by type, n; + select $i, 'dst', type, n, _part from dst order by type, n; + rollback" ||: + done +} + +function thread_select() +{ + set -e + trap "exit 0" INT + while true; do + $CLICKHOUSE_CLIENT --multiquery --query " + BEGIN TRANSACTION; + -- no duplicates + SELECT type, throwIf(count(n) != countDistinct(n)) FROM src GROUP BY type FORMAT Null; + SELECT type, throwIf(count(n) != countDistinct(n)) FROM dst GROUP BY type FORMAT Null; + -- rows inserted by thread_insert moved together + SET throw_on_unsupported_query_inside_transaction=0; + SELECT _table, throwIf(arraySort(groupArrayIf(n, type=1)) != arraySort(groupArrayIf(n, type=2))) FROM merge(currentDatabase(), '') GROUP BY _table FORMAT Null; + -- all rows are inserted in insert_thread + SELECT type, throwIf(count(n) != max(n)), throwIf(sum(n) != max(n)*(max(n)+1)/2) FROM merge(currentDatabase(), '') WHERE type IN (1, 2) GROUP BY type ORDER BY type FORMAT Null; + COMMIT;" || $CLICKHOUSE_CLIENT --multiquery --query " + begin transaction; + set transaction snapshot 3; + select $i, 'src', type, n, _part from src order by type, n; + select $i, 'dst', type, n, _part from dst order by type, n; + rollback" ||: + done +} + +thread_insert & PID_1=$! +thread_select & PID_2=$! + +thread_partition_src_to_dst & PID_3=$! +thread_partition_dst_to_src & PID_4=$! +wait $PID_3 && wait $PID_4 + +kill -INT $PID_1 +kill -INT $PID_2 +wait + +$CLICKHOUSE_CLIENT -q "SELECT type, count(n) = countDistinct(n) FROM merge(currentDatabase(), '') GROUP BY type ORDER BY type" +$CLICKHOUSE_CLIENT -q "SELECT DISTINCT arraySort(groupArrayIf(n, type=1)) = arraySort(groupArrayIf(n, type=2)) FROM merge(currentDatabase(), '') GROUP BY _table ORDER BY _table" +$CLICKHOUSE_CLIENT -q "SELECT count(n), sum(n) FROM merge(currentDatabase(), '') WHERE type=4" +$CLICKHOUSE_CLIENT -q "SELECT type, count(n) == max(n), sum(n) == max(n)*(max(n)+1)/2 FROM merge(currentDatabase(), '') WHERE type IN (1, 2) GROUP BY type ORDER BY type" + + +$CLICKHOUSE_CLIENT --query "DROP TABLE src"; +$CLICKHOUSE_CLIENT --query "DROP TABLE dst"; diff --git a/tests/queries/0_stateless/01170_alter_partition_isolation.reference b/tests/queries/0_stateless/01170_alter_partition_isolation.reference new file mode 100644 index 00000000000..f384fc748d4 --- /dev/null +++ b/tests/queries/0_stateless/01170_alter_partition_isolation.reference @@ -0,0 +1,30 @@ +tx1 1 1 +tx1 2 3 +tx2 3 2 +tx2 3 4 +tx1 4 3 + +5 3 +5 5 + +tx4 6 3 +tx4 6 5 +tx4 6 6 +tx4 7 8 +tx3 8 3 +tx3 8 5 +tx3 8 7 +tx3 8 9 +SERIALIZATION_ERROR +INVALID_TRANSACTION +tx4 9 8 + +10 8 + +11 8 +11 11 +11 12 +12 8 +12 8 +12 11 +12 12 diff --git a/tests/queries/0_stateless/01170_alter_partition_isolation.sh b/tests/queries/0_stateless/01170_alter_partition_isolation.sh new file mode 100755 index 00000000000..2db178fb6d1 --- /dev/null +++ b/tests/queries/0_stateless/01170_alter_partition_isolation.sh @@ -0,0 +1,73 @@ +#!/usr/bin/env bash +# Tags: no-fasttest, no-replicated-database +# Looks like server does not listen https port in fasttest +# FIXME Replicated database executes ALTERs in separate context, so transaction info is lost + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh +# shellcheck source=./transactions.lib +. "$CURDIR"/transactions.lib + +$CLICKHOUSE_CLIENT -q "drop table if exists mt" +$CLICKHOUSE_CLIENT -q "create table mt (n int) engine=MergeTree order by n" + +tx 1 "begin transaction" +tx 1 "insert into mt values (1)" +tx 2 "begin transaction" +tx 2 "insert into mt values (2)" +tx 1 "select 1, n from mt order by n" +tx 1 "alter table mt drop partition id 'all'" +tx 2 "insert into mt values (4)" +tx 1 "insert into mt values (3)" +tx 1 "select 2, n from mt order by n" +tx 2 "select 3, n from mt order by n" +tx 2 "alter table mt drop partition id 'all'" +tx 2 "insert into mt values (5)" +tx 1 "select 4, n from mt order by n" +tx 2 "commit" +tx 1 "commit" + +echo '' +$CLICKHOUSE_CLIENT -q "select 5, n from mt order by n" +echo '' + +tx 4 "begin transaction" +tx 4 "insert into mt values (6)" +tx 3 "begin transaction" +tx 3 "insert into mt values (7)" +tx 4 "select 6, n from mt order by n" +tx 4 "alter table mt drop partition id 'all'" +tx 3 "insert into mt values (9)" +tx 4 "insert into mt values (8)" +tx 4 "select 7, n from mt order by n" +tx 3 "select 8, n from mt order by n" +tx 3 "alter table mt drop partition id 'all'" | grep -Eo "SERIALIZATION_ERROR" | uniq +tx 3 "insert into mt values (10)" | grep -Eo "INVALID_TRANSACTION" | uniq +tx 4 "select 9, n from mt order by n" +tx 3 "rollback" +tx 4 "commit" + +echo '' +$CLICKHOUSE_CLIENT -q "select 10, n from mt order by n" +echo '' + +$CLICKHOUSE_CLIENT -q "drop table if exists another_mt" +$CLICKHOUSE_CLIENT -q "create table another_mt (n int) engine=MergeTree order by n" + +tx 5 "begin transaction" +tx 5 "insert into another_mt values (11)" +tx 6 "begin transaction" +tx 6 "insert into mt values (12)" +tx 6 "insert into another_mt values (13)" +tx 5 "alter table another_mt move partition id 'all' to table mt" +tx 6 "alter table another_mt replace partition id 'all' from mt" +tx 5 "alter table another_mt attach partition id 'all' from mt" +tx 5 "commit" +tx 6 "commit" + +$CLICKHOUSE_CLIENT -q "select 11, n from mt order by n" +$CLICKHOUSE_CLIENT -q "select 12, n from another_mt order by n" + +$CLICKHOUSE_CLIENT -q "drop table another_mt" +$CLICKHOUSE_CLIENT -q "drop table mt" diff --git a/tests/queries/0_stateless/01171_mv_select_insert_isolation_long.reference b/tests/queries/0_stateless/01171_mv_select_insert_isolation_long.reference new file mode 100644 index 00000000000..d8bb9e310e6 --- /dev/null +++ b/tests/queries/0_stateless/01171_mv_select_insert_isolation_long.reference @@ -0,0 +1,4 @@ +275 0 138 136 0 +275 0 +275 0 138 136 0 +275 0 diff --git a/tests/queries/0_stateless/01171_mv_select_insert_isolation_long.sh b/tests/queries/0_stateless/01171_mv_select_insert_isolation_long.sh new file mode 100755 index 00000000000..3de63615bc4 --- /dev/null +++ b/tests/queries/0_stateless/01171_mv_select_insert_isolation_long.sh @@ -0,0 +1,156 @@ +#!/usr/bin/env bash +# Tags: long, no-parallel +# Test is too heavy, avoid parallel run in Flaky Check + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +set -e + +$CLICKHOUSE_CLIENT --query "DROP TABLE IF EXISTS src"; +$CLICKHOUSE_CLIENT --query "DROP TABLE IF EXISTS dst"; +$CLICKHOUSE_CLIENT --query "DROP TABLE IF EXISTS mv"; +$CLICKHOUSE_CLIENT --query "CREATE TABLE src (n Int8, m Int8, CONSTRAINT c CHECK xxHash32(n+m) % 8 != 0) ENGINE=MergeTree ORDER BY n PARTITION BY 0 < n SETTINGS old_parts_lifetime=0"; +$CLICKHOUSE_CLIENT --query "CREATE TABLE dst (nm Int16, CONSTRAINT c CHECK xxHash32(nm) % 8 != 0) ENGINE=MergeTree ORDER BY nm SETTINGS old_parts_lifetime=0"; +$CLICKHOUSE_CLIENT --query "CREATE MATERIALIZED VIEW mv TO dst (nm Int16) AS SELECT n*m AS nm FROM src"; + +$CLICKHOUSE_CLIENT --query "CREATE TABLE tmp (x UInt8, nm Int16) ENGINE=MergeTree ORDER BY (x, nm) SETTINGS old_parts_lifetime=0" + +$CLICKHOUSE_CLIENT --query "INSERT INTO src VALUES (0, 0)" + +# some transactions will fail due to constraint +function thread_insert_commit() +{ + set -e + for i in {1..100}; do + $CLICKHOUSE_CLIENT --multiquery --query " + BEGIN TRANSACTION; + INSERT INTO src VALUES /* ($i, $1) */ ($i, $1); + SELECT throwIf((SELECT sum(nm) FROM mv) != $(($i * $1))) FORMAT Null; + INSERT INTO src VALUES /* (-$i, $1) */ (-$i, $1); + COMMIT;" 2>&1| grep -Fv "is violated at row" | grep -Fv "Transaction is not in RUNNING state" | grep -F "Received from " ||: + done +} + +function thread_insert_rollback() +{ + set -e + for _ in {1..100}; do + $CLICKHOUSE_CLIENT --multiquery --query " + BEGIN TRANSACTION; + INSERT INTO src VALUES /* (42, $1) */ (42, $1); + SELECT throwIf((SELECT count() FROM src WHERE n=42 AND m=$1) != 1) FORMAT Null; + ROLLBACK;" + done +} + +# make merges more aggressive +function thread_optimize() +{ + set -e + trap "exit 0" INT + while true; do + optimize_query="OPTIMIZE TABLE src" + partition_id=$(( RANDOM % 2 )) + if (( RANDOM % 2 )); then + optimize_query="OPTIMIZE TABLE dst" + partition_id="all" + fi + if (( RANDOM % 2 )); then + optimize_query="$optimize_query PARTITION ID '$partition_id'" + fi + if (( RANDOM % 2 )); then + optimize_query="$optimize_query FINAL" + fi + action="COMMIT" + if (( RANDOM % 4 )); then + action="ROLLBACK" + fi + + $CLICKHOUSE_CLIENT --multiquery --query " + BEGIN TRANSACTION; + $optimize_query; + $action; + " 2>&1| grep -Fv "already exists, but it will be deleted soon" | grep -F "Received from " ||: + sleep 0.$RANDOM; + done +} + +function thread_select() +{ + set -e + trap "exit 0" INT + while true; do + $CLICKHOUSE_CLIENT --multiquery --query " + BEGIN TRANSACTION; + SELECT throwIf((SELECT (sum(n), count() % 2) FROM src) != (0, 1)) FORMAT Null; + SELECT throwIf((SELECT (sum(nm), count() % 2) FROM mv) != (0, 1)) FORMAT Null; + SELECT throwIf((SELECT (sum(nm), count() % 2) FROM dst) != (0, 1)) FORMAT Null; + SELECT throwIf((SELECT arraySort(groupArray(nm)) FROM mv) != (SELECT arraySort(groupArray(nm)) FROM dst)) FORMAT Null; + SELECT throwIf((SELECT arraySort(groupArray(nm)) FROM mv) != (SELECT arraySort(groupArray(n*m)) FROM src)) FORMAT Null; + COMMIT;" || $CLICKHOUSE_CLIENT --multiquery --query " + begin transaction; + set transaction snapshot 3; + select 'src', n, m, _part from src order by n, m; + select 'dst', nm, _part from dst order by nm; + rollback" ||: + done +} + +function thread_select_insert() +{ + set -e + trap "exit 0" INT + while true; do + $CLICKHOUSE_CLIENT --multiquery --query " + BEGIN TRANSACTION; + SELECT throwIf((SELECT count() FROM tmp) != 0) FORMAT Null; + INSERT INTO tmp SELECT 1, n*m FROM src; + INSERT INTO tmp SELECT 2, nm FROM mv; + INSERT INTO tmp SELECT 3, nm FROM dst; + INSERT INTO tmp SELECT 4, (*,).1 FROM (SELECT n*m FROM src UNION ALL SELECT nm FROM mv UNION ALL SELECT nm FROM dst); + SELECT throwIf((SELECT countDistinct(x) FROM tmp) != 4) FORMAT Null; + + -- now check that all results are the same + SELECT throwIf(1 != (SELECT countDistinct(arr) FROM (SELECT x, arraySort(groupArray(nm)) AS arr FROM tmp WHERE x!=4 GROUP BY x))) FORMAT Null; + SELECT throwIf((SELECT count(), sum(nm) FROM tmp WHERE x=4) != (SELECT count(), sum(nm) FROM tmp WHERE x!=4)) FORMAT Null; + ROLLBACK;" || $CLICKHOUSE_CLIENT --multiquery --query " + begin transaction; + set transaction snapshot 3; + select 'src', n, m, _part from src order by n, m; + select 'dst', nm, _part from dst order by nm; + rollback" ||: + done +} + +thread_insert_commit 1 & PID_1=$! +thread_insert_commit 2 & PID_2=$! +thread_insert_rollback 3 & PID_3=$! + +thread_optimize & PID_4=$! +thread_select & PID_5=$! +thread_select_insert & PID_6=$! +sleep 0.$RANDOM; +thread_select & PID_7=$! +thread_select_insert & PID_8=$! + +wait $PID_1 && wait $PID_2 && wait $PID_3 +kill -INT $PID_4 +kill -INT $PID_5 +kill -INT $PID_6 +kill -INT $PID_7 +kill -INT $PID_8 +wait + +$CLICKHOUSE_CLIENT --multiquery --query " +BEGIN TRANSACTION; +SELECT count(), sum(n), sum(m=1), sum(m=2), sum(m=3) FROM src; +SELECT count(), sum(nm) FROM mv"; + +$CLICKHOUSE_CLIENT --query "SELECT count(), sum(n), sum(m=1), sum(m=2), sum(m=3) FROM src" +$CLICKHOUSE_CLIENT --query "SELECT count(), sum(nm) FROM mv" + +$CLICKHOUSE_CLIENT --query "DROP TABLE src"; +$CLICKHOUSE_CLIENT --query "DROP TABLE dst"; +$CLICKHOUSE_CLIENT --query "DROP TABLE mv"; diff --git a/tests/queries/0_stateless/01172_transaction_counters.reference b/tests/queries/0_stateless/01172_transaction_counters.reference new file mode 100644 index 00000000000..1aabf8a2a38 --- /dev/null +++ b/tests/queries/0_stateless/01172_transaction_counters.reference @@ -0,0 +1,40 @@ +(0,0,'00000000-0000-0000-0000-000000000000') +1 all_1_1_0 0 +1 all_2_2_0 1 +2 all_1_1_0 1 (0,0,'00000000-0000-0000-0000-000000000000') 0 +2 all_2_2_0 0 (0,0,'00000000-0000-0000-0000-000000000000') 0 +3 all_1_1_0 0 +3 all_3_3_0 1 +4 all_1_1_0 1 (0,0,'00000000-0000-0000-0000-000000000000') 0 +4 all_2_2_0 18446744073709551615 (1,1,'00000000-0000-0000-0000-000000000000') 0 +4 all_3_3_0 0 (0,0,'00000000-0000-0000-0000-000000000000') 0 +5 1 +6 all_1_1_0 0 +6 all_3_3_0 1 +6 all_4_4_0 1 +7 all_1_1_0 (0,0,'00000000-0000-0000-0000-000000000000') 0 +7 all_3_3_0 (0,0,'00000000-0000-0000-0000-000000000000') 0 +7 all_4_4_0 (0,0,'00000000-0000-0000-0000-000000000000') 0 +8 1 +1 1 AddPart 1 1 1 1 all_1_1_0 +2 1 Begin 1 1 1 1 +2 1 AddPart 1 1 1 1 all_2_2_0 +1 1 LockPart 1 1 1 1 all_2_2_0 +2 1 Rollback 1 1 1 1 +3 1 Begin 1 1 1 1 +3 1 AddPart 1 1 1 1 all_3_3_0 +3 1 Commit 1 1 1 0 +4 1 Begin 1 1 1 1 +4 1 AddPart 1 1 1 1 all_4_4_0 +4 1 Commit 1 1 1 0 +5 1 Begin 1 1 1 1 +5 1 AddPart 1 1 1 1 all_5_5_0 +5 1 LockPart 1 1 1 1 all_1_1_0 +5 1 LockPart 1 1 1 1 all_3_3_0 +5 1 LockPart 1 1 1 1 all_4_4_0 +5 1 LockPart 1 1 1 1 all_5_5_0 +5 1 UnlockPart 1 1 1 1 all_1_1_0 +5 1 UnlockPart 1 1 1 1 all_3_3_0 +5 1 UnlockPart 1 1 1 1 all_4_4_0 +5 1 UnlockPart 1 1 1 1 all_5_5_0 +5 1 Rollback 1 1 1 1 diff --git a/tests/queries/0_stateless/01172_transaction_counters.sql b/tests/queries/0_stateless/01172_transaction_counters.sql new file mode 100644 index 00000000000..5431673fd62 --- /dev/null +++ b/tests/queries/0_stateless/01172_transaction_counters.sql @@ -0,0 +1,50 @@ +-- Tags: no-s3-storage +-- FIXME this test fails with S3 due to a bug in DiskCacheWrapper +drop table if exists txn_counters; + +create table txn_counters (n Int64, creation_tid DEFAULT transactionID()) engine=MergeTree order by n; + +insert into txn_counters(n) values (1); +select transactionID(); + +-- stop background cleanup +system stop merges txn_counters; + +set throw_on_unsupported_query_inside_transaction=0; + +begin transaction; +insert into txn_counters(n) values (2); +select 1, system.parts.name, txn_counters.creation_tid = system.parts.creation_tid from txn_counters join system.parts on txn_counters._part = system.parts.name where database=currentDatabase() and table='txn_counters' order by system.parts.name; +select 2, name, creation_csn, removal_tid, removal_csn from system.parts where database=currentDatabase() and table='txn_counters' order by system.parts.name; +rollback; + +begin transaction; +insert into txn_counters(n) values (3); +select 3, system.parts.name, txn_counters.creation_tid = system.parts.creation_tid from txn_counters join system.parts on txn_counters._part = system.parts.name where database=currentDatabase() and table='txn_counters' order by system.parts.name; +select 4, name, creation_csn, removal_tid, removal_csn from system.parts where database=currentDatabase() and table='txn_counters' order by system.parts.name; +select 5, transactionID().3 == serverUUID(); +commit; + +detach table txn_counters; +attach table txn_counters; + +begin transaction; +insert into txn_counters(n) values (4); +select 6, system.parts.name, txn_counters.creation_tid = system.parts.creation_tid from txn_counters join system.parts on txn_counters._part = system.parts.name where database=currentDatabase() and table='txn_counters' order by system.parts.name; +select 7, name, removal_tid, removal_csn from system.parts where database=currentDatabase() and table='txn_counters' order by system.parts.name; +select 8, transactionID().3 == serverUUID(); +commit; + +begin transaction; +insert into txn_counters(n) values (5); +alter table txn_counters drop partition id 'all'; +rollback; + +system flush logs; +select indexOf((select arraySort(groupUniqArray(tid)) from system.transactions_info_log where database=currentDatabase() and table='txn_counters'), tid), + (toDecimal64(now64(6), 6) - toDecimal64(event_time, 6)) < 100, type, thread_id!=0, length(query_id)=length(queryID()), tid_hash!=0, csn=0, part +from system.transactions_info_log +where tid in (select tid from system.transactions_info_log where database=currentDatabase() and table='txn_counters' and not (tid.1=1 and tid.2=1)) +or (database=currentDatabase() and table='txn_counters') order by event_time; + +drop table txn_counters; diff --git a/tests/queries/0_stateless/01173_transaction_control_queries.reference b/tests/queries/0_stateless/01173_transaction_control_queries.reference new file mode 100644 index 00000000000..01acdffc581 --- /dev/null +++ b/tests/queries/0_stateless/01173_transaction_control_queries.reference @@ -0,0 +1,12 @@ +commit [1,10] +rollback [1,2,10,20] +no nested [1,10] +on exception before start [1,3,10,30] +on exception while processing [1,4,10,40] +on session close [1,6,10,60] +commit [1,7,10,70] +readonly [1,7,10,70] +snapshot 2 8 +snapshot1 0 0 +snapshot3 1 +snapshot100500 2 8 diff --git a/tests/queries/0_stateless/01173_transaction_control_queries.sql b/tests/queries/0_stateless/01173_transaction_control_queries.sql new file mode 100644 index 00000000000..930a2909f7a --- /dev/null +++ b/tests/queries/0_stateless/01173_transaction_control_queries.sql @@ -0,0 +1,102 @@ +drop table if exists mt1; +drop table if exists mt2; + +create table mt1 (n Int64) engine=MergeTree order by n; +create table mt2 (n Int64) engine=MergeTree order by n; + +commit; -- { serverError INVALID_TRANSACTION } +rollback; -- { serverError INVALID_TRANSACTION } + +begin transaction; +insert into mt1 values (1); +insert into mt2 values (10); +select 'commit', arraySort(groupArray(n)) from (select n from mt1 union all select * from mt2); +commit; + +begin transaction; +insert into mt1 values (2); +insert into mt2 values (20); +select 'rollback', arraySort(groupArray(n)) from (select n from mt1 union all select * from mt2); +rollback; + +begin transaction; +select 'no nested', arraySort(groupArray(n)) from (select n from mt1 union all select * from mt2); +begin transaction; -- { serverError INVALID_TRANSACTION } +rollback; + +begin transaction; +insert into mt1 values (3); +insert into mt2 values (30); +select 'on exception before start', arraySort(groupArray(n)) from (select n from mt1 union all select * from mt2); +-- rollback on exception before start +select functionThatDoesNotExist(); -- { serverError 46 } +-- cannot commit after exception +commit; -- { serverError INVALID_TRANSACTION } +begin transaction; -- { serverError INVALID_TRANSACTION } +rollback; + +begin transaction; +insert into mt1 values (4); +insert into mt2 values (40); +select 'on exception while processing', arraySort(groupArray(n)) from (select n from mt1 union all select * from mt2); +-- rollback on exception while processing +select throwIf(100 < number) from numbers(1000); -- { serverError 395 } +-- cannot commit after exception +commit; -- { serverError INVALID_TRANSACTION } +insert into mt1 values (5); -- { serverError INVALID_TRANSACTION } +insert into mt2 values (50); -- { serverError INVALID_TRANSACTION } +select 1; -- { serverError INVALID_TRANSACTION } +rollback; + +begin transaction; +insert into mt1 values (6); +insert into mt2 values (60); +select 'on session close', arraySort(groupArray(n)) from (select n from mt1 union all select * from mt2); +-- trigger reconnection by error on client, check rollback on session close +insert into mt1 values ([1]); -- { clientError 43 } +commit; -- { serverError INVALID_TRANSACTION } +rollback; -- { serverError INVALID_TRANSACTION } + +begin transaction; +insert into mt1 values (7); +insert into mt2 values (70); +select 'commit', arraySort(groupArray(n)) from (select n from mt1 union all select * from mt2); +commit; + +begin transaction; +select 'readonly', arraySort(groupArray(n)) from (select n from mt1 union all select * from mt2); +commit; + +begin transaction; +select 'snapshot', count(), sum(n) from mt1; +set transaction snapshot 1; +select 'snapshot1', count(), sum(n) from mt1; +set transaction snapshot 3; +set throw_on_unsupported_query_inside_transaction=0; +select 'snapshot3', count() = (select count() from system.parts where database=currentDatabase() and table='mt1' and _state in ('Active', 'Outdated')) from mt1; +set throw_on_unsupported_query_inside_transaction=1; +set transaction snapshot 1000000000000000; +select 'snapshot100500', count(), sum(n) from mt1; +set transaction snapshot 5; -- { serverError INVALID_TRANSACTION } +rollback; + +begin transaction; +create table m (n int) engine=Memory; -- { serverError 48 } +commit; -- { serverError INVALID_TRANSACTION } +rollback; + +create table m (n int) engine=Memory; +begin transaction; +insert into m values (1); -- { serverError 48 } +select * from m; -- { serverError INVALID_TRANSACTION } +commit; -- { serverError INVALID_TRANSACTION } +rollback; + +begin transaction; +select * from m; -- { serverError 48 } +commit; -- { serverError INVALID_TRANSACTION } +rollback; + +drop table m; +drop table mt1; +drop table mt2; diff --git a/tests/queries/0_stateless/01174_select_insert_isolation.reference b/tests/queries/0_stateless/01174_select_insert_isolation.reference new file mode 100644 index 00000000000..ba5f4de36ac --- /dev/null +++ b/tests/queries/0_stateless/01174_select_insert_isolation.reference @@ -0,0 +1,2 @@ +200 0 100 100 0 +200 0 100 100 0 diff --git a/tests/queries/0_stateless/01174_select_insert_isolation.sh b/tests/queries/0_stateless/01174_select_insert_isolation.sh new file mode 100755 index 00000000000..8872ab82c03 --- /dev/null +++ b/tests/queries/0_stateless/01174_select_insert_isolation.sh @@ -0,0 +1,66 @@ +#!/usr/bin/env bash +# Tags: long + +# shellcheck disable=SC2015 + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +set -e + +$CLICKHOUSE_CLIENT --query "DROP TABLE IF EXISTS mt"; +$CLICKHOUSE_CLIENT --query "CREATE TABLE mt (n Int8, m Int8) ENGINE=MergeTree ORDER BY n PARTITION BY 0 < n SETTINGS old_parts_lifetime=0"; + +function thread_insert_commit() +{ + for i in {1..50}; do + $CLICKHOUSE_CLIENT --multiquery --query " + BEGIN TRANSACTION; + INSERT INTO mt VALUES /* ($i, $1) */ ($i, $1); + INSERT INTO mt VALUES /* (-$i, $1) */ (-$i, $1); + COMMIT;"; + done +} + +function thread_insert_rollback() +{ + for _ in {1..50}; do + $CLICKHOUSE_CLIENT --multiquery --query " + BEGIN TRANSACTION; + INSERT INTO mt VALUES /* (42, $1) */ (42, $1); + ROLLBACK;"; + done +} + +function thread_select() +{ + trap "exit 0" INT + while true; do + # Result of `uniq | wc -l` must be 1 if the first and the last queries got the same result + $CLICKHOUSE_CLIENT --multiquery --query " + BEGIN TRANSACTION; + SELECT arraySort(groupArray(n)), arraySort(groupArray(m)), arraySort(groupArray(_part)) FROM mt; + SELECT throwIf((SELECT sum(n) FROM mt) != 0) FORMAT Null; + SELECT throwIf((SELECT count() FROM mt) % 2 != 0) FORMAT Null; + SELECT arraySort(groupArray(n)), arraySort(groupArray(m)), arraySort(groupArray(_part)) FROM mt; + COMMIT;" | uniq | wc -l | grep -v "^1$" && $CLICKHOUSE_CLIENT -q "SELECT * FROM system.parts + WHERE database='$CLICKHOUSE_DATABASE' AND table='mt'" ||:; + done +} + +thread_insert_commit 1 & PID_1=$! +thread_insert_commit 2 & PID_2=$! +thread_insert_rollback 3 & PID_3=$! +thread_select & PID_4=$! +wait $PID_1 && wait $PID_2 && wait $PID_3 +kill -INT $PID_4 +wait + +$CLICKHOUSE_CLIENT --multiquery --query " +BEGIN TRANSACTION; +SELECT count(), sum(n), sum(m=1), sum(m=2), sum(m=3) FROM mt;"; + +$CLICKHOUSE_CLIENT --query "SELECT count(), sum(n), sum(m=1), sum(m=2), sum(m=3) FROM mt;" + +$CLICKHOUSE_CLIENT --query "DROP TABLE mt"; diff --git a/tests/queries/0_stateless/01271_show_privileges.reference b/tests/queries/0_stateless/01271_show_privileges.reference index b27c0d10d3b..039e438dc0a 100644 --- a/tests/queries/0_stateless/01271_show_privileges.reference +++ b/tests/queries/0_stateless/01271_show_privileges.reference @@ -60,6 +60,7 @@ DROP [] \N ALL TRUNCATE ['TRUNCATE TABLE'] TABLE ALL OPTIMIZE ['OPTIMIZE TABLE'] TABLE ALL KILL QUERY [] GLOBAL ALL +KILL TRANSACTION [] GLOBAL ALL MOVE PARTITION BETWEEN SHARDS [] GLOBAL ALL CREATE USER [] GLOBAL ACCESS MANAGEMENT ALTER USER [] GLOBAL ACCESS MANAGEMENT diff --git a/tests/queries/0_stateless/02117_show_create_table_system.reference b/tests/queries/0_stateless/02117_show_create_table_system.reference index 9f6c0b91a30..246b8ef6d3b 100644 --- a/tests/queries/0_stateless/02117_show_create_table_system.reference +++ b/tests/queries/0_stateless/02117_show_create_table_system.reference @@ -20,7 +20,7 @@ CREATE TABLE system.errors\n(\n `name` String,\n `code` Int32,\n `value CREATE TABLE system.events\n(\n `event` String,\n `value` UInt64,\n `description` String\n)\nENGINE = SystemEvents()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' CREATE TABLE system.formats\n(\n `name` String,\n `is_input` UInt8,\n `is_output` UInt8\n)\nENGINE = SystemFormats()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' CREATE TABLE system.functions\n(\n `name` String,\n `is_aggregate` UInt8,\n `case_insensitive` UInt8,\n `alias_to` String,\n `create_query` String,\n `origin` Enum8(\'System\' = 0, \'SQLUserDefined\' = 1, \'ExecutableUserDefined\' = 2)\n)\nENGINE = SystemFunctions()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' -CREATE TABLE system.grants\n(\n `user_name` Nullable(String),\n `role_name` Nullable(String),\n `access_type` Enum16(\'SHOW DATABASES\' = 0, \'SHOW TABLES\' = 1, \'SHOW COLUMNS\' = 2, \'SHOW DICTIONARIES\' = 3, \'SHOW\' = 4, \'SELECT\' = 5, \'INSERT\' = 6, \'ALTER UPDATE\' = 7, \'ALTER DELETE\' = 8, \'ALTER ADD COLUMN\' = 9, \'ALTER MODIFY COLUMN\' = 10, \'ALTER DROP COLUMN\' = 11, \'ALTER COMMENT COLUMN\' = 12, \'ALTER CLEAR COLUMN\' = 13, \'ALTER RENAME COLUMN\' = 14, \'ALTER MATERIALIZE COLUMN\' = 15, \'ALTER COLUMN\' = 16, \'ALTER MODIFY COMMENT\' = 17, \'ALTER ORDER BY\' = 18, \'ALTER SAMPLE BY\' = 19, \'ALTER ADD INDEX\' = 20, \'ALTER DROP INDEX\' = 21, \'ALTER MATERIALIZE INDEX\' = 22, \'ALTER CLEAR INDEX\' = 23, \'ALTER INDEX\' = 24, \'ALTER ADD PROJECTION\' = 25, \'ALTER DROP PROJECTION\' = 26, \'ALTER MATERIALIZE PROJECTION\' = 27, \'ALTER CLEAR PROJECTION\' = 28, \'ALTER PROJECTION\' = 29, \'ALTER ADD CONSTRAINT\' = 30, \'ALTER DROP CONSTRAINT\' = 31, \'ALTER CONSTRAINT\' = 32, \'ALTER TTL\' = 33, \'ALTER MATERIALIZE TTL\' = 34, \'ALTER SETTINGS\' = 35, \'ALTER MOVE PARTITION\' = 36, \'ALTER FETCH PARTITION\' = 37, \'ALTER FREEZE PARTITION\' = 38, \'ALTER DATABASE SETTINGS\' = 39, \'ALTER TABLE\' = 40, \'ALTER DATABASE\' = 41, \'ALTER VIEW REFRESH\' = 42, \'ALTER VIEW MODIFY QUERY\' = 43, \'ALTER VIEW\' = 44, \'ALTER\' = 45, \'CREATE DATABASE\' = 46, \'CREATE TABLE\' = 47, \'CREATE VIEW\' = 48, \'CREATE DICTIONARY\' = 49, \'CREATE TEMPORARY TABLE\' = 50, \'CREATE FUNCTION\' = 51, \'CREATE\' = 52, \'DROP DATABASE\' = 53, \'DROP TABLE\' = 54, \'DROP VIEW\' = 55, \'DROP DICTIONARY\' = 56, \'DROP FUNCTION\' = 57, \'DROP\' = 58, \'TRUNCATE\' = 59, \'OPTIMIZE\' = 60, \'KILL QUERY\' = 61, \'MOVE PARTITION BETWEEN SHARDS\' = 62, \'CREATE USER\' = 63, \'ALTER USER\' = 64, \'DROP USER\' = 65, \'CREATE ROLE\' = 66, \'ALTER ROLE\' = 67, \'DROP ROLE\' = 68, \'ROLE ADMIN\' = 69, \'CREATE ROW POLICY\' = 70, \'ALTER ROW POLICY\' = 71, \'DROP ROW POLICY\' = 72, \'CREATE QUOTA\' = 73, \'ALTER QUOTA\' = 74, \'DROP QUOTA\' = 75, \'CREATE SETTINGS PROFILE\' = 76, \'ALTER SETTINGS PROFILE\' = 77, \'DROP SETTINGS PROFILE\' = 78, \'SHOW USERS\' = 79, \'SHOW ROLES\' = 80, \'SHOW ROW POLICIES\' = 81, \'SHOW QUOTAS\' = 82, \'SHOW SETTINGS PROFILES\' = 83, \'SHOW ACCESS\' = 84, \'ACCESS MANAGEMENT\' = 85, \'SYSTEM SHUTDOWN\' = 86, \'SYSTEM DROP DNS CACHE\' = 87, \'SYSTEM DROP MARK CACHE\' = 88, \'SYSTEM DROP UNCOMPRESSED CACHE\' = 89, \'SYSTEM DROP MMAP CACHE\' = 90, \'SYSTEM DROP COMPILED EXPRESSION CACHE\' = 91, \'SYSTEM DROP CACHE\' = 92, \'SYSTEM RELOAD CONFIG\' = 93, \'SYSTEM RELOAD SYMBOLS\' = 94, \'SYSTEM RELOAD DICTIONARY\' = 95, \'SYSTEM RELOAD MODEL\' = 96, \'SYSTEM RELOAD FUNCTION\' = 97, \'SYSTEM RELOAD EMBEDDED DICTIONARIES\' = 98, \'SYSTEM RELOAD\' = 99, \'SYSTEM RESTART DISK\' = 100, \'SYSTEM MERGES\' = 101, \'SYSTEM TTL MERGES\' = 102, \'SYSTEM FETCHES\' = 103, \'SYSTEM MOVES\' = 104, \'SYSTEM DISTRIBUTED SENDS\' = 105, \'SYSTEM REPLICATED SENDS\' = 106, \'SYSTEM SENDS\' = 107, \'SYSTEM REPLICATION QUEUES\' = 108, \'SYSTEM DROP REPLICA\' = 109, \'SYSTEM SYNC REPLICA\' = 110, \'SYSTEM RESTART REPLICA\' = 111, \'SYSTEM RESTORE REPLICA\' = 112, \'SYSTEM FLUSH DISTRIBUTED\' = 113, \'SYSTEM FLUSH LOGS\' = 114, \'SYSTEM FLUSH\' = 115, \'SYSTEM THREAD FUZZER\' = 116, \'SYSTEM\' = 117, \'dictGet\' = 118, \'addressToLine\' = 119, \'addressToLineWithInlines\' = 120, \'addressToSymbol\' = 121, \'demangle\' = 122, \'INTROSPECTION\' = 123, \'FILE\' = 124, \'URL\' = 125, \'REMOTE\' = 126, \'MONGO\' = 127, \'MYSQL\' = 128, \'POSTGRES\' = 129, \'SQLITE\' = 130, \'ODBC\' = 131, \'JDBC\' = 132, \'HDFS\' = 133, \'S3\' = 134, \'HIVE\' = 135, \'SOURCES\' = 136, \'ALL\' = 137, \'NONE\' = 138),\n `database` Nullable(String),\n `table` Nullable(String),\n `column` Nullable(String),\n `is_partial_revoke` UInt8,\n `grant_option` UInt8\n)\nENGINE = SystemGrants()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' +CREATE TABLE system.grants\n(\n `user_name` Nullable(String),\n `role_name` Nullable(String),\n `access_type` Enum16(\'SHOW DATABASES\' = 0, \'SHOW TABLES\' = 1, \'SHOW COLUMNS\' = 2, \'SHOW DICTIONARIES\' = 3, \'SHOW\' = 4, \'SELECT\' = 5, \'INSERT\' = 6, \'ALTER UPDATE\' = 7, \'ALTER DELETE\' = 8, \'ALTER ADD COLUMN\' = 9, \'ALTER MODIFY COLUMN\' = 10, \'ALTER DROP COLUMN\' = 11, \'ALTER COMMENT COLUMN\' = 12, \'ALTER CLEAR COLUMN\' = 13, \'ALTER RENAME COLUMN\' = 14, \'ALTER MATERIALIZE COLUMN\' = 15, \'ALTER COLUMN\' = 16, \'ALTER MODIFY COMMENT\' = 17, \'ALTER ORDER BY\' = 18, \'ALTER SAMPLE BY\' = 19, \'ALTER ADD INDEX\' = 20, \'ALTER DROP INDEX\' = 21, \'ALTER MATERIALIZE INDEX\' = 22, \'ALTER CLEAR INDEX\' = 23, \'ALTER INDEX\' = 24, \'ALTER ADD PROJECTION\' = 25, \'ALTER DROP PROJECTION\' = 26, \'ALTER MATERIALIZE PROJECTION\' = 27, \'ALTER CLEAR PROJECTION\' = 28, \'ALTER PROJECTION\' = 29, \'ALTER ADD CONSTRAINT\' = 30, \'ALTER DROP CONSTRAINT\' = 31, \'ALTER CONSTRAINT\' = 32, \'ALTER TTL\' = 33, \'ALTER MATERIALIZE TTL\' = 34, \'ALTER SETTINGS\' = 35, \'ALTER MOVE PARTITION\' = 36, \'ALTER FETCH PARTITION\' = 37, \'ALTER FREEZE PARTITION\' = 38, \'ALTER DATABASE SETTINGS\' = 39, \'ALTER TABLE\' = 40, \'ALTER DATABASE\' = 41, \'ALTER VIEW REFRESH\' = 42, \'ALTER VIEW MODIFY QUERY\' = 43, \'ALTER VIEW\' = 44, \'ALTER\' = 45, \'CREATE DATABASE\' = 46, \'CREATE TABLE\' = 47, \'CREATE VIEW\' = 48, \'CREATE DICTIONARY\' = 49, \'CREATE TEMPORARY TABLE\' = 50, \'CREATE FUNCTION\' = 51, \'CREATE\' = 52, \'DROP DATABASE\' = 53, \'DROP TABLE\' = 54, \'DROP VIEW\' = 55, \'DROP DICTIONARY\' = 56, \'DROP FUNCTION\' = 57, \'DROP\' = 58, \'TRUNCATE\' = 59, \'OPTIMIZE\' = 60, \'KILL QUERY\' = 61, \'KILL TRANSACTION\' = 62, \'MOVE PARTITION BETWEEN SHARDS\' = 63, \'CREATE USER\' = 64, \'ALTER USER\' = 65, \'DROP USER\' = 66, \'CREATE ROLE\' = 67, \'ALTER ROLE\' = 68, \'DROP ROLE\' = 69, \'ROLE ADMIN\' = 70, \'CREATE ROW POLICY\' = 71, \'ALTER ROW POLICY\' = 72, \'DROP ROW POLICY\' = 73, \'CREATE QUOTA\' = 74, \'ALTER QUOTA\' = 75, \'DROP QUOTA\' = 76, \'CREATE SETTINGS PROFILE\' = 77, \'ALTER SETTINGS PROFILE\' = 78, \'DROP SETTINGS PROFILE\' = 79, \'SHOW USERS\' = 80, \'SHOW ROLES\' = 81, \'SHOW ROW POLICIES\' = 82, \'SHOW QUOTAS\' = 83, \'SHOW SETTINGS PROFILES\' = 84, \'SHOW ACCESS\' = 85, \'ACCESS MANAGEMENT\' = 86, \'SYSTEM SHUTDOWN\' = 87, \'SYSTEM DROP DNS CACHE\' = 88, \'SYSTEM DROP MARK CACHE\' = 89, \'SYSTEM DROP UNCOMPRESSED CACHE\' = 90, \'SYSTEM DROP MMAP CACHE\' = 91, \'SYSTEM DROP COMPILED EXPRESSION CACHE\' = 92, \'SYSTEM DROP CACHE\' = 93, \'SYSTEM RELOAD CONFIG\' = 94, \'SYSTEM RELOAD SYMBOLS\' = 95, \'SYSTEM RELOAD DICTIONARY\' = 96, \'SYSTEM RELOAD MODEL\' = 97, \'SYSTEM RELOAD FUNCTION\' = 98, \'SYSTEM RELOAD EMBEDDED DICTIONARIES\' = 99, \'SYSTEM RELOAD\' = 100, \'SYSTEM RESTART DISK\' = 101, \'SYSTEM MERGES\' = 102, \'SYSTEM TTL MERGES\' = 103, \'SYSTEM FETCHES\' = 104, \'SYSTEM MOVES\' = 105, \'SYSTEM DISTRIBUTED SENDS\' = 106, \'SYSTEM REPLICATED SENDS\' = 107, \'SYSTEM SENDS\' = 108, \'SYSTEM REPLICATION QUEUES\' = 109, \'SYSTEM DROP REPLICA\' = 110, \'SYSTEM SYNC REPLICA\' = 111, \'SYSTEM RESTART REPLICA\' = 112, \'SYSTEM RESTORE REPLICA\' = 113, \'SYSTEM FLUSH DISTRIBUTED\' = 114, \'SYSTEM FLUSH LOGS\' = 115, \'SYSTEM FLUSH\' = 116, \'SYSTEM THREAD FUZZER\' = 117, \'SYSTEM\' = 118, \'dictGet\' = 119, \'addressToLine\' = 120, \'addressToLineWithInlines\' = 121, \'addressToSymbol\' = 122, \'demangle\' = 123, \'INTROSPECTION\' = 124, \'FILE\' = 125, \'URL\' = 126, \'REMOTE\' = 127, \'MONGO\' = 128, \'MYSQL\' = 129, \'POSTGRES\' = 130, \'SQLITE\' = 131, \'ODBC\' = 132, \'JDBC\' = 133, \'HDFS\' = 134, \'S3\' = 135, \'HIVE\' = 136, \'SOURCES\' = 137, \'ALL\' = 138, \'NONE\' = 139),\n `database` Nullable(String),\n `table` Nullable(String),\n `column` Nullable(String),\n `is_partial_revoke` UInt8,\n `grant_option` UInt8\n)\nENGINE = SystemGrants()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' CREATE TABLE system.graphite_retentions\n(\n `config_name` String,\n `rule_type` String,\n `regexp` String,\n `function` String,\n `age` UInt64,\n `precision` UInt64,\n `priority` UInt16,\n `is_default` UInt8,\n `Tables.database` Array(String),\n `Tables.table` Array(String)\n)\nENGINE = SystemGraphite()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' CREATE TABLE system.licenses\n(\n `library_name` String,\n `license_type` String,\n `license_path` String,\n `license_text` String\n)\nENGINE = SystemLicenses()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' CREATE TABLE system.macros\n(\n `macro` String,\n `substitution` String\n)\nENGINE = SystemMacros()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' @@ -33,9 +33,9 @@ CREATE TABLE system.numbers\n(\n `number` UInt64\n)\nENGINE = SystemNumbers() CREATE TABLE system.numbers_mt\n(\n `number` UInt64\n)\nENGINE = SystemNumbers()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' CREATE TABLE system.one\n(\n `dummy` UInt8\n)\nENGINE = SystemOne()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' CREATE TABLE system.part_moves_between_shards\n(\n `database` String,\n `table` String,\n `task_name` String,\n `task_uuid` UUID,\n `create_time` DateTime,\n `part_name` String,\n `part_uuid` UUID,\n `to_shard` String,\n `dst_part_name` String,\n `update_time` DateTime,\n `state` String,\n `rollback` UInt8,\n `num_tries` UInt32,\n `last_exception` String\n)\nENGINE = SystemShardMoves()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' -CREATE TABLE system.parts\n(\n `partition` String,\n `name` String,\n `uuid` UUID,\n `part_type` String,\n `active` UInt8,\n `marks` UInt64,\n `rows` UInt64,\n `bytes_on_disk` UInt64,\n `data_compressed_bytes` UInt64,\n `data_uncompressed_bytes` UInt64,\n `marks_bytes` UInt64,\n `secondary_indices_compressed_bytes` UInt64,\n `secondary_indices_uncompressed_bytes` UInt64,\n `secondary_indices_marks_bytes` UInt64,\n `modification_time` DateTime,\n `remove_time` DateTime,\n `refcount` UInt32,\n `min_date` Date,\n `max_date` Date,\n `min_time` DateTime,\n `max_time` DateTime,\n `partition_id` String,\n `min_block_number` Int64,\n `max_block_number` Int64,\n `level` UInt32,\n `data_version` UInt64,\n `primary_key_bytes_in_memory` UInt64,\n `primary_key_bytes_in_memory_allocated` UInt64,\n `is_frozen` UInt8,\n `database` String,\n `table` String,\n `engine` String,\n `disk_name` String,\n `path` String,\n `hash_of_all_files` String,\n `hash_of_uncompressed_files` String,\n `uncompressed_hash_of_compressed_files` String,\n `delete_ttl_info_min` DateTime,\n `delete_ttl_info_max` DateTime,\n `move_ttl_info.expression` Array(String),\n `move_ttl_info.min` Array(DateTime),\n `move_ttl_info.max` Array(DateTime),\n `default_compression_codec` String,\n `recompression_ttl_info.expression` Array(String),\n `recompression_ttl_info.min` Array(DateTime),\n `recompression_ttl_info.max` Array(DateTime),\n `group_by_ttl_info.expression` Array(String),\n `group_by_ttl_info.min` Array(DateTime),\n `group_by_ttl_info.max` Array(DateTime),\n `rows_where_ttl_info.expression` Array(String),\n `rows_where_ttl_info.min` Array(DateTime),\n `rows_where_ttl_info.max` Array(DateTime),\n `projections` Array(String),\n `bytes` UInt64,\n `marks_size` UInt64\n)\nENGINE = SystemParts()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' +CREATE TABLE system.parts\n(\n `partition` String,\n `name` String,\n `uuid` UUID,\n `part_type` String,\n `active` UInt8,\n `marks` UInt64,\n `rows` UInt64,\n `bytes_on_disk` UInt64,\n `data_compressed_bytes` UInt64,\n `data_uncompressed_bytes` UInt64,\n `marks_bytes` UInt64,\n `secondary_indices_compressed_bytes` UInt64,\n `secondary_indices_uncompressed_bytes` UInt64,\n `secondary_indices_marks_bytes` UInt64,\n `modification_time` DateTime,\n `remove_time` DateTime,\n `refcount` UInt32,\n `min_date` Date,\n `max_date` Date,\n `min_time` DateTime,\n `max_time` DateTime,\n `partition_id` String,\n `min_block_number` Int64,\n `max_block_number` Int64,\n `level` UInt32,\n `data_version` UInt64,\n `primary_key_bytes_in_memory` UInt64,\n `primary_key_bytes_in_memory_allocated` UInt64,\n `is_frozen` UInt8,\n `database` String,\n `table` String,\n `engine` String,\n `disk_name` String,\n `path` String,\n `hash_of_all_files` String,\n `hash_of_uncompressed_files` String,\n `uncompressed_hash_of_compressed_files` String,\n `delete_ttl_info_min` DateTime,\n `delete_ttl_info_max` DateTime,\n `move_ttl_info.expression` Array(String),\n `move_ttl_info.min` Array(DateTime),\n `move_ttl_info.max` Array(DateTime),\n `default_compression_codec` String,\n `recompression_ttl_info.expression` Array(String),\n `recompression_ttl_info.min` Array(DateTime),\n `recompression_ttl_info.max` Array(DateTime),\n `group_by_ttl_info.expression` Array(String),\n `group_by_ttl_info.min` Array(DateTime),\n `group_by_ttl_info.max` Array(DateTime),\n `rows_where_ttl_info.expression` Array(String),\n `rows_where_ttl_info.min` Array(DateTime),\n `rows_where_ttl_info.max` Array(DateTime),\n `projections` Array(String),\n `visible` UInt8,\n `creation_tid` Tuple(UInt64, UInt64, UUID),\n `removal_tid` Tuple(UInt64, UInt64, UUID),\n `creation_csn` UInt64,\n `removal_csn` UInt64,\n `bytes` UInt64,\n `marks_size` UInt64\n)\nENGINE = SystemParts()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' CREATE TABLE system.parts_columns\n(\n `partition` String,\n `name` String,\n `uuid` UUID,\n `part_type` String,\n `active` UInt8,\n `marks` UInt64,\n `rows` UInt64,\n `bytes_on_disk` UInt64,\n `data_compressed_bytes` UInt64,\n `data_uncompressed_bytes` UInt64,\n `marks_bytes` UInt64,\n `modification_time` DateTime,\n `remove_time` DateTime,\n `refcount` UInt32,\n `min_date` Date,\n `max_date` Date,\n `min_time` DateTime,\n `max_time` DateTime,\n `partition_id` String,\n `min_block_number` Int64,\n `max_block_number` Int64,\n `level` UInt32,\n `data_version` UInt64,\n `primary_key_bytes_in_memory` UInt64,\n `primary_key_bytes_in_memory_allocated` UInt64,\n `database` String,\n `table` String,\n `engine` String,\n `disk_name` String,\n `path` String,\n `column` String,\n `type` String,\n `column_position` UInt64,\n `default_kind` String,\n `default_expression` String,\n `column_bytes_on_disk` UInt64,\n `column_data_compressed_bytes` UInt64,\n `column_data_uncompressed_bytes` UInt64,\n `column_marks_bytes` UInt64,\n `serialization_kind` String,\n `subcolumns.names` Array(String),\n `subcolumns.types` Array(String),\n `subcolumns.serializations` Array(String),\n `subcolumns.bytes_on_disk` Array(UInt64),\n `subcolumns.data_compressed_bytes` Array(UInt64),\n `subcolumns.data_uncompressed_bytes` Array(UInt64),\n `subcolumns.marks_bytes` Array(UInt64),\n `bytes` UInt64,\n `marks_size` UInt64\n)\nENGINE = SystemPartsColumns()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' -CREATE TABLE system.privileges\n(\n `privilege` Enum16(\'SHOW DATABASES\' = 0, \'SHOW TABLES\' = 1, \'SHOW COLUMNS\' = 2, \'SHOW DICTIONARIES\' = 3, \'SHOW\' = 4, \'SELECT\' = 5, \'INSERT\' = 6, \'ALTER UPDATE\' = 7, \'ALTER DELETE\' = 8, \'ALTER ADD COLUMN\' = 9, \'ALTER MODIFY COLUMN\' = 10, \'ALTER DROP COLUMN\' = 11, \'ALTER COMMENT COLUMN\' = 12, \'ALTER CLEAR COLUMN\' = 13, \'ALTER RENAME COLUMN\' = 14, \'ALTER MATERIALIZE COLUMN\' = 15, \'ALTER COLUMN\' = 16, \'ALTER MODIFY COMMENT\' = 17, \'ALTER ORDER BY\' = 18, \'ALTER SAMPLE BY\' = 19, \'ALTER ADD INDEX\' = 20, \'ALTER DROP INDEX\' = 21, \'ALTER MATERIALIZE INDEX\' = 22, \'ALTER CLEAR INDEX\' = 23, \'ALTER INDEX\' = 24, \'ALTER ADD PROJECTION\' = 25, \'ALTER DROP PROJECTION\' = 26, \'ALTER MATERIALIZE PROJECTION\' = 27, \'ALTER CLEAR PROJECTION\' = 28, \'ALTER PROJECTION\' = 29, \'ALTER ADD CONSTRAINT\' = 30, \'ALTER DROP CONSTRAINT\' = 31, \'ALTER CONSTRAINT\' = 32, \'ALTER TTL\' = 33, \'ALTER MATERIALIZE TTL\' = 34, \'ALTER SETTINGS\' = 35, \'ALTER MOVE PARTITION\' = 36, \'ALTER FETCH PARTITION\' = 37, \'ALTER FREEZE PARTITION\' = 38, \'ALTER DATABASE SETTINGS\' = 39, \'ALTER TABLE\' = 40, \'ALTER DATABASE\' = 41, \'ALTER VIEW REFRESH\' = 42, \'ALTER VIEW MODIFY QUERY\' = 43, \'ALTER VIEW\' = 44, \'ALTER\' = 45, \'CREATE DATABASE\' = 46, \'CREATE TABLE\' = 47, \'CREATE VIEW\' = 48, \'CREATE DICTIONARY\' = 49, \'CREATE TEMPORARY TABLE\' = 50, \'CREATE FUNCTION\' = 51, \'CREATE\' = 52, \'DROP DATABASE\' = 53, \'DROP TABLE\' = 54, \'DROP VIEW\' = 55, \'DROP DICTIONARY\' = 56, \'DROP FUNCTION\' = 57, \'DROP\' = 58, \'TRUNCATE\' = 59, \'OPTIMIZE\' = 60, \'KILL QUERY\' = 61, \'MOVE PARTITION BETWEEN SHARDS\' = 62, \'CREATE USER\' = 63, \'ALTER USER\' = 64, \'DROP USER\' = 65, \'CREATE ROLE\' = 66, \'ALTER ROLE\' = 67, \'DROP ROLE\' = 68, \'ROLE ADMIN\' = 69, \'CREATE ROW POLICY\' = 70, \'ALTER ROW POLICY\' = 71, \'DROP ROW POLICY\' = 72, \'CREATE QUOTA\' = 73, \'ALTER QUOTA\' = 74, \'DROP QUOTA\' = 75, \'CREATE SETTINGS PROFILE\' = 76, \'ALTER SETTINGS PROFILE\' = 77, \'DROP SETTINGS PROFILE\' = 78, \'SHOW USERS\' = 79, \'SHOW ROLES\' = 80, \'SHOW ROW POLICIES\' = 81, \'SHOW QUOTAS\' = 82, \'SHOW SETTINGS PROFILES\' = 83, \'SHOW ACCESS\' = 84, \'ACCESS MANAGEMENT\' = 85, \'SYSTEM SHUTDOWN\' = 86, \'SYSTEM DROP DNS CACHE\' = 87, \'SYSTEM DROP MARK CACHE\' = 88, \'SYSTEM DROP UNCOMPRESSED CACHE\' = 89, \'SYSTEM DROP MMAP CACHE\' = 90, \'SYSTEM DROP COMPILED EXPRESSION CACHE\' = 91, \'SYSTEM DROP CACHE\' = 92, \'SYSTEM RELOAD CONFIG\' = 93, \'SYSTEM RELOAD SYMBOLS\' = 94, \'SYSTEM RELOAD DICTIONARY\' = 95, \'SYSTEM RELOAD MODEL\' = 96, \'SYSTEM RELOAD FUNCTION\' = 97, \'SYSTEM RELOAD EMBEDDED DICTIONARIES\' = 98, \'SYSTEM RELOAD\' = 99, \'SYSTEM RESTART DISK\' = 100, \'SYSTEM MERGES\' = 101, \'SYSTEM TTL MERGES\' = 102, \'SYSTEM FETCHES\' = 103, \'SYSTEM MOVES\' = 104, \'SYSTEM DISTRIBUTED SENDS\' = 105, \'SYSTEM REPLICATED SENDS\' = 106, \'SYSTEM SENDS\' = 107, \'SYSTEM REPLICATION QUEUES\' = 108, \'SYSTEM DROP REPLICA\' = 109, \'SYSTEM SYNC REPLICA\' = 110, \'SYSTEM RESTART REPLICA\' = 111, \'SYSTEM RESTORE REPLICA\' = 112, \'SYSTEM FLUSH DISTRIBUTED\' = 113, \'SYSTEM FLUSH LOGS\' = 114, \'SYSTEM FLUSH\' = 115, \'SYSTEM THREAD FUZZER\' = 116, \'SYSTEM\' = 117, \'dictGet\' = 118, \'addressToLine\' = 119, \'addressToLineWithInlines\' = 120, \'addressToSymbol\' = 121, \'demangle\' = 122, \'INTROSPECTION\' = 123, \'FILE\' = 124, \'URL\' = 125, \'REMOTE\' = 126, \'MONGO\' = 127, \'MYSQL\' = 128, \'POSTGRES\' = 129, \'SQLITE\' = 130, \'ODBC\' = 131, \'JDBC\' = 132, \'HDFS\' = 133, \'S3\' = 134, \'HIVE\' = 135, \'SOURCES\' = 136, \'ALL\' = 137, \'NONE\' = 138),\n `aliases` Array(String),\n `level` Nullable(Enum8(\'GLOBAL\' = 0, \'DATABASE\' = 1, \'TABLE\' = 2, \'DICTIONARY\' = 3, \'VIEW\' = 4, \'COLUMN\' = 5)),\n `parent_group` Nullable(Enum16(\'SHOW DATABASES\' = 0, \'SHOW TABLES\' = 1, \'SHOW COLUMNS\' = 2, \'SHOW DICTIONARIES\' = 3, \'SHOW\' = 4, \'SELECT\' = 5, \'INSERT\' = 6, \'ALTER UPDATE\' = 7, \'ALTER DELETE\' = 8, \'ALTER ADD COLUMN\' = 9, \'ALTER MODIFY COLUMN\' = 10, \'ALTER DROP COLUMN\' = 11, \'ALTER COMMENT COLUMN\' = 12, \'ALTER CLEAR COLUMN\' = 13, \'ALTER RENAME COLUMN\' = 14, \'ALTER MATERIALIZE COLUMN\' = 15, \'ALTER COLUMN\' = 16, \'ALTER MODIFY COMMENT\' = 17, \'ALTER ORDER BY\' = 18, \'ALTER SAMPLE BY\' = 19, \'ALTER ADD INDEX\' = 20, \'ALTER DROP INDEX\' = 21, \'ALTER MATERIALIZE INDEX\' = 22, \'ALTER CLEAR INDEX\' = 23, \'ALTER INDEX\' = 24, \'ALTER ADD PROJECTION\' = 25, \'ALTER DROP PROJECTION\' = 26, \'ALTER MATERIALIZE PROJECTION\' = 27, \'ALTER CLEAR PROJECTION\' = 28, \'ALTER PROJECTION\' = 29, \'ALTER ADD CONSTRAINT\' = 30, \'ALTER DROP CONSTRAINT\' = 31, \'ALTER CONSTRAINT\' = 32, \'ALTER TTL\' = 33, \'ALTER MATERIALIZE TTL\' = 34, \'ALTER SETTINGS\' = 35, \'ALTER MOVE PARTITION\' = 36, \'ALTER FETCH PARTITION\' = 37, \'ALTER FREEZE PARTITION\' = 38, \'ALTER DATABASE SETTINGS\' = 39, \'ALTER TABLE\' = 40, \'ALTER DATABASE\' = 41, \'ALTER VIEW REFRESH\' = 42, \'ALTER VIEW MODIFY QUERY\' = 43, \'ALTER VIEW\' = 44, \'ALTER\' = 45, \'CREATE DATABASE\' = 46, \'CREATE TABLE\' = 47, \'CREATE VIEW\' = 48, \'CREATE DICTIONARY\' = 49, \'CREATE TEMPORARY TABLE\' = 50, \'CREATE FUNCTION\' = 51, \'CREATE\' = 52, \'DROP DATABASE\' = 53, \'DROP TABLE\' = 54, \'DROP VIEW\' = 55, \'DROP DICTIONARY\' = 56, \'DROP FUNCTION\' = 57, \'DROP\' = 58, \'TRUNCATE\' = 59, \'OPTIMIZE\' = 60, \'KILL QUERY\' = 61, \'MOVE PARTITION BETWEEN SHARDS\' = 62, \'CREATE USER\' = 63, \'ALTER USER\' = 64, \'DROP USER\' = 65, \'CREATE ROLE\' = 66, \'ALTER ROLE\' = 67, \'DROP ROLE\' = 68, \'ROLE ADMIN\' = 69, \'CREATE ROW POLICY\' = 70, \'ALTER ROW POLICY\' = 71, \'DROP ROW POLICY\' = 72, \'CREATE QUOTA\' = 73, \'ALTER QUOTA\' = 74, \'DROP QUOTA\' = 75, \'CREATE SETTINGS PROFILE\' = 76, \'ALTER SETTINGS PROFILE\' = 77, \'DROP SETTINGS PROFILE\' = 78, \'SHOW USERS\' = 79, \'SHOW ROLES\' = 80, \'SHOW ROW POLICIES\' = 81, \'SHOW QUOTAS\' = 82, \'SHOW SETTINGS PROFILES\' = 83, \'SHOW ACCESS\' = 84, \'ACCESS MANAGEMENT\' = 85, \'SYSTEM SHUTDOWN\' = 86, \'SYSTEM DROP DNS CACHE\' = 87, \'SYSTEM DROP MARK CACHE\' = 88, \'SYSTEM DROP UNCOMPRESSED CACHE\' = 89, \'SYSTEM DROP MMAP CACHE\' = 90, \'SYSTEM DROP COMPILED EXPRESSION CACHE\' = 91, \'SYSTEM DROP CACHE\' = 92, \'SYSTEM RELOAD CONFIG\' = 93, \'SYSTEM RELOAD SYMBOLS\' = 94, \'SYSTEM RELOAD DICTIONARY\' = 95, \'SYSTEM RELOAD MODEL\' = 96, \'SYSTEM RELOAD FUNCTION\' = 97, \'SYSTEM RELOAD EMBEDDED DICTIONARIES\' = 98, \'SYSTEM RELOAD\' = 99, \'SYSTEM RESTART DISK\' = 100, \'SYSTEM MERGES\' = 101, \'SYSTEM TTL MERGES\' = 102, \'SYSTEM FETCHES\' = 103, \'SYSTEM MOVES\' = 104, \'SYSTEM DISTRIBUTED SENDS\' = 105, \'SYSTEM REPLICATED SENDS\' = 106, \'SYSTEM SENDS\' = 107, \'SYSTEM REPLICATION QUEUES\' = 108, \'SYSTEM DROP REPLICA\' = 109, \'SYSTEM SYNC REPLICA\' = 110, \'SYSTEM RESTART REPLICA\' = 111, \'SYSTEM RESTORE REPLICA\' = 112, \'SYSTEM FLUSH DISTRIBUTED\' = 113, \'SYSTEM FLUSH LOGS\' = 114, \'SYSTEM FLUSH\' = 115, \'SYSTEM THREAD FUZZER\' = 116, \'SYSTEM\' = 117, \'dictGet\' = 118, \'addressToLine\' = 119, \'addressToLineWithInlines\' = 120, \'addressToSymbol\' = 121, \'demangle\' = 122, \'INTROSPECTION\' = 123, \'FILE\' = 124, \'URL\' = 125, \'REMOTE\' = 126, \'MONGO\' = 127, \'MYSQL\' = 128, \'POSTGRES\' = 129, \'SQLITE\' = 130, \'ODBC\' = 131, \'JDBC\' = 132, \'HDFS\' = 133, \'S3\' = 134, \'HIVE\' = 135, \'SOURCES\' = 136, \'ALL\' = 137, \'NONE\' = 138))\n)\nENGINE = SystemPrivileges()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' +CREATE TABLE system.privileges\n(\n `privilege` Enum16(\'SHOW DATABASES\' = 0, \'SHOW TABLES\' = 1, \'SHOW COLUMNS\' = 2, \'SHOW DICTIONARIES\' = 3, \'SHOW\' = 4, \'SELECT\' = 5, \'INSERT\' = 6, \'ALTER UPDATE\' = 7, \'ALTER DELETE\' = 8, \'ALTER ADD COLUMN\' = 9, \'ALTER MODIFY COLUMN\' = 10, \'ALTER DROP COLUMN\' = 11, \'ALTER COMMENT COLUMN\' = 12, \'ALTER CLEAR COLUMN\' = 13, \'ALTER RENAME COLUMN\' = 14, \'ALTER MATERIALIZE COLUMN\' = 15, \'ALTER COLUMN\' = 16, \'ALTER MODIFY COMMENT\' = 17, \'ALTER ORDER BY\' = 18, \'ALTER SAMPLE BY\' = 19, \'ALTER ADD INDEX\' = 20, \'ALTER DROP INDEX\' = 21, \'ALTER MATERIALIZE INDEX\' = 22, \'ALTER CLEAR INDEX\' = 23, \'ALTER INDEX\' = 24, \'ALTER ADD PROJECTION\' = 25, \'ALTER DROP PROJECTION\' = 26, \'ALTER MATERIALIZE PROJECTION\' = 27, \'ALTER CLEAR PROJECTION\' = 28, \'ALTER PROJECTION\' = 29, \'ALTER ADD CONSTRAINT\' = 30, \'ALTER DROP CONSTRAINT\' = 31, \'ALTER CONSTRAINT\' = 32, \'ALTER TTL\' = 33, \'ALTER MATERIALIZE TTL\' = 34, \'ALTER SETTINGS\' = 35, \'ALTER MOVE PARTITION\' = 36, \'ALTER FETCH PARTITION\' = 37, \'ALTER FREEZE PARTITION\' = 38, \'ALTER DATABASE SETTINGS\' = 39, \'ALTER TABLE\' = 40, \'ALTER DATABASE\' = 41, \'ALTER VIEW REFRESH\' = 42, \'ALTER VIEW MODIFY QUERY\' = 43, \'ALTER VIEW\' = 44, \'ALTER\' = 45, \'CREATE DATABASE\' = 46, \'CREATE TABLE\' = 47, \'CREATE VIEW\' = 48, \'CREATE DICTIONARY\' = 49, \'CREATE TEMPORARY TABLE\' = 50, \'CREATE FUNCTION\' = 51, \'CREATE\' = 52, \'DROP DATABASE\' = 53, \'DROP TABLE\' = 54, \'DROP VIEW\' = 55, \'DROP DICTIONARY\' = 56, \'DROP FUNCTION\' = 57, \'DROP\' = 58, \'TRUNCATE\' = 59, \'OPTIMIZE\' = 60, \'KILL QUERY\' = 61, \'KILL TRANSACTION\' = 62, \'MOVE PARTITION BETWEEN SHARDS\' = 63, \'CREATE USER\' = 64, \'ALTER USER\' = 65, \'DROP USER\' = 66, \'CREATE ROLE\' = 67, \'ALTER ROLE\' = 68, \'DROP ROLE\' = 69, \'ROLE ADMIN\' = 70, \'CREATE ROW POLICY\' = 71, \'ALTER ROW POLICY\' = 72, \'DROP ROW POLICY\' = 73, \'CREATE QUOTA\' = 74, \'ALTER QUOTA\' = 75, \'DROP QUOTA\' = 76, \'CREATE SETTINGS PROFILE\' = 77, \'ALTER SETTINGS PROFILE\' = 78, \'DROP SETTINGS PROFILE\' = 79, \'SHOW USERS\' = 80, \'SHOW ROLES\' = 81, \'SHOW ROW POLICIES\' = 82, \'SHOW QUOTAS\' = 83, \'SHOW SETTINGS PROFILES\' = 84, \'SHOW ACCESS\' = 85, \'ACCESS MANAGEMENT\' = 86, \'SYSTEM SHUTDOWN\' = 87, \'SYSTEM DROP DNS CACHE\' = 88, \'SYSTEM DROP MARK CACHE\' = 89, \'SYSTEM DROP UNCOMPRESSED CACHE\' = 90, \'SYSTEM DROP MMAP CACHE\' = 91, \'SYSTEM DROP COMPILED EXPRESSION CACHE\' = 92, \'SYSTEM DROP CACHE\' = 93, \'SYSTEM RELOAD CONFIG\' = 94, \'SYSTEM RELOAD SYMBOLS\' = 95, \'SYSTEM RELOAD DICTIONARY\' = 96, \'SYSTEM RELOAD MODEL\' = 97, \'SYSTEM RELOAD FUNCTION\' = 98, \'SYSTEM RELOAD EMBEDDED DICTIONARIES\' = 99, \'SYSTEM RELOAD\' = 100, \'SYSTEM RESTART DISK\' = 101, \'SYSTEM MERGES\' = 102, \'SYSTEM TTL MERGES\' = 103, \'SYSTEM FETCHES\' = 104, \'SYSTEM MOVES\' = 105, \'SYSTEM DISTRIBUTED SENDS\' = 106, \'SYSTEM REPLICATED SENDS\' = 107, \'SYSTEM SENDS\' = 108, \'SYSTEM REPLICATION QUEUES\' = 109, \'SYSTEM DROP REPLICA\' = 110, \'SYSTEM SYNC REPLICA\' = 111, \'SYSTEM RESTART REPLICA\' = 112, \'SYSTEM RESTORE REPLICA\' = 113, \'SYSTEM FLUSH DISTRIBUTED\' = 114, \'SYSTEM FLUSH LOGS\' = 115, \'SYSTEM FLUSH\' = 116, \'SYSTEM THREAD FUZZER\' = 117, \'SYSTEM\' = 118, \'dictGet\' = 119, \'addressToLine\' = 120, \'addressToLineWithInlines\' = 121, \'addressToSymbol\' = 122, \'demangle\' = 123, \'INTROSPECTION\' = 124, \'FILE\' = 125, \'URL\' = 126, \'REMOTE\' = 127, \'MONGO\' = 128, \'MYSQL\' = 129, \'POSTGRES\' = 130, \'SQLITE\' = 131, \'ODBC\' = 132, \'JDBC\' = 133, \'HDFS\' = 134, \'S3\' = 135, \'HIVE\' = 136, \'SOURCES\' = 137, \'ALL\' = 138, \'NONE\' = 139),\n `aliases` Array(String),\n `level` Nullable(Enum8(\'GLOBAL\' = 0, \'DATABASE\' = 1, \'TABLE\' = 2, \'DICTIONARY\' = 3, \'VIEW\' = 4, \'COLUMN\' = 5)),\n `parent_group` Nullable(Enum16(\'SHOW DATABASES\' = 0, \'SHOW TABLES\' = 1, \'SHOW COLUMNS\' = 2, \'SHOW DICTIONARIES\' = 3, \'SHOW\' = 4, \'SELECT\' = 5, \'INSERT\' = 6, \'ALTER UPDATE\' = 7, \'ALTER DELETE\' = 8, \'ALTER ADD COLUMN\' = 9, \'ALTER MODIFY COLUMN\' = 10, \'ALTER DROP COLUMN\' = 11, \'ALTER COMMENT COLUMN\' = 12, \'ALTER CLEAR COLUMN\' = 13, \'ALTER RENAME COLUMN\' = 14, \'ALTER MATERIALIZE COLUMN\' = 15, \'ALTER COLUMN\' = 16, \'ALTER MODIFY COMMENT\' = 17, \'ALTER ORDER BY\' = 18, \'ALTER SAMPLE BY\' = 19, \'ALTER ADD INDEX\' = 20, \'ALTER DROP INDEX\' = 21, \'ALTER MATERIALIZE INDEX\' = 22, \'ALTER CLEAR INDEX\' = 23, \'ALTER INDEX\' = 24, \'ALTER ADD PROJECTION\' = 25, \'ALTER DROP PROJECTION\' = 26, \'ALTER MATERIALIZE PROJECTION\' = 27, \'ALTER CLEAR PROJECTION\' = 28, \'ALTER PROJECTION\' = 29, \'ALTER ADD CONSTRAINT\' = 30, \'ALTER DROP CONSTRAINT\' = 31, \'ALTER CONSTRAINT\' = 32, \'ALTER TTL\' = 33, \'ALTER MATERIALIZE TTL\' = 34, \'ALTER SETTINGS\' = 35, \'ALTER MOVE PARTITION\' = 36, \'ALTER FETCH PARTITION\' = 37, \'ALTER FREEZE PARTITION\' = 38, \'ALTER DATABASE SETTINGS\' = 39, \'ALTER TABLE\' = 40, \'ALTER DATABASE\' = 41, \'ALTER VIEW REFRESH\' = 42, \'ALTER VIEW MODIFY QUERY\' = 43, \'ALTER VIEW\' = 44, \'ALTER\' = 45, \'CREATE DATABASE\' = 46, \'CREATE TABLE\' = 47, \'CREATE VIEW\' = 48, \'CREATE DICTIONARY\' = 49, \'CREATE TEMPORARY TABLE\' = 50, \'CREATE FUNCTION\' = 51, \'CREATE\' = 52, \'DROP DATABASE\' = 53, \'DROP TABLE\' = 54, \'DROP VIEW\' = 55, \'DROP DICTIONARY\' = 56, \'DROP FUNCTION\' = 57, \'DROP\' = 58, \'TRUNCATE\' = 59, \'OPTIMIZE\' = 60, \'KILL QUERY\' = 61, \'KILL TRANSACTION\' = 62, \'MOVE PARTITION BETWEEN SHARDS\' = 63, \'CREATE USER\' = 64, \'ALTER USER\' = 65, \'DROP USER\' = 66, \'CREATE ROLE\' = 67, \'ALTER ROLE\' = 68, \'DROP ROLE\' = 69, \'ROLE ADMIN\' = 70, \'CREATE ROW POLICY\' = 71, \'ALTER ROW POLICY\' = 72, \'DROP ROW POLICY\' = 73, \'CREATE QUOTA\' = 74, \'ALTER QUOTA\' = 75, \'DROP QUOTA\' = 76, \'CREATE SETTINGS PROFILE\' = 77, \'ALTER SETTINGS PROFILE\' = 78, \'DROP SETTINGS PROFILE\' = 79, \'SHOW USERS\' = 80, \'SHOW ROLES\' = 81, \'SHOW ROW POLICIES\' = 82, \'SHOW QUOTAS\' = 83, \'SHOW SETTINGS PROFILES\' = 84, \'SHOW ACCESS\' = 85, \'ACCESS MANAGEMENT\' = 86, \'SYSTEM SHUTDOWN\' = 87, \'SYSTEM DROP DNS CACHE\' = 88, \'SYSTEM DROP MARK CACHE\' = 89, \'SYSTEM DROP UNCOMPRESSED CACHE\' = 90, \'SYSTEM DROP MMAP CACHE\' = 91, \'SYSTEM DROP COMPILED EXPRESSION CACHE\' = 92, \'SYSTEM DROP CACHE\' = 93, \'SYSTEM RELOAD CONFIG\' = 94, \'SYSTEM RELOAD SYMBOLS\' = 95, \'SYSTEM RELOAD DICTIONARY\' = 96, \'SYSTEM RELOAD MODEL\' = 97, \'SYSTEM RELOAD FUNCTION\' = 98, \'SYSTEM RELOAD EMBEDDED DICTIONARIES\' = 99, \'SYSTEM RELOAD\' = 100, \'SYSTEM RESTART DISK\' = 101, \'SYSTEM MERGES\' = 102, \'SYSTEM TTL MERGES\' = 103, \'SYSTEM FETCHES\' = 104, \'SYSTEM MOVES\' = 105, \'SYSTEM DISTRIBUTED SENDS\' = 106, \'SYSTEM REPLICATED SENDS\' = 107, \'SYSTEM SENDS\' = 108, \'SYSTEM REPLICATION QUEUES\' = 109, \'SYSTEM DROP REPLICA\' = 110, \'SYSTEM SYNC REPLICA\' = 111, \'SYSTEM RESTART REPLICA\' = 112, \'SYSTEM RESTORE REPLICA\' = 113, \'SYSTEM FLUSH DISTRIBUTED\' = 114, \'SYSTEM FLUSH LOGS\' = 115, \'SYSTEM FLUSH\' = 116, \'SYSTEM THREAD FUZZER\' = 117, \'SYSTEM\' = 118, \'dictGet\' = 119, \'addressToLine\' = 120, \'addressToLineWithInlines\' = 121, \'addressToSymbol\' = 122, \'demangle\' = 123, \'INTROSPECTION\' = 124, \'FILE\' = 125, \'URL\' = 126, \'REMOTE\' = 127, \'MONGO\' = 128, \'MYSQL\' = 129, \'POSTGRES\' = 130, \'SQLITE\' = 131, \'ODBC\' = 132, \'JDBC\' = 133, \'HDFS\' = 134, \'S3\' = 135, \'HIVE\' = 136, \'SOURCES\' = 137, \'ALL\' = 138, \'NONE\' = 139))\n)\nENGINE = SystemPrivileges()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' CREATE TABLE system.processes\n(\n `is_initial_query` UInt8,\n `user` String,\n `query_id` String,\n `address` IPv6,\n `port` UInt16,\n `initial_user` String,\n `initial_query_id` String,\n `initial_address` IPv6,\n `initial_port` UInt16,\n `interface` UInt8,\n `os_user` String,\n `client_hostname` String,\n `client_name` String,\n `client_revision` UInt64,\n `client_version_major` UInt64,\n `client_version_minor` UInt64,\n `client_version_patch` UInt64,\n `http_method` UInt8,\n `http_user_agent` String,\n `http_referer` String,\n `forwarded_for` String,\n `quota_key` String,\n `distributed_depth` UInt64,\n `elapsed` Float64,\n `is_cancelled` UInt8,\n `read_rows` UInt64,\n `read_bytes` UInt64,\n `total_rows_approx` UInt64,\n `written_rows` UInt64,\n `written_bytes` UInt64,\n `memory_usage` Int64,\n `peak_memory_usage` Int64,\n `query` String,\n `thread_ids` Array(UInt64),\n `ProfileEvents` Map(String, UInt64),\n `Settings` Map(String, String),\n `current_database` String,\n `ProfileEvents.Names` Array(String),\n `ProfileEvents.Values` Array(UInt64),\n `Settings.Names` Array(String),\n `Settings.Values` Array(String)\n)\nENGINE = SystemProcesses()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' CREATE TABLE system.projection_parts\n(\n `partition` String,\n `name` String,\n `part_type` String,\n `parent_name` String,\n `parent_uuid` UUID,\n `parent_part_type` String,\n `active` UInt8,\n `marks` UInt64,\n `rows` UInt64,\n `bytes_on_disk` UInt64,\n `data_compressed_bytes` UInt64,\n `data_uncompressed_bytes` UInt64,\n `marks_bytes` UInt64,\n `parent_marks` UInt64,\n `parent_rows` UInt64,\n `parent_bytes_on_disk` UInt64,\n `parent_data_compressed_bytes` UInt64,\n `parent_data_uncompressed_bytes` UInt64,\n `parent_marks_bytes` UInt64,\n `modification_time` DateTime,\n `remove_time` DateTime,\n `refcount` UInt32,\n `min_date` Date,\n `max_date` Date,\n `min_time` DateTime,\n `max_time` DateTime,\n `partition_id` String,\n `min_block_number` Int64,\n `max_block_number` Int64,\n `level` UInt32,\n `data_version` UInt64,\n `primary_key_bytes_in_memory` UInt64,\n `primary_key_bytes_in_memory_allocated` UInt64,\n `is_frozen` UInt8,\n `database` String,\n `table` String,\n `engine` String,\n `disk_name` String,\n `path` String,\n `hash_of_all_files` String,\n `hash_of_uncompressed_files` String,\n `uncompressed_hash_of_compressed_files` String,\n `delete_ttl_info_min` DateTime,\n `delete_ttl_info_max` DateTime,\n `move_ttl_info.expression` Array(String),\n `move_ttl_info.min` Array(DateTime),\n `move_ttl_info.max` Array(DateTime),\n `default_compression_codec` String,\n `recompression_ttl_info.expression` Array(String),\n `recompression_ttl_info.min` Array(DateTime),\n `recompression_ttl_info.max` Array(DateTime),\n `group_by_ttl_info.expression` Array(String),\n `group_by_ttl_info.min` Array(DateTime),\n `group_by_ttl_info.max` Array(DateTime),\n `rows_where_ttl_info.expression` Array(String),\n `rows_where_ttl_info.min` Array(DateTime),\n `rows_where_ttl_info.max` Array(DateTime),\n `bytes` UInt64,\n `marks_size` UInt64\n)\nENGINE = SystemProjectionParts()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' CREATE TABLE system.projection_parts_columns\n(\n `partition` String,\n `name` String,\n `part_type` String,\n `parent_name` String,\n `parent_uuid` UUID,\n `parent_part_type` String,\n `active` UInt8,\n `marks` UInt64,\n `rows` UInt64,\n `bytes_on_disk` UInt64,\n `data_compressed_bytes` UInt64,\n `data_uncompressed_bytes` UInt64,\n `marks_bytes` UInt64,\n `parent_marks` UInt64,\n `parent_rows` UInt64,\n `parent_bytes_on_disk` UInt64,\n `parent_data_compressed_bytes` UInt64,\n `parent_data_uncompressed_bytes` UInt64,\n `parent_marks_bytes` UInt64,\n `modification_time` DateTime,\n `remove_time` DateTime,\n `refcount` UInt32,\n `min_date` Date,\n `max_date` Date,\n `min_time` DateTime,\n `max_time` DateTime,\n `partition_id` String,\n `min_block_number` Int64,\n `max_block_number` Int64,\n `level` UInt32,\n `data_version` UInt64,\n `primary_key_bytes_in_memory` UInt64,\n `primary_key_bytes_in_memory_allocated` UInt64,\n `database` String,\n `table` String,\n `engine` String,\n `disk_name` String,\n `path` String,\n `column` String,\n `type` String,\n `column_position` UInt64,\n `default_kind` String,\n `default_expression` String,\n `column_bytes_on_disk` UInt64,\n `column_data_compressed_bytes` UInt64,\n `column_data_uncompressed_bytes` UInt64,\n `column_marks_bytes` UInt64,\n `bytes` UInt64,\n `marks_size` UInt64\n)\nENGINE = SystemProjectionPartsColumns()\nCOMMENT \'SYSTEM TABLE is built on the fly.\' diff --git a/tests/queries/0_stateless/transactions.lib b/tests/queries/0_stateless/transactions.lib new file mode 100755 index 00000000000..521c56754bc --- /dev/null +++ b/tests/queries/0_stateless/transactions.lib @@ -0,0 +1,78 @@ +#!/usr/bin/env bash + +# shellcheck disable=SC2015 + +# Useful to run queries in parallel sessions +function tx() +{ + tx_num=$1 + query=$2 + + session="${CLICKHOUSE_TEST_ZOOKEEPER_PREFIX}_tx$tx_num" + query_id="${session}_${RANDOM}" + url_without_session="https://${CLICKHOUSE_HOST}:${CLICKHOUSE_PORT_HTTPS}/?" + url="${url_without_session}session_id=$session&query_id=$query_id&database=$CLICKHOUSE_DATABASE" + + ${CLICKHOUSE_CURL} -m 60 -sSk "$url" --data "$query" | sed "s/^/tx$tx_num\t/" +} + +# Waits for the last query in session to finish +function tx_wait() { + tx_num=$1 + + session="${CLICKHOUSE_TEST_ZOOKEEPER_PREFIX}_tx$tx_num" + + # try get pid of previous query + query_pid="" + tmp_file_name="${CLICKHOUSE_TMP}/tmp_tx_${CLICKHOUSE_TEST_ZOOKEEPER_PREFIX}" + query_id_and_pid=$(grep -F "$session" "$tmp_file_name" 2>/dev/null | tail -1) ||: + read -r query_id query_pid <<< "$query_id_and_pid" ||: + + # wait for previous query in transaction + if [ -n "$query_pid" ]; then + timeout 5 tail --pid=$query_pid -f /dev/null && return ||: + fi + + # there is no pid (or maybe we got wrong one), so wait using system.processes (it's less reliable) + count=0 + while [[ $($CLICKHOUSE_CLIENT -q "SELECT count() FROM system.processes WHERE query_id LIKE '$session%'") -gt 0 ]]; do + sleep 0.5 + count=$((count+1)) + if [ "$count" -gt 120 ]; then + echo "timeout while waiting for $tx_num" + break + fi + done; +} + +# Wait for previous query in session to finish, starts new one asynchronously +function tx_async() +{ + tx_num=$1 + query=$2 + + tx_wait "$tx_num" + + session="${CLICKHOUSE_TEST_ZOOKEEPER_PREFIX}_tx$tx_num" + query_id="${session}_${RANDOM}" + url_without_session="https://${CLICKHOUSE_HOST}:${CLICKHOUSE_PORT_HTTPS}/?" + url="${url_without_session}session_id=$session&query_id=$query_id&database=$CLICKHOUSE_DATABASE" + + # We cannot be sure that query will actually start execution and appear in system.processes before the next call to tx_wait + # Also we cannot use global map in bash to store last query_id for each tx_num, so we use tmp file... + tmp_file_name="${CLICKHOUSE_TMP}/tmp_tx_${CLICKHOUSE_TEST_ZOOKEEPER_PREFIX}" + + # run query asynchronously + ${CLICKHOUSE_CURL} -m 60 -sSk "$url" --data "$query" | sed "s/^/tx$tx_num\t/" & + query_pid=$! + echo -e "$query_id\t$query_pid" >> "$tmp_file_name" +} + +# Wait for previous query in session to finish, execute the next one synchronously +function tx_sync() +{ + tx_num=$1 + query=$2 + tx_wait "$tx_num" + tx "$tx_num" "$query" +}