From d4210d04c16861ed6cbccd589e9c19fd4511c97d Mon Sep 17 00:00:00 2001 From: Val Date: Sun, 5 Apr 2020 15:18:51 +0300 Subject: [PATCH 001/381] databasereplicated constructor scratch --- src/Databases/DatabaseReplicated.cpp | 215 +++++++++++++++++++++++++++ src/Databases/DatabaseReplicated.h | 61 ++++++++ 2 files changed, 276 insertions(+) create mode 100644 src/Databases/DatabaseReplicated.cpp create mode 100644 src/Databases/DatabaseReplicated.h diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp new file mode 100644 index 00000000000..fd5f53a596c --- /dev/null +++ b/src/Databases/DatabaseReplicated.cpp @@ -0,0 +1,215 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +namespace DB +{ + + +namespace ErrorCodes +{ + extern const int NO_ZOOKEEPER; +} + +void DatabaseReplicated::setZooKeeper(zkutil::ZooKeeperPtr zookeeper) +{ + std::lock_guard lock(current_zookeeper_mutex); + current_zookeeper = zookeeper; +} + +zkutil::ZooKeeperPtr DatabaseReplicated::tryGetZooKeeper() const +{ + std::lock_guard lock(current_zookeeper_mutex); + return current_zookeeper; +} + +zkutil::ZooKeeperPtr DatabaseReplicated::getZooKeeper() const +{ + auto res = tryGetZooKeeper(); + if (!res) + throw Exception("Cannot get ZooKeeper", ErrorCodes::NO_ZOOKEEPER); + return res; +} + + +DatabaseReplicated::DatabaseReplicated( + const String & name_, + const String & metadata_path_, + const String & zookeeper_path_, + const String & replica_name_, + const Context & context_) + : DatabaseOrdinary(name_, metadata_path_, context_) + , zookeeper_path(zookeeper_path_) + , replica_name(replica_name_) +{ + + if (!zookeeper_path.empty() && zookeeper_path.back() == '/') + zookeeper_path.resize(zookeeper_path.size() - 1); + /// If zookeeper chroot prefix is used, path should start with '/', because chroot concatenates without it. + if (!zookeeper_path.empty() && zookeeper_path.front() != '/') + zookeeper_path = "/" + zookeeper_path; + replica_path = zookeeper_path + "/replicas/" + replica_name; + + if (context_.hasZooKeeper()) { + current_zookeeper = context_.getZooKeeper(); + } + + if (!current_zookeeper) + { + // TODO wtf is attach + // if (!attach) + throw Exception("Can't create replicated table without ZooKeeper", ErrorCodes::NO_ZOOKEEPER); + + /// Do not activate the replica. It will be readonly. + // TODO is it relevant for engines? + // LOG_ERROR(log, "No ZooKeeper: database will be in readonly mode."); + // TODO is_readonly = true; + // return; + } + + // can the zk path exist and no metadata on disk be available at the same moment? if so, in such a case, the db instance must be restored. + + current_zookeeper->createIfNotExists(zookeeper_path, String()); + current_zookeeper->createIfNotExists(replica_path, String()); + // TODO what to do? + // TODO createDatabaseIfNotExists ? + // TODO check database structure ? +} + +void DatabaseReplicated::createTable( + const Context & context, + const String & table_name, + const StoragePtr & table, + const ASTPtr & query) +{ + // try + DatabaseOnDisk::createTable(context, table_name, table, query); + + // replicated stuff + String statement = getObjectDefinitionFromCreateQuery(query); + auto zookeeper = getZooKeeper(); + // TODO в чем прикол именно так создавать зиноды? + Coordination::Requests ops; + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path, "", + zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/metadata", metadata, + zkutil::CreateMode::Persistent)); +// ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/columns", getColumns().toString(), +// zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/log", "", + zkutil::CreateMode::Persistent)); +// ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/blocks", "", +// zkutil::CreateMode::Persistent)); +// ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/block_numbers", "", +// zkutil::CreateMode::Persistent)); +// ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/nonincrement_block_numbers", "", +// zkutil::CreateMode::Persistent)); /// /nonincrement_block_numbers dir is unused, but is created nonetheless for backwards compatibility. + // TODO do we need a leader here? (probably yes) what is it gonna do? + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/leader_election", "", + zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/temp", "", + zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/replicas", "", + zkutil::CreateMode::Persistent)); + + Coordination::Responses responses; + auto code = zookeeper->tryMulti(ops, responses); + if (code && code != Coordination::ZNODEEXISTS) + throw Coordination::Exception(code); + + // ... + +} + + +void DatabaseReplicated::renameTable( + const Context & context, + const String & table_name, + IDatabase & to_database, + const String & to_table_name, + TableStructureWriteLockHolder & lock) +{ + // try + DatabaseOnDisk::renameTable(context, table_name, to_database, to_table_name, lock); + // replicated stuff + String statement = getObjectDefinitionFromCreateQuery(query); + // this one is fairly more complex +} + +void DatabaseReplicated::removeTable( + const Context & context, + const String & table_name) +{ + // try + DatabaseOnDisk::removeTable(context, table_name); + // replicated stuff + String statement = getObjectDefinitionFromCreateQuery(query); + // ... +} + +void DatabaseReplicated::drop(const Context & context) +{ + DatabaseOnDisk::drop(context); + // replicated stuff + String statement = getObjectDefinitionFromCreateQuery(query); + // should it be possible to recover after a drop. + // if not, we can just delete all the zookeeper nodes starting from + // zookeeper path. does it work recursively? hope so... +} + +void DatabaseOrdinary::loadStoredObjects( + Context & context, + bool has_force_restore_data_flag) +{ + syncReplicaState(context); + updateMetadata(context); + + DatabaseOrdinary::loadStoredObjects(context, has_force_restore_data_flag); + +} + +// sync replica's zookeeper metadata +void syncReplicaState(Context & context) { + +} + +// get the up to date metadata from zookeeper to local metadata dir +// for replicated (only?) tables +void updateMetadata(Context & context) { + +} + +} diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h new file mode 100644 index 00000000000..51f7763bb5a --- /dev/null +++ b/src/Databases/DatabaseReplicated.h @@ -0,0 +1,61 @@ +#pragma once + +#include +#include +#include + +namespace DB +{ +/** Replicated database engine. + * It stores tables list using list of .sql files, + * that contain declaration of table represented by SQL ATTACH TABLE query + * and operation log in zookeeper + */ +class DatabaseReplicated : public DatabaseOrdinary +{ +public: + DatabaseReplicated(const String & name_, const String & metadata_path_, const String & zookeeper_path_, const String & replica_name_, const Context & context); + + String getEngineName() const override { return "Replicated"; } + + void createTable( + const Context & context, + const String & table_name, + const StoragePtr & table, + const ASTPtr & query) override; + + void removeTable( + const Context & context, + const String & table_name) override; + + void renameTable( + const Context & context, + const String & table_name, + IDatabase & to_database, + const String & to_table_name, + TableStructureWriteLockHolder & lock) override; + + void drop(const Context & context) override; + + void loadStoredObjects( + Context & context, + bool has_force_restore_data_flag) override; + +private: + String zookeeper_path; + String replica_name; + String replica_path; + + zkutil::ZooKeeperPtr current_zookeeper; /// Use only the methods below. + mutable std::mutex current_zookeeper_mutex; /// To recreate the session in the background thread. + + zkutil::ZooKeeperPtr tryGetZooKeeper() const; + zkutil::ZooKeeperPtr getZooKeeper() const; + void setZooKeeper(zkutil::ZooKeeperPtr zookeeper); + + void syncReplicaState(Context & context); + + void updateMetadata(Context & context); +}; + +} From 272e31188d9b76bc4680fccf3502e459c89d5956 Mon Sep 17 00:00:00 2001 From: Val Date: Sun, 5 Apr 2020 16:06:21 +0300 Subject: [PATCH 002/381] databasereplicated add table functions prototype --- dbms/src/Databases/DatabaseReplicated.cpp | 156 ++++++++++++++++++++++ 1 file changed, 156 insertions(+) create mode 100644 dbms/src/Databases/DatabaseReplicated.cpp diff --git a/dbms/src/Databases/DatabaseReplicated.cpp b/dbms/src/Databases/DatabaseReplicated.cpp new file mode 100644 index 00000000000..704c678f366 --- /dev/null +++ b/dbms/src/Databases/DatabaseReplicated.cpp @@ -0,0 +1,156 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +namespace DB +{ + + +namespace ErrorCodes +{ + extern const int NO_ZOOKEEPER; +} + +void DatabaseReplicated::setZooKeeper(zkutil::ZooKeeperPtr zookeeper) +{ + std::lock_guard lock(current_zookeeper_mutex); + current_zookeeper = zookeeper; +} + +zkutil::ZooKeeperPtr DatabaseReplicated::tryGetZooKeeper() const +{ + std::lock_guard lock(current_zookeeper_mutex); + return current_zookeeper; +} + +zkutil::ZooKeeperPtr DatabaseReplicated::getZooKeeper() const +{ + auto res = tryGetZooKeeper(); + if (!res) + throw Exception("Cannot get ZooKeeper", ErrorCodes::NO_ZOOKEEPER); + return res; +} + + +DatabaseReplicated::DatabaseReplicated( + const String & name_, + const String & metadata_path_, + const String & zookeeper_path_, + const String & replica_name_, + const Context & context_) + : DatabaseOrdinary(name_, metadata_path_, context_) + , zookeeper_path(zookeeper_path_) + , replica_name(replica_name_) +{ + + if (!zookeeper_path.empty() && zookeeper_path.back() == '/') + zookeeper_path.resize(zookeeper_path.size() - 1); + /// If zookeeper chroot prefix is used, path should start with '/', because chroot concatenates without it. + if (!zookeeper_path.empty() && zookeeper_path.front() != '/') + zookeeper_path = "/" + zookeeper_path; + replica_path = zookeeper_path + "/replicas/" + replica_name; + + if (context_.hasZooKeeper()) { + current_zookeeper = context_.getZooKeeper(); + } + + if (!current_zookeeper) + { + // TODO wtf is attach + // if (!attach) + throw Exception("Can't create replicated table without ZooKeeper", ErrorCodes::NO_ZOOKEEPER); + + /// Do not activate the replica. It will be readonly. + // TODO is it relevant for engines? + // LOG_ERROR(log, "No ZooKeeper: database will be in readonly mode."); + // TODO is_readonly = true; + // return; + } + // getObjectDefinitionFromCreateQuery + // TODO what to do? + // TODO createDatabaseIfNotExists ? + // TODO check database structure ? +} + +void DatabaseReplicated::createTable( + const Context & context, + const String & table_name, + const StoragePtr & table, + const ASTPtr & query) +{ + // try + DatabaseOnDisk::createTable(context, table_name, table, query); + // replicated stuff + String statement = getObjectDefinitionFromCreateQuery(query); + // ... + +} + + +void DatabaseReplicated::renameTable( + const Context & context, + const String & table_name, + IDatabase & to_database, + const String & to_table_name, + TableStructureWriteLockHolder & lock) +{ + // try + DatabaseOnDisk::renameTable(context, table_name, to_database, to_table_name, lock); + // replicated stuff + String statement = getObjectDefinitionFromCreateQuery(query); + // ... +} + +void DatabaseReplicated::removeTable( + const Context & context, + const String & table_name) +{ + // try + DatabaseOnDisk::removeTable(context, table_name); + // replicated stuff + String statement = getObjectDefinitionFromCreateQuery(query); + // ... +} + +void DatabaseReplicated::drop(const Context & context) +{ + DatabaseOnDisk::drop(context); + // replicated stuff + String statement = getObjectDefinitionFromCreateQuery(query); + // ... +} + +} From edb871979a66ecd5d07346003360344e5fb51ff0 Mon Sep 17 00:00:00 2001 From: Val Date: Mon, 6 Apr 2020 14:29:45 +0300 Subject: [PATCH 003/381] add some zookeeper into the logic --- dbms/src/Databases/DatabaseReplicated.cpp | 40 +++++++++++++++++++++-- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/dbms/src/Databases/DatabaseReplicated.cpp b/dbms/src/Databases/DatabaseReplicated.cpp index 704c678f366..31e28c320cb 100644 --- a/dbms/src/Databases/DatabaseReplicated.cpp +++ b/dbms/src/Databases/DatabaseReplicated.cpp @@ -99,7 +99,9 @@ DatabaseReplicated::DatabaseReplicated( // TODO is_readonly = true; // return; } - // getObjectDefinitionFromCreateQuery + + current_zookeeper->createIfNotExists(zookeeper_path, String()); + current_zookeeper->createIfNotExists(replica_path, String()); // TODO what to do? // TODO createDatabaseIfNotExists ? // TODO check database structure ? @@ -115,6 +117,36 @@ void DatabaseReplicated::createTable( DatabaseOnDisk::createTable(context, table_name, table, query); // replicated stuff String statement = getObjectDefinitionFromCreateQuery(query); + auto zookeeper = getZooKeeper(); + // TODO в чем прикол именно так создавать зиноды? + Coordination::Requests ops; + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path, "", + zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/metadata", metadata, + zkutil::CreateMode::Persistent)); +// ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/columns", getColumns().toString(), +// zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/log", "", + zkutil::CreateMode::Persistent)); +// ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/blocks", "", +// zkutil::CreateMode::Persistent)); +// ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/block_numbers", "", +// zkutil::CreateMode::Persistent)); +// ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/nonincrement_block_numbers", "", +// zkutil::CreateMode::Persistent)); /// /nonincrement_block_numbers dir is unused, but is created nonetheless for backwards compatibility. + // TODO do we need a leader here? (probably yes) what is it gonna do? + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/leader_election", "", + zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/temp", "", + zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/replicas", "", + zkutil::CreateMode::Persistent)); + + Coordination::Responses responses; + auto code = zookeeper->tryMulti(ops, responses); + if (code && code != Coordination::ZNODEEXISTS) + throw Coordination::Exception(code); + // ... } @@ -131,7 +163,7 @@ void DatabaseReplicated::renameTable( DatabaseOnDisk::renameTable(context, table_name, to_database, to_table_name, lock); // replicated stuff String statement = getObjectDefinitionFromCreateQuery(query); - // ... + // this one is fairly more complex } void DatabaseReplicated::removeTable( @@ -150,7 +182,9 @@ void DatabaseReplicated::drop(const Context & context) DatabaseOnDisk::drop(context); // replicated stuff String statement = getObjectDefinitionFromCreateQuery(query); - // ... + // should it be possible to recover after a drop. + // if not, we can just delete all the zookeeper nodes starting from + // zookeeper path. does it work recursively? hope so... } } From e0f52965e5ebfbb01e7a502190bea17918e22754 Mon Sep 17 00:00:00 2001 From: Val Date: Fri, 24 Apr 2020 16:49:14 +0300 Subject: [PATCH 004/381] Add a comment with some thoughts --- dbms/src/Databases/DatabaseReplicated.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dbms/src/Databases/DatabaseReplicated.cpp b/dbms/src/Databases/DatabaseReplicated.cpp index 31e28c320cb..e18fc1db5f4 100644 --- a/dbms/src/Databases/DatabaseReplicated.cpp +++ b/dbms/src/Databases/DatabaseReplicated.cpp @@ -100,6 +100,8 @@ DatabaseReplicated::DatabaseReplicated( // return; } + // can the zk path exist and no metadata on disk be available at the same moment? if so, in such a case, the db instance must be restored. + current_zookeeper->createIfNotExists(zookeeper_path, String()); current_zookeeper->createIfNotExists(replica_path, String()); // TODO what to do? @@ -115,6 +117,7 @@ void DatabaseReplicated::createTable( { // try DatabaseOnDisk::createTable(context, table_name, table, query); + // replicated stuff String statement = getObjectDefinitionFromCreateQuery(query); auto zookeeper = getZooKeeper(); From c1c132502c64d52e5867e3cc4ed6e3b2523567d8 Mon Sep 17 00:00:00 2001 From: Val Date: Fri, 24 Apr 2020 17:12:54 +0300 Subject: [PATCH 005/381] add prototypes of loadStoredObject and some relevant helpers in replicateddb --- dbms/src/Databases/DatabaseReplicated.cpp | 22 ++++++++ dbms/src/Databases/DatabaseReplicated.h | 61 +++++++++++++++++++++++ 2 files changed, 83 insertions(+) create mode 100644 dbms/src/Databases/DatabaseReplicated.h diff --git a/dbms/src/Databases/DatabaseReplicated.cpp b/dbms/src/Databases/DatabaseReplicated.cpp index e18fc1db5f4..fd5f53a596c 100644 --- a/dbms/src/Databases/DatabaseReplicated.cpp +++ b/dbms/src/Databases/DatabaseReplicated.cpp @@ -190,4 +190,26 @@ void DatabaseReplicated::drop(const Context & context) // zookeeper path. does it work recursively? hope so... } +void DatabaseOrdinary::loadStoredObjects( + Context & context, + bool has_force_restore_data_flag) +{ + syncReplicaState(context); + updateMetadata(context); + + DatabaseOrdinary::loadStoredObjects(context, has_force_restore_data_flag); + +} + +// sync replica's zookeeper metadata +void syncReplicaState(Context & context) { + +} + +// get the up to date metadata from zookeeper to local metadata dir +// for replicated (only?) tables +void updateMetadata(Context & context) { + +} + } diff --git a/dbms/src/Databases/DatabaseReplicated.h b/dbms/src/Databases/DatabaseReplicated.h new file mode 100644 index 00000000000..51f7763bb5a --- /dev/null +++ b/dbms/src/Databases/DatabaseReplicated.h @@ -0,0 +1,61 @@ +#pragma once + +#include +#include +#include + +namespace DB +{ +/** Replicated database engine. + * It stores tables list using list of .sql files, + * that contain declaration of table represented by SQL ATTACH TABLE query + * and operation log in zookeeper + */ +class DatabaseReplicated : public DatabaseOrdinary +{ +public: + DatabaseReplicated(const String & name_, const String & metadata_path_, const String & zookeeper_path_, const String & replica_name_, const Context & context); + + String getEngineName() const override { return "Replicated"; } + + void createTable( + const Context & context, + const String & table_name, + const StoragePtr & table, + const ASTPtr & query) override; + + void removeTable( + const Context & context, + const String & table_name) override; + + void renameTable( + const Context & context, + const String & table_name, + IDatabase & to_database, + const String & to_table_name, + TableStructureWriteLockHolder & lock) override; + + void drop(const Context & context) override; + + void loadStoredObjects( + Context & context, + bool has_force_restore_data_flag) override; + +private: + String zookeeper_path; + String replica_name; + String replica_path; + + zkutil::ZooKeeperPtr current_zookeeper; /// Use only the methods below. + mutable std::mutex current_zookeeper_mutex; /// To recreate the session in the background thread. + + zkutil::ZooKeeperPtr tryGetZooKeeper() const; + zkutil::ZooKeeperPtr getZooKeeper() const; + void setZooKeeper(zkutil::ZooKeeperPtr zookeeper); + + void syncReplicaState(Context & context); + + void updateMetadata(Context & context); +}; + +} From 0d392bbb34c142f6871a2bd2ab699f5baa768780 Mon Sep 17 00:00:00 2001 From: Val Date: Wed, 29 Apr 2020 14:19:16 +0300 Subject: [PATCH 006/381] fix after rebase --- src/Databases/DatabaseFactory.cpp | 17 +++++++++- src/Databases/DatabaseReplicated.cpp | 49 +++++++++++++++------------- src/Databases/DatabaseReplicated.h | 7 ++-- 3 files changed, 47 insertions(+), 26 deletions(-) diff --git a/src/Databases/DatabaseFactory.cpp b/src/Databases/DatabaseFactory.cpp index f27bc509ebe..0d7a711b530 100644 --- a/src/Databases/DatabaseFactory.cpp +++ b/src/Databases/DatabaseFactory.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -69,7 +70,7 @@ DatabasePtr DatabaseFactory::getImpl( { String engine_name = engine_define->engine->name; - if (engine_name != "MySQL" && engine_name != "Lazy" && engine_define->engine->arguments) + if (engine_name != "MySQL" && engine_name != "Lazy" && engine_name != "Replicated" && engine_define->engine->arguments) throw Exception("Database engine " + engine_name + " cannot have arguments", ErrorCodes::BAD_ARGUMENTS); if (engine_define->engine->parameters || engine_define->partition_by || engine_define->primary_key || engine_define->order_by || @@ -138,6 +139,20 @@ DatabasePtr DatabaseFactory::getImpl( return std::make_shared(database_name, metadata_path, cache_expiration_time_seconds, context); } + else if (engine_name == "Replicated") + { + const ASTFunction * engine = engine_define->engine; + + if (!engine->arguments || engine->arguments->children.size() != 2) + throw Exception("Replicated database requires zoo_path and replica_name arguments", ErrorCodes::BAD_ARGUMENTS); + + const auto & arguments = engine->arguments->children; + + const auto zoo_path = arguments[0]->as()->value.safeGet(); + const auto replica_name = arguments[1]->as()->value.safeGet(); + return std::make_shared(database_name, metadata_path, zoo_path, replica_name, context); + } + throw Exception("Unknown database engine: " + engine_name, ErrorCodes::UNKNOWN_DATABASE_ENGINE); } diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index fd5f53a596c..92af1c890c2 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -125,8 +125,8 @@ void DatabaseReplicated::createTable( Coordination::Requests ops; ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path, "", zkutil::CreateMode::Persistent)); - ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/metadata", metadata, - zkutil::CreateMode::Persistent)); + //ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/metadata", metadata, + //zkutil::CreateMode::Persistent)); // ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/columns", getColumns().toString(), // zkutil::CreateMode::Persistent)); ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/log", "", @@ -160,23 +160,24 @@ void DatabaseReplicated::renameTable( const String & table_name, IDatabase & to_database, const String & to_table_name, - TableStructureWriteLockHolder & lock) + bool exchange) { // try - DatabaseOnDisk::renameTable(context, table_name, to_database, to_table_name, lock); - // replicated stuff - String statement = getObjectDefinitionFromCreateQuery(query); + DatabaseOnDisk::renameTable(context, table_name, to_database, to_table_name, exchange); + // replicated stuff; what to put to a znode + // String statement = getObjectDefinitionFromCreateQuery(query); // this one is fairly more complex } -void DatabaseReplicated::removeTable( +void DatabaseReplicated::dropTable( const Context & context, - const String & table_name) + const String & table_name, + bool no_delay) { // try - DatabaseOnDisk::removeTable(context, table_name); + DatabaseOnDisk::dropTable(context, table_name, no_delay); // replicated stuff - String statement = getObjectDefinitionFromCreateQuery(query); + //String statement = getObjectDefinitionFromCreateQuery(query); // ... } @@ -184,13 +185,26 @@ void DatabaseReplicated::drop(const Context & context) { DatabaseOnDisk::drop(context); // replicated stuff - String statement = getObjectDefinitionFromCreateQuery(query); + //String statement = getObjectDefinitionFromCreateQuery(query); // should it be possible to recover after a drop. // if not, we can just delete all the zookeeper nodes starting from // zookeeper path. does it work recursively? hope so... } -void DatabaseOrdinary::loadStoredObjects( +// sync replica's zookeeper metadata +void DatabaseReplicated::syncReplicaState(Context & context) { + auto c = context; // fixes unuser parameter error + return; +} + +// get the up to date metadata from zookeeper to local metadata dir +// for replicated (only?) tables +void DatabaseReplicated::updateMetadata(Context & context) { + auto c = context; // fixes unuser parameter error + return; +} + +void DatabaseReplicated::loadStoredObjects( Context & context, bool has_force_restore_data_flag) { @@ -201,15 +215,6 @@ void DatabaseOrdinary::loadStoredObjects( } -// sync replica's zookeeper metadata -void syncReplicaState(Context & context) { - -} - -// get the up to date metadata from zookeeper to local metadata dir -// for replicated (only?) tables -void updateMetadata(Context & context) { - -} + } diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index 51f7763bb5a..bc1af923277 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -24,16 +24,17 @@ public: const StoragePtr & table, const ASTPtr & query) override; - void removeTable( + void dropTable( const Context & context, - const String & table_name) override; + const String & table_name, + bool no_delay) override; void renameTable( const Context & context, const String & table_name, IDatabase & to_database, const String & to_table_name, - TableStructureWriteLockHolder & lock) override; + bool exchange) override; void drop(const Context & context) override; From 1cb96bf1762cc8b111f0cb58ed651059156442e2 Mon Sep 17 00:00:00 2001 From: Val Date: Wed, 29 Apr 2020 14:21:12 +0300 Subject: [PATCH 007/381] rm old files from nonexistant dir since the rebase --- dbms/src/Databases/DatabaseReplicated.cpp | 215 ---------------------- dbms/src/Databases/DatabaseReplicated.h | 61 ------ 2 files changed, 276 deletions(-) delete mode 100644 dbms/src/Databases/DatabaseReplicated.cpp delete mode 100644 dbms/src/Databases/DatabaseReplicated.h diff --git a/dbms/src/Databases/DatabaseReplicated.cpp b/dbms/src/Databases/DatabaseReplicated.cpp deleted file mode 100644 index fd5f53a596c..00000000000 --- a/dbms/src/Databases/DatabaseReplicated.cpp +++ /dev/null @@ -1,215 +0,0 @@ -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include - -namespace DB -{ - - -namespace ErrorCodes -{ - extern const int NO_ZOOKEEPER; -} - -void DatabaseReplicated::setZooKeeper(zkutil::ZooKeeperPtr zookeeper) -{ - std::lock_guard lock(current_zookeeper_mutex); - current_zookeeper = zookeeper; -} - -zkutil::ZooKeeperPtr DatabaseReplicated::tryGetZooKeeper() const -{ - std::lock_guard lock(current_zookeeper_mutex); - return current_zookeeper; -} - -zkutil::ZooKeeperPtr DatabaseReplicated::getZooKeeper() const -{ - auto res = tryGetZooKeeper(); - if (!res) - throw Exception("Cannot get ZooKeeper", ErrorCodes::NO_ZOOKEEPER); - return res; -} - - -DatabaseReplicated::DatabaseReplicated( - const String & name_, - const String & metadata_path_, - const String & zookeeper_path_, - const String & replica_name_, - const Context & context_) - : DatabaseOrdinary(name_, metadata_path_, context_) - , zookeeper_path(zookeeper_path_) - , replica_name(replica_name_) -{ - - if (!zookeeper_path.empty() && zookeeper_path.back() == '/') - zookeeper_path.resize(zookeeper_path.size() - 1); - /// If zookeeper chroot prefix is used, path should start with '/', because chroot concatenates without it. - if (!zookeeper_path.empty() && zookeeper_path.front() != '/') - zookeeper_path = "/" + zookeeper_path; - replica_path = zookeeper_path + "/replicas/" + replica_name; - - if (context_.hasZooKeeper()) { - current_zookeeper = context_.getZooKeeper(); - } - - if (!current_zookeeper) - { - // TODO wtf is attach - // if (!attach) - throw Exception("Can't create replicated table without ZooKeeper", ErrorCodes::NO_ZOOKEEPER); - - /// Do not activate the replica. It will be readonly. - // TODO is it relevant for engines? - // LOG_ERROR(log, "No ZooKeeper: database will be in readonly mode."); - // TODO is_readonly = true; - // return; - } - - // can the zk path exist and no metadata on disk be available at the same moment? if so, in such a case, the db instance must be restored. - - current_zookeeper->createIfNotExists(zookeeper_path, String()); - current_zookeeper->createIfNotExists(replica_path, String()); - // TODO what to do? - // TODO createDatabaseIfNotExists ? - // TODO check database structure ? -} - -void DatabaseReplicated::createTable( - const Context & context, - const String & table_name, - const StoragePtr & table, - const ASTPtr & query) -{ - // try - DatabaseOnDisk::createTable(context, table_name, table, query); - - // replicated stuff - String statement = getObjectDefinitionFromCreateQuery(query); - auto zookeeper = getZooKeeper(); - // TODO в чем прикол именно так создавать зиноды? - Coordination::Requests ops; - ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path, "", - zkutil::CreateMode::Persistent)); - ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/metadata", metadata, - zkutil::CreateMode::Persistent)); -// ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/columns", getColumns().toString(), -// zkutil::CreateMode::Persistent)); - ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/log", "", - zkutil::CreateMode::Persistent)); -// ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/blocks", "", -// zkutil::CreateMode::Persistent)); -// ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/block_numbers", "", -// zkutil::CreateMode::Persistent)); -// ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/nonincrement_block_numbers", "", -// zkutil::CreateMode::Persistent)); /// /nonincrement_block_numbers dir is unused, but is created nonetheless for backwards compatibility. - // TODO do we need a leader here? (probably yes) what is it gonna do? - ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/leader_election", "", - zkutil::CreateMode::Persistent)); - ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/temp", "", - zkutil::CreateMode::Persistent)); - ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/replicas", "", - zkutil::CreateMode::Persistent)); - - Coordination::Responses responses; - auto code = zookeeper->tryMulti(ops, responses); - if (code && code != Coordination::ZNODEEXISTS) - throw Coordination::Exception(code); - - // ... - -} - - -void DatabaseReplicated::renameTable( - const Context & context, - const String & table_name, - IDatabase & to_database, - const String & to_table_name, - TableStructureWriteLockHolder & lock) -{ - // try - DatabaseOnDisk::renameTable(context, table_name, to_database, to_table_name, lock); - // replicated stuff - String statement = getObjectDefinitionFromCreateQuery(query); - // this one is fairly more complex -} - -void DatabaseReplicated::removeTable( - const Context & context, - const String & table_name) -{ - // try - DatabaseOnDisk::removeTable(context, table_name); - // replicated stuff - String statement = getObjectDefinitionFromCreateQuery(query); - // ... -} - -void DatabaseReplicated::drop(const Context & context) -{ - DatabaseOnDisk::drop(context); - // replicated stuff - String statement = getObjectDefinitionFromCreateQuery(query); - // should it be possible to recover after a drop. - // if not, we can just delete all the zookeeper nodes starting from - // zookeeper path. does it work recursively? hope so... -} - -void DatabaseOrdinary::loadStoredObjects( - Context & context, - bool has_force_restore_data_flag) -{ - syncReplicaState(context); - updateMetadata(context); - - DatabaseOrdinary::loadStoredObjects(context, has_force_restore_data_flag); - -} - -// sync replica's zookeeper metadata -void syncReplicaState(Context & context) { - -} - -// get the up to date metadata from zookeeper to local metadata dir -// for replicated (only?) tables -void updateMetadata(Context & context) { - -} - -} diff --git a/dbms/src/Databases/DatabaseReplicated.h b/dbms/src/Databases/DatabaseReplicated.h deleted file mode 100644 index 51f7763bb5a..00000000000 --- a/dbms/src/Databases/DatabaseReplicated.h +++ /dev/null @@ -1,61 +0,0 @@ -#pragma once - -#include -#include -#include - -namespace DB -{ -/** Replicated database engine. - * It stores tables list using list of .sql files, - * that contain declaration of table represented by SQL ATTACH TABLE query - * and operation log in zookeeper - */ -class DatabaseReplicated : public DatabaseOrdinary -{ -public: - DatabaseReplicated(const String & name_, const String & metadata_path_, const String & zookeeper_path_, const String & replica_name_, const Context & context); - - String getEngineName() const override { return "Replicated"; } - - void createTable( - const Context & context, - const String & table_name, - const StoragePtr & table, - const ASTPtr & query) override; - - void removeTable( - const Context & context, - const String & table_name) override; - - void renameTable( - const Context & context, - const String & table_name, - IDatabase & to_database, - const String & to_table_name, - TableStructureWriteLockHolder & lock) override; - - void drop(const Context & context) override; - - void loadStoredObjects( - Context & context, - bool has_force_restore_data_flag) override; - -private: - String zookeeper_path; - String replica_name; - String replica_path; - - zkutil::ZooKeeperPtr current_zookeeper; /// Use only the methods below. - mutable std::mutex current_zookeeper_mutex; /// To recreate the session in the background thread. - - zkutil::ZooKeeperPtr tryGetZooKeeper() const; - zkutil::ZooKeeperPtr getZooKeeper() const; - void setZooKeeper(zkutil::ZooKeeperPtr zookeeper); - - void syncReplicaState(Context & context); - - void updateMetadata(Context & context); -}; - -} From 8b0366ff4ff08d47b9ca7451ce33ca07683b0012 Mon Sep 17 00:00:00 2001 From: Val Date: Thu, 30 Apr 2020 19:15:27 +0300 Subject: [PATCH 008/381] an attempt to make something meaningful --- src/Databases/DatabaseReplicated.cpp | 91 ++++++++++++---------------- 1 file changed, 40 insertions(+), 51 deletions(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 92af1c890c2..d6bbec24791 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -71,7 +71,7 @@ DatabaseReplicated::DatabaseReplicated( const String & zookeeper_path_, const String & replica_name_, const Context & context_) - : DatabaseOrdinary(name_, metadata_path_, context_) + : DatabaseOrdinary(name_, metadata_path_, "data/", "DatabaseReplicated (" + name_ + ")", context_) , zookeeper_path(zookeeper_path_) , replica_name(replica_name_) { @@ -89,24 +89,31 @@ DatabaseReplicated::DatabaseReplicated( if (!current_zookeeper) { - // TODO wtf is attach - // if (!attach) - throw Exception("Can't create replicated table without ZooKeeper", ErrorCodes::NO_ZOOKEEPER); + throw Exception("Can't create replicated database without ZooKeeper", ErrorCodes::NO_ZOOKEEPER); + - /// Do not activate the replica. It will be readonly. - // TODO is it relevant for engines? - // LOG_ERROR(log, "No ZooKeeper: database will be in readonly mode."); - // TODO is_readonly = true; - // return; } - // can the zk path exist and no metadata on disk be available at the same moment? if so, in such a case, the db instance must be restored. + // test without this fancy mess (prob wont work) + current_zookeeper->createAncestors(replica_path); + current_zookeeper->createOrUpdate(replica_path, String(), zkutil::CreateMode::Persistent); - current_zookeeper->createIfNotExists(zookeeper_path, String()); - current_zookeeper->createIfNotExists(replica_path, String()); - // TODO what to do? - // TODO createDatabaseIfNotExists ? - // TODO check database structure ? +// if (!current_zookeeper->exists(zookeeper_path)) { +// +// LOG_DEBUG(log, "Creating database " << zookeeper_path); +// current_zookeeper->createAncestors(zookeeper_path); + + // Coordination::Requests ops; + // ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path, "", + // zkutil::CreateMode::Persistent)); + // ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/replicas", "", + // zkutil::CreateMode::Persistent)); + + // Coordination::Responses responses; + // auto code = current_zookeeper->tryMulti(ops, responses); + // if (code && code != Coordination::ZNODEEXISTS) + // throw Coordination::Exception(code); + // } } void DatabaseReplicated::createTable( @@ -115,43 +122,16 @@ void DatabaseReplicated::createTable( const StoragePtr & table, const ASTPtr & query) { - // try + // try? DatabaseOnDisk::createTable(context, table_name, table, query); - // replicated stuff + // suppose it worked String statement = getObjectDefinitionFromCreateQuery(query); - auto zookeeper = getZooKeeper(); - // TODO в чем прикол именно так создавать зиноды? - Coordination::Requests ops; - ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path, "", - zkutil::CreateMode::Persistent)); - //ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/metadata", metadata, - //zkutil::CreateMode::Persistent)); -// ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/columns", getColumns().toString(), -// zkutil::CreateMode::Persistent)); - ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/log", "", - zkutil::CreateMode::Persistent)); -// ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/blocks", "", -// zkutil::CreateMode::Persistent)); -// ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/block_numbers", "", -// zkutil::CreateMode::Persistent)); -// ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/nonincrement_block_numbers", "", -// zkutil::CreateMode::Persistent)); /// /nonincrement_block_numbers dir is unused, but is created nonetheless for backwards compatibility. - // TODO do we need a leader here? (probably yes) what is it gonna do? - ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/leader_election", "", - zkutil::CreateMode::Persistent)); - ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/temp", "", - zkutil::CreateMode::Persistent)); - ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/replicas", "", - zkutil::CreateMode::Persistent)); - - Coordination::Responses responses; - auto code = zookeeper->tryMulti(ops, responses); - if (code && code != Coordination::ZNODEEXISTS) - throw Coordination::Exception(code); - - // ... + LOG_DEBUG(log, "CREATE TABLE STATEMENT " << statement); + // let's do dumb write to zk at the first iteration + current_zookeeper = getZooKeeper(); + current_zookeeper->createOrUpdate(replica_path + "/" + table_name, statement, zkutil::CreateMode::Persistent); } @@ -167,6 +147,14 @@ void DatabaseReplicated::renameTable( // replicated stuff; what to put to a znode // String statement = getObjectDefinitionFromCreateQuery(query); // this one is fairly more complex + current_zookeeper = getZooKeeper(); + + // no need for now to have stat + Coordination::Stat metadata_stat; + auto statement = current_zookeeper->get(replica_path + "/" + table_name, &metadata_stat); + current_zookeeper->createOrUpdate(replica_path + "/" + to_table_name, statement, zkutil::CreateMode::Persistent); + current_zookeeper->remove(replica_path + "/" + table_name); + // TODO add rename statement to the log } void DatabaseReplicated::dropTable( @@ -176,9 +164,10 @@ void DatabaseReplicated::dropTable( { // try DatabaseOnDisk::dropTable(context, table_name, no_delay); - // replicated stuff - //String statement = getObjectDefinitionFromCreateQuery(query); - // ... + + // let's do dumb remove from zk at the first iteration + current_zookeeper = getZooKeeper(); + current_zookeeper->remove(replica_path + "/" + table_name); } void DatabaseReplicated::drop(const Context & context) From 948bd1c5cc3f069aa621055611b81f484de49dad Mon Sep 17 00:00:00 2001 From: Val Date: Thu, 30 Apr 2020 19:16:53 +0300 Subject: [PATCH 009/381] database replicated basic test (create and drop) --- .../01267_replicated_database_engine_zookeeper.sql | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 tests/queries/0_stateless/01267_replicated_database_engine_zookeeper.sql diff --git a/tests/queries/0_stateless/01267_replicated_database_engine_zookeeper.sql b/tests/queries/0_stateless/01267_replicated_database_engine_zookeeper.sql new file mode 100644 index 00000000000..94b461e2f93 --- /dev/null +++ b/tests/queries/0_stateless/01267_replicated_database_engine_zookeeper.sql @@ -0,0 +1,12 @@ +DROP DATABASE IF EXISTS test_db1; +DROP DATABASE IF EXISTS test_db2; +DROP TABLE IF EXISTS test_table1; +DROP TABLE IF EXISTS test_table2; + +CREATE DATABASE test_db1 ENGINE = Replicated('/clickhouse/databases/test1', 'id1'); +USE test_db1; +CREATE TABLE test_table1 (d Date, k UInt64, i32 Int32) ENGINE=ReplicatedMergeTree('/clickhouse/tables/test1', 'id1', d, k, 8192); + +CREATE DATABASE test_db2 ENGINE = Replicated('/clickhouse/databases/test1', 'id2'); +USE test_db2; +CREATE TABLE test_table2 (d Date, k UInt64, i32 Int32) ENGINE=ReplicatedMergeTree('/clickhouse/tables/test1', 'id2', d, k, 8192); From 0a4c1783a1ef45edc189e1cf19e2fdef1712e140 Mon Sep 17 00:00:00 2001 From: Val Date: Fri, 1 May 2020 16:16:02 +0300 Subject: [PATCH 010/381] Make drop work by fixing namespace bug data dir wasn't set right. now it's fixed. add non-replicated table to test sql --- src/Databases/DatabaseReplicated.cpp | 19 ++++++++++--------- ...7_replicated_database_engine_zookeeper.sql | 10 ++++------ 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index d6bbec24791..61bcfc8d5a9 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -71,22 +71,24 @@ DatabaseReplicated::DatabaseReplicated( const String & zookeeper_path_, const String & replica_name_, const Context & context_) - : DatabaseOrdinary(name_, metadata_path_, "data/", "DatabaseReplicated (" + name_ + ")", context_) + : DatabaseOrdinary(name_, metadata_path_, "data/" + escapeForFileName(name_) + "/", "DatabaseReplicated (" + name_ + ")", context_) , zookeeper_path(zookeeper_path_) , replica_name(replica_name_) { + LOG_DEBUG(log, "METADATA PATH ARGUMENT " << metadata_path_); + LOG_DEBUG(log, "METADATA PATH ACTUAL " << getMetadataPath()); if (!zookeeper_path.empty() && zookeeper_path.back() == '/') zookeeper_path.resize(zookeeper_path.size() - 1); - /// If zookeeper chroot prefix is used, path should start with '/', because chroot concatenates without it. + // If zookeeper chroot prefix is used, path should start with '/', because chroot concatenates without it. if (!zookeeper_path.empty() && zookeeper_path.front() != '/') zookeeper_path = "/" + zookeeper_path; + replica_path = zookeeper_path + "/replicas/" + replica_name; if (context_.hasZooKeeper()) { current_zookeeper = context_.getZooKeeper(); } - if (!current_zookeeper) { throw Exception("Can't create replicated database without ZooKeeper", ErrorCodes::NO_ZOOKEEPER); @@ -95,6 +97,7 @@ DatabaseReplicated::DatabaseReplicated( } // test without this fancy mess (prob wont work) + // it works current_zookeeper->createAncestors(replica_path); current_zookeeper->createOrUpdate(replica_path, String(), zkutil::CreateMode::Persistent); @@ -172,12 +175,10 @@ void DatabaseReplicated::dropTable( void DatabaseReplicated::drop(const Context & context) { - DatabaseOnDisk::drop(context); - // replicated stuff - //String statement = getObjectDefinitionFromCreateQuery(query); - // should it be possible to recover after a drop. - // if not, we can just delete all the zookeeper nodes starting from - // zookeeper path. does it work recursively? hope so... + current_zookeeper = getZooKeeper(); + current_zookeeper->remove(replica_path); + + DatabaseOnDisk::drop(context); // no throw } // sync replica's zookeeper metadata diff --git a/tests/queries/0_stateless/01267_replicated_database_engine_zookeeper.sql b/tests/queries/0_stateless/01267_replicated_database_engine_zookeeper.sql index 94b461e2f93..c70de9a50d2 100644 --- a/tests/queries/0_stateless/01267_replicated_database_engine_zookeeper.sql +++ b/tests/queries/0_stateless/01267_replicated_database_engine_zookeeper.sql @@ -1,12 +1,10 @@ DROP DATABASE IF EXISTS test_db1; DROP DATABASE IF EXISTS test_db2; -DROP TABLE IF EXISTS test_table1; -DROP TABLE IF EXISTS test_table2; CREATE DATABASE test_db1 ENGINE = Replicated('/clickhouse/databases/test1', 'id1'); -USE test_db1; -CREATE TABLE test_table1 (d Date, k UInt64, i32 Int32) ENGINE=ReplicatedMergeTree('/clickhouse/tables/test1', 'id1', d, k, 8192); +CREATE TABLE test_db1.replicated_table (d Date, k UInt64, i32 Int32) ENGINE=ReplicatedMergeTree('/clickhouse/tables/test1', 'id1', d, k, 8192); +CREATE TABLE test_db1.basic_table (EventDate Date, CounterID Int) engine=MergeTree(EventDate, (CounterID, EventDate), 8192); CREATE DATABASE test_db2 ENGINE = Replicated('/clickhouse/databases/test1', 'id2'); -USE test_db2; -CREATE TABLE test_table2 (d Date, k UInt64, i32 Int32) ENGINE=ReplicatedMergeTree('/clickhouse/tables/test1', 'id2', d, k, 8192); +CREATE TABLE test_db2.replicated_table (d Date, k UInt64, i32 Int32) ENGINE=ReplicatedMergeTree('/clickhouse/tables/test1', 'id2', d, k, 8192); +CREATE TABLE test_db2.basic_table (EventDate Date, CounterID Int) engine=MergeTree(EventDate, (CounterID, EventDate), 8192); From 319256ef4f29b0e4d4d0f5034874961fbb64813d Mon Sep 17 00:00:00 2001 From: Val Date: Tue, 5 May 2020 17:16:59 +0300 Subject: [PATCH 011/381] an attempt to replicated create query from create query --- src/Databases/DatabaseReplicated.cpp | 198 +++++++++----------- src/Databases/DatabaseReplicated.h | 60 +++--- src/Databases/IDatabase.h | 4 + src/Interpreters/InterpreterCreateQuery.cpp | 15 +- 4 files changed, 143 insertions(+), 134 deletions(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 61bcfc8d5a9..a1eb910dedf 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -70,8 +71,11 @@ DatabaseReplicated::DatabaseReplicated( const String & metadata_path_, const String & zookeeper_path_, const String & replica_name_, - const Context & context_) - : DatabaseOrdinary(name_, metadata_path_, "data/" + escapeForFileName(name_) + "/", "DatabaseReplicated (" + name_ + ")", context_) + Context & context_) +// : DatabaseOrdinary(name_, metadata_path_, "data/" + escapeForFileName(name_) + "/", "DatabaseReplicated (" + name_ + ")", context_) + // TODO add constructor to Atomic and call it here with path and logger name specification + // TODO ask why const and & are ommited in Atomic + : DatabaseAtomic(name_, metadata_path_, context_) , zookeeper_path(zookeeper_path_) , replica_name(replica_name_) { @@ -96,115 +100,97 @@ DatabaseReplicated::DatabaseReplicated( } - // test without this fancy mess (prob wont work) - // it works - current_zookeeper->createAncestors(replica_path); - current_zookeeper->createOrUpdate(replica_path, String(), zkutil::CreateMode::Persistent); + current_zookeeper->createAncestors(zookeeper_path); + current_zookeeper->createOrUpdate(zookeeper_path, String(), zkutil::CreateMode::Persistent); -// if (!current_zookeeper->exists(zookeeper_path)) { -// -// LOG_DEBUG(log, "Creating database " << zookeeper_path); -// current_zookeeper->createAncestors(zookeeper_path); - - // Coordination::Requests ops; - // ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path, "", - // zkutil::CreateMode::Persistent)); - // ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/replicas", "", - // zkutil::CreateMode::Persistent)); - - // Coordination::Responses responses; - // auto code = current_zookeeper->tryMulti(ops, responses); - // if (code && code != Coordination::ZNODEEXISTS) - // throw Coordination::Exception(code); - // } -} - -void DatabaseReplicated::createTable( - const Context & context, - const String & table_name, - const StoragePtr & table, - const ASTPtr & query) -{ - // try? - DatabaseOnDisk::createTable(context, table_name, table, query); - - // suppose it worked - String statement = getObjectDefinitionFromCreateQuery(query); - LOG_DEBUG(log, "CREATE TABLE STATEMENT " << statement); - - // let's do dumb write to zk at the first iteration - current_zookeeper = getZooKeeper(); - current_zookeeper->createOrUpdate(replica_path + "/" + table_name, statement, zkutil::CreateMode::Persistent); + // TODO launch a worker here } -void DatabaseReplicated::renameTable( - const Context & context, - const String & table_name, - IDatabase & to_database, - const String & to_table_name, - bool exchange) -{ - // try - DatabaseOnDisk::renameTable(context, table_name, to_database, to_table_name, exchange); - // replicated stuff; what to put to a znode - // String statement = getObjectDefinitionFromCreateQuery(query); - // this one is fairly more complex - current_zookeeper = getZooKeeper(); - - // no need for now to have stat - Coordination::Stat metadata_stat; - auto statement = current_zookeeper->get(replica_path + "/" + table_name, &metadata_stat); - current_zookeeper->createOrUpdate(replica_path + "/" + to_table_name, statement, zkutil::CreateMode::Persistent); - current_zookeeper->remove(replica_path + "/" + table_name); - // TODO add rename statement to the log +void DatabaseReplicated::propose(const ASTPtr & query) { + LOG_DEBUG(log, "PROPOSING\n" << queryToString(query)); } -void DatabaseReplicated::dropTable( - const Context & context, - const String & table_name, - bool no_delay) -{ - // try - DatabaseOnDisk::dropTable(context, table_name, no_delay); - - // let's do dumb remove from zk at the first iteration - current_zookeeper = getZooKeeper(); - current_zookeeper->remove(replica_path + "/" + table_name); -} - -void DatabaseReplicated::drop(const Context & context) -{ - current_zookeeper = getZooKeeper(); - current_zookeeper->remove(replica_path); - - DatabaseOnDisk::drop(context); // no throw -} - -// sync replica's zookeeper metadata -void DatabaseReplicated::syncReplicaState(Context & context) { - auto c = context; // fixes unuser parameter error - return; -} - -// get the up to date metadata from zookeeper to local metadata dir -// for replicated (only?) tables -void DatabaseReplicated::updateMetadata(Context & context) { - auto c = context; // fixes unuser parameter error - return; -} - -void DatabaseReplicated::loadStoredObjects( - Context & context, - bool has_force_restore_data_flag) -{ - syncReplicaState(context); - updateMetadata(context); - - DatabaseOrdinary::loadStoredObjects(context, has_force_restore_data_flag); - -} - - +// void DatabaseReplicated::createTable( +// const Context & context, +// const String & table_name, +// const StoragePtr & table, +// const ASTPtr & query) +// { +// LOG_DEBUG(log, "CREATE TABLE"); +// +// +// DatabaseOnDisk::createTable(context, table_name, table, query); +// +// // String statement = getObjectDefinitionFromCreateQuery(query); +// +// // current_zookeeper = getZooKeeper(); +// // current_zookeeper->createOrUpdate(replica_path + "/" + table_name + ".sql", statement, zkutil::CreateMode::Persistent); +// return; +// } +// +// +// void DatabaseReplicated::renameTable( +// const Context & context, +// const String & table_name, +// IDatabase & to_database, +// const String & to_table_name, +// bool exchange) +// { +// LOG_DEBUG(log, "RENAME TABLE"); +// DatabaseAtomic::renameTable(context, table_name, to_database, to_table_name, exchange); +// // try +// // DatabaseOnDisk::renameTable(context, table_name, to_database, to_table_name, exchange); +// // replicated stuff; what to put to a znode +// // String statement = getObjectDefinitionFromCreateQuery(query); +// // this one is fairly more complex +// // current_zookeeper = getZooKeeper(); +// +// // no need for now to have stat +// // Coordination::Stat metadata_stat; +// // auto statement = current_zookeeper->get(replica_path + "/" + table_name, &metadata_stat); +// // current_zookeeper->createOrUpdate(replica_path + "/" + to_table_name, statement, zkutil::CreateMode::Persistent); +// // current_zookeeper->remove(replica_path + "/" + table_name); +// // TODO add rename statement to the log +// return; +// } +// +// void DatabaseReplicated::dropTable( +// const Context & context, +// const String & table_name, +// bool no_delay) +// { +// LOG_DEBUG(log, "DROP TABLE"); +// DatabaseAtomic::dropTable(context, table_name, no_delay); +// // try +// // DatabaseOnDisk::dropTable(context, table_name, no_delay); +// +// // let's do dumb remove from zk at the first iteration +// // current_zookeeper = getZooKeeper(); +// // current_zookeeper->remove(replica_path + "/" + table_name); +// return; +// } +// +// void DatabaseReplicated::drop(const Context & context) +// { +// LOG_DEBUG(log, "DROP"); +// DatabaseAtomic::drop(context); +// // current_zookeeper = getZooKeeper(); +// // current_zookeeper->remove(replica_path); +// +// // DatabaseOnDisk::drop(context); // no throw +// return; +// } +// +// void DatabaseReplicated::loadStoredObjects( +// Context & context, +// bool has_force_restore_data_flag) +// { +// DatabaseOrdinary::loadStoredObjects(context, has_force_restore_data_flag); +// // launch a worker maybe. i don't know +// // DatabaseAtomic::loadStoredObjects(context, has_force_restore_data_flag); +// +// return; +// } } diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index bc1af923277..df6f86c1491 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include @@ -11,36 +11,47 @@ namespace DB * that contain declaration of table represented by SQL ATTACH TABLE query * and operation log in zookeeper */ -class DatabaseReplicated : public DatabaseOrdinary +class DatabaseReplicated : public DatabaseAtomic { public: - DatabaseReplicated(const String & name_, const String & metadata_path_, const String & zookeeper_path_, const String & replica_name_, const Context & context); + DatabaseReplicated(const String & name_, const String & metadata_path_, const String & zookeeper_path_, const String & replica_name_, Context & context); + +// void drop(const Context & context) override; String getEngineName() const override { return "Replicated"; } - void createTable( - const Context & context, - const String & table_name, - const StoragePtr & table, - const ASTPtr & query) override; + void propose(const ASTPtr & query) override; - void dropTable( - const Context & context, - const String & table_name, - bool no_delay) override; +// void createTable( +// const Context & context, +// const String & table_name, +// const StoragePtr & table, +// const ASTPtr & query) override; +// +// void dropTable( +// const Context & context, +// const String & table_name, +// bool no_delay) override; +// +// void renameTable( +// const Context & context, +// const String & table_name, +// IDatabase & to_database, +// const String & to_table_name, +// bool exchange) override; +// +// void alterTable( +// const Context & context, +// const StorageID & table_id, +// const StorageInMemoryMetadata & metadata) override; - void renameTable( - const Context & context, - const String & table_name, - IDatabase & to_database, - const String & to_table_name, - bool exchange) override; +// void attachTable(const String & name, const StoragePtr & table, const String & relative_table_path) override; +// +// StoragePtr detachTable(const String & name) override; - void drop(const Context & context) override; - - void loadStoredObjects( - Context & context, - bool has_force_restore_data_flag) override; +// void loadStoredObjects( +// Context & context, +// bool has_force_restore_data_flag) override; private: String zookeeper_path; @@ -54,9 +65,6 @@ private: zkutil::ZooKeeperPtr getZooKeeper() const; void setZooKeeper(zkutil::ZooKeeperPtr zookeeper); - void syncReplicaState(Context & context); - - void updateMetadata(Context & context); }; } diff --git a/src/Databases/IDatabase.h b/src/Databases/IDatabase.h index 26b27045be6..18265b153cf 100644 --- a/src/Databases/IDatabase.h +++ b/src/Databases/IDatabase.h @@ -161,6 +161,10 @@ public: /// Is the database empty. virtual bool empty() const = 0; + virtual void propose(const ASTPtr & /*query*/) { + throw Exception("There is no propose query method for Database" + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); + } + /// Add the table to the database. Record its presence in the metadata. virtual void createTable( const Context & /*context*/, diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 3e09d728c4c..99c021a72fa 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -622,7 +622,7 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, if (need_add_to_database) { database = DatabaseCatalog::instance().getDatabase(create.database); - if (database->getEngineName() == "Atomic") + if (database->getEngineName() == "Atomic" || database->getEngineName() == "Replicated") { /// TODO implement ATTACH FROM 'path/to/data': generate UUID and move table data to store/ if (create.attach && create.uuid == UUIDHelpers::Nil) @@ -696,7 +696,18 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, false); } - database->createTable(context, table_name, res, query_ptr); + + if (database->getEngineName() == "Replicated") { + // propose + // try to + database->propose(query_ptr); + database->createTable(context, table_name, res, query_ptr); + // catch + // throw and remove proposal + // otherwise + // proceed (commit to zk) + } else + database->createTable(context, table_name, res, query_ptr); /// We must call "startup" and "shutdown" while holding DDLGuard. /// Because otherwise method "shutdown" (from InterpreterDropQuery) can be called before startup From 0a860c0c2ba760bf8c6ea45378acc0f00cb2bcff Mon Sep 17 00:00:00 2001 From: Val Date: Mon, 11 May 2020 15:55:17 +0300 Subject: [PATCH 012/381] log based replicated --- src/Databases/DatabaseReplicated.cpp | 177 ++++++++++---------- src/Databases/DatabaseReplicated.h | 57 +++---- src/Interpreters/ClientInfo.h | 1 + src/Interpreters/Context.h | 3 + src/Interpreters/DDLWorker.cpp | 3 +- src/Interpreters/InterpreterAlterQuery.cpp | 9 + src/Interpreters/InterpreterCreateQuery.cpp | 4 +- src/Interpreters/InterpreterDropQuery.cpp | 6 + src/Interpreters/InterpreterRenameQuery.cpp | 6 +- 9 files changed, 142 insertions(+), 124 deletions(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index a1eb910dedf..1bc954bfb76 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -6,11 +6,13 @@ #include #include #include +#include #include #include #include #include #include +#include #include #include #include @@ -24,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -33,8 +36,10 @@ #include #include #include +#include #include +#include namespace DB { @@ -75,13 +80,11 @@ DatabaseReplicated::DatabaseReplicated( // : DatabaseOrdinary(name_, metadata_path_, "data/" + escapeForFileName(name_) + "/", "DatabaseReplicated (" + name_ + ")", context_) // TODO add constructor to Atomic and call it here with path and logger name specification // TODO ask why const and & are ommited in Atomic - : DatabaseAtomic(name_, metadata_path_, context_) + : DatabaseOrdinary(name_, metadata_path_, context_) + , context(context_) , zookeeper_path(zookeeper_path_) , replica_name(replica_name_) { - LOG_DEBUG(log, "METADATA PATH ARGUMENT " << metadata_path_); - LOG_DEBUG(log, "METADATA PATH ACTUAL " << getMetadataPath()); - if (!zookeeper_path.empty() && zookeeper_path.back() == '/') zookeeper_path.resize(zookeeper_path.size() - 1); // If zookeeper chroot prefix is used, path should start with '/', because chroot concatenates without it. @@ -103,94 +106,96 @@ DatabaseReplicated::DatabaseReplicated( current_zookeeper->createAncestors(zookeeper_path); current_zookeeper->createOrUpdate(zookeeper_path, String(), zkutil::CreateMode::Persistent); + // TODO if no last_entry then make it equal to 0 in zk; + // TODO launch a worker here + + main_thread = ThreadFromGlobalPool(&DatabaseReplicated::runMainThread, this); +} + +DatabaseReplicated::~DatabaseReplicated() +{ + stop_flag = true; + main_thread.join(); +} + +void DatabaseReplicated::runMainThread() { + setThreadName("ReplctdWorker"); // ok whatever. 15 bytes // + database_name); + LOG_DEBUG(log, "Started " << database_name << " database worker thread\n Replica: " << replica_name); + + while (!stop_flag) { + attachToThreadGroup(); + + sleepForSeconds(10); + current_zookeeper = getZooKeeper(); + String last_n = current_zookeeper->get(zookeeper_path + "/last_entry", {}, NULL); + size_t last_n_parsed = parse(last_n); + while (current_log_entry_n < last_n_parsed) { + current_log_entry_n++; + executeLog(current_log_entry_n); + } + break; // debug purpose + } +} + +void DatabaseReplicated::executeLog(size_t n) { + + LOG_DEBUG(log, "EXECUTING LOG! DB: " << database_name << "\n Replica: " << replica_name << "LOG N" << n); + current_context = std::make_unique(context); + current_context->from_replicated_log = true; + current_context->setCurrentQueryId(""); // generate random query_id + current_zookeeper = getZooKeeper(); + + String query_to_execute = current_zookeeper->get(zookeeper_path + "/log." + std::to_string(n), {}, NULL); + ReadBufferFromString istr(query_to_execute); + String dummy_string; + WriteBufferFromString ostr(dummy_string); + executeQuery(istr, ostr, false, context, {}); +} + +// TODO we might not need it here at all +void DatabaseReplicated::attachToThreadGroup() { + if (thread_group) + { + /// Put all threads to one thread pool + CurrentThread::attachToIfDetached(thread_group); + } + else + { + CurrentThread::initializeQuery(); + thread_group = CurrentThread::getGroup(); + } +} + +// taken from ddlworker +static std::unique_ptr createSimpleZooKeeperLock( + const std::shared_ptr & zookeeper, const String & lock_prefix, const String & lock_name, const String & lock_message) +{ + auto zookeeper_holder = std::make_shared(); + zookeeper_holder->initFromInstance(zookeeper); + return std::make_unique(std::move(zookeeper_holder), lock_prefix, lock_name, lock_message); } void DatabaseReplicated::propose(const ASTPtr & query) { + // TODO if source is zk then omit propose. Throw? + + // TODO remove that log message i think LOG_DEBUG(log, "PROPOSING\n" << queryToString(query)); + + current_zookeeper = getZooKeeper(); + auto lock = createSimpleZooKeeperLock(current_zookeeper, zookeeper_path, "lock", replica_name); + + // TODO check that last_entry is the same as current_log_entry_n for the replica + + current_log_entry_n++; // starting from 1 + String log_entry = zookeeper_path + "/log." + std::to_string(current_log_entry_n); + current_zookeeper->createOrUpdate(log_entry, queryToString(query), zkutil::CreateMode::Persistent); + + current_zookeeper->createOrUpdate(zookeeper_path + "/last_entry", std::to_string(current_log_entry_n), zkutil::CreateMode::Persistent); + + lock->unlock(); + // write to metastore the last entry? } -// void DatabaseReplicated::createTable( -// const Context & context, -// const String & table_name, -// const StoragePtr & table, -// const ASTPtr & query) -// { -// LOG_DEBUG(log, "CREATE TABLE"); -// -// -// DatabaseOnDisk::createTable(context, table_name, table, query); -// -// // String statement = getObjectDefinitionFromCreateQuery(query); -// -// // current_zookeeper = getZooKeeper(); -// // current_zookeeper->createOrUpdate(replica_path + "/" + table_name + ".sql", statement, zkutil::CreateMode::Persistent); -// return; -// } -// -// -// void DatabaseReplicated::renameTable( -// const Context & context, -// const String & table_name, -// IDatabase & to_database, -// const String & to_table_name, -// bool exchange) -// { -// LOG_DEBUG(log, "RENAME TABLE"); -// DatabaseAtomic::renameTable(context, table_name, to_database, to_table_name, exchange); -// // try -// // DatabaseOnDisk::renameTable(context, table_name, to_database, to_table_name, exchange); -// // replicated stuff; what to put to a znode -// // String statement = getObjectDefinitionFromCreateQuery(query); -// // this one is fairly more complex -// // current_zookeeper = getZooKeeper(); -// -// // no need for now to have stat -// // Coordination::Stat metadata_stat; -// // auto statement = current_zookeeper->get(replica_path + "/" + table_name, &metadata_stat); -// // current_zookeeper->createOrUpdate(replica_path + "/" + to_table_name, statement, zkutil::CreateMode::Persistent); -// // current_zookeeper->remove(replica_path + "/" + table_name); -// // TODO add rename statement to the log -// return; -// } -// -// void DatabaseReplicated::dropTable( -// const Context & context, -// const String & table_name, -// bool no_delay) -// { -// LOG_DEBUG(log, "DROP TABLE"); -// DatabaseAtomic::dropTable(context, table_name, no_delay); -// // try -// // DatabaseOnDisk::dropTable(context, table_name, no_delay); -// -// // let's do dumb remove from zk at the first iteration -// // current_zookeeper = getZooKeeper(); -// // current_zookeeper->remove(replica_path + "/" + table_name); -// return; -// } -// -// void DatabaseReplicated::drop(const Context & context) -// { -// LOG_DEBUG(log, "DROP"); -// DatabaseAtomic::drop(context); -// // current_zookeeper = getZooKeeper(); -// // current_zookeeper->remove(replica_path); -// -// // DatabaseOnDisk::drop(context); // no throw -// return; -// } -// -// void DatabaseReplicated::loadStoredObjects( -// Context & context, -// bool has_force_restore_data_flag) -// { -// DatabaseOrdinary::loadStoredObjects(context, has_force_restore_data_flag); -// // launch a worker maybe. i don't know -// // DatabaseAtomic::loadStoredObjects(context, has_force_restore_data_flag); -// -// return; -// } - } diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index df6f86c1491..d61f0a00ef8 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -1,9 +1,12 @@ #pragma once -#include +#include #include #include +#include +#include + namespace DB { /** Replicated database engine. @@ -11,49 +14,35 @@ namespace DB * that contain declaration of table represented by SQL ATTACH TABLE query * and operation log in zookeeper */ -class DatabaseReplicated : public DatabaseAtomic +class DatabaseReplicated : public DatabaseOrdinary { public: DatabaseReplicated(const String & name_, const String & metadata_path_, const String & zookeeper_path_, const String & replica_name_, Context & context); -// void drop(const Context & context) override; + ~DatabaseReplicated(); String getEngineName() const override { return "Replicated"; } void propose(const ASTPtr & query) override; -// void createTable( -// const Context & context, -// const String & table_name, -// const StoragePtr & table, -// const ASTPtr & query) override; -// -// void dropTable( -// const Context & context, -// const String & table_name, -// bool no_delay) override; -// -// void renameTable( -// const Context & context, -// const String & table_name, -// IDatabase & to_database, -// const String & to_table_name, -// bool exchange) override; -// -// void alterTable( -// const Context & context, -// const StorageID & table_id, -// const StorageInMemoryMetadata & metadata) override; - -// void attachTable(const String & name, const StoragePtr & table, const String & relative_table_path) override; -// -// StoragePtr detachTable(const String & name) override; - -// void loadStoredObjects( -// Context & context, -// bool has_force_restore_data_flag) override; - private: + + void runMainThread(); + void runCleanupThread(); + + void attachToThreadGroup(); + + void executeLog(size_t n); + + Context & context; // is it overkiill? + std::unique_ptr current_context; // to run executeQuery + + size_t current_log_entry_n = 0; + std::atomic stop_flag{false}; + + ThreadFromGlobalPool main_thread; + ThreadGroupStatusPtr thread_group; + String zookeeper_path; String replica_name; String replica_path; diff --git a/src/Interpreters/ClientInfo.h b/src/Interpreters/ClientInfo.h index 704fba3b3ef..2dff30e40a2 100644 --- a/src/Interpreters/ClientInfo.h +++ b/src/Interpreters/ClientInfo.h @@ -38,6 +38,7 @@ public: NO_QUERY = 0, /// Uninitialized object. INITIAL_QUERY = 1, SECONDARY_QUERY = 2, /// Query that was initiated by another query for distributed or ON CLUSTER query execution. + REPLICATED_LOG_QUERY = 3, /// TODO add comment }; diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index 5a4e959229f..66ea6f6914c 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -214,6 +214,9 @@ private: Context(); public: + ///testing + bool from_replicated_log = false; + /// Create initial Context with ContextShared and etc. static Context createGlobal(ContextShared * shared); static SharedContextHolder createShared(); diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index 28436f192b0..65f984924a3 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -585,7 +585,8 @@ bool DDLWorker::tryExecuteQuery(const String & query, const DDLTask & task, Exec try { current_context = std::make_unique(context); - current_context->getClientInfo().query_kind = ClientInfo::QueryKind::SECONDARY_QUERY; + //current_context->getClientInfo().query_kind = ClientInfo::QueryKind::SECONDARY_QUERY; + current_context->from_replicated_log = true; current_context->setCurrentQueryId(""); // generate random query_id executeQuery(istr, ostr, false, *current_context, {}); } diff --git a/src/Interpreters/InterpreterAlterQuery.cpp b/src/Interpreters/InterpreterAlterQuery.cpp index 61277b8160c..ad79bd68fed 100644 --- a/src/Interpreters/InterpreterAlterQuery.cpp +++ b/src/Interpreters/InterpreterAlterQuery.cpp @@ -15,6 +15,8 @@ #include #include #include +#include +#include namespace DB @@ -37,6 +39,7 @@ BlockIO InterpreterAlterQuery::execute() { const auto & alter = query_ptr->as(); + if (!alter.cluster.empty()) return executeDDLQueryOnCluster(query_ptr, context, getRequiredAccess()); @@ -46,6 +49,12 @@ BlockIO InterpreterAlterQuery::execute() auto alter_lock = table->lockForAlter(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); auto metadata_snapshot = table->getInMemoryMetadataPtr(); + // TODO it's dirty. need to add database to parsing stage + DatabasePtr database = DatabaseCatalog::instance().getDatabase(table_id.database_name); + if (database->getEngineName() == "Replicated" && !context.from_replicated_log) { + database->propose(query_ptr); + } + /// Add default database to table identifiers that we can encounter in e.g. default expressions, /// mutation expression, etc. AddDefaultDatabaseVisitor visitor(table_id.getDatabaseName()); diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 99c021a72fa..5698c370fa1 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -622,7 +622,7 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, if (need_add_to_database) { database = DatabaseCatalog::instance().getDatabase(create.database); - if (database->getEngineName() == "Atomic" || database->getEngineName() == "Replicated") + if (database->getEngineName() == "Atomic") // || database->getEngineName() == "Replicated") { /// TODO implement ATTACH FROM 'path/to/data': generate UUID and move table data to store/ if (create.attach && create.uuid == UUIDHelpers::Nil) @@ -697,7 +697,7 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, } - if (database->getEngineName() == "Replicated") { + if (database->getEngineName() == "Replicated" && !context.from_replicated_log) { // propose // try to database->propose(query_ptr); diff --git a/src/Interpreters/InterpreterDropQuery.cpp b/src/Interpreters/InterpreterDropQuery.cpp index e6853a8af4c..bae1b796016 100644 --- a/src/Interpreters/InterpreterDropQuery.cpp +++ b/src/Interpreters/InterpreterDropQuery.cpp @@ -97,6 +97,9 @@ BlockIO InterpreterDropQuery::executeToTable( if (database->getEngineName() != "Atomic") table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); /// Drop table from memory, don't touch data and metadata + if (database->getEngineName() == "Replicated" && !context.from_replicated_log) { + database->propose(query_ptr); + } database->detachTable(table_id.table_name); } else if (query.kind == ASTDropQuery::Kind::Truncate) @@ -120,6 +123,9 @@ BlockIO InterpreterDropQuery::executeToTable( if (database->getEngineName() != "Atomic") table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); + if (database->getEngineName() == "Replicated" && !context.from_replicated_log) { + database->propose(query_ptr); + } database->dropTable(context, table_id.table_name, query.no_delay); } } diff --git a/src/Interpreters/InterpreterRenameQuery.cpp b/src/Interpreters/InterpreterRenameQuery.cpp index de2b6bb0c1c..d93b14a6bc2 100644 --- a/src/Interpreters/InterpreterRenameQuery.cpp +++ b/src/Interpreters/InterpreterRenameQuery.cpp @@ -80,7 +80,11 @@ BlockIO InterpreterRenameQuery::execute() if (!rename.exchange) database_catalog.assertTableDoesntExist(StorageID(elem.to_database_name, elem.to_table_name), context); - database_catalog.getDatabase(elem.from_database_name)->renameTable( + DatabasePtr database = database_catalog.getDatabase(elem.from_database_name); + if (database->getEngineName() == "Replicated" && !context.from_replicated_log) { + database->propose(query_ptr); + } + database->renameTable( context, elem.from_table_name, *database_catalog.getDatabase(elem.to_database_name), From 5eea58039c6f78a93eabd65792e8ed5c47615127 Mon Sep 17 00:00:00 2001 From: Val Date: Mon, 11 May 2020 16:31:14 +0300 Subject: [PATCH 013/381] fix not initialized last entry in zk --- src/Databases/DatabaseReplicated.cpp | 14 ++++++++------ src/Databases/DatabaseReplicated.h | 2 +- src/Interpreters/DDLWorker.cpp | 3 +-- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 1bc954bfb76..36c95f68c2c 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -99,8 +99,6 @@ DatabaseReplicated::DatabaseReplicated( if (!current_zookeeper) { throw Exception("Can't create replicated database without ZooKeeper", ErrorCodes::NO_ZOOKEEPER); - - } current_zookeeper->createAncestors(zookeeper_path); @@ -109,7 +107,6 @@ DatabaseReplicated::DatabaseReplicated( // TODO if no last_entry then make it equal to 0 in zk; // TODO launch a worker here - main_thread = ThreadFromGlobalPool(&DatabaseReplicated::runMainThread, this); } @@ -126,15 +123,20 @@ void DatabaseReplicated::runMainThread() { while (!stop_flag) { attachToThreadGroup(); - sleepForSeconds(10); + sleepForSeconds(2); current_zookeeper = getZooKeeper(); - String last_n = current_zookeeper->get(zookeeper_path + "/last_entry", {}, NULL); + String last_n; + if (!current_zookeeper->tryGet(zookeeper_path + "/last_entry", last_n, {}, NULL)) { + continue; + } size_t last_n_parsed = parse(last_n); + LOG_DEBUG(log, "PARSED " << last_n_parsed); + LOG_DEBUG(log, "LOCAL CURRENT " << current_log_entry_n); while (current_log_entry_n < last_n_parsed) { current_log_entry_n++; executeLog(current_log_entry_n); } - break; // debug purpose + // break; // debug purpose } } diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index d61f0a00ef8..7700d17d9e4 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -37,7 +37,7 @@ private: Context & context; // is it overkiill? std::unique_ptr current_context; // to run executeQuery - size_t current_log_entry_n = 0; + std::atomic current_log_entry_n = 0; std::atomic stop_flag{false}; ThreadFromGlobalPool main_thread; diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index 65f984924a3..28436f192b0 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -585,8 +585,7 @@ bool DDLWorker::tryExecuteQuery(const String & query, const DDLTask & task, Exec try { current_context = std::make_unique(context); - //current_context->getClientInfo().query_kind = ClientInfo::QueryKind::SECONDARY_QUERY; - current_context->from_replicated_log = true; + current_context->getClientInfo().query_kind = ClientInfo::QueryKind::SECONDARY_QUERY; current_context->setCurrentQueryId(""); // generate random query_id executeQuery(istr, ostr, false, *current_context, {}); } From d61259cd7b2f9f49c8a1e6da6a431a97d6616f45 Mon Sep 17 00:00:00 2001 From: Val Date: Tue, 12 May 2020 16:35:05 +0300 Subject: [PATCH 014/381] ddl replication works --- src/Databases/DatabaseReplicated.cpp | 23 ++++++++++++++++------- src/Databases/DatabaseReplicated.h | 1 - 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 36c95f68c2c..2c7f6facf71 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -81,7 +82,6 @@ DatabaseReplicated::DatabaseReplicated( // TODO add constructor to Atomic and call it here with path and logger name specification // TODO ask why const and & are ommited in Atomic : DatabaseOrdinary(name_, metadata_path_, context_) - , context(context_) , zookeeper_path(zookeeper_path_) , replica_name(replica_name_) { @@ -142,17 +142,26 @@ void DatabaseReplicated::runMainThread() { void DatabaseReplicated::executeLog(size_t n) { - LOG_DEBUG(log, "EXECUTING LOG! DB: " << database_name << "\n Replica: " << replica_name << "LOG N" << n); - current_context = std::make_unique(context); - current_context->from_replicated_log = true; - current_context->setCurrentQueryId(""); // generate random query_id current_zookeeper = getZooKeeper(); - String query_to_execute = current_zookeeper->get(zookeeper_path + "/log." + std::to_string(n), {}, NULL); ReadBufferFromString istr(query_to_execute); String dummy_string; WriteBufferFromString ostr(dummy_string); - executeQuery(istr, ostr, false, context, {}); + + try + { + current_context = std::make_unique(global_context); + current_context->from_replicated_log = true; + current_context->setCurrentQueryId(""); // generate random query_id + executeQuery(istr, ostr, false, *current_context, {}); + } + catch (...) + { + tryLogCurrentException(log, "Query " + query_to_execute + " wasn't finished successfully"); + + } + + LOG_DEBUG(log, "Executed query: " << query_to_execute); } // TODO we might not need it here at all diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index 7700d17d9e4..504be5a3ec5 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -34,7 +34,6 @@ private: void executeLog(size_t n); - Context & context; // is it overkiill? std::unique_ptr current_context; // to run executeQuery std::atomic current_log_entry_n = 0; From d7a354b24d20d2b78f91f5f745ded28e873a6b49 Mon Sep 17 00:00:00 2001 From: Val Date: Tue, 12 May 2020 17:25:36 +0300 Subject: [PATCH 015/381] create query fix for replicated dbs --- src/Databases/DatabaseReplicated.cpp | 1 + src/Interpreters/InterpreterCreateQuery.cpp | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 2c7f6facf71..e507894bd3e 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -152,6 +152,7 @@ void DatabaseReplicated::executeLog(size_t n) { { current_context = std::make_unique(global_context); current_context->from_replicated_log = true; + current_context->setCurrentDatabase(database_name); current_context->setCurrentQueryId(""); // generate random query_id executeQuery(istr, ostr, false, *current_context, {}); } diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 5698c370fa1..ed4095d63be 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -601,6 +601,11 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create) /// Set and retrieve list of columns, indices and constraints. Set table engine if needed. Rewrite query in canonical way. TableProperties properties = setProperties(create); + // testing + if (context.from_replicated_log) { + create.database = current_database; + } + /// Actually creates table bool created = doCreateTable(create, properties); if (!created) /// Table already exists From c0924b5911ce165166a66c8f0055b34ad7dbd2ed Mon Sep 17 00:00:00 2001 From: Val Date: Tue, 12 May 2020 17:55:24 +0300 Subject: [PATCH 016/381] create and alter test for replicated db --- ...icated_database_engine_zookeeper.reference | 34 ++++++++++++++++ ...9_replicated_database_engine_zookeeper.sql | 39 +++++++++++++++++++ 2 files changed, 73 insertions(+) create mode 100644 tests/queries/0_stateless/01269_replicated_database_engine_zookeeper.reference create mode 100644 tests/queries/0_stateless/01269_replicated_database_engine_zookeeper.sql diff --git a/tests/queries/0_stateless/01269_replicated_database_engine_zookeeper.reference b/tests/queries/0_stateless/01269_replicated_database_engine_zookeeper.reference new file mode 100644 index 00000000000..58f951b1257 --- /dev/null +++ b/tests/queries/0_stateless/01269_replicated_database_engine_zookeeper.reference @@ -0,0 +1,34 @@ +CounterID UInt32 +StartDate Date +UserID UInt32 +VisitID UInt32 +Added0 String +Added1 UInt32 +Added2 UInt32 +AddedNested1.A Array(UInt32) +AddedNested1.C Array(String) +AddedNested2.A Array(UInt32) +AddedNested2.B Array(UInt64) +CounterID UInt32 +StartDate Date +UserID UInt32 +VisitID UInt32 +Added0 String +Added1 UInt32 +Added2 UInt32 +AddedNested1.A Array(UInt32) +AddedNested1.C Array(String) +AddedNested2.A Array(UInt32) +AddedNested2.B Array(UInt64) +CounterID UInt32 +StartDate Date +UserID UInt32 +VisitID UInt32 +Added0 String +Added1 UInt32 +Added2 UInt32 +AddedNested1.A Array(UInt32) +AddedNested1.C Array(String) +AddedNested2.A Array(UInt32) +AddedNested2.B Array(UInt64) + diff --git a/tests/queries/0_stateless/01269_replicated_database_engine_zookeeper.sql b/tests/queries/0_stateless/01269_replicated_database_engine_zookeeper.sql new file mode 100644 index 00000000000..1acc9022014 --- /dev/null +++ b/tests/queries/0_stateless/01269_replicated_database_engine_zookeeper.sql @@ -0,0 +1,39 @@ +DROP DATABASE IF EXISTS rdbtest; +DROP DATABASE IF EXISTS replicatwo; +DROP DATABASE IF EXISTS replicathree; + +CREATE DATABASE rdbtest ENGINE = Replicated('/clickhouse/db/test1/', 'id1'); +CREATE DATABASE replicatwo ENGINE = Replicated('/clickhouse/db/test1/', 'id2'); +CREATE DATABASE replicathree ENGINE = Replicated('/clickhouse/db/test1/', 'id3'); + +USE rdbtest; + +CREATE TABLE alter_test (CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192); + +ALTER TABLE alter_test ADD COLUMN Added0 UInt32; +ALTER TABLE alter_test ADD COLUMN Added2 UInt32; +ALTER TABLE alter_test ADD COLUMN Added1 UInt32 AFTER Added0; + +ALTER TABLE alter_test ADD COLUMN AddedNested1 Nested(A UInt32, B UInt64) AFTER Added2; +ALTER TABLE alter_test ADD COLUMN AddedNested1.C Array(String) AFTER AddedNested1.B; +ALTER TABLE alter_test ADD COLUMN AddedNested2 Nested(A UInt32, B UInt64) AFTER AddedNested1; + +ALTER TABLE alter_test DROP COLUMN ToDrop; + +ALTER TABLE alter_test MODIFY COLUMN Added0 String; + +ALTER TABLE alter_test DROP COLUMN NestedColumn.A; +ALTER TABLE alter_test DROP COLUMN NestedColumn.S; + +ALTER TABLE alter_test DROP COLUMN AddedNested1.B; + +ALTER TABLE alter_test ADD COLUMN IF NOT EXISTS Added0 UInt32; +ALTER TABLE alter_test ADD COLUMN IF NOT EXISTS AddedNested1 Nested(A UInt32, B UInt64); +ALTER TABLE alter_test ADD COLUMN IF NOT EXISTS AddedNested1.C Array(String); +ALTER TABLE alter_test MODIFY COLUMN IF EXISTS ToDrop UInt64; +ALTER TABLE alter_test DROP COLUMN IF EXISTS ToDrop; +ALTER TABLE alter_test COMMENT COLUMN IF EXISTS ToDrop 'new comment'; + +DESC TABLE rdbtest.alter_test; +DESC TABLE replicatwo.alter_test; +DESC TABLE replicathree.alter_test; From f103e24a09f475f4d66038b41667b63be01a94be Mon Sep 17 00:00:00 2001 From: Val Date: Wed, 13 May 2020 17:44:01 +0300 Subject: [PATCH 017/381] make db replicated inherited from atomic --- src/Databases/DatabaseReplicated.cpp | 6 ++---- src/Databases/DatabaseReplicated.h | 4 ++-- src/Databases/DatabasesCommon.cpp | 2 +- src/Interpreters/InterpreterCreateQuery.cpp | 18 ++++++++---------- src/Interpreters/InterpreterDropQuery.cpp | 9 +++++++-- 5 files changed, 20 insertions(+), 19 deletions(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index e507894bd3e..2b473c25ce2 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -81,7 +81,7 @@ DatabaseReplicated::DatabaseReplicated( // : DatabaseOrdinary(name_, metadata_path_, "data/" + escapeForFileName(name_) + "/", "DatabaseReplicated (" + name_ + ")", context_) // TODO add constructor to Atomic and call it here with path and logger name specification // TODO ask why const and & are ommited in Atomic - : DatabaseOrdinary(name_, metadata_path_, context_) + : DatabaseAtomic(name_, metadata_path_, context_) , zookeeper_path(zookeeper_path_) , replica_name(replica_name_) { @@ -122,8 +122,7 @@ void DatabaseReplicated::runMainThread() { while (!stop_flag) { attachToThreadGroup(); - - sleepForSeconds(2); + sleepForSeconds(1);// BURN CPU current_zookeeper = getZooKeeper(); String last_n; if (!current_zookeeper->tryGet(zookeeper_path + "/last_entry", last_n, {}, NULL)) { @@ -136,7 +135,6 @@ void DatabaseReplicated::runMainThread() { current_log_entry_n++; executeLog(current_log_entry_n); } - // break; // debug purpose } } diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index 504be5a3ec5..0cb0c57c808 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include @@ -14,7 +14,7 @@ namespace DB * that contain declaration of table represented by SQL ATTACH TABLE query * and operation log in zookeeper */ -class DatabaseReplicated : public DatabaseOrdinary +class DatabaseReplicated : public DatabaseAtomic { public: DatabaseReplicated(const String & name_, const String & metadata_path_, const String & zookeeper_path_, const String & replica_name_, Context & context); diff --git a/src/Databases/DatabasesCommon.cpp b/src/Databases/DatabasesCommon.cpp index 47c54fae800..7925d812241 100644 --- a/src/Databases/DatabasesCommon.cpp +++ b/src/Databases/DatabasesCommon.cpp @@ -98,7 +98,7 @@ void DatabaseWithOwnTablesBase::attachTableUnlocked(const String & table_name, c auto table_id = table->getStorageID(); if (table_id.hasUUID()) { - assert(getDatabaseName() == DatabaseCatalog::TEMPORARY_DATABASE || getEngineName() == "Atomic"); + assert(getDatabaseName() == DatabaseCatalog::TEMPORARY_DATABASE || getEngineName() == "Atomic" || getEngineName() == "Replicated"); DatabaseCatalog::instance().addUUIDMapping(table_id.uuid, shared_from_this(), table); } } diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index ed4095d63be..648e41327ba 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -627,7 +627,7 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, if (need_add_to_database) { database = DatabaseCatalog::instance().getDatabase(create.database); - if (database->getEngineName() == "Atomic") // || database->getEngineName() == "Replicated") + if (database->getEngineName() == "Atomic" || (database->getEngineName() == "Replicated" && !context.from_replicated_log)) { /// TODO implement ATTACH FROM 'path/to/data': generate UUID and move table data to store/ if (create.attach && create.uuid == UUIDHelpers::Nil) @@ -635,6 +635,11 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, if (!create.attach && create.uuid == UUIDHelpers::Nil) create.uuid = UUIDHelpers::generateV4(); } + else if (database->getEngineName() == "Replicated" && context.from_replicated_log) { + if (create.uuid == UUIDHelpers::Nil) + // change error to incorrect log or something + throw Exception("Table UUID is not specified in the replicated log", ErrorCodes::INCORRECT_QUERY); + } else { if (create.uuid != UUIDHelpers::Nil) @@ -703,16 +708,9 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, if (database->getEngineName() == "Replicated" && !context.from_replicated_log) { - // propose - // try to database->propose(query_ptr); - database->createTable(context, table_name, res, query_ptr); - // catch - // throw and remove proposal - // otherwise - // proceed (commit to zk) - } else - database->createTable(context, table_name, res, query_ptr); + } + database->createTable(context, table_name, res, query_ptr); /// We must call "startup" and "shutdown" while holding DDLGuard. /// Because otherwise method "shutdown" (from InterpreterDropQuery) can be called before startup diff --git a/src/Interpreters/InterpreterDropQuery.cpp b/src/Interpreters/InterpreterDropQuery.cpp index bae1b796016..e9221fc273c 100644 --- a/src/Interpreters/InterpreterDropQuery.cpp +++ b/src/Interpreters/InterpreterDropQuery.cpp @@ -93,8 +93,8 @@ BlockIO InterpreterDropQuery::executeToTable( { context.checkAccess(table->isView() ? AccessType::DROP_VIEW : AccessType::DROP_TABLE, table_id); table->shutdown(); - TableExclusiveLockHolder table_lock; - if (database->getEngineName() != "Atomic") + TableStructureWriteLockHolder table_lock; + if (database->getEngineName() != "Atomic" && database->getEngineName() != "Replicated") table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); /// Drop table from memory, don't touch data and metadata if (database->getEngineName() == "Replicated" && !context.from_replicated_log) { @@ -119,8 +119,13 @@ BlockIO InterpreterDropQuery::executeToTable( table->shutdown(); +<<<<<<< HEAD TableExclusiveLockHolder table_lock; if (database->getEngineName() != "Atomic") +======= + TableStructureWriteLockHolder table_lock; + if (database->getEngineName() != "Atomic" && database->getEngineName() != "Replicated") +>>>>>>> 921e85e9c9... make db replicated inherited from atomic table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); if (database->getEngineName() == "Replicated" && !context.from_replicated_log) { From 5e076b464ea79c4d27e38a55cfc141645ddc9884 Mon Sep 17 00:00:00 2001 From: Val Date: Wed, 13 May 2020 20:00:47 +0300 Subject: [PATCH 018/381] add replicated db snapshot, integration test, repl alter queries, etc add an option to create replicated tables within replicated db without specifying zk path and replica id add replicated sch pool disable replication of alter queries for replicated tables in replicated dbs snapshot prototype. amend of replicated db workflow add prototype of integration tests for replicated db --- src/Common/CurrentMetrics.cpp | 2 + src/Core/Settings.h | 1 + src/Databases/DatabaseLazy.cpp | 2 +- src/Databases/DatabaseLazy.h | 2 +- src/Databases/DatabaseOnDisk.h | 3 +- src/Databases/DatabaseOrdinary.cpp | 2 +- src/Databases/DatabaseOrdinary.h | 4 +- src/Databases/DatabaseReplicated.cpp | 93 ++++++++++++------- src/Databases/DatabaseReplicated.h | 16 ++-- src/Databases/DatabaseWithDictionaries.cpp | 2 +- src/Databases/DatabaseWithDictionaries.h | 2 +- src/Interpreters/Context.cpp | 18 ++++ src/Interpreters/Context.h | 1 + src/Interpreters/InterpreterAlterQuery.cpp | 2 +- .../MergeTree/registerStorageMergeTree.cpp | 35 ++++++- .../test_replicated_database/test.py | 38 ++++++++ 16 files changed, 166 insertions(+), 57 deletions(-) create mode 100644 tests/integration/test_replicated_database/test.py diff --git a/src/Common/CurrentMetrics.cpp b/src/Common/CurrentMetrics.cpp index 4bab9ef2844..36c65953a6f 100644 --- a/src/Common/CurrentMetrics.cpp +++ b/src/Common/CurrentMetrics.cpp @@ -14,6 +14,7 @@ M(BackgroundSchedulePoolTask, "Number of active tasks in BackgroundSchedulePool. This pool is used for periodic ReplicatedMergeTree tasks, like cleaning old data parts, altering data parts, replica re-initialization, etc.") \ M(BackgroundBufferFlushSchedulePoolTask, "Number of active tasks in BackgroundBufferFlushSchedulePool. This pool is used for periodic Buffer flushes") \ M(BackgroundDistributedSchedulePoolTask, "Number of active tasks in BackgroundDistributedSchedulePool. This pool is used for distributed sends that is done in background.") \ + M(BackgroundReplicatedSchedulePoolTask, "Number of active tasks in BackgroundReplicatedSchedulePoolTask. TODO.") \ M(CacheDictionaryUpdateQueueBatches, "Number of 'batches' (a set of keys) in update queue in CacheDictionaries.") \ M(CacheDictionaryUpdateQueueKeys, "Exact number of keys in update queue in CacheDictionaries.") \ M(DiskSpaceReservedForMerge, "Disk space reserved for currently running background merges. It is slightly more than the total size of currently merging parts.") \ @@ -38,6 +39,7 @@ M(MemoryTrackingInBackgroundSchedulePool, "Total amount of memory (bytes) allocated in background schedule pool (that is dedicated for bookkeeping tasks of Replicated tables).") \ M(MemoryTrackingInBackgroundBufferFlushSchedulePool, "Total amount of memory (bytes) allocated in background buffer flushes pool (that is dedicated for background buffer flushes).") \ M(MemoryTrackingInBackgroundDistributedSchedulePool, "Total amount of memory (bytes) allocated in background distributed schedule pool (that is dedicated for distributed sends).") \ + M(MemoryTrackingInBackgroundReplicatedSchedulePool, "Total amount of memory (bytes) allocated in replicated schedule pool (TODO).") \ M(MemoryTrackingForMerges, "Total amount of memory (bytes) allocated for background merges. Included in MemoryTrackingInBackgroundProcessingPool. Note that this value may include a drift when the memory was allocated in a context of background processing pool and freed in other context or vice-versa. This happens naturally due to caches for tables indexes and doesn't indicate memory leaks.") \ M(EphemeralNode, "Number of ephemeral nodes hold in ZooKeeper.") \ M(ZooKeeperSession, "Number of sessions (connections) to ZooKeeper. Should be no more than one, because using more than one connection to ZooKeeper may lead to bugs due to lack of linearizability (stale reads) that ZooKeeper consistency model allows.") \ diff --git a/src/Core/Settings.h b/src/Core/Settings.h index f434132eccd..ea950afa70a 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -87,6 +87,7 @@ struct Settings : public SettingsCollection M(SettingUInt64, background_move_pool_size, 8, "Number of threads performing background moves for tables. Only has meaning at server startup.", 0) \ M(SettingUInt64, background_schedule_pool_size, 16, "Number of threads performing background tasks for replicated tables, kafka streaming, dns cache updates. Only has meaning at server startup.", 0) \ M(SettingUInt64, background_distributed_schedule_pool_size, 16, "Number of threads performing background tasks for distributed sends. Only has meaning at server startup.", 0) \ + M(SettingUInt64, background_replicated_schedule_pool_size, 16, "Number of threads performing background tasks in replicated databases. Only has meaning at server startup.", 0) \ \ M(SettingMilliseconds, distributed_directory_monitor_sleep_time_ms, 100, "Sleep time for StorageDistributed DirectoryMonitors, in case of any errors delay grows exponentially.", 0) \ M(SettingMilliseconds, distributed_directory_monitor_max_sleep_time_ms, 30000, "Maximum sleep time for StorageDistributed DirectoryMonitors, it limits exponential growth too.", 0) \ diff --git a/src/Databases/DatabaseLazy.cpp b/src/Databases/DatabaseLazy.cpp index 11e5272110e..d1a6c191bfc 100644 --- a/src/Databases/DatabaseLazy.cpp +++ b/src/Databases/DatabaseLazy.cpp @@ -27,7 +27,7 @@ namespace ErrorCodes } -DatabaseLazy::DatabaseLazy(const String & name_, const String & metadata_path_, time_t expiration_time_, const Context & context_) +DatabaseLazy::DatabaseLazy(const String & name_, const String & metadata_path_, time_t expiration_time_, Context & context_) : DatabaseOnDisk(name_, metadata_path_, "data/" + escapeForFileName(name_) + "/", "DatabaseLazy (" + name_ + ")", context_) , expiration_time(expiration_time_) { diff --git a/src/Databases/DatabaseLazy.h b/src/Databases/DatabaseLazy.h index 2e24b687be5..adda103a21e 100644 --- a/src/Databases/DatabaseLazy.h +++ b/src/Databases/DatabaseLazy.h @@ -18,7 +18,7 @@ class Context; class DatabaseLazy final : public DatabaseOnDisk { public: - DatabaseLazy(const String & name_, const String & metadata_path_, time_t expiration_time_, const Context & context_); + DatabaseLazy(const String & name_, const String & metadata_path_, time_t expiration_time_, Context & context_); String getEngineName() const override { return "Lazy"; } diff --git a/src/Databases/DatabaseOnDisk.h b/src/Databases/DatabaseOnDisk.h index d4fb9b2aa17..dc347c99542 100644 --- a/src/Databases/DatabaseOnDisk.h +++ b/src/Databases/DatabaseOnDisk.h @@ -31,7 +31,7 @@ String getObjectDefinitionFromCreateQuery(const ASTPtr & query); class DatabaseOnDisk : public DatabaseWithOwnTablesBase { public: - DatabaseOnDisk(const String & name, const String & metadata_path_, const String & data_path_, const String & logger, const Context & context); + DatabaseOnDisk(const String & name, const String & metadata_path_, const String & data_path_, const String & logger, Context & context); void createTable( const Context & context, @@ -86,6 +86,7 @@ protected: const String metadata_path; const String data_path; + Context & global_context; }; } diff --git a/src/Databases/DatabaseOrdinary.cpp b/src/Databases/DatabaseOrdinary.cpp index 9194558dffb..2f4f584b091 100644 --- a/src/Databases/DatabaseOrdinary.cpp +++ b/src/Databases/DatabaseOrdinary.cpp @@ -94,7 +94,7 @@ namespace } -DatabaseOrdinary::DatabaseOrdinary(const String & name_, const String & metadata_path_, const Context & context_) +DatabaseOrdinary::DatabaseOrdinary(const String & name_, const String & metadata_path_, Context & context_) : DatabaseOrdinary(name_, metadata_path_, "data/" + escapeForFileName(name_) + "/", "DatabaseOrdinary (" + name_ + ")", context_) { } diff --git a/src/Databases/DatabaseOrdinary.h b/src/Databases/DatabaseOrdinary.h index a9e53edfe28..4767ccdc123 100644 --- a/src/Databases/DatabaseOrdinary.h +++ b/src/Databases/DatabaseOrdinary.h @@ -14,8 +14,8 @@ namespace DB class DatabaseOrdinary : public DatabaseWithDictionaries { public: - DatabaseOrdinary(const String & name_, const String & metadata_path_, const Context & context); - DatabaseOrdinary(const String & name_, const String & metadata_path_, const String & data_path_, const String & logger, const Context & context_); + DatabaseOrdinary(const String & name_, const String & metadata_path_, Context & context); + DatabaseOrdinary(const String & name_, const String & metadata_path_, const String & data_path_, const String & logger, Context & context_); String getEngineName() const override { return "Ordinary"; } diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 2b473c25ce2..9dd8530fc46 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -101,43 +101,58 @@ DatabaseReplicated::DatabaseReplicated( throw Exception("Can't create replicated database without ZooKeeper", ErrorCodes::NO_ZOOKEEPER); } - current_zookeeper->createAncestors(zookeeper_path); - current_zookeeper->createOrUpdate(zookeeper_path, String(), zkutil::CreateMode::Persistent); + if (!current_zookeeper->exists(zookeeper_path, {}, NULL)) { + current_zookeeper->createAncestors(zookeeper_path); + current_zookeeper->createOrUpdate(zookeeper_path, String(), zkutil::CreateMode::Persistent); + current_zookeeper->createOrUpdate(zookeeper_path + "/last_entry", "0", zkutil::CreateMode::Persistent); + current_zookeeper->createAncestors(replica_path); + } else { + } + current_zookeeper->createOrUpdate(replica_path, String(), zkutil::CreateMode::Persistent); - // TODO if no last_entry then make it equal to 0 in zk; - - // TODO launch a worker here - main_thread = ThreadFromGlobalPool(&DatabaseReplicated::runMainThread, this); + backgroundLogExecutor = global_context.getReplicatedSchedulePool().createTask(database_name + "(DatabaseReplicated::the_threeeed)", [this]{ runMainThread();} ); + backgroundLogExecutor->schedule(); } DatabaseReplicated::~DatabaseReplicated() { stop_flag = true; - main_thread.join(); } void DatabaseReplicated::runMainThread() { - setThreadName("ReplctdWorker"); // ok whatever. 15 bytes // + database_name); LOG_DEBUG(log, "Started " << database_name << " database worker thread\n Replica: " << replica_name); - - while (!stop_flag) { - attachToThreadGroup(); - sleepForSeconds(1);// BURN CPU + if (!stop_flag) { // TODO is there a need for the flag? current_zookeeper = getZooKeeper(); - String last_n; - if (!current_zookeeper->tryGet(zookeeper_path + "/last_entry", last_n, {}, NULL)) { - continue; - } + String last_n = current_zookeeper->get(zookeeper_path + "/last_entry", {}, NULL); size_t last_n_parsed = parse(last_n); LOG_DEBUG(log, "PARSED " << last_n_parsed); LOG_DEBUG(log, "LOCAL CURRENT " << current_log_entry_n); + + bool newEntries = current_log_entry_n < last_n_parsed; while (current_log_entry_n < last_n_parsed) { current_log_entry_n++; executeLog(current_log_entry_n); } + if (newEntries) { + saveState(); + } + backgroundLogExecutor->scheduleAfter(500); } } +void DatabaseReplicated::saveState() { + current_zookeeper->createOrUpdate(replica_path + "/last_entry", std::to_string(current_log_entry_n), zkutil::CreateMode::Persistent); + // TODO rename vars + String statement = std::to_string(current_log_entry_n); + String metadatafile = getMetadataPath() + ".last_entry"; + WriteBufferFromFile out(metadatafile, statement.size(), O_WRONLY | O_CREAT); + writeString(statement, out); + out.next(); + if (global_context.getSettingsRef().fsync_metadata) + out.sync(); + out.close(); +} + void DatabaseReplicated::executeLog(size_t n) { current_zookeeper = getZooKeeper(); @@ -163,21 +178,7 @@ void DatabaseReplicated::executeLog(size_t n) { LOG_DEBUG(log, "Executed query: " << query_to_execute); } -// TODO we might not need it here at all -void DatabaseReplicated::attachToThreadGroup() { - if (thread_group) - { - /// Put all threads to one thread pool - CurrentThread::attachToIfDetached(thread_group); - } - else - { - CurrentThread::initializeQuery(); - thread_group = CurrentThread::getGroup(); - } -} - -// taken from ddlworker +// TODO Move to ZooKeeper/Lock and remove it from here and ddlworker static std::unique_ptr createSimpleZooKeeperLock( const std::shared_ptr & zookeeper, const String & lock_prefix, const String & lock_name, const String & lock_message) { @@ -188,15 +189,24 @@ static std::unique_ptr createSimpleZooKeeperLock( void DatabaseReplicated::propose(const ASTPtr & query) { - // TODO if source is zk then omit propose. Throw? - // TODO remove that log message i think LOG_DEBUG(log, "PROPOSING\n" << queryToString(query)); current_zookeeper = getZooKeeper(); - auto lock = createSimpleZooKeeperLock(current_zookeeper, zookeeper_path, "lock", replica_name); + auto lock = createSimpleZooKeeperLock(current_zookeeper, zookeeper_path, "propose_lock", replica_name); - // TODO check that last_entry is the same as current_log_entry_n for the replica + + // schedule and deactive combo + // ensures that replica is up to date + // and since propose lock is acquired, + // no other propose can happen from + // different replicas during this call + backgroundLogExecutor->schedule(); + backgroundLogExecutor->deactivate(); + + if (current_log_entry_n > 5) { // make a settings variable + createSnapshot(); + } current_log_entry_n++; // starting from 1 String log_entry = zookeeper_path + "/log." + std::to_string(current_log_entry_n); @@ -205,7 +215,18 @@ void DatabaseReplicated::propose(const ASTPtr & query) { current_zookeeper->createOrUpdate(zookeeper_path + "/last_entry", std::to_string(current_log_entry_n), zkutil::CreateMode::Persistent); lock->unlock(); - // write to metastore the last entry? + saveState(); +} + +void DatabaseReplicated::createSnapshot() { + current_zookeeper->createAncestors(zookeeper_path + "/snapshot"); + current_zookeeper->createOrUpdate(zookeeper_path + "/snapshot", std::to_string(current_log_entry_n), zkutil::CreateMode::Persistent); + for (auto iterator = getTablesIterator({}); iterator->isValid(); iterator->next()) { + String table_name = iterator->name(); + auto query = getCreateQueryFromMetadata(getObjectMetadataPath(table_name), true); + String statement = queryToString(query); + current_zookeeper->createOrUpdate(zookeeper_path + "/snapshot/" + table_name, statement, zkutil::CreateMode::Persistent); + } } } diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index 0cb0c57c808..0b2d097caac 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -25,25 +26,26 @@ public: void propose(const ASTPtr & query) override; + String zookeeper_path; + String replica_name; + private: void runMainThread(); - void runCleanupThread(); - void attachToThreadGroup(); - void executeLog(size_t n); + void saveState(); + + void createSnapshot(); + std::unique_ptr current_context; // to run executeQuery std::atomic current_log_entry_n = 0; std::atomic stop_flag{false}; - ThreadFromGlobalPool main_thread; - ThreadGroupStatusPtr thread_group; + BackgroundSchedulePool::TaskHolder backgroundLogExecutor; - String zookeeper_path; - String replica_name; String replica_path; zkutil::ZooKeeperPtr current_zookeeper; /// Use only the methods below. diff --git a/src/Databases/DatabaseWithDictionaries.cpp b/src/Databases/DatabaseWithDictionaries.cpp index e0f2aa9286b..37f5b51f4ed 100644 --- a/src/Databases/DatabaseWithDictionaries.cpp +++ b/src/Databases/DatabaseWithDictionaries.cpp @@ -317,7 +317,7 @@ void DatabaseWithDictionaries::shutdown() DatabaseWithDictionaries::DatabaseWithDictionaries( - const String & name, const String & metadata_path_, const String & data_path_, const String & logger, const Context & context) + const String & name, const String & metadata_path_, const String & data_path_, const String & logger, Context & context) : DatabaseOnDisk(name, metadata_path_, data_path_, logger, context) , external_loader(context.getExternalDictionariesLoader()) { diff --git a/src/Databases/DatabaseWithDictionaries.h b/src/Databases/DatabaseWithDictionaries.h index eb9e105e31d..0e87ae686cf 100644 --- a/src/Databases/DatabaseWithDictionaries.h +++ b/src/Databases/DatabaseWithDictionaries.h @@ -37,7 +37,7 @@ public: ~DatabaseWithDictionaries() override; protected: - DatabaseWithDictionaries(const String & name, const String & metadata_path_, const String & data_path_, const String & logger, const Context & context); + DatabaseWithDictionaries(const String & name, const String & metadata_path_, const String & data_path_, const String & logger, Context & context); ASTPtr getCreateDictionaryQueryImpl(const String & dictionary_name, bool throw_on_error) const override; diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index b691e9aaf60..ccd489f6c45 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -82,6 +82,9 @@ namespace CurrentMetrics extern const Metric BackgroundDistributedSchedulePoolTask; extern const Metric MemoryTrackingInBackgroundDistributedSchedulePool; + + extern const Metric BackgroundReplicatedSchedulePoolTask; + extern const Metric MemoryTrackingInBackgroundReplicatedSchedulePool; } @@ -338,6 +341,8 @@ struct ContextShared std::optional background_move_pool; /// The thread pool for the background moves performed by the tables. std::optional schedule_pool; /// A thread pool that can run different jobs in background (used in replicated tables) std::optional distributed_schedule_pool; /// A thread pool that can run different jobs in background (used for distributed sends) + // TODO Rename replicated table pool or even both; adjust comments + std::optional replicated_schedule_pool; /// A thread pool that can run different jobs in background (used in replicated database engine) MultiVersion macros; /// Substitutions extracted from config. std::unique_ptr ddl_worker; /// Process ddl commands from zk. /// Rules for selecting the compression settings, depending on the size of the part. @@ -437,6 +442,7 @@ struct ContextShared background_move_pool.reset(); schedule_pool.reset(); distributed_schedule_pool.reset(); + replicated_schedule_pool.reset(); ddl_worker.reset(); /// Stop trace collector if any @@ -1415,6 +1421,18 @@ BackgroundSchedulePool & Context::getDistributedSchedulePool() return *shared->distributed_schedule_pool; } +BackgroundSchedulePool & Context::getReplicatedSchedulePool() +{ + auto lock = getLock(); + if (!shared->replicated_schedule_pool) + shared->replicated_schedule_pool.emplace( + settings.background_replicated_schedule_pool_size, + CurrentMetrics::BackgroundReplicatedSchedulePoolTask, + CurrentMetrics::MemoryTrackingInBackgroundReplicatedSchedulePool, + "BgRplSchPool"); + return *shared->replicated_schedule_pool; +} + void Context::setDDLWorker(std::unique_ptr ddl_worker) { auto lock = getLock(); diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index 66ea6f6914c..e9c78a175d4 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -502,6 +502,7 @@ public: BackgroundProcessingPool & getBackgroundMovePool(); BackgroundSchedulePool & getSchedulePool(); BackgroundSchedulePool & getDistributedSchedulePool(); + BackgroundSchedulePool & getReplicatedSchedulePool(); void setDDLWorker(std::unique_ptr ddl_worker); DDLWorker & getDDLWorker() const; diff --git a/src/Interpreters/InterpreterAlterQuery.cpp b/src/Interpreters/InterpreterAlterQuery.cpp index ad79bd68fed..cef1ebd7469 100644 --- a/src/Interpreters/InterpreterAlterQuery.cpp +++ b/src/Interpreters/InterpreterAlterQuery.cpp @@ -51,7 +51,7 @@ BlockIO InterpreterAlterQuery::execute() // TODO it's dirty. need to add database to parsing stage DatabasePtr database = DatabaseCatalog::instance().getDatabase(table_id.database_name); - if (database->getEngineName() == "Replicated" && !context.from_replicated_log) { + if (database->getEngineName() == "Replicated" && !context.from_replicated_log && !table->supportsReplication()) { database->propose(query_ptr); } diff --git a/src/Storages/MergeTree/registerStorageMergeTree.cpp b/src/Storages/MergeTree/registerStorageMergeTree.cpp index 1ecac8f413d..eb62c80cc49 100644 --- a/src/Storages/MergeTree/registerStorageMergeTree.cpp +++ b/src/Storages/MergeTree/registerStorageMergeTree.cpp @@ -1,3 +1,6 @@ +#include +#include + #include #include #include @@ -277,10 +280,18 @@ static StoragePtr create(const StorageFactory::Arguments & args) String name_part = args.engine_name.substr(0, args.engine_name.size() - strlen("MergeTree")); - bool replicated = startsWith(name_part, "Replicated"); - if (replicated) + bool replicatedStorage = startsWith(name_part, "Replicated"); + if (replicatedStorage) name_part = name_part.substr(strlen("Replicated")); + String database_name = args.query.database; + auto database = DatabaseCatalog::instance().getDatabase(database_name); + bool replicatedDatabase = false; + + if (database->getEngineName() == "Replicated") { + replicatedDatabase = true; + } + MergeTreeData::MergingParams merging_params; merging_params.mode = MergeTreeData::MergingParams::Ordinary; @@ -322,7 +333,7 @@ static StoragePtr create(const StorageFactory::Arguments & args) needed_params += "]"; }; - if (replicated) + if (replicatedStorage && !replicatedDatabase) { add_mandatory_param("path in ZooKeeper"); add_mandatory_param("replica name"); @@ -392,7 +403,7 @@ static StoragePtr create(const StorageFactory::Arguments & args) String zookeeper_path; String replica_name; - if (replicated) + if (replicatedStorage && !replicatedDatabase) { const auto * ast = engine_args[arg_num]->as(); if (ast && ast->value.getType() == Field::Types::String) @@ -418,6 +429,12 @@ static StoragePtr create(const StorageFactory::Arguments & args) ++arg_num; } + if (replicatedStorage && replicatedDatabase) { + auto * database_replicated = typeid_cast(database.get()); + zookeeper_path = database_replicated->zookeeper_path + "/tables/" + toString(args.query.uuid); + replica_name = database_replicated->replica_name; + } + /// This merging param maybe used as part of sorting key std::optional merging_param_key_arg; @@ -617,7 +634,15 @@ static StoragePtr create(const StorageFactory::Arguments & args) throw Exception("You must set the setting `allow_experimental_data_skipping_indices` to 1 " \ "before using data skipping indices.", ErrorCodes::BAD_ARGUMENTS); - if (replicated) + StorageInMemoryMetadata metadata(args.columns, indices_description, args.constraints); + metadata.partition_by_ast = partition_by_ast; + metadata.order_by_ast = order_by_ast; + metadata.primary_key_ast = primary_key_ast; + metadata.ttl_for_table_ast = ttl_table_ast; + metadata.sample_by_ast = sample_by_ast; + metadata.settings_ast = settings_ast; + + if (replicatedStorage) return StorageReplicatedMergeTree::create( zookeeper_path, replica_name, args.attach, args.table_id, args.relative_data_path, metadata, args.context, date_column_name, merging_params, std::move(storage_settings), diff --git a/tests/integration/test_replicated_database/test.py b/tests/integration/test_replicated_database/test.py new file mode 100644 index 00000000000..23268bcdfd8 --- /dev/null +++ b/tests/integration/test_replicated_database/test.py @@ -0,0 +1,38 @@ +import time +import logging + +import pytest + +from helpers.cluster import ClickHouseCluster + +logging.getLogger().setLevel(logging.INFO) +logging.getLogger().addHandler(logging.StreamHandler()) + +cluster = ClickHouseCluster(__file__) + +node1 = cluster.add_instance('node1', macros={'replica': 'test1'}, with_zookeeper=True) +node2 = cluster.add_instance('node2', macros={'replica': 'test2'}, with_zookeeper=True) + +all_nodes = [node1, node2] + +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster.start() + + for node in all_nodes: + node.query("DROP DATABASE IF EXISTS testdb") + node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', '{replica}');") + yield cluster + + finally: + cluster.shutdown() + + +def test_db(started_cluster): + DURATION_SECONDS = 5 + node1.query("CREATE TABLE testdb.replicated_table (d Date, k UInt64, i32 Int32) ENGINE=ReplicatedMergeTree(d, k, 8192);") + + time.sleep(DURATION_SECONDS) + logging.info(node2.query("desc table testdb.replicated_table")) + assert node1.query("desc table testdb.replicated_table") == node2.query("desc table testdb.replicated_table") From 34f74ff7851fbb68fb740219f339ced64242636c Mon Sep 17 00:00:00 2001 From: Val Date: Sun, 24 May 2020 20:12:24 +0300 Subject: [PATCH 019/381] add test cases for replicated db --- .../test_replicated_database/test.py | 44 ++++++++++++++++--- 1 file changed, 38 insertions(+), 6 deletions(-) diff --git a/tests/integration/test_replicated_database/test.py b/tests/integration/test_replicated_database/test.py index 23268bcdfd8..38977aa0bdb 100644 --- a/tests/integration/test_replicated_database/test.py +++ b/tests/integration/test_replicated_database/test.py @@ -12,15 +12,14 @@ cluster = ClickHouseCluster(__file__) node1 = cluster.add_instance('node1', macros={'replica': 'test1'}, with_zookeeper=True) node2 = cluster.add_instance('node2', macros={'replica': 'test2'}, with_zookeeper=True) - -all_nodes = [node1, node2] +node3 = cluster.add_instance('node3', macros={'replica': 'test3'}, with_zookeeper=True) @pytest.fixture(scope="module") def started_cluster(): try: cluster.start() - for node in all_nodes: + for node in [node1, node2]: node.query("DROP DATABASE IF EXISTS testdb") node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', '{replica}');") yield cluster @@ -29,10 +28,43 @@ def started_cluster(): cluster.shutdown() -def test_db(started_cluster): - DURATION_SECONDS = 5 - node1.query("CREATE TABLE testdb.replicated_table (d Date, k UInt64, i32 Int32) ENGINE=ReplicatedMergeTree(d, k, 8192);") +def test_create_replicated_table(started_cluster): + DURATION_SECONDS = 1 + node1.query("CREATE TABLE testdb.replicated_table (d Date, k UInt64, i32 Int32) ENGINE=ReplicatedMergeTree(d, k, 8192);") time.sleep(DURATION_SECONDS) logging.info(node2.query("desc table testdb.replicated_table")) assert node1.query("desc table testdb.replicated_table") == node2.query("desc table testdb.replicated_table") + +def test_alter_table(started_cluster): + DURATION_SECONDS = 1 + node1.query("CREATE TABLE testdb.alter_test (CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192);\ + ALTER TABLE testdb.alter_test ADD COLUMN Added0 UInt32;\ + ALTER TABLE testdb.alter_test ADD COLUMN Added2 UInt32;\ + ALTER TABLE testdb.alter_test ADD COLUMN Added1 UInt32 AFTER Added0;\ + ALTER TABLE testdb.alter_test ADD COLUMN AddedNested1 Nested(A UInt32, B UInt64) AFTER Added2;\ + ALTER TABLE testdb.alter_test ADD COLUMN AddedNested1.C Array(String) AFTER AddedNested1.B;\ + ALTER TABLE testdb.alter_test ADD COLUMN AddedNested2 Nested(A UInt32, B UInt64) AFTER AddedNested1;") + + time.sleep(DURATION_SECONDS) + assert node1.query("desc table testdb.alter_test") == node2.query("desc table testdb.alter_test") + +def test_create_replica_from_snapshot(started_cluster): + DURATION_SECONDS = 3 + """ + right now snapshot's created every 6 proposes. + later on it must be configurable + for now let's check snapshot + by creating a new node just after 10 log entries + """ + node1.query("ALTER TABLE testdb.alter_test ADD COLUMN Added3 UInt32 ;") #9 + node1.query("ALTER TABLE testdb.alter_test ADD COLUMN Added4 UInt32 ;") #10 + node1.query("ALTER TABLE testdb.alter_test ADD COLUMN Added5 UInt32 ;") #1 + # by this moment snapshot must be created + + node3.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', '{replica}');") + + time.sleep(DURATION_SECONDS) + + assert node3.query("desc table testdb.alter_test") == node1.query("desc table testdb.alter_test") + From 1f03839830c1ec92b912bab6cdcfba6908780ccf Mon Sep 17 00:00:00 2001 From: Val Date: Sun, 24 May 2020 20:12:59 +0300 Subject: [PATCH 020/381] add zookeeper tryRemoveChildren method --- src/Common/ZooKeeper/ZooKeeper.cpp | 17 +++++++++++++++++ src/Common/ZooKeeper/ZooKeeper.h | 5 +++++ 2 files changed, 22 insertions(+) diff --git a/src/Common/ZooKeeper/ZooKeeper.cpp b/src/Common/ZooKeeper/ZooKeeper.cpp index 476e88d7e72..541625149dd 100644 --- a/src/Common/ZooKeeper/ZooKeeper.cpp +++ b/src/Common/ZooKeeper/ZooKeeper.cpp @@ -579,6 +579,23 @@ void ZooKeeper::removeChildren(const std::string & path) } +void ZooKeeper::tryRemoveChildren(const std::string & path) +{ + Strings children; + if (tryGetChildren(path, children) != Coordination::ZOK) + return; + while (!children.empty()) + { + Coordination::Requests ops; + for (size_t i = 0; i < MULTI_BATCH_SIZE && !children.empty(); ++i) + { + ops.emplace_back(makeRemoveRequest(path + "/" + children.back(), -1)); + children.pop_back(); + } + multi(ops); + } +} + void ZooKeeper::removeChildrenRecursive(const std::string & path) { Strings children = getChildren(path); diff --git a/src/Common/ZooKeeper/ZooKeeper.h b/src/Common/ZooKeeper/ZooKeeper.h index 416e40c2da4..cb28f442392 100644 --- a/src/Common/ZooKeeper/ZooKeeper.h +++ b/src/Common/ZooKeeper/ZooKeeper.h @@ -187,7 +187,12 @@ public: /// Remove all children nodes (non recursive). void removeChildren(const std::string & path); + /// Remove all children nodes (non recursive). + /// If there're no children, this method doesn't throw an exception + void tryRemoveChildren(const std::string & path); + using WaitCondition = std::function; + /// Wait for the node to disappear or return immediately if it doesn't exist. /// If condition is speficied, it is used to return early (when condition returns false) /// The function returns true if waited and false if waiting was interrupted by condition. From 4921dc6dab978d05bf16a5cf6bfd8572a5c0f12b Mon Sep 17 00:00:00 2001 From: Val Date: Sun, 24 May 2020 20:13:53 +0300 Subject: [PATCH 021/381] db replicated refactoring --- src/Databases/DatabaseReplicated.cpp | 105 ++++++++++++++++----------- src/Databases/DatabaseReplicated.h | 14 ++-- 2 files changed, 69 insertions(+), 50 deletions(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 9dd8530fc46..ae5a8249202 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -80,7 +80,6 @@ DatabaseReplicated::DatabaseReplicated( Context & context_) // : DatabaseOrdinary(name_, metadata_path_, "data/" + escapeForFileName(name_) + "/", "DatabaseReplicated (" + name_ + ")", context_) // TODO add constructor to Atomic and call it here with path and logger name specification - // TODO ask why const and & are ommited in Atomic : DatabaseAtomic(name_, metadata_path_, context_) , zookeeper_path(zookeeper_path_) , replica_name(replica_name_) @@ -102,42 +101,50 @@ DatabaseReplicated::DatabaseReplicated( } if (!current_zookeeper->exists(zookeeper_path, {}, NULL)) { - current_zookeeper->createAncestors(zookeeper_path); - current_zookeeper->createOrUpdate(zookeeper_path, String(), zkutil::CreateMode::Persistent); - current_zookeeper->createOrUpdate(zookeeper_path + "/last_entry", "0", zkutil::CreateMode::Persistent); + createDatabaseZKNodes(); + } + + // replica + if (!current_zookeeper->exists(replica_path, {}, NULL)) { current_zookeeper->createAncestors(replica_path); - } else { + current_zookeeper->createOrUpdate(replica_path, String(), zkutil::CreateMode::Persistent); } - current_zookeeper->createOrUpdate(replica_path, String(), zkutil::CreateMode::Persistent); - backgroundLogExecutor = global_context.getReplicatedSchedulePool().createTask(database_name + "(DatabaseReplicated::the_threeeed)", [this]{ runMainThread();} ); - backgroundLogExecutor->schedule(); + //loadMetadataFromSnapshot(); + + background_log_executor = global_context.getReplicatedSchedulePool().createTask(database_name + "(DatabaseReplicated::the_threeeed)", [this]{ runBackgroundLogExecutor();} ); + background_log_executor->schedule(); } -DatabaseReplicated::~DatabaseReplicated() -{ - stop_flag = true; +void DatabaseReplicated::createDatabaseZKNodes() { + current_zookeeper = getZooKeeper(); + + if (current_zookeeper->exists(zookeeper_path)) + return; + + current_zookeeper->createAncestors(zookeeper_path); + + current_zookeeper->createIfNotExists(zookeeper_path, String()); + current_zookeeper->createIfNotExists(zookeeper_path + "/last_entry", "0"); + current_zookeeper->createIfNotExists(zookeeper_path + "/log", String()); + current_zookeeper->createIfNotExists(zookeeper_path + "/snapshot", String()); } -void DatabaseReplicated::runMainThread() { - LOG_DEBUG(log, "Started " << database_name << " database worker thread\n Replica: " << replica_name); - if (!stop_flag) { // TODO is there a need for the flag? - current_zookeeper = getZooKeeper(); - String last_n = current_zookeeper->get(zookeeper_path + "/last_entry", {}, NULL); - size_t last_n_parsed = parse(last_n); - LOG_DEBUG(log, "PARSED " << last_n_parsed); - LOG_DEBUG(log, "LOCAL CURRENT " << current_log_entry_n); +void DatabaseReplicated::runBackgroundLogExecutor() { + current_zookeeper = getZooKeeper(); + String last_n = current_zookeeper->get(zookeeper_path + "/last_entry", {}, NULL); + size_t last_n_parsed = parse(last_n); - bool newEntries = current_log_entry_n < last_n_parsed; - while (current_log_entry_n < last_n_parsed) { - current_log_entry_n++; - executeLog(current_log_entry_n); - } - if (newEntries) { - saveState(); - } - backgroundLogExecutor->scheduleAfter(500); + bool newEntries = current_log_entry_n < last_n_parsed; + while (current_log_entry_n < last_n_parsed) { + current_log_entry_n++; + String log_path = zookeeper_path + "/log/log." + std::to_string(current_log_entry_n); + executeFromZK(log_path); } + if (newEntries) { + saveState(); + } + background_log_executor->scheduleAfter(500); } void DatabaseReplicated::saveState() { @@ -153,10 +160,9 @@ void DatabaseReplicated::saveState() { out.close(); } -void DatabaseReplicated::executeLog(size_t n) { - +void DatabaseReplicated::executeFromZK(String & path) { current_zookeeper = getZooKeeper(); - String query_to_execute = current_zookeeper->get(zookeeper_path + "/log." + std::to_string(n), {}, NULL); + String query_to_execute = current_zookeeper->get(path, {}, NULL); ReadBufferFromString istr(query_to_execute); String dummy_string; WriteBufferFromString ostr(dummy_string); @@ -171,7 +177,7 @@ void DatabaseReplicated::executeLog(size_t n) { } catch (...) { - tryLogCurrentException(log, "Query " + query_to_execute + " wasn't finished successfully"); + tryLogCurrentException(log, "Query from zookeeper " + query_to_execute + " wasn't finished successfully"); } @@ -195,21 +201,23 @@ void DatabaseReplicated::propose(const ASTPtr & query) { current_zookeeper = getZooKeeper(); auto lock = createSimpleZooKeeperLock(current_zookeeper, zookeeper_path, "propose_lock", replica_name); - // schedule and deactive combo // ensures that replica is up to date // and since propose lock is acquired, // no other propose can happen from // different replicas during this call - backgroundLogExecutor->schedule(); - backgroundLogExecutor->deactivate(); + background_log_executor->schedule(); + background_log_executor->deactivate(); - if (current_log_entry_n > 5) { // make a settings variable - createSnapshot(); - } +// if (current_log_entry_n > 5) { // make a settings variable +// // TODO check that all the replicas are up to date! +// updateSnapshot(); +// current_log_entry_n = 0; +// current_zookeeper->removeChildren(zookeeper_path + "/log"); +// } current_log_entry_n++; // starting from 1 - String log_entry = zookeeper_path + "/log." + std::to_string(current_log_entry_n); + String log_entry = zookeeper_path + "/log/log." + std::to_string(current_log_entry_n); current_zookeeper->createOrUpdate(log_entry, queryToString(query), zkutil::CreateMode::Persistent); current_zookeeper->createOrUpdate(zookeeper_path + "/last_entry", std::to_string(current_log_entry_n), zkutil::CreateMode::Persistent); @@ -218,9 +226,9 @@ void DatabaseReplicated::propose(const ASTPtr & query) { saveState(); } -void DatabaseReplicated::createSnapshot() { - current_zookeeper->createAncestors(zookeeper_path + "/snapshot"); - current_zookeeper->createOrUpdate(zookeeper_path + "/snapshot", std::to_string(current_log_entry_n), zkutil::CreateMode::Persistent); +void DatabaseReplicated::updateSnapshot() { + current_zookeeper = getZooKeeper(); + current_zookeeper->tryRemoveChildren(zookeeper_path + "/snapshot"); for (auto iterator = getTablesIterator({}); iterator->isValid(); iterator->next()) { String table_name = iterator->name(); auto query = getCreateQueryFromMetadata(getObjectMetadataPath(table_name), true); @@ -229,4 +237,17 @@ void DatabaseReplicated::createSnapshot() { } } +void DatabaseReplicated::loadMetadataFromSnapshot() { + current_zookeeper = getZooKeeper(); + + Strings metadatas; + if (current_zookeeper->tryGetChildren(zookeeper_path + "/snapshot", metadatas) != Coordination::ZOK) + return; + + for (auto t = metadatas.begin(); t != metadatas.end(); ++t) { + String path = zookeeper_path + "/snapshot/" + *t; + executeFromZK(path); + } +} + } diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index 0b2d097caac..bd2f11390d2 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -20,8 +20,6 @@ class DatabaseReplicated : public DatabaseAtomic public: DatabaseReplicated(const String & name_, const String & metadata_path_, const String & zookeeper_path_, const String & replica_name_, Context & context); - ~DatabaseReplicated(); - String getEngineName() const override { return "Replicated"; } void propose(const ASTPtr & query) override; @@ -30,21 +28,21 @@ public: String replica_name; private: + void createDatabaseZKNodes(); - void runMainThread(); + void runBackgroundLogExecutor(); - void executeLog(size_t n); + void executeFromZK(String & path); void saveState(); - - void createSnapshot(); + void updateSnapshot(); + void loadMetadataFromSnapshot(); std::unique_ptr current_context; // to run executeQuery std::atomic current_log_entry_n = 0; - std::atomic stop_flag{false}; - BackgroundSchedulePool::TaskHolder backgroundLogExecutor; + BackgroundSchedulePool::TaskHolder background_log_executor; String replica_path; From cbcd1bea0eef7ee647f1cdcca51612cecc4697d1 Mon Sep 17 00:00:00 2001 From: Val Date: Tue, 26 May 2020 16:35:05 +0300 Subject: [PATCH 022/381] provide better comments and information --- src/Common/CurrentMetrics.cpp | 4 ++-- src/Common/ZooKeeper/ZooKeeper.h | 3 ++- src/Core/Settings.h | 2 +- src/Databases/IDatabase.h | 22 ++++++++++----------- src/Interpreters/Context.cpp | 1 - src/Interpreters/InterpreterCreateQuery.cpp | 8 +++++--- src/Interpreters/InterpreterDropQuery.cpp | 8 +++----- 7 files changed, 24 insertions(+), 24 deletions(-) diff --git a/src/Common/CurrentMetrics.cpp b/src/Common/CurrentMetrics.cpp index 36c65953a6f..a6a08897505 100644 --- a/src/Common/CurrentMetrics.cpp +++ b/src/Common/CurrentMetrics.cpp @@ -14,7 +14,7 @@ M(BackgroundSchedulePoolTask, "Number of active tasks in BackgroundSchedulePool. This pool is used for periodic ReplicatedMergeTree tasks, like cleaning old data parts, altering data parts, replica re-initialization, etc.") \ M(BackgroundBufferFlushSchedulePoolTask, "Number of active tasks in BackgroundBufferFlushSchedulePool. This pool is used for periodic Buffer flushes") \ M(BackgroundDistributedSchedulePoolTask, "Number of active tasks in BackgroundDistributedSchedulePool. This pool is used for distributed sends that is done in background.") \ - M(BackgroundReplicatedSchedulePoolTask, "Number of active tasks in BackgroundReplicatedSchedulePoolTask. TODO.") \ + M(BackgroundReplicatedSchedulePoolTask, "Number of active tasks in BackgroundReplicatedSchedulePoolTask. The pool is used by replicated database for executing DDL log coming from other replicas. One task corresponds to one replicated database") \ M(CacheDictionaryUpdateQueueBatches, "Number of 'batches' (a set of keys) in update queue in CacheDictionaries.") \ M(CacheDictionaryUpdateQueueKeys, "Exact number of keys in update queue in CacheDictionaries.") \ M(DiskSpaceReservedForMerge, "Disk space reserved for currently running background merges. It is slightly more than the total size of currently merging parts.") \ @@ -39,7 +39,7 @@ M(MemoryTrackingInBackgroundSchedulePool, "Total amount of memory (bytes) allocated in background schedule pool (that is dedicated for bookkeeping tasks of Replicated tables).") \ M(MemoryTrackingInBackgroundBufferFlushSchedulePool, "Total amount of memory (bytes) allocated in background buffer flushes pool (that is dedicated for background buffer flushes).") \ M(MemoryTrackingInBackgroundDistributedSchedulePool, "Total amount of memory (bytes) allocated in background distributed schedule pool (that is dedicated for distributed sends).") \ - M(MemoryTrackingInBackgroundReplicatedSchedulePool, "Total amount of memory (bytes) allocated in replicated schedule pool (TODO).") \ + M(MemoryTrackingInBackgroundReplicatedSchedulePool, "Total amount of memory (bytes) allocated in background replicated schedule pool (that is dedicated for ddl log execution by replicated database replicas).") \ M(MemoryTrackingForMerges, "Total amount of memory (bytes) allocated for background merges. Included in MemoryTrackingInBackgroundProcessingPool. Note that this value may include a drift when the memory was allocated in a context of background processing pool and freed in other context or vice-versa. This happens naturally due to caches for tables indexes and doesn't indicate memory leaks.") \ M(EphemeralNode, "Number of ephemeral nodes hold in ZooKeeper.") \ M(ZooKeeperSession, "Number of sessions (connections) to ZooKeeper. Should be no more than one, because using more than one connection to ZooKeeper may lead to bugs due to lack of linearizability (stale reads) that ZooKeeper consistency model allows.") \ diff --git a/src/Common/ZooKeeper/ZooKeeper.h b/src/Common/ZooKeeper/ZooKeeper.h index cb28f442392..47eaefa51fc 100644 --- a/src/Common/ZooKeeper/ZooKeeper.h +++ b/src/Common/ZooKeeper/ZooKeeper.h @@ -188,7 +188,8 @@ public: void removeChildren(const std::string & path); /// Remove all children nodes (non recursive). - /// If there're no children, this method doesn't throw an exception + /// If there're no children for the given path, + /// this method does not throw an exception. void tryRemoveChildren(const std::string & path); using WaitCondition = std::function; diff --git a/src/Core/Settings.h b/src/Core/Settings.h index ea950afa70a..1351b752136 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -87,7 +87,7 @@ struct Settings : public SettingsCollection M(SettingUInt64, background_move_pool_size, 8, "Number of threads performing background moves for tables. Only has meaning at server startup.", 0) \ M(SettingUInt64, background_schedule_pool_size, 16, "Number of threads performing background tasks for replicated tables, kafka streaming, dns cache updates. Only has meaning at server startup.", 0) \ M(SettingUInt64, background_distributed_schedule_pool_size, 16, "Number of threads performing background tasks for distributed sends. Only has meaning at server startup.", 0) \ - M(SettingUInt64, background_replicated_schedule_pool_size, 16, "Number of threads performing background tasks in replicated databases. Only has meaning at server startup.", 0) \ + M(SettingUInt64, background_replicated_schedule_pool_size, 4, "Number of threads performing background tasks in replicated databases. One task corresponds to one replicated database replica. Only has meaning at server startup.", 0) \ \ M(SettingMilliseconds, distributed_directory_monitor_sleep_time_ms, 100, "Sleep time for StorageDistributed DirectoryMonitors, in case of any errors delay grows exponentially.", 0) \ M(SettingMilliseconds, distributed_directory_monitor_max_sleep_time_ms, 30000, "Maximum sleep time for StorageDistributed DirectoryMonitors, it limits exponential growth too.", 0) \ diff --git a/src/Databases/IDatabase.h b/src/Databases/IDatabase.h index 18265b153cf..5b3003f36b4 100644 --- a/src/Databases/IDatabase.h +++ b/src/Databases/IDatabase.h @@ -162,7 +162,7 @@ public: virtual bool empty() const = 0; virtual void propose(const ASTPtr & /*query*/) { - throw Exception("There is no propose query method for Database" + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); + throw Exception(getEngineName() + ": propose() is not supported", ErrorCodes::NOT_IMPLEMENTED); } /// Add the table to the database. Record its presence in the metadata. @@ -172,7 +172,7 @@ public: const StoragePtr & /*table*/, const ASTPtr & /*query*/) { - throw Exception("There is no CREATE TABLE query for Database" + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); + throw Exception("There is no CREATE TABLE query for Database " + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); } /// Add the dictionary to the database. Record its presence in the metadata. @@ -181,7 +181,7 @@ public: const String & /*dictionary_name*/, const ASTPtr & /*query*/) { - throw Exception("There is no CREATE DICTIONARY query for Database" + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); + throw Exception("There is no CREATE DICTIONARY query for Database " + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); } /// Delete the table from the database, drop table and delete the metadata. @@ -190,7 +190,7 @@ public: const String & /*name*/, [[maybe_unused]] bool no_delay = false) { - throw Exception("There is no DROP TABLE query for Database" + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); + throw Exception("There is no DROP TABLE query for Database " + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); } /// Delete the dictionary from the database. Delete the metadata. @@ -198,32 +198,32 @@ public: const Context & /*context*/, const String & /*dictionary_name*/) { - throw Exception("There is no DROP DICTIONARY query for Database" + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); + throw Exception("There is no DROP DICTIONARY query for Database " + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); } /// Add a table to the database, but do not add it to the metadata. The database may not support this method. virtual void attachTable(const String & /*name*/, const StoragePtr & /*table*/, [[maybe_unused]] const String & relative_table_path = {}) { - throw Exception("There is no ATTACH TABLE query for Database" + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); + throw Exception("There is no ATTACH TABLE query for Database " + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); } /// Add dictionary to the database, but do not add it to the metadata. The database may not support this method. /// If dictionaries_lazy_load is false it also starts loading the dictionary asynchronously. virtual void attachDictionary(const String & /* dictionary_name */, const DictionaryAttachInfo & /* attach_info */) { - throw Exception("There is no ATTACH DICTIONARY query for Database" + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); + throw Exception("There is no ATTACH DICTIONARY query for Database " + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); } /// Forget about the table without deleting it, and return it. The database may not support this method. virtual StoragePtr detachTable(const String & /*name*/) { - throw Exception("There is no DETACH TABLE query for Database" + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); + throw Exception("There is no DETACH TABLE query for Database " + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); } /// Forget about the dictionary without deleting it. The database may not support this method. virtual void detachDictionary(const String & /*name*/) { - throw Exception("There is no DETACH DICTIONARY query for Database" + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); + throw Exception("There is no DETACH DICTIONARY query for Database " + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); } /// Rename the table and possibly move the table to another database. @@ -314,14 +314,14 @@ protected: virtual ASTPtr getCreateTableQueryImpl(const String & /*name*/, const Context & /*context*/, bool throw_on_error) const { if (throw_on_error) - throw Exception("There is no SHOW CREATE TABLE query for Database" + getEngineName(), ErrorCodes::CANNOT_GET_CREATE_TABLE_QUERY); + throw Exception("There is no SHOW CREATE TABLE query for Database " + getEngineName(), ErrorCodes::CANNOT_GET_CREATE_TABLE_QUERY); return nullptr; } virtual ASTPtr getCreateDictionaryQueryImpl(const String & /*name*/, bool throw_on_error) const { if (throw_on_error) - throw Exception("There is no SHOW CREATE DICTIONARY query for Database" + getEngineName(), ErrorCodes::CANNOT_GET_CREATE_DICTIONARY_QUERY); + throw Exception("There is no SHOW CREATE DICTIONARY query for Database " + getEngineName(), ErrorCodes::CANNOT_GET_CREATE_DICTIONARY_QUERY); return nullptr; } diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index ccd489f6c45..14ee5284bab 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -341,7 +341,6 @@ struct ContextShared std::optional background_move_pool; /// The thread pool for the background moves performed by the tables. std::optional schedule_pool; /// A thread pool that can run different jobs in background (used in replicated tables) std::optional distributed_schedule_pool; /// A thread pool that can run different jobs in background (used for distributed sends) - // TODO Rename replicated table pool or even both; adjust comments std::optional replicated_schedule_pool; /// A thread pool that can run different jobs in background (used in replicated database engine) MultiVersion macros; /// Substitutions extracted from config. std::unique_ptr ddl_worker; /// Process ddl commands from zk. diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 648e41327ba..6ff474e096f 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -601,7 +601,10 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create) /// Set and retrieve list of columns, indices and constraints. Set table engine if needed. Rewrite query in canonical way. TableProperties properties = setProperties(create); - // testing + /// DDL log for replicated databases can not + /// contain the right database name for every replica + /// therefore for such queries the AST database + /// field is modified right before an actual execution if (context.from_replicated_log) { create.database = current_database; } @@ -637,8 +640,7 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, } else if (database->getEngineName() == "Replicated" && context.from_replicated_log) { if (create.uuid == UUIDHelpers::Nil) - // change error to incorrect log or something - throw Exception("Table UUID is not specified in the replicated log", ErrorCodes::INCORRECT_QUERY); + throw Exception("Table UUID is not specified in DDL log", ErrorCodes::INCORRECT_QUERY); } else { diff --git a/src/Interpreters/InterpreterDropQuery.cpp b/src/Interpreters/InterpreterDropQuery.cpp index e9221fc273c..fe94a394ba2 100644 --- a/src/Interpreters/InterpreterDropQuery.cpp +++ b/src/Interpreters/InterpreterDropQuery.cpp @@ -110,6 +110,9 @@ BlockIO InterpreterDropQuery::executeToTable( auto table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); auto metadata_snapshot = table->getInMemoryMetadataPtr(); /// Drop table data, don't touch metadata + if (database->getEngineName() == "Replicated" && !context.from_replicated_log) { + database->propose(query_ptr); + } table->truncate(query_ptr, metadata_snapshot, context, table_lock); } else if (query.kind == ASTDropQuery::Kind::Drop) @@ -119,13 +122,8 @@ BlockIO InterpreterDropQuery::executeToTable( table->shutdown(); -<<<<<<< HEAD TableExclusiveLockHolder table_lock; - if (database->getEngineName() != "Atomic") -======= - TableStructureWriteLockHolder table_lock; if (database->getEngineName() != "Atomic" && database->getEngineName() != "Replicated") ->>>>>>> 921e85e9c9... make db replicated inherited from atomic table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); if (database->getEngineName() == "Replicated" && !context.from_replicated_log) { From 31910e9bf1a526a2bf3e8fdf167ff3447e37747f Mon Sep 17 00:00:00 2001 From: Val Date: Tue, 26 May 2020 18:08:09 +0300 Subject: [PATCH 023/381] Use ClientInf::QueryKind to distinguish replicated db log queries --- src/Databases/DatabaseReplicated.cpp | 2 +- src/Interpreters/ClientInfo.h | 2 +- src/Interpreters/Context.h | 3 --- src/Interpreters/InterpreterAlterQuery.cpp | 3 +-- src/Interpreters/InterpreterCreateQuery.cpp | 8 ++++---- src/Interpreters/InterpreterDropQuery.cpp | 7 ++++--- src/Interpreters/InterpreterRenameQuery.cpp | 2 +- 7 files changed, 12 insertions(+), 15 deletions(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index ae5a8249202..c6840ac0d81 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -170,7 +170,7 @@ void DatabaseReplicated::executeFromZK(String & path) { try { current_context = std::make_unique(global_context); - current_context->from_replicated_log = true; + current_context->getClientInfo().query_kind = ClientInfo::QueryKind::REPLICATED_LOG_QUERY; current_context->setCurrentDatabase(database_name); current_context->setCurrentQueryId(""); // generate random query_id executeQuery(istr, ostr, false, *current_context, {}); diff --git a/src/Interpreters/ClientInfo.h b/src/Interpreters/ClientInfo.h index 2dff30e40a2..42b3ab42bc1 100644 --- a/src/Interpreters/ClientInfo.h +++ b/src/Interpreters/ClientInfo.h @@ -38,7 +38,7 @@ public: NO_QUERY = 0, /// Uninitialized object. INITIAL_QUERY = 1, SECONDARY_QUERY = 2, /// Query that was initiated by another query for distributed or ON CLUSTER query execution. - REPLICATED_LOG_QUERY = 3, /// TODO add comment + REPLICATED_LOG_QUERY = 3, /// Query from replicated DDL log. }; diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index e9c78a175d4..5d1fda03221 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -214,9 +214,6 @@ private: Context(); public: - ///testing - bool from_replicated_log = false; - /// Create initial Context with ContextShared and etc. static Context createGlobal(ContextShared * shared); static SharedContextHolder createShared(); diff --git a/src/Interpreters/InterpreterAlterQuery.cpp b/src/Interpreters/InterpreterAlterQuery.cpp index cef1ebd7469..134531d0cf0 100644 --- a/src/Interpreters/InterpreterAlterQuery.cpp +++ b/src/Interpreters/InterpreterAlterQuery.cpp @@ -49,9 +49,8 @@ BlockIO InterpreterAlterQuery::execute() auto alter_lock = table->lockForAlter(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); auto metadata_snapshot = table->getInMemoryMetadataPtr(); - // TODO it's dirty. need to add database to parsing stage DatabasePtr database = DatabaseCatalog::instance().getDatabase(table_id.database_name); - if (database->getEngineName() == "Replicated" && !context.from_replicated_log && !table->supportsReplication()) { + if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY && !table->supportsReplication()) { database->propose(query_ptr); } diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 6ff474e096f..0b06fbfd874 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -605,7 +605,7 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create) /// contain the right database name for every replica /// therefore for such queries the AST database /// field is modified right before an actual execution - if (context.from_replicated_log) { + if (context.getClientInfo().query_kind == ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { create.database = current_database; } @@ -630,7 +630,7 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, if (need_add_to_database) { database = DatabaseCatalog::instance().getDatabase(create.database); - if (database->getEngineName() == "Atomic" || (database->getEngineName() == "Replicated" && !context.from_replicated_log)) + if (database->getEngineName() == "Atomic" || (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY)) { /// TODO implement ATTACH FROM 'path/to/data': generate UUID and move table data to store/ if (create.attach && create.uuid == UUIDHelpers::Nil) @@ -638,7 +638,7 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, if (!create.attach && create.uuid == UUIDHelpers::Nil) create.uuid = UUIDHelpers::generateV4(); } - else if (database->getEngineName() == "Replicated" && context.from_replicated_log) { + else if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind == ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { if (create.uuid == UUIDHelpers::Nil) throw Exception("Table UUID is not specified in DDL log", ErrorCodes::INCORRECT_QUERY); } @@ -709,7 +709,7 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, } - if (database->getEngineName() == "Replicated" && !context.from_replicated_log) { + if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { database->propose(query_ptr); } database->createTable(context, table_name, res, query_ptr); diff --git a/src/Interpreters/InterpreterDropQuery.cpp b/src/Interpreters/InterpreterDropQuery.cpp index fe94a394ba2..afbf5d31fbf 100644 --- a/src/Interpreters/InterpreterDropQuery.cpp +++ b/src/Interpreters/InterpreterDropQuery.cpp @@ -97,7 +97,7 @@ BlockIO InterpreterDropQuery::executeToTable( if (database->getEngineName() != "Atomic" && database->getEngineName() != "Replicated") table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); /// Drop table from memory, don't touch data and metadata - if (database->getEngineName() == "Replicated" && !context.from_replicated_log) { + if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { database->propose(query_ptr); } database->detachTable(table_id.table_name); @@ -110,7 +110,8 @@ BlockIO InterpreterDropQuery::executeToTable( auto table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); auto metadata_snapshot = table->getInMemoryMetadataPtr(); /// Drop table data, don't touch metadata - if (database->getEngineName() == "Replicated" && !context.from_replicated_log) { + auto table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); + if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { database->propose(query_ptr); } table->truncate(query_ptr, metadata_snapshot, context, table_lock); @@ -126,7 +127,7 @@ BlockIO InterpreterDropQuery::executeToTable( if (database->getEngineName() != "Atomic" && database->getEngineName() != "Replicated") table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); - if (database->getEngineName() == "Replicated" && !context.from_replicated_log) { + if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { database->propose(query_ptr); } database->dropTable(context, table_id.table_name, query.no_delay); diff --git a/src/Interpreters/InterpreterRenameQuery.cpp b/src/Interpreters/InterpreterRenameQuery.cpp index d93b14a6bc2..45003ab0d14 100644 --- a/src/Interpreters/InterpreterRenameQuery.cpp +++ b/src/Interpreters/InterpreterRenameQuery.cpp @@ -81,7 +81,7 @@ BlockIO InterpreterRenameQuery::execute() database_catalog.assertTableDoesntExist(StorageID(elem.to_database_name, elem.to_table_name), context); DatabasePtr database = database_catalog.getDatabase(elem.from_database_name); - if (database->getEngineName() == "Replicated" && !context.from_replicated_log) { + if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { database->propose(query_ptr); } database->renameTable( From fbbccaf98ae02b5ed463b3c05fc79595743e817a Mon Sep 17 00:00:00 2001 From: Val Date: Tue, 26 May 2020 18:10:15 +0300 Subject: [PATCH 024/381] remove stateless tests for replicated db --- ...7_replicated_database_engine_zookeeper.sql | 10 ----- ...icated_database_engine_zookeeper.reference | 34 ---------------- ...9_replicated_database_engine_zookeeper.sql | 39 ------------------- 3 files changed, 83 deletions(-) delete mode 100644 tests/queries/0_stateless/01267_replicated_database_engine_zookeeper.sql delete mode 100644 tests/queries/0_stateless/01269_replicated_database_engine_zookeeper.reference delete mode 100644 tests/queries/0_stateless/01269_replicated_database_engine_zookeeper.sql diff --git a/tests/queries/0_stateless/01267_replicated_database_engine_zookeeper.sql b/tests/queries/0_stateless/01267_replicated_database_engine_zookeeper.sql deleted file mode 100644 index c70de9a50d2..00000000000 --- a/tests/queries/0_stateless/01267_replicated_database_engine_zookeeper.sql +++ /dev/null @@ -1,10 +0,0 @@ -DROP DATABASE IF EXISTS test_db1; -DROP DATABASE IF EXISTS test_db2; - -CREATE DATABASE test_db1 ENGINE = Replicated('/clickhouse/databases/test1', 'id1'); -CREATE TABLE test_db1.replicated_table (d Date, k UInt64, i32 Int32) ENGINE=ReplicatedMergeTree('/clickhouse/tables/test1', 'id1', d, k, 8192); -CREATE TABLE test_db1.basic_table (EventDate Date, CounterID Int) engine=MergeTree(EventDate, (CounterID, EventDate), 8192); - -CREATE DATABASE test_db2 ENGINE = Replicated('/clickhouse/databases/test1', 'id2'); -CREATE TABLE test_db2.replicated_table (d Date, k UInt64, i32 Int32) ENGINE=ReplicatedMergeTree('/clickhouse/tables/test1', 'id2', d, k, 8192); -CREATE TABLE test_db2.basic_table (EventDate Date, CounterID Int) engine=MergeTree(EventDate, (CounterID, EventDate), 8192); diff --git a/tests/queries/0_stateless/01269_replicated_database_engine_zookeeper.reference b/tests/queries/0_stateless/01269_replicated_database_engine_zookeeper.reference deleted file mode 100644 index 58f951b1257..00000000000 --- a/tests/queries/0_stateless/01269_replicated_database_engine_zookeeper.reference +++ /dev/null @@ -1,34 +0,0 @@ -CounterID UInt32 -StartDate Date -UserID UInt32 -VisitID UInt32 -Added0 String -Added1 UInt32 -Added2 UInt32 -AddedNested1.A Array(UInt32) -AddedNested1.C Array(String) -AddedNested2.A Array(UInt32) -AddedNested2.B Array(UInt64) -CounterID UInt32 -StartDate Date -UserID UInt32 -VisitID UInt32 -Added0 String -Added1 UInt32 -Added2 UInt32 -AddedNested1.A Array(UInt32) -AddedNested1.C Array(String) -AddedNested2.A Array(UInt32) -AddedNested2.B Array(UInt64) -CounterID UInt32 -StartDate Date -UserID UInt32 -VisitID UInt32 -Added0 String -Added1 UInt32 -Added2 UInt32 -AddedNested1.A Array(UInt32) -AddedNested1.C Array(String) -AddedNested2.A Array(UInt32) -AddedNested2.B Array(UInt64) - diff --git a/tests/queries/0_stateless/01269_replicated_database_engine_zookeeper.sql b/tests/queries/0_stateless/01269_replicated_database_engine_zookeeper.sql deleted file mode 100644 index 1acc9022014..00000000000 --- a/tests/queries/0_stateless/01269_replicated_database_engine_zookeeper.sql +++ /dev/null @@ -1,39 +0,0 @@ -DROP DATABASE IF EXISTS rdbtest; -DROP DATABASE IF EXISTS replicatwo; -DROP DATABASE IF EXISTS replicathree; - -CREATE DATABASE rdbtest ENGINE = Replicated('/clickhouse/db/test1/', 'id1'); -CREATE DATABASE replicatwo ENGINE = Replicated('/clickhouse/db/test1/', 'id2'); -CREATE DATABASE replicathree ENGINE = Replicated('/clickhouse/db/test1/', 'id3'); - -USE rdbtest; - -CREATE TABLE alter_test (CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192); - -ALTER TABLE alter_test ADD COLUMN Added0 UInt32; -ALTER TABLE alter_test ADD COLUMN Added2 UInt32; -ALTER TABLE alter_test ADD COLUMN Added1 UInt32 AFTER Added0; - -ALTER TABLE alter_test ADD COLUMN AddedNested1 Nested(A UInt32, B UInt64) AFTER Added2; -ALTER TABLE alter_test ADD COLUMN AddedNested1.C Array(String) AFTER AddedNested1.B; -ALTER TABLE alter_test ADD COLUMN AddedNested2 Nested(A UInt32, B UInt64) AFTER AddedNested1; - -ALTER TABLE alter_test DROP COLUMN ToDrop; - -ALTER TABLE alter_test MODIFY COLUMN Added0 String; - -ALTER TABLE alter_test DROP COLUMN NestedColumn.A; -ALTER TABLE alter_test DROP COLUMN NestedColumn.S; - -ALTER TABLE alter_test DROP COLUMN AddedNested1.B; - -ALTER TABLE alter_test ADD COLUMN IF NOT EXISTS Added0 UInt32; -ALTER TABLE alter_test ADD COLUMN IF NOT EXISTS AddedNested1 Nested(A UInt32, B UInt64); -ALTER TABLE alter_test ADD COLUMN IF NOT EXISTS AddedNested1.C Array(String); -ALTER TABLE alter_test MODIFY COLUMN IF EXISTS ToDrop UInt64; -ALTER TABLE alter_test DROP COLUMN IF EXISTS ToDrop; -ALTER TABLE alter_test COMMENT COLUMN IF EXISTS ToDrop 'new comment'; - -DESC TABLE rdbtest.alter_test; -DESC TABLE replicatwo.alter_test; -DESC TABLE replicathree.alter_test; From 0e9f516738adad2a22cf95d92304c6ffe3c6e55a Mon Sep 17 00:00:00 2001 From: Val Date: Wed, 27 May 2020 18:04:10 +0300 Subject: [PATCH 025/381] add comment for replicated db class --- src/Databases/DatabaseReplicated.h | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index bd2f11390d2..e81b78386f7 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -10,10 +10,27 @@ namespace DB { -/** Replicated database engine. - * It stores tables list using list of .sql files, - * that contain declaration of table represented by SQL ATTACH TABLE query - * and operation log in zookeeper +/** DatabaseReplicated engine + * supports replication of metadata + * via DDL log being written to ZooKeeper + * and executed on all of the replicas + * for a given database. + * + * One Clickhouse server can have multiple + * replicated databases running and updating + * at the same time. + * + * The engine has two parameters ZooKeeper path and + * replica name. + * The same ZooKeeper path corresponds to the same + * database. Replica names must be different for all replicas + * of the same database. + * + * Using this engine, creation of Replicated tables + * requires no ZooKeeper path and replica name parameters. + * Table's replica name is the same as database replica name. + * Table's ZooKeeper path is a concatenation of database's + * ZooKeeper path, /tables/, and UUID of the table. */ class DatabaseReplicated : public DatabaseAtomic { From a0af67b636d4a2b47d0c0898833e8c1c86731561 Mon Sep 17 00:00:00 2001 From: Val Date: Wed, 27 May 2020 21:33:37 +0300 Subject: [PATCH 026/381] Add one more test for db replicated and fix related bug --- src/Databases/DatabaseReplicated.cpp | 8 +++ .../test_replicated_database/test.py | 52 ++++++++++++------- 2 files changed, 40 insertions(+), 20 deletions(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index c6840ac0d81..202e46c3f82 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -201,6 +201,13 @@ void DatabaseReplicated::propose(const ASTPtr & query) { current_zookeeper = getZooKeeper(); auto lock = createSimpleZooKeeperLock(current_zookeeper, zookeeper_path, "propose_lock", replica_name); + while (!lock->tryLock()) { + // TODO it seems that zk lock doesn't work at all + // need to find a different solution for proposal + pcg64 rng(randomSeed()); + std::this_thread::sleep_for(std::chrono::milliseconds(std::uniform_int_distribution(0, 1000)(rng))); + } + // schedule and deactive combo // ensures that replica is up to date // and since propose lock is acquired, @@ -224,6 +231,7 @@ void DatabaseReplicated::propose(const ASTPtr & query) { lock->unlock(); saveState(); + background_log_executor->activateAndSchedule(); } void DatabaseReplicated::updateSnapshot() { diff --git a/tests/integration/test_replicated_database/test.py b/tests/integration/test_replicated_database/test.py index 38977aa0bdb..703690a7218 100644 --- a/tests/integration/test_replicated_database/test.py +++ b/tests/integration/test_replicated_database/test.py @@ -33,38 +33,50 @@ def test_create_replicated_table(started_cluster): node1.query("CREATE TABLE testdb.replicated_table (d Date, k UInt64, i32 Int32) ENGINE=ReplicatedMergeTree(d, k, 8192);") time.sleep(DURATION_SECONDS) - logging.info(node2.query("desc table testdb.replicated_table")) assert node1.query("desc table testdb.replicated_table") == node2.query("desc table testdb.replicated_table") -def test_alter_table(started_cluster): +def test_simple_alter_table(started_cluster): DURATION_SECONDS = 1 - node1.query("CREATE TABLE testdb.alter_test (CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192);\ - ALTER TABLE testdb.alter_test ADD COLUMN Added0 UInt32;\ - ALTER TABLE testdb.alter_test ADD COLUMN Added2 UInt32;\ - ALTER TABLE testdb.alter_test ADD COLUMN Added1 UInt32 AFTER Added0;\ - ALTER TABLE testdb.alter_test ADD COLUMN AddedNested1 Nested(A UInt32, B UInt64) AFTER Added2;\ - ALTER TABLE testdb.alter_test ADD COLUMN AddedNested1.C Array(String) AFTER AddedNested1.B;\ - ALTER TABLE testdb.alter_test ADD COLUMN AddedNested2 Nested(A UInt32, B UInt64) AFTER AddedNested1;") + node1.query("CREATE TABLE testdb.alter_test (CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192);") + node1.query("ALTER TABLE testdb.alter_test ADD COLUMN Added0 UInt32;") + node1.query("ALTER TABLE testdb.alter_test ADD COLUMN Added2 UInt32;") + node1.query("ALTER TABLE testdb.alter_test ADD COLUMN Added1 UInt32 AFTER Added0;") + node1.query("ALTER TABLE testdb.alter_test ADD COLUMN AddedNested1 Nested(A UInt32, B UInt64) AFTER Added2;") + node1.query("ALTER TABLE testdb.alter_test ADD COLUMN AddedNested1.C Array(String) AFTER AddedNested1.B;") + node1.query("ALTER TABLE testdb.alter_test ADD COLUMN AddedNested2 Nested(A UInt32, B UInt64) AFTER AddedNested1;") time.sleep(DURATION_SECONDS) assert node1.query("desc table testdb.alter_test") == node2.query("desc table testdb.alter_test") -def test_create_replica_from_snapshot(started_cluster): +def test_create_replica_after_delay(started_cluster): DURATION_SECONDS = 3 - """ - right now snapshot's created every 6 proposes. - later on it must be configurable - for now let's check snapshot - by creating a new node just after 10 log entries - """ - node1.query("ALTER TABLE testdb.alter_test ADD COLUMN Added3 UInt32 ;") #9 - node1.query("ALTER TABLE testdb.alter_test ADD COLUMN Added4 UInt32 ;") #10 - node1.query("ALTER TABLE testdb.alter_test ADD COLUMN Added5 UInt32 ;") #1 - # by this moment snapshot must be created node3.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', '{replica}');") + node1.query("ALTER TABLE testdb.alter_test ADD COLUMN Added3 UInt32 ;") + node1.query("ALTER TABLE testdb.alter_test ADD COLUMN Added4 UInt32 ;") + node1.query("ALTER TABLE testdb.alter_test ADD COLUMN Added5 UInt32 ;") + time.sleep(DURATION_SECONDS) assert node3.query("desc table testdb.alter_test") == node1.query("desc table testdb.alter_test") +def test_alters_from_different_replicas(started_cluster): + DURATION_SECONDS = 1 + + node1.query("CREATE TABLE testdb.concurrent_test (CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192);") + time.sleep(DURATION_SECONDS) + + node3.query("ALTER TABLE testdb.concurrent_test ADD COLUMN Added0 UInt32;") + time.sleep(DURATION_SECONDS) + node1.query("ALTER TABLE testdb.concurrent_test ADD COLUMN Added2 UInt32;") + time.sleep(DURATION_SECONDS) + node3.query("ALTER TABLE testdb.concurrent_test ADD COLUMN Added1 UInt32 AFTER Added0;") + time.sleep(DURATION_SECONDS) + node1.query("ALTER TABLE testdb.concurrent_test ADD COLUMN AddedNested1 Nested(A UInt32, B UInt64) AFTER Added2;") + time.sleep(DURATION_SECONDS) + node3.query("ALTER TABLE testdb.concurrent_test ADD COLUMN AddedNested1.C Array(String) AFTER AddedNested1.B;") + time.sleep(DURATION_SECONDS) + node1.query("ALTER TABLE testdb.concurrent_test ADD COLUMN AddedNested2 Nested(A UInt32, B UInt64) AFTER AddedNested1;") + time.sleep(DURATION_SECONDS) + assert node3.query("desc table testdb.concurrent_test") == node1.query("desc table testdb.concurrent_test") From 469f9738dff25544a35c23da2f6e207355b5f16c Mon Sep 17 00:00:00 2001 From: Val Date: Wed, 27 May 2020 21:40:00 +0300 Subject: [PATCH 027/381] refactor save state in db replicated --- src/Databases/DatabaseReplicated.cpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 202e46c3f82..3dbacbaf33d 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -148,12 +148,14 @@ void DatabaseReplicated::runBackgroundLogExecutor() { } void DatabaseReplicated::saveState() { - current_zookeeper->createOrUpdate(replica_path + "/last_entry", std::to_string(current_log_entry_n), zkutil::CreateMode::Persistent); - // TODO rename vars - String statement = std::to_string(current_log_entry_n); - String metadatafile = getMetadataPath() + ".last_entry"; - WriteBufferFromFile out(metadatafile, statement.size(), O_WRONLY | O_CREAT); - writeString(statement, out); + String state = std::to_string(current_log_entry_n); + + current_zookeeper = getZooKeeper(); + current_zookeeper->createOrUpdate(replica_path + "/last_entry", state, zkutil::CreateMode::Persistent); + + String metadata_file = getMetadataPath() + ".last_entry"; + WriteBufferFromFile out(metadata_file, state.size(), O_WRONLY | O_CREAT); + writeString(state, out); out.next(); if (global_context.getSettingsRef().fsync_metadata) out.sync(); From f928c897cf68b4bf73bf7b6108e469ef87bb385d Mon Sep 17 00:00:00 2001 From: Val Date: Sun, 7 Jun 2020 14:20:05 +0300 Subject: [PATCH 028/381] change replication algorithm, remove zk lock In this version of the databaseReplicated sequential persistent zk nodes are used to order DDL queries. Db replicated ddl queries are executed in the backgrould pool no matter whether it's proposed by the same replica or not. --- src/Databases/DatabaseReplicated.cpp | 84 +++++++++------------ src/Databases/DatabaseReplicated.h | 2 + src/Interpreters/InterpreterAlterQuery.cpp | 1 + src/Interpreters/InterpreterCreateQuery.cpp | 10 +-- src/Interpreters/InterpreterDropQuery.cpp | 9 ++- src/Interpreters/InterpreterRenameQuery.cpp | 14 ++-- 6 files changed, 55 insertions(+), 65 deletions(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 3dbacbaf33d..2650bd46a58 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -132,19 +132,34 @@ void DatabaseReplicated::createDatabaseZKNodes() { void DatabaseReplicated::runBackgroundLogExecutor() { current_zookeeper = getZooKeeper(); - String last_n = current_zookeeper->get(zookeeper_path + "/last_entry", {}, NULL); - size_t last_n_parsed = parse(last_n); + Strings log_entry_names = current_zookeeper->getChildren(zookeeper_path + "/log"); - bool newEntries = current_log_entry_n < last_n_parsed; - while (current_log_entry_n < last_n_parsed) { - current_log_entry_n++; - String log_path = zookeeper_path + "/log/log." + std::to_string(current_log_entry_n); - executeFromZK(log_path); - } - if (newEntries) { - saveState(); + std::sort(log_entry_names.begin(), log_entry_names.end()); + auto newest_entry_it = std::upper_bound(log_entry_names.begin(), log_entry_names.end(), last_executed_log_entry); + + log_entry_names.erase(log_entry_names.begin(), newest_entry_it); + + for (const String & log_entry_name : log_entry_names) { + String log_entry_path = zookeeper_path + "/log/" + log_entry_name; + executeFromZK(log_entry_path); + last_executed_log_entry = log_entry_name; } + background_log_executor->scheduleAfter(500); + + // String last_n = current_zookeeper->get(zookeeper_path + "/last_entry", {}, NULL); + // size_t last_n_parsed = parse(last_n); + + // bool newEntries = current_log_entry_n < last_n_parsed; + // while (current_log_entry_n < last_n_parsed) { + // current_log_entry_n++; + // String log_path = zookeeper_path + "/log/log." + std::to_string(current_log_entry_n); + // executeFromZK(log_path); + // } + // if (newEntries) { + // saveState(); + // } + // background_log_executor->scheduleAfter(500); } void DatabaseReplicated::saveState() { @@ -187,53 +202,22 @@ void DatabaseReplicated::executeFromZK(String & path) { } // TODO Move to ZooKeeper/Lock and remove it from here and ddlworker -static std::unique_ptr createSimpleZooKeeperLock( - const std::shared_ptr & zookeeper, const String & lock_prefix, const String & lock_name, const String & lock_message) -{ - auto zookeeper_holder = std::make_shared(); - zookeeper_holder->initFromInstance(zookeeper); - return std::make_unique(std::move(zookeeper_holder), lock_prefix, lock_name, lock_message); -} +// static std::unique_ptr createSimpleZooKeeperLock( +// const std::shared_ptr & zookeeper, const String & lock_prefix, const String & lock_name, const String & lock_message) +// { +// auto zookeeper_holder = std::make_shared(); +// zookeeper_holder->initFromInstance(zookeeper); +// return std::make_unique(std::move(zookeeper_holder), lock_prefix, lock_name, lock_message); +// } void DatabaseReplicated::propose(const ASTPtr & query) { - // TODO remove that log message i think - LOG_DEBUG(log, "PROPOSING\n" << queryToString(query)); - current_zookeeper = getZooKeeper(); - auto lock = createSimpleZooKeeperLock(current_zookeeper, zookeeper_path, "propose_lock", replica_name); - while (!lock->tryLock()) { - // TODO it seems that zk lock doesn't work at all - // need to find a different solution for proposal - pcg64 rng(randomSeed()); - std::this_thread::sleep_for(std::chrono::milliseconds(std::uniform_int_distribution(0, 1000)(rng))); - } + LOG_DEBUG(log, "PROPOSINGGG query: " << queryToString(query)); + current_zookeeper->create(zookeeper_path + "/log/log-", queryToString(query), zkutil::CreateMode::PersistentSequential); - // schedule and deactive combo - // ensures that replica is up to date - // and since propose lock is acquired, - // no other propose can happen from - // different replicas during this call background_log_executor->schedule(); - background_log_executor->deactivate(); - -// if (current_log_entry_n > 5) { // make a settings variable -// // TODO check that all the replicas are up to date! -// updateSnapshot(); -// current_log_entry_n = 0; -// current_zookeeper->removeChildren(zookeeper_path + "/log"); -// } - - current_log_entry_n++; // starting from 1 - String log_entry = zookeeper_path + "/log/log." + std::to_string(current_log_entry_n); - current_zookeeper->createOrUpdate(log_entry, queryToString(query), zkutil::CreateMode::Persistent); - - current_zookeeper->createOrUpdate(zookeeper_path + "/last_entry", std::to_string(current_log_entry_n), zkutil::CreateMode::Persistent); - - lock->unlock(); - saveState(); - background_log_executor->activateAndSchedule(); } void DatabaseReplicated::updateSnapshot() { diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index e81b78386f7..19a0ea09e11 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -59,6 +59,8 @@ private: std::atomic current_log_entry_n = 0; + String last_executed_log_entry = ""; + BackgroundSchedulePool::TaskHolder background_log_executor; String replica_path; diff --git a/src/Interpreters/InterpreterAlterQuery.cpp b/src/Interpreters/InterpreterAlterQuery.cpp index 134531d0cf0..6b4bcdde067 100644 --- a/src/Interpreters/InterpreterAlterQuery.cpp +++ b/src/Interpreters/InterpreterAlterQuery.cpp @@ -52,6 +52,7 @@ BlockIO InterpreterAlterQuery::execute() DatabasePtr database = DatabaseCatalog::instance().getDatabase(table_id.database_name); if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY && !table->supportsReplication()) { database->propose(query_ptr); + return {}; } /// Add default database to table identifiers that we can encounter in e.g. default expressions, diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 0b06fbfd874..6806679cb4d 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -688,6 +688,11 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, return true; } + if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { + database->propose(query_ptr); + return true; + } + StoragePtr res; /// NOTE: CREATE query may be rewritten by Storage creator or table function if (create.as_table_function) @@ -707,11 +712,6 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, properties.constraints, false); } - - - if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { - database->propose(query_ptr); - } database->createTable(context, table_name, res, query_ptr); /// We must call "startup" and "shutdown" while holding DDLGuard. diff --git a/src/Interpreters/InterpreterDropQuery.cpp b/src/Interpreters/InterpreterDropQuery.cpp index afbf5d31fbf..05418f275a2 100644 --- a/src/Interpreters/InterpreterDropQuery.cpp +++ b/src/Interpreters/InterpreterDropQuery.cpp @@ -99,8 +99,9 @@ BlockIO InterpreterDropQuery::executeToTable( /// Drop table from memory, don't touch data and metadata if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { database->propose(query_ptr); + } else { + database->detachTable(table_id.table_name); } - database->detachTable(table_id.table_name); } else if (query.kind == ASTDropQuery::Kind::Truncate) { @@ -113,8 +114,9 @@ BlockIO InterpreterDropQuery::executeToTable( auto table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { database->propose(query_ptr); + } else { + table->truncate(query_ptr, metadata_snapshot, context, table_lock); } - table->truncate(query_ptr, metadata_snapshot, context, table_lock); } else if (query.kind == ASTDropQuery::Kind::Drop) { @@ -129,8 +131,9 @@ BlockIO InterpreterDropQuery::executeToTable( if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { database->propose(query_ptr); + } else { + database->dropTable(context, table_id.table_name, query.no_delay); } - database->dropTable(context, table_id.table_name, query.no_delay); } } diff --git a/src/Interpreters/InterpreterRenameQuery.cpp b/src/Interpreters/InterpreterRenameQuery.cpp index 45003ab0d14..97206f6b364 100644 --- a/src/Interpreters/InterpreterRenameQuery.cpp +++ b/src/Interpreters/InterpreterRenameQuery.cpp @@ -83,15 +83,15 @@ BlockIO InterpreterRenameQuery::execute() DatabasePtr database = database_catalog.getDatabase(elem.from_database_name); if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { database->propose(query_ptr); + } else { + database->renameTable( + context, + elem.from_table_name, + *database_catalog.getDatabase(elem.to_database_name), + elem.to_table_name, + rename.exchange); } - database->renameTable( - context, - elem.from_table_name, - *database_catalog.getDatabase(elem.to_database_name), - elem.to_table_name, - rename.exchange); } - return {}; } From f6de720f59e8bc8619fbf8684e6d80e8459ba432 Mon Sep 17 00:00:00 2001 From: Val Date: Sun, 7 Jun 2020 14:26:42 +0300 Subject: [PATCH 029/381] speed up db replicated test --- tests/integration/test_replicated_database/test.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/tests/integration/test_replicated_database/test.py b/tests/integration/test_replicated_database/test.py index 703690a7218..95ca5c1e138 100644 --- a/tests/integration/test_replicated_database/test.py +++ b/tests/integration/test_replicated_database/test.py @@ -49,7 +49,7 @@ def test_simple_alter_table(started_cluster): assert node1.query("desc table testdb.alter_test") == node2.query("desc table testdb.alter_test") def test_create_replica_after_delay(started_cluster): - DURATION_SECONDS = 3 + DURATION_SECONDS = 2 node3.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', '{replica}');") @@ -65,18 +65,20 @@ def test_alters_from_different_replicas(started_cluster): DURATION_SECONDS = 1 node1.query("CREATE TABLE testdb.concurrent_test (CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192);") + time.sleep(DURATION_SECONDS) node3.query("ALTER TABLE testdb.concurrent_test ADD COLUMN Added0 UInt32;") - time.sleep(DURATION_SECONDS) node1.query("ALTER TABLE testdb.concurrent_test ADD COLUMN Added2 UInt32;") - time.sleep(DURATION_SECONDS) node3.query("ALTER TABLE testdb.concurrent_test ADD COLUMN Added1 UInt32 AFTER Added0;") - time.sleep(DURATION_SECONDS) node1.query("ALTER TABLE testdb.concurrent_test ADD COLUMN AddedNested1 Nested(A UInt32, B UInt64) AFTER Added2;") - time.sleep(DURATION_SECONDS) node3.query("ALTER TABLE testdb.concurrent_test ADD COLUMN AddedNested1.C Array(String) AFTER AddedNested1.B;") - time.sleep(DURATION_SECONDS) node1.query("ALTER TABLE testdb.concurrent_test ADD COLUMN AddedNested2 Nested(A UInt32, B UInt64) AFTER AddedNested1;") + time.sleep(DURATION_SECONDS) + + logging.info("NODE3") + logging.info(node3.query("desc table testdb.concurrent_test")) + logging.info("NODE1") + logging.info(node1.query("desc table testdb.concurrent_test")) assert node3.query("desc table testdb.concurrent_test") == node1.query("desc table testdb.concurrent_test") From e8e4e4d21c559fc3548d791dea65aa7871e8d19f Mon Sep 17 00:00:00 2001 From: Val Date: Sat, 20 Jun 2020 18:38:20 +0300 Subject: [PATCH 030/381] add tests for db replicated --- .../configs/disable_snapshots.xml | 3 ++ .../configs/snapshot_each_query.xml | 3 ++ .../test_replicated_database/test.py | 40 ++++++++++++------- 3 files changed, 31 insertions(+), 15 deletions(-) create mode 100644 tests/integration/test_replicated_database/configs/disable_snapshots.xml create mode 100644 tests/integration/test_replicated_database/configs/snapshot_each_query.xml diff --git a/tests/integration/test_replicated_database/configs/disable_snapshots.xml b/tests/integration/test_replicated_database/configs/disable_snapshots.xml new file mode 100644 index 00000000000..9a656bdcea1 --- /dev/null +++ b/tests/integration/test_replicated_database/configs/disable_snapshots.xml @@ -0,0 +1,3 @@ + + 0 + diff --git a/tests/integration/test_replicated_database/configs/snapshot_each_query.xml b/tests/integration/test_replicated_database/configs/snapshot_each_query.xml new file mode 100644 index 00000000000..6eae1d9d992 --- /dev/null +++ b/tests/integration/test_replicated_database/configs/snapshot_each_query.xml @@ -0,0 +1,3 @@ + + 1 + diff --git a/tests/integration/test_replicated_database/test.py b/tests/integration/test_replicated_database/test.py index 95ca5c1e138..b557354b6ba 100644 --- a/tests/integration/test_replicated_database/test.py +++ b/tests/integration/test_replicated_database/test.py @@ -10,18 +10,16 @@ logging.getLogger().addHandler(logging.StreamHandler()) cluster = ClickHouseCluster(__file__) -node1 = cluster.add_instance('node1', macros={'replica': 'test1'}, with_zookeeper=True) -node2 = cluster.add_instance('node2', macros={'replica': 'test2'}, with_zookeeper=True) -node3 = cluster.add_instance('node3', macros={'replica': 'test3'}, with_zookeeper=True) +node1 = cluster.add_instance('node1', main_configs=['configs/disable_snapshots.xml'], with_zookeeper=True, stay_alive=True) +node2 = cluster.add_instance('node2', main_configs=['configs/disable_snapshots.xml'], with_zookeeper=True) +node3 = cluster.add_instance('node3', main_configs=['configs/disable_snapshots.xml'], with_zookeeper=True) @pytest.fixture(scope="module") def started_cluster(): try: cluster.start() - - for node in [node1, node2]: - node.query("DROP DATABASE IF EXISTS testdb") - node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', '{replica}');") + node1.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'replica1');") + node2.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'replica2');") yield cluster finally: @@ -49,15 +47,13 @@ def test_simple_alter_table(started_cluster): assert node1.query("desc table testdb.alter_test") == node2.query("desc table testdb.alter_test") def test_create_replica_after_delay(started_cluster): - DURATION_SECONDS = 2 - - node3.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', '{replica}');") + node3.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'replica3');") node1.query("ALTER TABLE testdb.alter_test ADD COLUMN Added3 UInt32 ;") node1.query("ALTER TABLE testdb.alter_test ADD COLUMN Added4 UInt32 ;") node1.query("ALTER TABLE testdb.alter_test ADD COLUMN Added5 UInt32 ;") - time.sleep(DURATION_SECONDS) + time.sleep(6) assert node3.query("desc table testdb.alter_test") == node1.query("desc table testdb.alter_test") @@ -77,8 +73,22 @@ def test_alters_from_different_replicas(started_cluster): time.sleep(DURATION_SECONDS) - logging.info("NODE3") - logging.info(node3.query("desc table testdb.concurrent_test")) - logging.info("NODE1") - logging.info(node1.query("desc table testdb.concurrent_test")) assert node3.query("desc table testdb.concurrent_test") == node1.query("desc table testdb.concurrent_test") + +def test_drop_and_create_table(started_cluster): + node1.query("DROP TABLE testdb.concurrent_test") + node1.query("CREATE TABLE testdb.concurrent_test (CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192);") + time.sleep(5) + assert node3.query("desc table testdb.concurrent_test") == node1.query("desc table testdb.concurrent_test") + +def test_replica_restart(started_cluster): + node1.restart_clickhouse() + time.sleep(5) + assert node3.query("desc table testdb.concurrent_test") == node1.query("desc table testdb.concurrent_test") + +#def test_drop_and_create_replica(started_cluster): +# node1.query("DROP DATABASE testdb") +# node1.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'replica1');") +# time.sleep(6) +# assert node3.query("desc table testdb.concurrent_test") == node1.query("desc table testdb.concurrent_test") + From f57fd52e3b564072d7c2ae61ecaf06138c4201ed Mon Sep 17 00:00:00 2001 From: Val Date: Sat, 20 Jun 2020 18:39:05 +0300 Subject: [PATCH 031/381] fix recursive propose for drop database db replicated query --- src/Interpreters/InterpreterDropQuery.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Interpreters/InterpreterDropQuery.cpp b/src/Interpreters/InterpreterDropQuery.cpp index 05418f275a2..368024da043 100644 --- a/src/Interpreters/InterpreterDropQuery.cpp +++ b/src/Interpreters/InterpreterDropQuery.cpp @@ -129,7 +129,8 @@ BlockIO InterpreterDropQuery::executeToTable( if (database->getEngineName() != "Atomic" && database->getEngineName() != "Replicated") table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); - if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { + // Prevents recursive drop from drop database query. The original query must specify a table. + if (!query_ptr->as().table.empty() && database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { database->propose(query_ptr); } else { database->dropTable(context, table_id.table_name, query.no_delay); From 4fc4b1d195bce04dfd08252eb6c0e3f58d0182f9 Mon Sep 17 00:00:00 2001 From: Val Date: Sat, 20 Jun 2020 18:39:58 +0300 Subject: [PATCH 032/381] db replicated minor enhancements --- src/Databases/DatabaseAtomic.cpp | 7 ++ src/Databases/DatabaseAtomic.h | 1 + src/Databases/DatabaseReplicated.cpp | 176 +++++++++++++++++++-------- src/Databases/DatabaseReplicated.h | 16 +-- src/Databases/DatabasesCommon.cpp | 4 +- 5 files changed, 142 insertions(+), 62 deletions(-) diff --git a/src/Databases/DatabaseAtomic.cpp b/src/Databases/DatabaseAtomic.cpp index ff30b95d139..85f6c70a07c 100644 --- a/src/Databases/DatabaseAtomic.cpp +++ b/src/Databases/DatabaseAtomic.cpp @@ -40,6 +40,13 @@ DatabaseAtomic::DatabaseAtomic(String name_, String metadata_path_, Context & co Poco::File(path_to_table_symlinks).createDirectories(); } +DatabaseAtomic::DatabaseAtomic(String name_, String metadata_path_, const String & data_path_, const String & logger, Context & context_) + : DatabaseOrdinary(name_, std::move(metadata_path_), data_path_, logger, context_) + , path_to_table_symlinks(context_.getPath() + "data/" + escapeForFileName(name_) + "/") +{ + Poco::File(path_to_table_symlinks).createDirectories(); +} + String DatabaseAtomic::getTableDataPath(const String & table_name) const { std::lock_guard lock(mutex); diff --git a/src/Databases/DatabaseAtomic.h b/src/Databases/DatabaseAtomic.h index 71428fdb420..88a77da53a4 100644 --- a/src/Databases/DatabaseAtomic.h +++ b/src/Databases/DatabaseAtomic.h @@ -22,6 +22,7 @@ class DatabaseAtomic : public DatabaseOrdinary public: DatabaseAtomic(String name_, String metadata_path_, Context & context_); + DatabaseAtomic(String name_, String metadata_path_, const String & data_path_, const String & logger, Context & context_); String getEngineName() const override { return "Atomic"; } diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 2650bd46a58..4d16a5d05c0 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -49,6 +49,7 @@ namespace DB namespace ErrorCodes { extern const int NO_ZOOKEEPER; + extern const int FILE_DOESNT_EXIST; } void DatabaseReplicated::setZooKeeper(zkutil::ZooKeeperPtr zookeeper) @@ -78,9 +79,7 @@ DatabaseReplicated::DatabaseReplicated( const String & zookeeper_path_, const String & replica_name_, Context & context_) -// : DatabaseOrdinary(name_, metadata_path_, "data/" + escapeForFileName(name_) + "/", "DatabaseReplicated (" + name_ + ")", context_) - // TODO add constructor to Atomic and call it here with path and logger name specification - : DatabaseAtomic(name_, metadata_path_, context_) + : DatabaseAtomic(name_, metadata_path_, "store/", "DatabaseReplicated (" + name_ + ")", context_) , zookeeper_path(zookeeper_path_) , replica_name(replica_name_) { @@ -90,8 +89,6 @@ DatabaseReplicated::DatabaseReplicated( if (!zookeeper_path.empty() && zookeeper_path.front() != '/') zookeeper_path = "/" + zookeeper_path; - replica_path = zookeeper_path + "/replicas/" + replica_name; - if (context_.hasZooKeeper()) { current_zookeeper = context_.getZooKeeper(); } @@ -100,37 +97,101 @@ DatabaseReplicated::DatabaseReplicated( throw Exception("Can't create replicated database without ZooKeeper", ErrorCodes::NO_ZOOKEEPER); } + // New database if (!current_zookeeper->exists(zookeeper_path, {}, NULL)) { createDatabaseZKNodes(); - } + // Old replica recovery + } else if (current_zookeeper->exists(zookeeper_path + "/replicas/" + replica_name, {}, NULL)) { + String local_last_entry; + try + { + ReadBufferFromFile in(getMetadataPath() + ".last_entry", 16); + readStringUntilEOF(local_last_entry, in); + } + catch (const Exception & e) + { + if (e.code() == ErrorCodes::FILE_DOESNT_EXIST) { + // that is risky cause + // if replica name is the same + // than the last one wins + saveState(); + } else { + throw; + } + } - // replica - if (!current_zookeeper->exists(replica_path, {}, NULL)) { - current_zookeeper->createAncestors(replica_path); - current_zookeeper->createOrUpdate(replica_path, String(), zkutil::CreateMode::Persistent); + String remote_last_entry = current_zookeeper->get(zookeeper_path + "/replicas/" + replica_name, {}, NULL); + if (local_last_entry == remote_last_entry) { + last_executed_log_entry = local_last_entry; + } else { + LOG_DEBUG(log, "LOCAL: " << local_last_entry); + LOG_DEBUG(log, "ZK: " << remote_last_entry); + throw Exception("Can't create replicated database MISCONFIGURATION or something", ErrorCodes::NO_ZOOKEEPER); + } } - //loadMetadataFromSnapshot(); + snapshot_period = context_.getConfigRef().getInt("database_replicated_snapshot_period", 10); + LOG_DEBUG(log, "Snapshot period is set to " << snapshot_period); - background_log_executor = global_context.getReplicatedSchedulePool().createTask(database_name + "(DatabaseReplicated::the_threeeed)", [this]{ runBackgroundLogExecutor();} ); - background_log_executor->schedule(); + background_log_executor = global_context.getReplicatedSchedulePool().createTask(database_name + "(DatabaseReplicated::background_executor)", [this]{ runBackgroundLogExecutor();} ); + + background_log_executor->scheduleAfter(500); } void DatabaseReplicated::createDatabaseZKNodes() { current_zookeeper = getZooKeeper(); - if (current_zookeeper->exists(zookeeper_path)) - return; - current_zookeeper->createAncestors(zookeeper_path); current_zookeeper->createIfNotExists(zookeeper_path, String()); - current_zookeeper->createIfNotExists(zookeeper_path + "/last_entry", "0"); current_zookeeper->createIfNotExists(zookeeper_path + "/log", String()); - current_zookeeper->createIfNotExists(zookeeper_path + "/snapshot", String()); + current_zookeeper->createIfNotExists(zookeeper_path + "/snapshots", String()); + current_zookeeper->createIfNotExists(zookeeper_path + "/replicas", String()); +} + +void DatabaseReplicated::RemoveOutdatedSnapshotsAndLog() { + // This method removes all snapshots and logged queries + // that no longer will be in use by current replicas or + // new coming ones. + // Each registered replica has its state in ZooKeeper. + // Therefore removed snapshots and logged queries are less + // than a least advanced replica. + // It does not interfere with a new coming replica + // metadata loading from snapshot + // because the replica will use the last snapshot available + // and this snapshot will set the last executed log query + // to a greater one than the least advanced current replica. + current_zookeeper = getZooKeeper(); + Strings replica_states = current_zookeeper->getChildren(zookeeper_path + "/replicas"); + auto least_advanced = std::min_element(replica_states.begin(), replica_states.end()); + Strings snapshots = current_zookeeper->getChildren(zookeeper_path + "/snapshots"); + + if (snapshots.size() < 2) { + return; + } + + std::sort(snapshots.begin(), snapshots.end()); + auto still_useful = std::lower_bound(snapshots.begin(), snapshots.end(), *least_advanced); + snapshots.erase(still_useful, snapshots.end()); + for (const String & snapshot : snapshots) { + current_zookeeper->tryRemoveRecursive(zookeeper_path + "/snapshots/" + snapshot); + } + + Strings log_entry_names = current_zookeeper->getChildren(zookeeper_path + "/log"); + std::sort(log_entry_names.begin(), log_entry_names.end()); + auto still_useful_log = std::upper_bound(log_entry_names.begin(), log_entry_names.end(), *still_useful); + log_entry_names.erase(still_useful_log, log_entry_names.end()); + for (const String & log_entry_name : log_entry_names) { + String log_entry_path = zookeeper_path + "/log/" + log_entry_name; + current_zookeeper->tryRemove(log_entry_path); + } } void DatabaseReplicated::runBackgroundLogExecutor() { + if (last_executed_log_entry == "") { + loadMetadataFromSnapshot(); + } + current_zookeeper = getZooKeeper(); Strings log_entry_names = current_zookeeper->getChildren(zookeeper_path + "/log"); @@ -143,34 +204,27 @@ void DatabaseReplicated::runBackgroundLogExecutor() { String log_entry_path = zookeeper_path + "/log/" + log_entry_name; executeFromZK(log_entry_path); last_executed_log_entry = log_entry_name; + saveState(); + + int log_n = parse(log_entry_name.substr(4)); + int last_log_n = parse(log_entry_names.back().substr(4)); + + // The third condition gurantees at most one snapshot per batch + if (log_n > 0 && snapshot_period > 0 && (last_log_n - log_n) / snapshot_period == 0 && log_n % snapshot_period == 0) { + createSnapshot(); + } } background_log_executor->scheduleAfter(500); - - // String last_n = current_zookeeper->get(zookeeper_path + "/last_entry", {}, NULL); - // size_t last_n_parsed = parse(last_n); - - // bool newEntries = current_log_entry_n < last_n_parsed; - // while (current_log_entry_n < last_n_parsed) { - // current_log_entry_n++; - // String log_path = zookeeper_path + "/log/log." + std::to_string(current_log_entry_n); - // executeFromZK(log_path); - // } - // if (newEntries) { - // saveState(); - // } - // background_log_executor->scheduleAfter(500); } void DatabaseReplicated::saveState() { - String state = std::to_string(current_log_entry_n); - current_zookeeper = getZooKeeper(); - current_zookeeper->createOrUpdate(replica_path + "/last_entry", state, zkutil::CreateMode::Persistent); + current_zookeeper->createOrUpdate(zookeeper_path + "/replicas/" + replica_name, last_executed_log_entry, zkutil::CreateMode::Persistent); String metadata_file = getMetadataPath() + ".last_entry"; - WriteBufferFromFile out(metadata_file, state.size(), O_WRONLY | O_CREAT); - writeString(state, out); + WriteBufferFromFile out(metadata_file, last_executed_log_entry.size(), O_WRONLY | O_CREAT); + writeString(last_executed_log_entry, out); out.next(); if (global_context.getSettingsRef().fsync_metadata) out.sync(); @@ -201,47 +255,63 @@ void DatabaseReplicated::executeFromZK(String & path) { LOG_DEBUG(log, "Executed query: " << query_to_execute); } -// TODO Move to ZooKeeper/Lock and remove it from here and ddlworker -// static std::unique_ptr createSimpleZooKeeperLock( -// const std::shared_ptr & zookeeper, const String & lock_prefix, const String & lock_name, const String & lock_message) -// { -// auto zookeeper_holder = std::make_shared(); -// zookeeper_holder->initFromInstance(zookeeper); -// return std::make_unique(std::move(zookeeper_holder), lock_prefix, lock_name, lock_message); -// } - - void DatabaseReplicated::propose(const ASTPtr & query) { current_zookeeper = getZooKeeper(); - LOG_DEBUG(log, "PROPOSINGGG query: " << queryToString(query)); + LOG_DEBUG(log, "Writing the query to log: " << queryToString(query)); current_zookeeper->create(zookeeper_path + "/log/log-", queryToString(query), zkutil::CreateMode::PersistentSequential); background_log_executor->schedule(); } -void DatabaseReplicated::updateSnapshot() { +void DatabaseReplicated::createSnapshot() { current_zookeeper = getZooKeeper(); - current_zookeeper->tryRemoveChildren(zookeeper_path + "/snapshot"); + String snapshot_path = zookeeper_path + "/snapshots/" + last_executed_log_entry; + + if (Coordination::ZNODEEXISTS == current_zookeeper->tryCreate(snapshot_path, String(), zkutil::CreateMode::Persistent)) { + return; + } + for (auto iterator = getTablesIterator({}); iterator->isValid(); iterator->next()) { String table_name = iterator->name(); auto query = getCreateQueryFromMetadata(getObjectMetadataPath(table_name), true); String statement = queryToString(query); - current_zookeeper->createOrUpdate(zookeeper_path + "/snapshot/" + table_name, statement, zkutil::CreateMode::Persistent); + current_zookeeper->createOrUpdate(snapshot_path + "/" + table_name, statement, zkutil::CreateMode::Persistent); } + + RemoveOutdatedSnapshotsAndLog(); } void DatabaseReplicated::loadMetadataFromSnapshot() { current_zookeeper = getZooKeeper(); + Strings snapshots; + if (current_zookeeper->tryGetChildren(zookeeper_path + "/snapshots", snapshots) != Coordination::ZOK) + return; + + if (snapshots.size() < 1) { + return; + } + + auto latest_snapshot = std::max_element(snapshots.begin(), snapshots.end()); Strings metadatas; - if (current_zookeeper->tryGetChildren(zookeeper_path + "/snapshot", metadatas) != Coordination::ZOK) + if (current_zookeeper->tryGetChildren(zookeeper_path + "/snapshots/" + *latest_snapshot, metadatas) != Coordination::ZOK) return; for (auto t = metadatas.begin(); t != metadatas.end(); ++t) { - String path = zookeeper_path + "/snapshot/" + *t; + String path = zookeeper_path + "/snapshots/" + *latest_snapshot + "/" + *t; executeFromZK(path); } + + last_executed_log_entry = *latest_snapshot; + saveState(); +} + +void DatabaseReplicated::drop(const Context & context_) +{ + current_zookeeper = getZooKeeper(); + current_zookeeper->tryRemove(zookeeper_path + "/replicas/" + replica_name); + DatabaseAtomic::drop(context_); } } diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index 19a0ea09e11..471365361b7 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -23,13 +23,13 @@ namespace DB * The engine has two parameters ZooKeeper path and * replica name. * The same ZooKeeper path corresponds to the same - * database. Replica names must be different for all replicas + * database. Replica names MUST be different for all replicas * of the same database. * * Using this engine, creation of Replicated tables * requires no ZooKeeper path and replica name parameters. * Table's replica name is the same as database replica name. - * Table's ZooKeeper path is a concatenation of database's + * Table's ZooKeeper path is a concatenation of database * ZooKeeper path, /tables/, and UUID of the table. */ class DatabaseReplicated : public DatabaseAtomic @@ -37,6 +37,8 @@ class DatabaseReplicated : public DatabaseAtomic public: DatabaseReplicated(const String & name_, const String & metadata_path_, const String & zookeeper_path_, const String & replica_name_, Context & context); + void drop(const Context & /*context*/) override; + String getEngineName() const override { return "Replicated"; } void propose(const ASTPtr & query) override; @@ -48,23 +50,23 @@ private: void createDatabaseZKNodes(); void runBackgroundLogExecutor(); - + void executeFromZK(String & path); void saveState(); - void updateSnapshot(); + void loadMetadataFromSnapshot(); + void createSnapshot(); + void RemoveOutdatedSnapshotsAndLog(); std::unique_ptr current_context; // to run executeQuery - std::atomic current_log_entry_n = 0; + int snapshot_period; String last_executed_log_entry = ""; BackgroundSchedulePool::TaskHolder background_log_executor; - String replica_path; - zkutil::ZooKeeperPtr current_zookeeper; /// Use only the methods below. mutable std::mutex current_zookeeper_mutex; /// To recreate the session in the background thread. diff --git a/src/Databases/DatabasesCommon.cpp b/src/Databases/DatabasesCommon.cpp index 7925d812241..4575e6da953 100644 --- a/src/Databases/DatabasesCommon.cpp +++ b/src/Databases/DatabasesCommon.cpp @@ -78,7 +78,7 @@ StoragePtr DatabaseWithOwnTablesBase::detachTableUnlocked(const String & table_n auto table_id = res->getStorageID(); if (table_id.hasUUID()) { - assert(getDatabaseName() == DatabaseCatalog::TEMPORARY_DATABASE || getEngineName() == "Atomic"); + assert(getDatabaseName() == DatabaseCatalog::TEMPORARY_DATABASE || getEngineName() == "Atomic" || getEngineName() == "Replicated"); DatabaseCatalog::instance().removeUUIDMapping(table_id.uuid); } @@ -120,7 +120,7 @@ void DatabaseWithOwnTablesBase::shutdown() kv.second->shutdown(); if (table_id.hasUUID()) { - assert(getDatabaseName() == DatabaseCatalog::TEMPORARY_DATABASE || getEngineName() == "Atomic"); + assert(getDatabaseName() == DatabaseCatalog::TEMPORARY_DATABASE || getEngineName() == "Atomic" || getEngineName() == "Replicated"); DatabaseCatalog::instance().removeUUIDMapping(table_id.uuid); } } From 82f5281cfe52ce4643ced3b4ad3f2c229b894014 Mon Sep 17 00:00:00 2001 From: Val Date: Sun, 21 Jun 2020 18:03:04 +0300 Subject: [PATCH 033/381] remove redundant includes --- src/Databases/DatabaseReplicated.cpp | 28 ---------------------------- src/Databases/DatabaseReplicated.h | 4 ---- 2 files changed, 32 deletions(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 4d16a5d05c0..5a42edd9f0d 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -1,46 +1,18 @@ -#include - -#include -#include -#include #include -#include #include #include #include #include #include #include -#include #include -#include -#include -#include -#include -#include -#include -#include - #include - -#include -#include -#include -#include -#include -#include -#include -#include -#include #include - #include #include #include #include -#include -#include namespace DB { diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index 471365361b7..ab7b596eb4e 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -1,13 +1,9 @@ #pragma once #include -#include #include #include -#include -#include - namespace DB { /** DatabaseReplicated engine From 67588edcf5c5fea7e29958329b38b6d3db2b9d0f Mon Sep 17 00:00:00 2001 From: Val Date: Mon, 22 Jun 2020 17:19:26 +0300 Subject: [PATCH 034/381] clean up db replicated files and add more tests --- src/Databases/DatabaseReplicated.cpp | 39 +++++---- src/Databases/DatabaseReplicated.h | 2 +- .../test_replicated_database/test.py | 81 ++++++++++--------- 3 files changed, 65 insertions(+), 57 deletions(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 5a42edd9f0d..6a137a2af0c 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -21,7 +21,7 @@ namespace DB namespace ErrorCodes { extern const int NO_ZOOKEEPER; - extern const int FILE_DOESNT_EXIST; + extern const int LOGICAL_ERROR; } void DatabaseReplicated::setZooKeeper(zkutil::ZooKeeperPtr zookeeper) @@ -74,6 +74,8 @@ DatabaseReplicated::DatabaseReplicated( createDatabaseZKNodes(); // Old replica recovery } else if (current_zookeeper->exists(zookeeper_path + "/replicas/" + replica_name, {}, NULL)) { + String remote_last_entry = current_zookeeper->get(zookeeper_path + "/replicas/" + replica_name, {}, NULL); + String local_last_entry; try { @@ -82,28 +84,21 @@ DatabaseReplicated::DatabaseReplicated( } catch (const Exception & e) { - if (e.code() == ErrorCodes::FILE_DOESNT_EXIST) { - // that is risky cause - // if replica name is the same - // than the last one wins - saveState(); - } else { - throw; - } + // Metadata is corrupted. + // Replica erases the previous zk last executed log entry + // and behaves like a new clean replica. + writeLastExecutedToDiskAndZK(); } - String remote_last_entry = current_zookeeper->get(zookeeper_path + "/replicas/" + replica_name, {}, NULL); - if (local_last_entry == remote_last_entry) { + if (!local_last_entry.empty() && local_last_entry == remote_last_entry) { last_executed_log_entry = local_last_entry; } else { - LOG_DEBUG(log, "LOCAL: " << local_last_entry); - LOG_DEBUG(log, "ZK: " << remote_last_entry); - throw Exception("Can't create replicated database MISCONFIGURATION or something", ErrorCodes::NO_ZOOKEEPER); + throw Exception("Replica name might be in use by a different node. Please check replica_name parameter. Remove .last_entry file from metadata to create a new replica.", ErrorCodes::LOGICAL_ERROR); } } snapshot_period = context_.getConfigRef().getInt("database_replicated_snapshot_period", 10); - LOG_DEBUG(log, "Snapshot period is set to " << snapshot_period); + LOG_DEBUG(log, "Snapshot period is set to " << snapshot_period << " log entries per one snapshot"); background_log_executor = global_context.getReplicatedSchedulePool().createTask(database_name + "(DatabaseReplicated::background_executor)", [this]{ runBackgroundLogExecutor();} ); @@ -176,12 +171,12 @@ void DatabaseReplicated::runBackgroundLogExecutor() { String log_entry_path = zookeeper_path + "/log/" + log_entry_name; executeFromZK(log_entry_path); last_executed_log_entry = log_entry_name; - saveState(); + writeLastExecutedToDiskAndZK(); int log_n = parse(log_entry_name.substr(4)); int last_log_n = parse(log_entry_names.back().substr(4)); - // The third condition gurantees at most one snapshot per batch + // The third condition gurantees at most one snapshot creation per batch if (log_n > 0 && snapshot_period > 0 && (last_log_n - log_n) / snapshot_period == 0 && log_n % snapshot_period == 0) { createSnapshot(); } @@ -190,7 +185,7 @@ void DatabaseReplicated::runBackgroundLogExecutor() { background_log_executor->scheduleAfter(500); } -void DatabaseReplicated::saveState() { +void DatabaseReplicated::writeLastExecutedToDiskAndZK() { current_zookeeper = getZooKeeper(); current_zookeeper->createOrUpdate(zookeeper_path + "/replicas/" + replica_name, last_executed_log_entry, zkutil::CreateMode::Persistent); @@ -230,7 +225,7 @@ void DatabaseReplicated::executeFromZK(String & path) { void DatabaseReplicated::propose(const ASTPtr & query) { current_zookeeper = getZooKeeper(); - LOG_DEBUG(log, "Writing the query to log: " << queryToString(query)); + LOG_DEBUG(log, "Proposing query: " << queryToString(query)); current_zookeeper->create(zookeeper_path + "/log/log-", queryToString(query), zkutil::CreateMode::PersistentSequential); background_log_executor->schedule(); @@ -255,6 +250,8 @@ void DatabaseReplicated::createSnapshot() { } void DatabaseReplicated::loadMetadataFromSnapshot() { + // Executes the latest snapshot. + // Used by new replicas only. current_zookeeper = getZooKeeper(); Strings snapshots; @@ -270,13 +267,15 @@ void DatabaseReplicated::loadMetadataFromSnapshot() { if (current_zookeeper->tryGetChildren(zookeeper_path + "/snapshots/" + *latest_snapshot, metadatas) != Coordination::ZOK) return; + LOG_DEBUG(log, "Executing " << *latest_snapshot << " snapshot"); for (auto t = metadatas.begin(); t != metadatas.end(); ++t) { String path = zookeeper_path + "/snapshots/" + *latest_snapshot + "/" + *t; + executeFromZK(path); } last_executed_log_entry = *latest_snapshot; - saveState(); + writeLastExecutedToDiskAndZK(); } void DatabaseReplicated::drop(const Context & context_) diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index ab7b596eb4e..1cdcc3e990c 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -49,7 +49,7 @@ private: void executeFromZK(String & path); - void saveState(); + void writeLastExecutedToDiskAndZK(); void loadMetadataFromSnapshot(); void createSnapshot(); diff --git a/tests/integration/test_replicated_database/test.py b/tests/integration/test_replicated_database/test.py index b557354b6ba..0b7f8aadec2 100644 --- a/tests/integration/test_replicated_database/test.py +++ b/tests/integration/test_replicated_database/test.py @@ -10,16 +10,18 @@ logging.getLogger().addHandler(logging.StreamHandler()) cluster = ClickHouseCluster(__file__) -node1 = cluster.add_instance('node1', main_configs=['configs/disable_snapshots.xml'], with_zookeeper=True, stay_alive=True) -node2 = cluster.add_instance('node2', main_configs=['configs/disable_snapshots.xml'], with_zookeeper=True) -node3 = cluster.add_instance('node3', main_configs=['configs/disable_snapshots.xml'], with_zookeeper=True) +main_node = cluster.add_instance('main_node', main_configs=['configs/disable_snapshots.xml'], with_zookeeper=True, stay_alive=True) +dummy_node = cluster.add_instance('dummy_node', main_configs=['configs/disable_snapshots.xml'], with_zookeeper=True) +competing_node = cluster.add_instance('competing_node', main_configs=['configs/disable_snapshots.xml'], with_zookeeper=True) +snapshotting_node = cluster.add_instance('snapshotting_node', main_configs=['configs/snapshot_each_query.xml'], with_zookeeper=True) +snapshot_recovering_node = cluster.add_instance('snapshot_recovering_node', main_configs=['configs/disable_snapshots.xml'], with_zookeeper=True) @pytest.fixture(scope="module") def started_cluster(): try: cluster.start() - node1.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'replica1');") - node2.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'replica2');") + main_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'replica1');") + dummy_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'replica2');") yield cluster finally: @@ -28,67 +30,74 @@ def started_cluster(): def test_create_replicated_table(started_cluster): DURATION_SECONDS = 1 - node1.query("CREATE TABLE testdb.replicated_table (d Date, k UInt64, i32 Int32) ENGINE=ReplicatedMergeTree(d, k, 8192);") + main_node.query("CREATE TABLE testdb.replicated_table (d Date, k UInt64, i32 Int32) ENGINE=ReplicatedMergeTree(d, k, 8192);") time.sleep(DURATION_SECONDS) - assert node1.query("desc table testdb.replicated_table") == node2.query("desc table testdb.replicated_table") + assert main_node.query("desc table testdb.replicated_table") == dummy_node.query("desc table testdb.replicated_table") def test_simple_alter_table(started_cluster): DURATION_SECONDS = 1 - node1.query("CREATE TABLE testdb.alter_test (CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192);") - node1.query("ALTER TABLE testdb.alter_test ADD COLUMN Added0 UInt32;") - node1.query("ALTER TABLE testdb.alter_test ADD COLUMN Added2 UInt32;") - node1.query("ALTER TABLE testdb.alter_test ADD COLUMN Added1 UInt32 AFTER Added0;") - node1.query("ALTER TABLE testdb.alter_test ADD COLUMN AddedNested1 Nested(A UInt32, B UInt64) AFTER Added2;") - node1.query("ALTER TABLE testdb.alter_test ADD COLUMN AddedNested1.C Array(String) AFTER AddedNested1.B;") - node1.query("ALTER TABLE testdb.alter_test ADD COLUMN AddedNested2 Nested(A UInt32, B UInt64) AFTER AddedNested1;") + main_node.query("CREATE TABLE testdb.alter_test (CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192);") + main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN Added0 UInt32;") + main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN Added2 UInt32;") + main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN Added1 UInt32 AFTER Added0;") + main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN AddedNested1 Nested(A UInt32, B UInt64) AFTER Added2;") + main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN AddedNested1.C Array(String) AFTER AddedNested1.B;") + main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN AddedNested2 Nested(A UInt32, B UInt64) AFTER AddedNested1;") time.sleep(DURATION_SECONDS) - assert node1.query("desc table testdb.alter_test") == node2.query("desc table testdb.alter_test") + assert main_node.query("desc table testdb.alter_test") == dummy_node.query("desc table testdb.alter_test") def test_create_replica_after_delay(started_cluster): - node3.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'replica3');") + competing_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'replica3');") - node1.query("ALTER TABLE testdb.alter_test ADD COLUMN Added3 UInt32 ;") - node1.query("ALTER TABLE testdb.alter_test ADD COLUMN Added4 UInt32 ;") - node1.query("ALTER TABLE testdb.alter_test ADD COLUMN Added5 UInt32 ;") + main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN Added3 UInt32 ;") + main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN Added4 UInt32 ;") + main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN Added5 UInt32 ;") time.sleep(6) - assert node3.query("desc table testdb.alter_test") == node1.query("desc table testdb.alter_test") + assert competing_node.query("desc table testdb.alter_test") == main_node.query("desc table testdb.alter_test") def test_alters_from_different_replicas(started_cluster): DURATION_SECONDS = 1 - node1.query("CREATE TABLE testdb.concurrent_test (CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192);") + main_node.query("CREATE TABLE testdb.concurrent_test (CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192);") time.sleep(DURATION_SECONDS) - node3.query("ALTER TABLE testdb.concurrent_test ADD COLUMN Added0 UInt32;") - node1.query("ALTER TABLE testdb.concurrent_test ADD COLUMN Added2 UInt32;") - node3.query("ALTER TABLE testdb.concurrent_test ADD COLUMN Added1 UInt32 AFTER Added0;") - node1.query("ALTER TABLE testdb.concurrent_test ADD COLUMN AddedNested1 Nested(A UInt32, B UInt64) AFTER Added2;") - node3.query("ALTER TABLE testdb.concurrent_test ADD COLUMN AddedNested1.C Array(String) AFTER AddedNested1.B;") - node1.query("ALTER TABLE testdb.concurrent_test ADD COLUMN AddedNested2 Nested(A UInt32, B UInt64) AFTER AddedNested1;") + competing_node.query("ALTER TABLE testdb.concurrent_test ADD COLUMN Added0 UInt32;") + main_node.query("ALTER TABLE testdb.concurrent_test ADD COLUMN Added2 UInt32;") + competing_node.query("ALTER TABLE testdb.concurrent_test ADD COLUMN Added1 UInt32 AFTER Added0;") + main_node.query("ALTER TABLE testdb.concurrent_test ADD COLUMN AddedNested1 Nested(A UInt32, B UInt64) AFTER Added2;") + competing_node.query("ALTER TABLE testdb.concurrent_test ADD COLUMN AddedNested1.C Array(String) AFTER AddedNested1.B;") + main_node.query("ALTER TABLE testdb.concurrent_test ADD COLUMN AddedNested2 Nested(A UInt32, B UInt64) AFTER AddedNested1;") time.sleep(DURATION_SECONDS) - assert node3.query("desc table testdb.concurrent_test") == node1.query("desc table testdb.concurrent_test") + assert competing_node.query("desc table testdb.concurrent_test") == main_node.query("desc table testdb.concurrent_test") def test_drop_and_create_table(started_cluster): - node1.query("DROP TABLE testdb.concurrent_test") - node1.query("CREATE TABLE testdb.concurrent_test (CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192);") + main_node.query("DROP TABLE testdb.concurrent_test") + main_node.query("CREATE TABLE testdb.concurrent_test (CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192);") time.sleep(5) - assert node3.query("desc table testdb.concurrent_test") == node1.query("desc table testdb.concurrent_test") + assert competing_node.query("desc table testdb.concurrent_test") == main_node.query("desc table testdb.concurrent_test") def test_replica_restart(started_cluster): - node1.restart_clickhouse() + main_node.restart_clickhouse() time.sleep(5) - assert node3.query("desc table testdb.concurrent_test") == node1.query("desc table testdb.concurrent_test") + assert competing_node.query("desc table testdb.concurrent_test") == main_node.query("desc table testdb.concurrent_test") + +def test_snapshot_and_snapshot_recover(started_cluster): + snapshotting_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'replica4');") + time.sleep(5) + snapshot_recovering_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'replica5');") + time.sleep(5) + assert snapshotting_node.query("desc table testdb.alter_test") == snapshot_recovering_node.query("desc table testdb.alter_test") #def test_drop_and_create_replica(started_cluster): -# node1.query("DROP DATABASE testdb") -# node1.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'replica1');") +# main_node.query("DROP DATABASE testdb") +# main_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'replica1');") # time.sleep(6) -# assert node3.query("desc table testdb.concurrent_test") == node1.query("desc table testdb.concurrent_test") +# assert competing_node.query("desc table testdb.concurrent_test") == main_node.query("desc table testdb.concurrent_test") From 16e50e33d76f4c4e4ccd167f2354c41782fcf76a Mon Sep 17 00:00:00 2001 From: Val Date: Mon, 22 Jun 2020 17:22:26 +0300 Subject: [PATCH 035/381] fix typo --- src/Databases/DatabaseReplicated.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 6a137a2af0c..bf974901e41 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -121,11 +121,11 @@ void DatabaseReplicated::RemoveOutdatedSnapshotsAndLog() { // that no longer will be in use by current replicas or // new coming ones. // Each registered replica has its state in ZooKeeper. - // Therefore removed snapshots and logged queries are less - // than a least advanced replica. + // Therefore, snapshots and logged queries that are less + // than a least advanced replica are removed. // It does not interfere with a new coming replica // metadata loading from snapshot - // because the replica will use the last snapshot available + // because the replica will use the latest snapshot available // and this snapshot will set the last executed log query // to a greater one than the least advanced current replica. current_zookeeper = getZooKeeper(); From d293e002a7251f58eee5601749169435d25136ba Mon Sep 17 00:00:00 2001 From: Val Date: Wed, 24 Jun 2020 15:45:42 +0300 Subject: [PATCH 036/381] address pr comments --- src/Databases/DatabaseReplicated.cpp | 24 +++++++++++++++------ src/Interpreters/InterpreterCreateQuery.cpp | 2 +- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index bf974901e41..adfd28f8914 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -22,6 +22,7 @@ namespace ErrorCodes { extern const int NO_ZOOKEEPER; extern const int LOGICAL_ERROR; + extern const int BAD_ARGUMENTS; } void DatabaseReplicated::setZooKeeper(zkutil::ZooKeeperPtr zookeeper) @@ -55,10 +56,14 @@ DatabaseReplicated::DatabaseReplicated( , zookeeper_path(zookeeper_path_) , replica_name(replica_name_) { - if (!zookeeper_path.empty() && zookeeper_path.back() == '/') + if (zookeeper_path.empty() || replica_name.empty()) { + throw Exception("ZooKeeper path and replica name must be non-empty", ErrorCodes::BAD_ARGUMENTS); + } + + if (zookeeper_path.back() == '/') zookeeper_path.resize(zookeeper_path.size() - 1); // If zookeeper chroot prefix is used, path should start with '/', because chroot concatenates without it. - if (!zookeeper_path.empty() && zookeeper_path.front() != '/') + if (zookeeper_path.front() != '/') zookeeper_path = "/" + zookeeper_path; if (context_.hasZooKeeper()) { @@ -70,10 +75,10 @@ DatabaseReplicated::DatabaseReplicated( } // New database - if (!current_zookeeper->exists(zookeeper_path, {}, NULL)) { + if (!current_zookeeper->exists(zookeeper_path)) { createDatabaseZKNodes(); // Old replica recovery - } else if (current_zookeeper->exists(zookeeper_path + "/replicas/" + replica_name, {}, NULL)) { + } else if (current_zookeeper->exists(zookeeper_path + "/replicas/" + replica_name)) { String remote_last_entry = current_zookeeper->get(zookeeper_path + "/replicas/" + replica_name, {}, NULL); String local_last_entry; @@ -243,8 +248,9 @@ void DatabaseReplicated::createSnapshot() { String table_name = iterator->name(); auto query = getCreateQueryFromMetadata(getObjectMetadataPath(table_name), true); String statement = queryToString(query); - current_zookeeper->createOrUpdate(snapshot_path + "/" + table_name, statement, zkutil::CreateMode::Persistent); + current_zookeeper->createIfNotExists(snapshot_path + "/" + table_name, statement); } + current_zookeeper->createIfNotExists(snapshot_path + "/.completed", String()); RemoveOutdatedSnapshotsAndLog(); } @@ -258,11 +264,17 @@ void DatabaseReplicated::loadMetadataFromSnapshot() { if (current_zookeeper->tryGetChildren(zookeeper_path + "/snapshots", snapshots) != Coordination::ZOK) return; + auto latest_snapshot = std::max_element(snapshots.begin(), snapshots.end()); + while (snapshots.size() > 0 && !current_zookeeper->exists(zookeeper_path + "/snapshots/" + *latest_snapshot + "/.completed")) { + snapshots.erase(latest_snapshot); + latest_snapshot = std::max_element(snapshots.begin(), snapshots.end()); + } + if (snapshots.size() < 1) { return; } - auto latest_snapshot = std::max_element(snapshots.begin(), snapshots.end()); + Strings metadatas; if (current_zookeeper->tryGetChildren(zookeeper_path + "/snapshots/" + *latest_snapshot, metadatas) != Coordination::ZOK) return; diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 6806679cb4d..9d3abf2c8a6 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -640,7 +640,7 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, } else if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind == ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { if (create.uuid == UUIDHelpers::Nil) - throw Exception("Table UUID is not specified in DDL log", ErrorCodes::INCORRECT_QUERY); + throw Exception("Table UUID is not specified in DDL log", ErrorCodes::LOGICAL_ERROR); } else { From 9635ea64bed93a587a147a21fbeda27cc08cf43d Mon Sep 17 00:00:00 2001 From: Val Date: Wed, 24 Jun 2020 15:50:23 +0300 Subject: [PATCH 037/381] Add desc of propose idatabase method --- src/Databases/IDatabase.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Databases/IDatabase.h b/src/Databases/IDatabase.h index 5b3003f36b4..b80e73be108 100644 --- a/src/Databases/IDatabase.h +++ b/src/Databases/IDatabase.h @@ -161,6 +161,7 @@ public: /// Is the database empty. virtual bool empty() const = 0; + /// Submit query to log. Currently used by DatabaseReplicated engine only. virtual void propose(const ASTPtr & /*query*/) { throw Exception(getEngineName() + ": propose() is not supported", ErrorCodes::NOT_IMPLEMENTED); } From dde293fc3d10470bbe65b5ef4f58a5c2cd2d851e Mon Sep 17 00:00:00 2001 From: Val Date: Wed, 24 Jun 2020 16:37:29 +0300 Subject: [PATCH 038/381] check schema after alters in test --- .../test_replicated_database/test.py | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tests/integration/test_replicated_database/test.py b/tests/integration/test_replicated_database/test.py index 0b7f8aadec2..346114cb8c4 100644 --- a/tests/integration/test_replicated_database/test.py +++ b/tests/integration/test_replicated_database/test.py @@ -46,6 +46,28 @@ def test_simple_alter_table(started_cluster): main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN AddedNested2 Nested(A UInt32, B UInt64) AFTER AddedNested1;") time.sleep(DURATION_SECONDS) + + schema = main_node.query("show create table testdb.alter_test") + fields = [ + "`CounterID`", + "`StartDate`", + "`UserID`", + "`VisitID`", + "`NestedColumn.A`", + "`NestedColumn.S`", + "`ToDrop`", + "`Added0`", + "`Added1`", + "`Added2`", + "`AddedNested1.A`", + "`AddedNested1.B`", + "`AddedNested1.C`", + "`AddedNested2.A`", + "`AddedNested2.B`"] + + for field in fields: + assert field in schema + assert main_node.query("desc table testdb.alter_test") == dummy_node.query("desc table testdb.alter_test") def test_create_replica_after_delay(started_cluster): From e23c7a313eaafa174b3e0404469c152c1ff08c00 Mon Sep 17 00:00:00 2001 From: Val Date: Fri, 26 Jun 2020 17:05:27 +0300 Subject: [PATCH 039/381] address pr comments --- src/Databases/DatabaseOnDisk.h | 2 +- src/Databases/DatabaseReplicated.cpp | 15 ++++++++------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/Databases/DatabaseOnDisk.h b/src/Databases/DatabaseOnDisk.h index dc347c99542..00689900edf 100644 --- a/src/Databases/DatabaseOnDisk.h +++ b/src/Databases/DatabaseOnDisk.h @@ -86,7 +86,7 @@ protected: const String metadata_path; const String data_path; - Context & global_context; + const Context & global_context; }; } diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index adfd28f8914..0ddc976d8d0 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -105,7 +105,7 @@ DatabaseReplicated::DatabaseReplicated( snapshot_period = context_.getConfigRef().getInt("database_replicated_snapshot_period", 10); LOG_DEBUG(log, "Snapshot period is set to " << snapshot_period << " log entries per one snapshot"); - background_log_executor = global_context.getReplicatedSchedulePool().createTask(database_name + "(DatabaseReplicated::background_executor)", [this]{ runBackgroundLogExecutor();} ); + background_log_executor = context_.getReplicatedSchedulePool().createTask(database_name + "(DatabaseReplicated::background_executor)", [this]{ runBackgroundLogExecutor();} ); background_log_executor->scheduleAfter(500); } @@ -206,9 +206,9 @@ void DatabaseReplicated::writeLastExecutedToDiskAndZK() { void DatabaseReplicated::executeFromZK(String & path) { current_zookeeper = getZooKeeper(); String query_to_execute = current_zookeeper->get(path, {}, NULL); - ReadBufferFromString istr(query_to_execute); - String dummy_string; - WriteBufferFromString ostr(dummy_string); + //ReadBufferFromString istr(query_to_execute); + //String dummy_string; + //WriteBufferFromString ostr(dummy_string); try { @@ -216,7 +216,8 @@ void DatabaseReplicated::executeFromZK(String & path) { current_context->getClientInfo().query_kind = ClientInfo::QueryKind::REPLICATED_LOG_QUERY; current_context->setCurrentDatabase(database_name); current_context->setCurrentQueryId(""); // generate random query_id - executeQuery(istr, ostr, false, *current_context, {}); + //executeQuery(istr, ostr, false, *current_context, {}); + executeQuery(query_to_execute, *current_context); } catch (...) { @@ -248,9 +249,9 @@ void DatabaseReplicated::createSnapshot() { String table_name = iterator->name(); auto query = getCreateQueryFromMetadata(getObjectMetadataPath(table_name), true); String statement = queryToString(query); - current_zookeeper->createIfNotExists(snapshot_path + "/" + table_name, statement); + current_zookeeper->create(snapshot_path + "/" + table_name, statement, zkutil::CreateMode::Persistent); } - current_zookeeper->createIfNotExists(snapshot_path + "/.completed", String()); + current_zookeeper->create(snapshot_path + "/.completed", String(), zkutil::CreateMode::Persistent); RemoveOutdatedSnapshotsAndLog(); } From 8273248c4e3cc8431ee30b71729a9da369f54a7a Mon Sep 17 00:00:00 2001 From: Val Date: Sat, 27 Jun 2020 16:39:41 +0300 Subject: [PATCH 040/381] add log_name_to_exec to dbreplicated --- src/Databases/DatabaseFactory.cpp | 5 +- src/Databases/DatabaseOnDisk.cpp | 2 +- src/Databases/DatabaseOnDisk.h | 1 - src/Databases/DatabaseOrdinary.cpp | 2 +- src/Databases/DatabaseReplicated.cpp | 47 ++++++++++++------- src/Databases/DatabaseReplicated.h | 9 +++- src/Interpreters/InterpreterDropQuery.cpp | 3 +- .../MergeTree/registerStorageMergeTree.cpp | 8 ---- tests/integration/runner | 4 +- 9 files changed, 46 insertions(+), 35 deletions(-) diff --git a/src/Databases/DatabaseFactory.cpp b/src/Databases/DatabaseFactory.cpp index 0d7a711b530..752eeba4e81 100644 --- a/src/Databases/DatabaseFactory.cpp +++ b/src/Databases/DatabaseFactory.cpp @@ -148,8 +148,9 @@ DatabasePtr DatabaseFactory::getImpl( const auto & arguments = engine->arguments->children; - const auto zoo_path = arguments[0]->as()->value.safeGet(); - const auto replica_name = arguments[1]->as()->value.safeGet(); + const auto & zoo_path = safeGetLiteralValue(arguments[0], "Replicated"); + const auto & replica_name = safeGetLiteralValue(arguments[1], "Replicated"); + return std::make_shared(database_name, metadata_path, zoo_path, replica_name, context); } diff --git a/src/Databases/DatabaseOnDisk.cpp b/src/Databases/DatabaseOnDisk.cpp index 0a16b6eacff..6c72773fb69 100644 --- a/src/Databases/DatabaseOnDisk.cpp +++ b/src/Databases/DatabaseOnDisk.cpp @@ -127,7 +127,7 @@ DatabaseOnDisk::DatabaseOnDisk( const String & metadata_path_, const String & data_path_, const String & logger, - const Context & context) + Context & context) : DatabaseWithOwnTablesBase(name, logger, context) , metadata_path(metadata_path_) , data_path(data_path_) diff --git a/src/Databases/DatabaseOnDisk.h b/src/Databases/DatabaseOnDisk.h index 00689900edf..4e7b2ab1709 100644 --- a/src/Databases/DatabaseOnDisk.h +++ b/src/Databases/DatabaseOnDisk.h @@ -86,7 +86,6 @@ protected: const String metadata_path; const String data_path; - const Context & global_context; }; } diff --git a/src/Databases/DatabaseOrdinary.cpp b/src/Databases/DatabaseOrdinary.cpp index 2f4f584b091..69fbbce8b7d 100644 --- a/src/Databases/DatabaseOrdinary.cpp +++ b/src/Databases/DatabaseOrdinary.cpp @@ -100,7 +100,7 @@ DatabaseOrdinary::DatabaseOrdinary(const String & name_, const String & metadata } DatabaseOrdinary::DatabaseOrdinary( - const String & name_, const String & metadata_path_, const String & data_path_, const String & logger, const Context & context_) + const String & name_, const String & metadata_path_, const String & data_path_, const String & logger, Context & context_) : DatabaseWithDictionaries(name_, metadata_path_, data_path_, logger, context_) { } diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 0ddc976d8d0..47298996236 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -13,6 +13,8 @@ #include #include +#include + namespace DB { @@ -103,13 +105,15 @@ DatabaseReplicated::DatabaseReplicated( } snapshot_period = context_.getConfigRef().getInt("database_replicated_snapshot_period", 10); - LOG_DEBUG(log, "Snapshot period is set to " << snapshot_period << " log entries per one snapshot"); + LOG_DEBUG(log, "Snapshot period is set to {} log entries per one snapshot", snapshot_period); background_log_executor = context_.getReplicatedSchedulePool().createTask(database_name + "(DatabaseReplicated::background_executor)", [this]{ runBackgroundLogExecutor();} ); background_log_executor->scheduleAfter(500); } +DatabaseReplicated::~DatabaseReplicated() = default; + void DatabaseReplicated::createDatabaseZKNodes() { current_zookeeper = getZooKeeper(); @@ -174,7 +178,13 @@ void DatabaseReplicated::runBackgroundLogExecutor() { for (const String & log_entry_name : log_entry_names) { String log_entry_path = zookeeper_path + "/log/" + log_entry_name; - executeFromZK(log_entry_path); + bool yield = false; + { + std::lock_guard lock(log_name_mutex); + if (log_name_to_exec_with_result == log_entry_name) + yield = true; + } + executeFromZK(log_entry_path, yield); last_executed_log_entry = log_entry_name; writeLastExecutedToDiskAndZK(); @@ -203,12 +213,9 @@ void DatabaseReplicated::writeLastExecutedToDiskAndZK() { out.close(); } -void DatabaseReplicated::executeFromZK(String & path) { +void DatabaseReplicated::executeFromZK(String & path, bool yield) { current_zookeeper = getZooKeeper(); String query_to_execute = current_zookeeper->get(path, {}, NULL); - //ReadBufferFromString istr(query_to_execute); - //String dummy_string; - //WriteBufferFromString ostr(dummy_string); try { @@ -216,23 +223,29 @@ void DatabaseReplicated::executeFromZK(String & path) { current_context->getClientInfo().query_kind = ClientInfo::QueryKind::REPLICATED_LOG_QUERY; current_context->setCurrentDatabase(database_name); current_context->setCurrentQueryId(""); // generate random query_id - //executeQuery(istr, ostr, false, *current_context, {}); executeQuery(query_to_execute, *current_context); } catch (...) { - tryLogCurrentException(log, "Query from zookeeper " + query_to_execute + " wasn't finished successfully"); + if (yield) + tryLogCurrentException(log, "Query from zookeeper " + query_to_execute + " wasn't finished successfully"); } - LOG_DEBUG(log, "Executed query: " << query_to_execute); + std::lock_guard lock(log_name_mutex); + log_name_to_exec_with_result.clear(); + LOG_DEBUG(log, "Executed query: {}", query_to_execute); } void DatabaseReplicated::propose(const ASTPtr & query) { current_zookeeper = getZooKeeper(); - LOG_DEBUG(log, "Proposing query: " << queryToString(query)); - current_zookeeper->create(zookeeper_path + "/log/log-", queryToString(query), zkutil::CreateMode::PersistentSequential); + LOG_DEBUG(log, "Proposing query: {}", queryToString(query)); + + { + std::lock_guard lock(log_name_mutex); + log_name_to_exec_with_result = current_zookeeper->create(zookeeper_path + "/log/log-", queryToString(query), zkutil::CreateMode::PersistentSequential); + } background_log_executor->schedule(); } @@ -241,11 +254,11 @@ void DatabaseReplicated::createSnapshot() { current_zookeeper = getZooKeeper(); String snapshot_path = zookeeper_path + "/snapshots/" + last_executed_log_entry; - if (Coordination::ZNODEEXISTS == current_zookeeper->tryCreate(snapshot_path, String(), zkutil::CreateMode::Persistent)) { + if (Coordination::Error::ZNODEEXISTS == current_zookeeper->tryCreate(snapshot_path, String(), zkutil::CreateMode::Persistent)) { return; } - for (auto iterator = getTablesIterator({}); iterator->isValid(); iterator->next()) { + for (auto iterator = getTablesIterator(global_context, {}); iterator->isValid(); iterator->next()) { String table_name = iterator->name(); auto query = getCreateQueryFromMetadata(getObjectMetadataPath(table_name), true); String statement = queryToString(query); @@ -262,7 +275,7 @@ void DatabaseReplicated::loadMetadataFromSnapshot() { current_zookeeper = getZooKeeper(); Strings snapshots; - if (current_zookeeper->tryGetChildren(zookeeper_path + "/snapshots", snapshots) != Coordination::ZOK) + if (current_zookeeper->tryGetChildren(zookeeper_path + "/snapshots", snapshots) != Coordination::Error::ZOK) return; auto latest_snapshot = std::max_element(snapshots.begin(), snapshots.end()); @@ -277,14 +290,14 @@ void DatabaseReplicated::loadMetadataFromSnapshot() { Strings metadatas; - if (current_zookeeper->tryGetChildren(zookeeper_path + "/snapshots/" + *latest_snapshot, metadatas) != Coordination::ZOK) + if (current_zookeeper->tryGetChildren(zookeeper_path + "/snapshots/" + *latest_snapshot, metadatas) != Coordination::Error::ZOK) return; - LOG_DEBUG(log, "Executing " << *latest_snapshot << " snapshot"); + LOG_DEBUG(log, "Executing {} snapshot", *latest_snapshot); for (auto t = metadatas.begin(); t != metadatas.end(); ++t) { String path = zookeeper_path + "/snapshots/" + *latest_snapshot + "/" + *t; - executeFromZK(path); + executeFromZK(path, false); } last_executed_log_entry = *latest_snapshot; diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index 1cdcc3e990c..2aa6c0d9a68 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -4,6 +4,7 @@ #include #include + namespace DB { /** DatabaseReplicated engine @@ -33,6 +34,8 @@ class DatabaseReplicated : public DatabaseAtomic public: DatabaseReplicated(const String & name_, const String & metadata_path_, const String & zookeeper_path_, const String & replica_name_, Context & context); + ~DatabaseReplicated(); + void drop(const Context & /*context*/) override; String getEngineName() const override { return "Replicated"; } @@ -47,7 +50,7 @@ private: void runBackgroundLogExecutor(); - void executeFromZK(String & path); + void executeFromZK(String & path, bool yield); void writeLastExecutedToDiskAndZK(); @@ -57,6 +60,10 @@ private: std::unique_ptr current_context; // to run executeQuery + //BlockIO execution_result; + std::mutex log_name_mutex; + String log_name_to_exec_with_result; + int snapshot_period; String last_executed_log_entry = ""; diff --git a/src/Interpreters/InterpreterDropQuery.cpp b/src/Interpreters/InterpreterDropQuery.cpp index 368024da043..8eef9059f69 100644 --- a/src/Interpreters/InterpreterDropQuery.cpp +++ b/src/Interpreters/InterpreterDropQuery.cpp @@ -93,7 +93,7 @@ BlockIO InterpreterDropQuery::executeToTable( { context.checkAccess(table->isView() ? AccessType::DROP_VIEW : AccessType::DROP_TABLE, table_id); table->shutdown(); - TableStructureWriteLockHolder table_lock; + TableExclusiveLockHolder table_lock; if (database->getEngineName() != "Atomic" && database->getEngineName() != "Replicated") table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); /// Drop table from memory, don't touch data and metadata @@ -111,7 +111,6 @@ BlockIO InterpreterDropQuery::executeToTable( auto table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); auto metadata_snapshot = table->getInMemoryMetadataPtr(); /// Drop table data, don't touch metadata - auto table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { database->propose(query_ptr); } else { diff --git a/src/Storages/MergeTree/registerStorageMergeTree.cpp b/src/Storages/MergeTree/registerStorageMergeTree.cpp index eb62c80cc49..9836cd2ee23 100644 --- a/src/Storages/MergeTree/registerStorageMergeTree.cpp +++ b/src/Storages/MergeTree/registerStorageMergeTree.cpp @@ -634,14 +634,6 @@ static StoragePtr create(const StorageFactory::Arguments & args) throw Exception("You must set the setting `allow_experimental_data_skipping_indices` to 1 " \ "before using data skipping indices.", ErrorCodes::BAD_ARGUMENTS); - StorageInMemoryMetadata metadata(args.columns, indices_description, args.constraints); - metadata.partition_by_ast = partition_by_ast; - metadata.order_by_ast = order_by_ast; - metadata.primary_key_ast = primary_key_ast; - metadata.ttl_for_table_ast = ttl_table_ast; - metadata.sample_by_ast = sample_by_ast; - metadata.settings_ast = settings_ast; - if (replicatedStorage) return StorageReplicatedMergeTree::create( zookeeper_path, replica_name, args.attach, args.table_id, args.relative_data_path, diff --git a/tests/integration/runner b/tests/integration/runner index 399c87dcf06..058badcee66 100755 --- a/tests/integration/runner +++ b/tests/integration/runner @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python2 #-*- coding: utf-8 -*- import subprocess import os @@ -105,7 +105,7 @@ if __name__ == "__main__": bridge_bin=args.bridge_binary, cfg=args.configs_dir, pth=args.clickhouse_root, - opts=' '.join(args.pytest_args), + opts='-vv ' + ' '.join(args.pytest_args), img=DIND_INTEGRATION_TESTS_IMAGE_NAME, name=CONTAINER_NAME, command=args.command From 147fa9fed92c6b35061091971590e3243522bb84 Mon Sep 17 00:00:00 2001 From: Val Date: Sat, 4 Jul 2020 16:39:17 +0300 Subject: [PATCH 041/381] fix type error in zookeeper --- src/Common/ZooKeeper/ZooKeeper.cpp | 2 +- tests/integration/runner | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Common/ZooKeeper/ZooKeeper.cpp b/src/Common/ZooKeeper/ZooKeeper.cpp index 541625149dd..e09533874e3 100644 --- a/src/Common/ZooKeeper/ZooKeeper.cpp +++ b/src/Common/ZooKeeper/ZooKeeper.cpp @@ -582,7 +582,7 @@ void ZooKeeper::removeChildren(const std::string & path) void ZooKeeper::tryRemoveChildren(const std::string & path) { Strings children; - if (tryGetChildren(path, children) != Coordination::ZOK) + if (tryGetChildren(path, children) != Coordination::Error::ZOK) return; while (!children.empty()) { diff --git a/tests/integration/runner b/tests/integration/runner index 058badcee66..399c87dcf06 100755 --- a/tests/integration/runner +++ b/tests/integration/runner @@ -1,4 +1,4 @@ -#!/usr/bin/env python2 +#!/usr/bin/env python #-*- coding: utf-8 -*- import subprocess import os @@ -105,7 +105,7 @@ if __name__ == "__main__": bridge_bin=args.bridge_binary, cfg=args.configs_dir, pth=args.clickhouse_root, - opts='-vv ' + ' '.join(args.pytest_args), + opts=' '.join(args.pytest_args), img=DIND_INTEGRATION_TESTS_IMAGE_NAME, name=CONTAINER_NAME, command=args.command From e591fe501412cce7bf2c9105ba7b572cc3b89ddb Mon Sep 17 00:00:00 2001 From: Val Date: Sat, 4 Jul 2020 19:32:23 +0300 Subject: [PATCH 042/381] database replicated feedback mechanism prototype --- src/Databases/DatabaseReplicated.cpp | 77 ++++++++++++++++----- src/Databases/DatabaseReplicated.h | 10 +-- src/Interpreters/InterpreterAlterQuery.cpp | 4 +- src/Interpreters/InterpreterCreateQuery.cpp | 11 ++- src/Interpreters/InterpreterDropQuery.cpp | 6 ++ src/Interpreters/InterpreterRenameQuery.cpp | 8 +++ 6 files changed, 92 insertions(+), 24 deletions(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 47298996236..fb64a005320 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -7,11 +7,13 @@ #include #include #include +#include #include #include #include #include #include +#include #include @@ -105,6 +107,7 @@ DatabaseReplicated::DatabaseReplicated( } snapshot_period = context_.getConfigRef().getInt("database_replicated_snapshot_period", 10); + feedback_timeout = context_.getConfigRef().getInt("database_replicated_feedback_timeout", 0); LOG_DEBUG(log, "Snapshot period is set to {} log entries per one snapshot", snapshot_period); background_log_executor = context_.getReplicatedSchedulePool().createTask(database_name + "(DatabaseReplicated::background_executor)", [this]{ runBackgroundLogExecutor();} ); @@ -177,14 +180,7 @@ void DatabaseReplicated::runBackgroundLogExecutor() { log_entry_names.erase(log_entry_names.begin(), newest_entry_it); for (const String & log_entry_name : log_entry_names) { - String log_entry_path = zookeeper_path + "/log/" + log_entry_name; - bool yield = false; - { - std::lock_guard lock(log_name_mutex); - if (log_name_to_exec_with_result == log_entry_name) - yield = true; - } - executeFromZK(log_entry_path, yield); + executeLogName(log_entry_name); last_executed_log_entry = log_entry_name; writeLastExecutedToDiskAndZK(); @@ -213,7 +209,8 @@ void DatabaseReplicated::writeLastExecutedToDiskAndZK() { out.close(); } -void DatabaseReplicated::executeFromZK(String & path, bool yield) { +void DatabaseReplicated::executeLogName(const String & log_entry_name) { + String path = zookeeper_path + "/log/" + log_entry_name; current_zookeeper = getZooKeeper(); String query_to_execute = current_zookeeper->get(path, {}, NULL); @@ -225,15 +222,12 @@ void DatabaseReplicated::executeFromZK(String & path, bool yield) { current_context->setCurrentQueryId(""); // generate random query_id executeQuery(query_to_execute, *current_context); } - catch (...) + catch (const Exception & e) { - if (yield) - tryLogCurrentException(log, "Query from zookeeper " + query_to_execute + " wasn't finished successfully"); - + tryLogCurrentException(log, "Query from zookeeper " + query_to_execute + " wasn't finished successfully"); + current_zookeeper->create(zookeeper_path + "/replicas/" + replica_name + "/errors/" + log_entry_name, e.what(), zkutil::CreateMode::Persistent); } - std::lock_guard lock(log_name_mutex); - log_name_to_exec_with_result.clear(); LOG_DEBUG(log, "Executed query: {}", query_to_execute); } @@ -250,6 +244,48 @@ void DatabaseReplicated::propose(const ASTPtr & query) { background_log_executor->schedule(); } +BlockIO DatabaseReplicated::getFeedback() { + BlockIO res; + if (feedback_timeout == 0) + return res; + + Stopwatch watch; + + NamesAndTypes block_structure = { + {"replica_name", std::make_shared()}, + {"execution_feedback", std::make_shared()}, + }; + auto replica_name_column = block_structure[0].type->createColumn(); + auto feedback_column = block_structure[1].type->createColumn(); + + current_zookeeper = getZooKeeper(); + Strings replica_states = current_zookeeper->getChildren(zookeeper_path + "/replicas"); + auto replica_iter = replica_states.begin(); + + while (!replica_states.empty() && watch.elapsedSeconds() < feedback_timeout) { + String last_executed = current_zookeeper->get(zookeeper_path + "/replicas/" + *replica_iter); + if (last_executed > log_name_to_exec_with_result) { + replica_name_column->insert(*replica_iter); + String err_path = zookeeper_path + "/replicas/" + *replica_iter + "/errors/" + log_name_to_exec_with_result; + if (!current_zookeeper->exists(err_path)) { + feedback_column->insert("OK"); + } else { + String feedback = current_zookeeper->get(err_path, {}, NULL); + feedback_column->insert(feedback); + } + replica_states.erase(replica_iter); + replica_iter = replica_states.begin(); + } + } + + Block block = Block({ + {std::move(replica_name_column), block_structure[0].type, block_structure[0].name}, + {std::move(feedback_column), block_structure[1].type, block_structure[1].name}}); + + res.in = std::make_shared(block); + return res; +} + void DatabaseReplicated::createSnapshot() { current_zookeeper = getZooKeeper(); String snapshot_path = zookeeper_path + "/snapshots/" + last_executed_log_entry; @@ -288,16 +324,23 @@ void DatabaseReplicated::loadMetadataFromSnapshot() { return; } - Strings metadatas; if (current_zookeeper->tryGetChildren(zookeeper_path + "/snapshots/" + *latest_snapshot, metadatas) != Coordination::Error::ZOK) return; LOG_DEBUG(log, "Executing {} snapshot", *latest_snapshot); + for (auto t = metadatas.begin(); t != metadatas.end(); ++t) { String path = zookeeper_path + "/snapshots/" + *latest_snapshot + "/" + *t; - executeFromZK(path, false); + String query_to_execute = current_zookeeper->get(path, {}, NULL); + + current_context = std::make_unique(global_context); + current_context->getClientInfo().query_kind = ClientInfo::QueryKind::REPLICATED_LOG_QUERY; + current_context->setCurrentDatabase(database_name); + current_context->setCurrentQueryId(""); // generate random query_id + + executeQuery(query_to_execute, *current_context); } last_executed_log_entry = *latest_snapshot; diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index 2aa6c0d9a68..0f448b8061c 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -3,6 +3,8 @@ #include #include #include +#include +#include namespace DB @@ -42,6 +44,8 @@ public: void propose(const ASTPtr & query) override; + BlockIO getFeedback(); + String zookeeper_path; String replica_name; @@ -49,9 +53,7 @@ private: void createDatabaseZKNodes(); void runBackgroundLogExecutor(); - - void executeFromZK(String & path, bool yield); - + void executeLogName(const String &); void writeLastExecutedToDiskAndZK(); void loadMetadataFromSnapshot(); @@ -60,11 +62,11 @@ private: std::unique_ptr current_context; // to run executeQuery - //BlockIO execution_result; std::mutex log_name_mutex; String log_name_to_exec_with_result; int snapshot_period; + int feedback_timeout; String last_executed_log_entry = ""; diff --git a/src/Interpreters/InterpreterAlterQuery.cpp b/src/Interpreters/InterpreterAlterQuery.cpp index 6b4bcdde067..96f3628b637 100644 --- a/src/Interpreters/InterpreterAlterQuery.cpp +++ b/src/Interpreters/InterpreterAlterQuery.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #include @@ -52,7 +53,8 @@ BlockIO InterpreterAlterQuery::execute() DatabasePtr database = DatabaseCatalog::instance().getDatabase(table_id.database_name); if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY && !table->supportsReplication()) { database->propose(query_ptr); - return {}; + auto * database_replicated = typeid_cast(database.get()); + return database_replicated->getFeedback(); } /// Add default database to table identifiers that we can encounter in e.g. default expressions, diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 9d3abf2c8a6..0c312cfc863 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -46,6 +46,7 @@ #include #include +#include #include #include @@ -571,12 +572,12 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create) ErrorCodes::BAD_DATABASE_FOR_TEMPORARY_TABLE); String current_database = context.getCurrentDatabase(); + auto database_name = create.database.empty() ? current_database : create.database; + auto database = DatabaseCatalog::instance().getDatabase(database_name); // If this is a stub ATTACH query, read the query definition from the database if (create.attach && !create.storage && !create.columns_list) { - auto database_name = create.database.empty() ? current_database : create.database; - auto database = DatabaseCatalog::instance().getDatabase(database_name); bool if_not_exists = create.if_not_exists; // Table SQL definition is available even if the table is detached @@ -611,6 +612,12 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create) /// Actually creates table bool created = doCreateTable(create, properties); + + if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { + auto * database_replicated = typeid_cast(database.get()); + return database_replicated->getFeedback(); + } + if (!created) /// Table already exists return {}; diff --git a/src/Interpreters/InterpreterDropQuery.cpp b/src/Interpreters/InterpreterDropQuery.cpp index 8eef9059f69..d5ac832e46c 100644 --- a/src/Interpreters/InterpreterDropQuery.cpp +++ b/src/Interpreters/InterpreterDropQuery.cpp @@ -12,6 +12,7 @@ #include #include #include +#include namespace DB @@ -137,6 +138,11 @@ BlockIO InterpreterDropQuery::executeToTable( } } + if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { + auto * database_replicated = typeid_cast(database.get()); + return database_replicated->getFeedback(); + } + return {}; } diff --git a/src/Interpreters/InterpreterRenameQuery.cpp b/src/Interpreters/InterpreterRenameQuery.cpp index 97206f6b364..b950edac5bc 100644 --- a/src/Interpreters/InterpreterRenameQuery.cpp +++ b/src/Interpreters/InterpreterRenameQuery.cpp @@ -6,6 +6,7 @@ #include #include #include +#include namespace DB @@ -91,7 +92,14 @@ BlockIO InterpreterRenameQuery::execute() elem.to_table_name, rename.exchange); } + + // TODO it can't work + if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { + auto * database_replicated = typeid_cast(database.get()); + return database_replicated->getFeedback(); + } } + return {}; } From 478eb0b8a5df5f602651268cc396178b6adcf17e Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Thu, 22 Oct 2020 18:08:00 +0300 Subject: [PATCH 043/381] fix --- src/Databases/DatabaseReplicated.cpp | 206 ++++++++++++-------- src/Databases/IDatabase.h | 3 +- src/Databases/ya.make | 1 + src/Interpreters/InterpreterAlterQuery.cpp | 3 +- src/Interpreters/InterpreterCreateQuery.cpp | 10 +- src/Interpreters/InterpreterDropQuery.cpp | 19 +- src/Interpreters/InterpreterRenameQuery.cpp | 10 +- 7 files changed, 149 insertions(+), 103 deletions(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 42662d836d4..328f5476064 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -1,3 +1,4 @@ +#include #include #include #include @@ -7,20 +8,15 @@ #include #include #include -#include #include +#include #include #include #include -#include - -#include namespace DB { - - namespace ErrorCodes { extern const int NO_ZOOKEEPER; @@ -60,29 +56,34 @@ DatabaseReplicated::DatabaseReplicated( , zookeeper_path(zookeeper_path_) , replica_name(replica_name_) { - if (zookeeper_path.empty() || replica_name.empty()) { + if (zookeeper_path.empty() || replica_name.empty()) + { throw Exception("ZooKeeper path and replica name must be non-empty", ErrorCodes::BAD_ARGUMENTS); } if (zookeeper_path.back() == '/') zookeeper_path.resize(zookeeper_path.size() - 1); - // If zookeeper chroot prefix is used, path should start with '/', because chroot concatenates without it. + /// If zookeeper chroot prefix is used, path should start with '/', because chroot concatenates without it. if (zookeeper_path.front() != '/') zookeeper_path = "/" + zookeeper_path; - if (context_.hasZooKeeper()) { + if (context_.hasZooKeeper()) + { current_zookeeper = context_.getZooKeeper(); } if (!current_zookeeper) { - throw Exception("Can't create replicated database without ZooKeeper", ErrorCodes::NO_ZOOKEEPER); + throw Exception("Can't create replicated database without ZooKeeper", ErrorCodes::NO_ZOOKEEPER); } - // New database - if (!current_zookeeper->exists(zookeeper_path)) { + /// New database + if (!current_zookeeper->exists(zookeeper_path)) + { createDatabaseZKNodes(); - // Old replica recovery - } else if (current_zookeeper->exists(zookeeper_path + "/replicas/" + replica_name)) { + /// Old replica recovery + } + else if (current_zookeeper->exists(zookeeper_path + "/replicas/" + replica_name)) + { String remote_last_entry = current_zookeeper->get(zookeeper_path + "/replicas/" + replica_name, {}, nullptr); String local_last_entry; @@ -93,16 +94,22 @@ DatabaseReplicated::DatabaseReplicated( } catch (const Exception &) { - // Metadata is corrupted. - // Replica erases the previous zk last executed log entry - // and behaves like a new clean replica. - writeLastExecutedToDiskAndZK(); + /// Metadata is corrupted. + /// Replica erases the previous zk last executed log entry + /// and behaves like a new clean replica. + writeLastExecutedToDiskAndZK(); } - if (!local_last_entry.empty() && local_last_entry == remote_last_entry) { + if (!local_last_entry.empty() && local_last_entry == remote_last_entry) + { last_executed_log_entry = local_last_entry; - } else { - throw Exception("Replica name might be in use by a different node. Please check replica_name parameter. Remove .last_entry file from metadata to create a new replica.", ErrorCodes::LOGICAL_ERROR); + } + else + { + throw Exception( + "Replica name might be in use by a different node. Please check replica_name parameter. Remove .last_entry file from " + "metadata to create a new replica.", + ErrorCodes::LOGICAL_ERROR); } } @@ -110,12 +117,15 @@ DatabaseReplicated::DatabaseReplicated( feedback_timeout = context_.getConfigRef().getInt("database_replicated_feedback_timeout", 0); LOG_DEBUG(log, "Snapshot period is set to {} log entries per one snapshot", snapshot_period); - background_log_executor = context_.getReplicatedSchedulePool().createTask(database_name + "(DatabaseReplicated::background_executor)", [this]{ runBackgroundLogExecutor();} ); + background_log_executor = context_.getReplicatedSchedulePool().createTask( + database_name + "(DatabaseReplicated::background_executor)", [this] { runBackgroundLogExecutor(); } + ); background_log_executor->scheduleAfter(500); } -void DatabaseReplicated::createDatabaseZKNodes() { +void DatabaseReplicated::createDatabaseZKNodes() +{ current_zookeeper = getZooKeeper(); current_zookeeper->createAncestors(zookeeper_path); @@ -126,31 +136,34 @@ void DatabaseReplicated::createDatabaseZKNodes() { current_zookeeper->createIfNotExists(zookeeper_path + "/replicas", String()); } -void DatabaseReplicated::RemoveOutdatedSnapshotsAndLog() { - // This method removes all snapshots and logged queries - // that no longer will be in use by current replicas or - // new coming ones. - // Each registered replica has its state in ZooKeeper. - // Therefore, snapshots and logged queries that are less - // than a least advanced replica are removed. - // It does not interfere with a new coming replica - // metadata loading from snapshot - // because the replica will use the latest snapshot available - // and this snapshot will set the last executed log query - // to a greater one than the least advanced current replica. +void DatabaseReplicated::RemoveOutdatedSnapshotsAndLog() +{ + /// This method removes all snapshots and logged queries + /// that no longer will be in use by current replicas or + /// new coming ones. + /// Each registered replica has its state in ZooKeeper. + /// Therefore, snapshots and logged queries that are less + /// than a least advanced replica are removed. + /// It does not interfere with a new coming replica + /// metadata loading from snapshot + /// because the replica will use the latest snapshot available + /// and this snapshot will set the last executed log query + /// to a greater one than the least advanced current replica. current_zookeeper = getZooKeeper(); Strings replica_states = current_zookeeper->getChildren(zookeeper_path + "/replicas"); auto least_advanced = std::min_element(replica_states.begin(), replica_states.end()); Strings snapshots = current_zookeeper->getChildren(zookeeper_path + "/snapshots"); - - if (snapshots.size() < 2) { + + if (snapshots.size() < 2) + { return; } std::sort(snapshots.begin(), snapshots.end()); auto still_useful = std::lower_bound(snapshots.begin(), snapshots.end(), *least_advanced); snapshots.erase(still_useful, snapshots.end()); - for (const String & snapshot : snapshots) { + for (const String & snapshot : snapshots) + { current_zookeeper->tryRemoveRecursive(zookeeper_path + "/snapshots/" + snapshot); } @@ -158,14 +171,17 @@ void DatabaseReplicated::RemoveOutdatedSnapshotsAndLog() { std::sort(log_entry_names.begin(), log_entry_names.end()); auto still_useful_log = std::upper_bound(log_entry_names.begin(), log_entry_names.end(), *still_useful); log_entry_names.erase(still_useful_log, log_entry_names.end()); - for (const String & log_entry_name : log_entry_names) { + for (const String & log_entry_name : log_entry_names) + { String log_entry_path = zookeeper_path + "/log/" + log_entry_name; current_zookeeper->tryRemove(log_entry_path); } } -void DatabaseReplicated::runBackgroundLogExecutor() { - if (last_executed_log_entry == "") { +void DatabaseReplicated::runBackgroundLogExecutor() +{ + if (last_executed_log_entry == "") + { loadMetadataFromSnapshot(); } @@ -177,7 +193,8 @@ void DatabaseReplicated::runBackgroundLogExecutor() { log_entry_names.erase(log_entry_names.begin(), newest_entry_it); - for (const String & log_entry_name : log_entry_names) { + for (const String & log_entry_name : log_entry_names) + { executeLogName(log_entry_name); last_executed_log_entry = log_entry_name; writeLastExecutedToDiskAndZK(); @@ -185,8 +202,9 @@ void DatabaseReplicated::runBackgroundLogExecutor() { int log_n = parse(log_entry_name.substr(4)); int last_log_n = parse(log_entry_names.back().substr(4)); - // The third condition gurantees at most one snapshot creation per batch - if (log_n > 0 && snapshot_period > 0 && (last_log_n - log_n) / snapshot_period == 0 && log_n % snapshot_period == 0) { + /// The third condition gurantees at most one snapshot creation per batch + if (log_n > 0 && snapshot_period > 0 && (last_log_n - log_n) / snapshot_period == 0 && log_n % snapshot_period == 0) + { createSnapshot(); } } @@ -194,9 +212,11 @@ void DatabaseReplicated::runBackgroundLogExecutor() { background_log_executor->scheduleAfter(500); } -void DatabaseReplicated::writeLastExecutedToDiskAndZK() { +void DatabaseReplicated::writeLastExecutedToDiskAndZK() +{ current_zookeeper = getZooKeeper(); - current_zookeeper->createOrUpdate(zookeeper_path + "/replicas/" + replica_name, last_executed_log_entry, zkutil::CreateMode::Persistent); + current_zookeeper->createOrUpdate( + zookeeper_path + "/replicas/" + replica_name, last_executed_log_entry, zkutil::CreateMode::Persistent); String metadata_file = getMetadataPath() + ".last_entry"; WriteBufferFromFile out(metadata_file, last_executed_log_entry.size(), O_WRONLY | O_CREAT); @@ -207,42 +227,47 @@ void DatabaseReplicated::writeLastExecutedToDiskAndZK() { out.close(); } -void DatabaseReplicated::executeLogName(const String & log_entry_name) { - String path = zookeeper_path + "/log/" + log_entry_name; - current_zookeeper = getZooKeeper(); - String query_to_execute = current_zookeeper->get(path, {}, nullptr); +void DatabaseReplicated::executeLogName(const String & log_entry_name) +{ + String path = zookeeper_path + "/log/" + log_entry_name; + current_zookeeper = getZooKeeper(); + String query_to_execute = current_zookeeper->get(path, {}, nullptr); - try - { - current_context = std::make_unique(global_context); - current_context->getClientInfo().query_kind = ClientInfo::QueryKind::REPLICATED_LOG_QUERY; - current_context->setCurrentDatabase(database_name); - current_context->setCurrentQueryId(""); // generate random query_id - executeQuery(query_to_execute, *current_context); - } - catch (const Exception & e) - { - tryLogCurrentException(log, "Query from zookeeper " + query_to_execute + " wasn't finished successfully"); - current_zookeeper->create(zookeeper_path + "/replicas/" + replica_name + "/errors/" + log_entry_name, e.what(), zkutil::CreateMode::Persistent); - } + try + { + current_context = std::make_unique(global_context); + current_context->getClientInfo().query_kind = ClientInfo::QueryKind::REPLICATED_LOG_QUERY; + current_context->setCurrentDatabase(database_name); + current_context->setCurrentQueryId(""); // generate random query_id + executeQuery(query_to_execute, *current_context); + } + catch (const Exception & e) + { + tryLogCurrentException(log, "Query from zookeeper " + query_to_execute + " wasn't finished successfully"); + current_zookeeper->create( + zookeeper_path + "/replicas/" + replica_name + "/errors/" + log_entry_name, e.what(), zkutil::CreateMode::Persistent); + } - LOG_DEBUG(log, "Executed query: {}", query_to_execute); + LOG_DEBUG(log, "Executed query: {}", query_to_execute); } -void DatabaseReplicated::propose(const ASTPtr & query) { +void DatabaseReplicated::propose(const ASTPtr & query) +{ current_zookeeper = getZooKeeper(); LOG_DEBUG(log, "Proposing query: {}", queryToString(query)); { std::lock_guard lock(log_name_mutex); - log_name_to_exec_with_result = current_zookeeper->create(zookeeper_path + "/log/log-", queryToString(query), zkutil::CreateMode::PersistentSequential); + log_name_to_exec_with_result + = current_zookeeper->create(zookeeper_path + "/log/log-", queryToString(query), zkutil::CreateMode::PersistentSequential); } background_log_executor->schedule(); } -BlockIO DatabaseReplicated::getFeedback() { +BlockIO DatabaseReplicated::getFeedback() +{ BlockIO res; if (feedback_timeout == 0) return res; @@ -260,39 +285,48 @@ BlockIO DatabaseReplicated::getFeedback() { Strings replica_states = current_zookeeper->getChildren(zookeeper_path + "/replicas"); auto replica_iter = replica_states.begin(); - while (!replica_states.empty() && watch.elapsedSeconds() < feedback_timeout) { + while (!replica_states.empty() && watch.elapsedSeconds() < feedback_timeout) + { String last_executed = current_zookeeper->get(zookeeper_path + "/replicas/" + *replica_iter); - if (last_executed > log_name_to_exec_with_result) { + if (last_executed > log_name_to_exec_with_result) + { replica_name_column->insert(*replica_iter); String err_path = zookeeper_path + "/replicas/" + *replica_iter + "/errors/" + log_name_to_exec_with_result; - if (!current_zookeeper->exists(err_path)) { + if (!current_zookeeper->exists(err_path)) + { feedback_column->insert("OK"); - } else { + } + else + { String feedback = current_zookeeper->get(err_path, {}, nullptr); feedback_column->insert(feedback); } - replica_states.erase(replica_iter); - replica_iter = replica_states.begin(); + replica_states.erase(replica_iter); + replica_iter = replica_states.begin(); } } Block block = Block({ {std::move(replica_name_column), block_structure[0].type, block_structure[0].name}, - {std::move(feedback_column), block_structure[1].type, block_structure[1].name}}); + {std::move(feedback_column), block_structure[1].type, block_structure[1].name} + }); res.in = std::make_shared(block); return res; } -void DatabaseReplicated::createSnapshot() { +void DatabaseReplicated::createSnapshot() +{ current_zookeeper = getZooKeeper(); String snapshot_path = zookeeper_path + "/snapshots/" + last_executed_log_entry; - if (Coordination::Error::ZNODEEXISTS == current_zookeeper->tryCreate(snapshot_path, String(), zkutil::CreateMode::Persistent)) { + if (Coordination::Error::ZNODEEXISTS == current_zookeeper->tryCreate(snapshot_path, String(), zkutil::CreateMode::Persistent)) + { return; } - - for (auto iterator = getTablesIterator(global_context, {}); iterator->isValid(); iterator->next()) { + + for (auto iterator = getTablesIterator(global_context, {}); iterator->isValid(); iterator->next()) + { String table_name = iterator->name(); auto query = getCreateQueryFromMetadata(getObjectMetadataPath(table_name), true); String statement = queryToString(query); @@ -303,9 +337,10 @@ void DatabaseReplicated::createSnapshot() { RemoveOutdatedSnapshotsAndLog(); } -void DatabaseReplicated::loadMetadataFromSnapshot() { - // Executes the latest snapshot. - // Used by new replicas only. +void DatabaseReplicated::loadMetadataFromSnapshot() +{ + /// Executes the latest snapshot. + /// Used by new replicas only. current_zookeeper = getZooKeeper(); Strings snapshots; @@ -313,12 +348,14 @@ void DatabaseReplicated::loadMetadataFromSnapshot() { return; auto latest_snapshot = std::max_element(snapshots.begin(), snapshots.end()); - while (snapshots.size() > 0 && !current_zookeeper->exists(zookeeper_path + "/snapshots/" + *latest_snapshot + "/.completed")) { + while (snapshots.size() > 0 && !current_zookeeper->exists(zookeeper_path + "/snapshots/" + *latest_snapshot + "/.completed")) + { snapshots.erase(latest_snapshot); latest_snapshot = std::max_element(snapshots.begin(), snapshots.end()); } - if (snapshots.size() < 1) { + if (snapshots.size() < 1) + { return; } @@ -328,7 +365,8 @@ void DatabaseReplicated::loadMetadataFromSnapshot() { LOG_DEBUG(log, "Executing {} snapshot", *latest_snapshot); - for (auto t = metadatas.begin(); t != metadatas.end(); ++t) { + for (auto t = metadatas.begin(); t != metadatas.end(); ++t) + { String path = zookeeper_path + "/snapshots/" + *latest_snapshot + "/" + *t; String query_to_execute = current_zookeeper->get(path, {}, nullptr); diff --git a/src/Databases/IDatabase.h b/src/Databases/IDatabase.h index 2fd0c62b72e..9bec6394be7 100644 --- a/src/Databases/IDatabase.h +++ b/src/Databases/IDatabase.h @@ -181,7 +181,8 @@ public: virtual bool empty() const = 0; /// Submit query to log. Currently used by DatabaseReplicated engine only. - virtual void propose(const ASTPtr & /*query*/) { + virtual void propose(const ASTPtr & /*query*/) + { throw Exception(getEngineName() + ": propose() is not supported", ErrorCodes::NOT_IMPLEMENTED); } diff --git a/src/Databases/ya.make b/src/Databases/ya.make index b4173057e03..4ce56859d66 100644 --- a/src/Databases/ya.make +++ b/src/Databases/ya.make @@ -15,6 +15,7 @@ SRCS( DatabaseMemory.cpp DatabaseOnDisk.cpp DatabaseOrdinary.cpp + DatabaseReplicated.cpp DatabasesCommon.cpp DatabaseWithDictionaries.cpp MySQL/ConnectionMySQLSettings.cpp diff --git a/src/Interpreters/InterpreterAlterQuery.cpp b/src/Interpreters/InterpreterAlterQuery.cpp index 0b53e84564f..e229cb120e5 100644 --- a/src/Interpreters/InterpreterAlterQuery.cpp +++ b/src/Interpreters/InterpreterAlterQuery.cpp @@ -51,7 +51,8 @@ BlockIO InterpreterAlterQuery::execute() auto metadata_snapshot = table->getInMemoryMetadataPtr(); DatabasePtr database = DatabaseCatalog::instance().getDatabase(table_id.database_name); - if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY && !table->supportsReplication()) { + if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY && !table->supportsReplication()) + { database->propose(query_ptr); auto * database_replicated = typeid_cast(database.get()); return database_replicated->getFeedback(); diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 7c809e65639..5210230859c 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -75,6 +75,7 @@ namespace ErrorCodes extern const int DICTIONARY_ALREADY_EXISTS; extern const int ILLEGAL_SYNTAX_FOR_DATA_TYPE; extern const int ILLEGAL_COLUMN; + extern const int LOGICAL_ERROR; } namespace fs = std::filesystem; @@ -713,14 +714,16 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create) /// contain the right database name for every replica /// therefore for such queries the AST database /// field is modified right before an actual execution - if (context.getClientInfo().query_kind == ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { + if (context.getClientInfo().query_kind == ClientInfo::QueryKind::REPLICATED_LOG_QUERY) + { create.database = current_database; } /// Actually creates table bool created = doCreateTable(create, properties); - if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { + if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) + { auto * database_replicated = typeid_cast(database.get()); return database_replicated->getFeedback(); } @@ -786,7 +789,8 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, return true; } - if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { + if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) + { database->propose(query_ptr); return true; } diff --git a/src/Interpreters/InterpreterDropQuery.cpp b/src/Interpreters/InterpreterDropQuery.cpp index 455b40c30e3..393f4ef3dc9 100644 --- a/src/Interpreters/InterpreterDropQuery.cpp +++ b/src/Interpreters/InterpreterDropQuery.cpp @@ -101,11 +101,10 @@ BlockIO InterpreterDropQuery::executeToTable( if (database->getEngineName() != "Atomic" && database->getEngineName() != "Replicated") table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); /// Drop table from memory, don't touch data and metadata - if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { + if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) database->propose(query_ptr); - } else { + else database->detachTable(table_id.table_name); - } } else if (query.kind == ASTDropQuery::Kind::Truncate) { @@ -115,11 +114,10 @@ BlockIO InterpreterDropQuery::executeToTable( auto table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); auto metadata_snapshot = table->getInMemoryMetadataPtr(); /// Drop table data, don't touch metadata - if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { + if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) database->propose(query_ptr); - } else { + else table->truncate(query_ptr, metadata_snapshot, context, table_lock); - } } else if (query.kind == ASTDropQuery::Kind::Drop) { @@ -132,12 +130,11 @@ BlockIO InterpreterDropQuery::executeToTable( if (database->getEngineName() != "Atomic" && database->getEngineName() != "Replicated") table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); - // Prevents recursive drop from drop database query. The original query must specify a table. - if (!query_ptr->as().table.empty() && database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { + /// Prevents recursive drop from drop database query. The original query must specify a table. + if (!query_ptr->as().table.empty() && database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) database->propose(query_ptr); - } else { + else database->dropTable(context, table_id.table_name, query.no_delay); - } } } @@ -154,7 +151,7 @@ BlockIO InterpreterDropQuery::executeToTable( } } - if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) + if (database && database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { auto * database_replicated = typeid_cast(database.get()); return database_replicated->getFeedback(); diff --git a/src/Interpreters/InterpreterRenameQuery.cpp b/src/Interpreters/InterpreterRenameQuery.cpp index 3d8855b6458..65ed33bd9db 100644 --- a/src/Interpreters/InterpreterRenameQuery.cpp +++ b/src/Interpreters/InterpreterRenameQuery.cpp @@ -75,9 +75,12 @@ BlockIO InterpreterRenameQuery::executeToTables(const ASTRenameQuery & rename, c database_catalog.assertTableDoesntExist(StorageID(elem.to_database_name, elem.to_table_name), context); DatabasePtr database = database_catalog.getDatabase(elem.from_database_name); - if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { + if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) + { database->propose(query_ptr); - } else { + } + else + { database->renameTable( context, elem.from_table_name, @@ -88,7 +91,8 @@ BlockIO InterpreterRenameQuery::executeToTables(const ASTRenameQuery & rename, c } // TODO it can't work - if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { + if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) + { auto * database_replicated = typeid_cast(database.get()); return database_replicated->getFeedback(); } From cd14f095abe7f355353054172533d1f097d6105e Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Mon, 26 Oct 2020 18:12:16 +0300 Subject: [PATCH 044/381] fix tests --- src/Databases/DatabaseReplicated.cpp | 9 +- src/Databases/DatabaseReplicated.h | 2 +- src/Interpreters/InterpreterCreateQuery.cpp | 2 +- tests/integration/helpers/test_tools.py | 10 +- .../test_replicated_database/__init__.py | 0 .../test_replicated_database/test.py | 143 ++++++++++-------- 6 files changed, 95 insertions(+), 71 deletions(-) create mode 100644 tests/integration/test_replicated_database/__init__.py diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 328f5476064..7fb7be61d35 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -136,7 +136,7 @@ void DatabaseReplicated::createDatabaseZKNodes() current_zookeeper->createIfNotExists(zookeeper_path + "/replicas", String()); } -void DatabaseReplicated::RemoveOutdatedSnapshotsAndLog() +void DatabaseReplicated::removeOutdatedSnapshotsAndLog() { /// This method removes all snapshots and logged queries /// that no longer will be in use by current replicas or @@ -180,7 +180,7 @@ void DatabaseReplicated::RemoveOutdatedSnapshotsAndLog() void DatabaseReplicated::runBackgroundLogExecutor() { - if (last_executed_log_entry == "") + if (last_executed_log_entry.empty()) { loadMetadataFromSnapshot(); } @@ -274,7 +274,8 @@ BlockIO DatabaseReplicated::getFeedback() Stopwatch watch; - NamesAndTypes block_structure = { + NamesAndTypes block_structure = + { {"replica_name", std::make_shared()}, {"execution_feedback", std::make_shared()}, }; @@ -334,7 +335,7 @@ void DatabaseReplicated::createSnapshot() } current_zookeeper->create(snapshot_path + "/.completed", String(), zkutil::CreateMode::Persistent); - RemoveOutdatedSnapshotsAndLog(); + removeOutdatedSnapshotsAndLog(); } void DatabaseReplicated::loadMetadataFromSnapshot() diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index 4b647915079..62997e953ac 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -57,7 +57,7 @@ private: void loadMetadataFromSnapshot(); void createSnapshot(); - void RemoveOutdatedSnapshotsAndLog(); + void removeOutdatedSnapshotsAndLog(); std::unique_ptr current_context; // to run executeQuery diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 5210230859c..0f7d441c0d6 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -141,7 +141,7 @@ BlockIO InterpreterCreateQuery::createDatabase(ASTCreateQuery & create) throw Exception("Unknown database engine: " + ostr.str(), ErrorCodes::UNKNOWN_DATABASE_ENGINE); } - if (create.storage->engine->name == "Atomic") + if (create.storage->engine->name == "Atomic" || create.storage->engine->name == "Replicated") { if (create.attach && create.uuid == UUIDHelpers::Nil) throw Exception("UUID must be specified for ATTACH", ErrorCodes::INCORRECT_QUERY); diff --git a/tests/integration/helpers/test_tools.py b/tests/integration/helpers/test_tools.py index 75ae8f67f7a..639b47a7179 100644 --- a/tests/integration/helpers/test_tools.py +++ b/tests/integration/helpers/test_tools.py @@ -44,20 +44,20 @@ class TSV: def assert_eq_with_retry(instance, query, expectation, retry_count=20, sleep_time=0.5, stdin=None, timeout=None, - settings=None, user=None, ignore_error=False): + settings=None, user=None, ignore_error=False, get_result=lambda x: x): expectation_tsv = TSV(expectation) for i in range(retry_count): try: - if TSV(instance.query(query, user=user, stdin=stdin, timeout=timeout, settings=settings, - ignore_error=ignore_error)) == expectation_tsv: + if TSV(get_result(instance.query(query, user=user, stdin=stdin, timeout=timeout, settings=settings, + ignore_error=ignore_error))) == expectation_tsv: break time.sleep(sleep_time) except Exception as ex: print(("assert_eq_with_retry retry {} exception {}".format(i + 1, ex))) time.sleep(sleep_time) else: - val = TSV(instance.query(query, user=user, stdin=stdin, timeout=timeout, settings=settings, - ignore_error=ignore_error)) + val = TSV(get_result(instance.query(query, user=user, stdin=stdin, timeout=timeout, settings=settings, + ignore_error=ignore_error))) if expectation_tsv != val: raise AssertionError("'{}' != '{}'\n{}".format(expectation_tsv, val, '\n'.join( expectation_tsv.diff(val, n1="expectation", n2="query")))) diff --git a/tests/integration/test_replicated_database/__init__.py b/tests/integration/test_replicated_database/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_replicated_database/test.py b/tests/integration/test_replicated_database/test.py index 346114cb8c4..372ac7a7c3e 100644 --- a/tests/integration/test_replicated_database/test.py +++ b/tests/integration/test_replicated_database/test.py @@ -1,20 +1,24 @@ import time -import logging - +import re import pytest from helpers.cluster import ClickHouseCluster - -logging.getLogger().setLevel(logging.INFO) -logging.getLogger().addHandler(logging.StreamHandler()) +from helpers.test_tools import assert_eq_with_retry cluster = ClickHouseCluster(__file__) -main_node = cluster.add_instance('main_node', main_configs=['configs/disable_snapshots.xml'], with_zookeeper=True, stay_alive=True) -dummy_node = cluster.add_instance('dummy_node', main_configs=['configs/disable_snapshots.xml'], with_zookeeper=True) -competing_node = cluster.add_instance('competing_node', main_configs=['configs/disable_snapshots.xml'], with_zookeeper=True) -snapshotting_node = cluster.add_instance('snapshotting_node', main_configs=['configs/snapshot_each_query.xml'], with_zookeeper=True) -snapshot_recovering_node = cluster.add_instance('snapshot_recovering_node', main_configs=['configs/disable_snapshots.xml'], with_zookeeper=True) +main_node = cluster.add_instance('main_node', main_configs=['configs/disable_snapshots.xml'], with_zookeeper=True, stay_alive=True, macros={"shard": 1, "replica": 1}) +dummy_node = cluster.add_instance('dummy_node', main_configs=['configs/disable_snapshots.xml'], with_zookeeper=True, macros={"shard": 1, "replica": 2}) +competing_node = cluster.add_instance('competing_node', main_configs=['configs/disable_snapshots.xml'], with_zookeeper=True, macros={"shard": 1, "replica": 3}) +snapshotting_node = cluster.add_instance('snapshotting_node', main_configs=['configs/snapshot_each_query.xml'], with_zookeeper=True, macros={"shard": 2, "replica": 1}) +snapshot_recovering_node = cluster.add_instance('snapshot_recovering_node', main_configs=['configs/disable_snapshots.xml'], with_zookeeper=True, macros={"shard": 2, "replica": 2}) + +uuid_regex = re.compile("[0-9a-f]{8}\-[0-9a-f]{4}\-[0-9a-f]{4}\-[0-9a-f]{4}\-[0-9a-f]{12}") +def assert_create_query(nodes, table_name, expected): + replace_uuid = lambda x: re.sub(uuid_regex, "uuid", x) + query = "show create table testdb.{}".format(table_name) + for node in nodes: + assert_eq_with_retry(node, query, expected, get_result=replace_uuid) @pytest.fixture(scope="module") def started_cluster(): @@ -27,17 +31,25 @@ def started_cluster(): finally: cluster.shutdown() +#TODO better tests def test_create_replicated_table(started_cluster): - DURATION_SECONDS = 1 - main_node.query("CREATE TABLE testdb.replicated_table (d Date, k UInt64, i32 Int32) ENGINE=ReplicatedMergeTree(d, k, 8192);") + #FIXME should fail (replicated with old syntax) + #main_node.query("CREATE TABLE testdb.replicated_table (d Date, k UInt64, i32 Int32) ENGINE=ReplicatedMergeTree(d, k, 8192);") + main_node.query("CREATE TABLE testdb.replicated_table (d Date, k UInt64, i32 Int32) ENGINE=ReplicatedMergeTree ORDER BY k PARTITION BY toYYYYMM(d);") - time.sleep(DURATION_SECONDS) - assert main_node.query("desc table testdb.replicated_table") == dummy_node.query("desc table testdb.replicated_table") + expected = "CREATE TABLE testdb.replicated_table\\n(\\n `d` Date,\\n `k` UInt64,\\n `i32` Int32\\n)\\n" \ + "ENGINE = ReplicatedMergeTree(\\'/clickhouse/tables/uuid/{shard}\\', \\'{replica}\\')\\n" \ + "PARTITION BY toYYYYMM(d)\\nORDER BY k\\nSETTINGS index_granularity = 8192" + assert_create_query([main_node, dummy_node], "replicated_table", expected) + # assert without replacing uuid + assert main_node.query("show create testdb.replicated_table") == dummy_node.query("show create testdb.replicated_table") def test_simple_alter_table(started_cluster): - DURATION_SECONDS = 1 - main_node.query("CREATE TABLE testdb.alter_test (CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192);") + #TODO add test with ReplicatedMergeTree + main_node.query("CREATE TABLE testdb.alter_test " + "(CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) " + "ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192);") main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN Added0 UInt32;") main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN Added2 UInt32;") main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN Added1 UInt32 AFTER Added0;") @@ -45,48 +57,37 @@ def test_simple_alter_table(started_cluster): main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN AddedNested1.C Array(String) AFTER AddedNested1.B;") main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN AddedNested2 Nested(A UInt32, B UInt64) AFTER AddedNested1;") - time.sleep(DURATION_SECONDS) + expected = "CREATE TABLE testdb.alter_test\\n(\\n `CounterID` UInt32,\\n `StartDate` Date,\\n `UserID` UInt32,\\n" \ + " `VisitID` UInt32,\\n `NestedColumn.A` Array(UInt8),\\n `NestedColumn.S` Array(String),\\n" \ + " `ToDrop` UInt32,\\n `Added0` UInt32,\\n `Added1` UInt32,\\n `Added2` UInt32,\\n" \ + " `AddedNested1.A` Array(UInt32),\\n `AddedNested1.B` Array(UInt64),\\n `AddedNested1.C` Array(String),\\n" \ + " `AddedNested2.A` Array(UInt32),\\n `AddedNested2.B` Array(UInt64)\\n)\\n" \ + "ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192)" - schema = main_node.query("show create table testdb.alter_test") - fields = [ - "`CounterID`", - "`StartDate`", - "`UserID`", - "`VisitID`", - "`NestedColumn.A`", - "`NestedColumn.S`", - "`ToDrop`", - "`Added0`", - "`Added1`", - "`Added2`", - "`AddedNested1.A`", - "`AddedNested1.B`", - "`AddedNested1.C`", - "`AddedNested2.A`", - "`AddedNested2.B`"] - - for field in fields: - assert field in schema - - assert main_node.query("desc table testdb.alter_test") == dummy_node.query("desc table testdb.alter_test") + assert_create_query([main_node, dummy_node], "alter_test", expected) def test_create_replica_after_delay(started_cluster): competing_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'replica3');") - main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN Added3 UInt32 ;") - main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN Added4 UInt32 ;") - main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN Added5 UInt32 ;") + main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN Added3 UInt32;") + main_node.query("ALTER TABLE testdb.alter_test DROP COLUMN AddedNested1;") + main_node.query("ALTER TABLE testdb.alter_test RENAME COLUMN Added1 TO AddedNested1;") - time.sleep(6) + expected = "CREATE TABLE testdb.alter_test\\n(\\n `CounterID` UInt32,\\n `StartDate` Date,\\n `UserID` UInt32,\\n" \ + " `VisitID` UInt32,\\n `NestedColumn.A` Array(UInt8),\\n `NestedColumn.S` Array(String),\\n" \ + " `ToDrop` UInt32,\\n `Added0` UInt32,\\n `AddedNested1` UInt32,\\n `Added2` UInt32,\\n" \ + " `AddedNested2.A` Array(UInt32),\\n `AddedNested2.B` Array(UInt64),\\n `Added3` UInt32\\n)\\n" \ + "ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192)" - assert competing_node.query("desc table testdb.alter_test") == main_node.query("desc table testdb.alter_test") + assert_create_query([main_node, dummy_node, competing_node], "alter_test", expected) def test_alters_from_different_replicas(started_cluster): - DURATION_SECONDS = 1 + main_node.query("CREATE TABLE testdb.concurrent_test " + "(CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) " + "ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192);") - main_node.query("CREATE TABLE testdb.concurrent_test (CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192);") - - time.sleep(DURATION_SECONDS) + time.sleep(1) #FIXME + dummy_node.kill_clickhouse(stop_start_wait_sec=0) competing_node.query("ALTER TABLE testdb.concurrent_test ADD COLUMN Added0 UInt32;") main_node.query("ALTER TABLE testdb.concurrent_test ADD COLUMN Added2 UInt32;") @@ -95,31 +96,53 @@ def test_alters_from_different_replicas(started_cluster): competing_node.query("ALTER TABLE testdb.concurrent_test ADD COLUMN AddedNested1.C Array(String) AFTER AddedNested1.B;") main_node.query("ALTER TABLE testdb.concurrent_test ADD COLUMN AddedNested2 Nested(A UInt32, B UInt64) AFTER AddedNested1;") - time.sleep(DURATION_SECONDS) + expected = "CREATE TABLE testdb.concurrent_test\\n(\\n `CounterID` UInt32,\\n `StartDate` Date,\\n `UserID` UInt32,\\n" \ + " `VisitID` UInt32,\\n `NestedColumn.A` Array(UInt8),\\n `NestedColumn.S` Array(String),\\n `ToDrop` UInt32,\\n" \ + " `Added0` UInt32,\\n `Added1` UInt32,\\n `Added2` UInt32,\\n `AddedNested1.A` Array(UInt32),\\n" \ + " `AddedNested1.B` Array(UInt64),\\n `AddedNested1.C` Array(String),\\n `AddedNested2.A` Array(UInt32),\\n" \ + " `AddedNested2.B` Array(UInt64)\\n)\\n" \ + "ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192)" - assert competing_node.query("desc table testdb.concurrent_test") == main_node.query("desc table testdb.concurrent_test") + assert_create_query([main_node, competing_node], "concurrent_test", expected) def test_drop_and_create_table(started_cluster): main_node.query("DROP TABLE testdb.concurrent_test") - main_node.query("CREATE TABLE testdb.concurrent_test (CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192);") - time.sleep(5) - assert competing_node.query("desc table testdb.concurrent_test") == main_node.query("desc table testdb.concurrent_test") + main_node.query("CREATE TABLE testdb.concurrent_test " + "(CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) " + "ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192);") + + expected = "CREATE TABLE testdb.concurrent_test\\n(\\n `CounterID` UInt32,\\n `StartDate` Date,\\n `UserID` UInt32,\\n" \ + " `VisitID` UInt32,\\n `NestedColumn.A` Array(UInt8),\\n `NestedColumn.S` Array(String),\\n `ToDrop` UInt32\\n)\\n" \ + "ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192)" + + assert_create_query([main_node, competing_node], "concurrent_test", expected) def test_replica_restart(started_cluster): main_node.restart_clickhouse() - time.sleep(5) - assert competing_node.query("desc table testdb.concurrent_test") == main_node.query("desc table testdb.concurrent_test") + + expected = "CREATE TABLE testdb.concurrent_test\\n(\\n `CounterID` UInt32,\\n `StartDate` Date,\\n `UserID` UInt32,\\n" \ + " `VisitID` UInt32,\\n `NestedColumn.A` Array(UInt8),\\n `NestedColumn.S` Array(String),\\n `ToDrop` UInt32\\n)\\n" \ + "ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192)" + + assert_create_query([main_node, competing_node], "concurrent_test", expected) def test_snapshot_and_snapshot_recover(started_cluster): + #FIXME bad test snapshotting_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'replica4');") time.sleep(5) snapshot_recovering_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'replica5');") time.sleep(5) assert snapshotting_node.query("desc table testdb.alter_test") == snapshot_recovering_node.query("desc table testdb.alter_test") -#def test_drop_and_create_replica(started_cluster): -# main_node.query("DROP DATABASE testdb") -# main_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'replica1');") -# time.sleep(6) -# assert competing_node.query("desc table testdb.concurrent_test") == main_node.query("desc table testdb.concurrent_test") +def test_drop_and_create_replica(started_cluster): + main_node.query("DROP DATABASE testdb") + main_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'replica1');") + + expected = "CREATE TABLE testdb.concurrent_test\\n(\\n `CounterID` UInt32,\\n `StartDate` Date,\\n `UserID` UInt32,\\n" \ + " `VisitID` UInt32,\\n `NestedColumn.A` Array(UInt8),\\n `NestedColumn.S` Array(String),\\n `ToDrop` UInt32\\n)\\n" \ + "ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192)" + + assert_create_query([main_node, competing_node], "concurrent_test", expected) + +#TODO tests with Distributed From d8ae9fcdb4aea22a83d6fc917ec9d070d2780470 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Tue, 27 Oct 2020 12:19:45 +0300 Subject: [PATCH 045/381] fixes, add shard name --- src/Common/ZooKeeper/ZooKeeper.cpp | 17 -------------- src/Common/ZooKeeper/ZooKeeper.h | 5 ----- src/Databases/DatabaseFactory.cpp | 12 +++++----- src/Databases/DatabaseReplicated.cpp | 33 +++++++++++++++++++++------- src/Databases/DatabaseReplicated.h | 14 +++++++----- src/Databases/IDatabase.h | 20 ++++++++--------- src/Interpreters/DDLWorker.cpp | 1 + 7 files changed, 52 insertions(+), 50 deletions(-) diff --git a/src/Common/ZooKeeper/ZooKeeper.cpp b/src/Common/ZooKeeper/ZooKeeper.cpp index f4174faf057..bee875d1c74 100644 --- a/src/Common/ZooKeeper/ZooKeeper.cpp +++ b/src/Common/ZooKeeper/ZooKeeper.cpp @@ -588,23 +588,6 @@ void ZooKeeper::removeChildren(const std::string & path) } -void ZooKeeper::tryRemoveChildren(const std::string & path) -{ - Strings children; - if (tryGetChildren(path, children) != Coordination::Error::ZOK) - return; - while (!children.empty()) - { - Coordination::Requests ops; - for (size_t i = 0; i < MULTI_BATCH_SIZE && !children.empty(); ++i) - { - ops.emplace_back(makeRemoveRequest(path + "/" + children.back(), -1)); - children.pop_back(); - } - multi(ops); - } -} - void ZooKeeper::removeChildrenRecursive(const std::string & path) { Strings children = getChildren(path); diff --git a/src/Common/ZooKeeper/ZooKeeper.h b/src/Common/ZooKeeper/ZooKeeper.h index bbe3787197a..1ad744102c6 100644 --- a/src/Common/ZooKeeper/ZooKeeper.h +++ b/src/Common/ZooKeeper/ZooKeeper.h @@ -189,11 +189,6 @@ public: /// Remove all children nodes (non recursive). void removeChildren(const std::string & path); - /// Remove all children nodes (non recursive). - /// If there're no children for the given path, - /// this method does not throw an exception. - void tryRemoveChildren(const std::string & path); - using WaitCondition = std::function; /// Wait for the node to disappear or return immediately if it doesn't exist. diff --git a/src/Databases/DatabaseFactory.cpp b/src/Databases/DatabaseFactory.cpp index 5afa0b216ac..7758fe0bddc 100644 --- a/src/Databases/DatabaseFactory.cpp +++ b/src/Databases/DatabaseFactory.cpp @@ -169,15 +169,17 @@ DatabasePtr DatabaseFactory::getImpl(const ASTCreateQuery & create, const String { const ASTFunction * engine = engine_define->engine; - if (!engine->arguments || engine->arguments->children.size() != 2) - throw Exception("Replicated database requires zoo_path and replica_name arguments", ErrorCodes::BAD_ARGUMENTS); + if (!engine->arguments || engine->arguments->children.size() != 3) + throw Exception("Replicated database requires 3 arguments: zookeeper path, shard name and replica name", ErrorCodes::BAD_ARGUMENTS); const auto & arguments = engine->arguments->children; - const auto & zoo_path = safeGetLiteralValue(arguments[0], "Replicated"); - const auto & replica_name = safeGetLiteralValue(arguments[1], "Replicated"); + //TODO allow macros in arguments + const auto & zookeeper_path = safeGetLiteralValue(arguments[0], "Replicated"); + const auto & shard_name = safeGetLiteralValue(arguments[1], "Replicated"); + const auto & replica_name = safeGetLiteralValue(arguments[2], "Replicated"); - return std::make_shared(database_name, metadata_path, uuid, zoo_path, replica_name, context); + return std::make_shared(database_name, metadata_path, uuid, zookeeper_path, shard_name, replica_name, context); } throw Exception("Unknown database engine: " + engine_name, ErrorCodes::UNKNOWN_DATABASE_ENGINE); diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 7fb7be61d35..145b3abba00 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -24,6 +24,7 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; } +//FIXME never used void DatabaseReplicated::setZooKeeper(zkutil::ZooKeeperPtr zookeeper) { std::lock_guard lock(current_zookeeper_mutex); @@ -50,16 +51,16 @@ DatabaseReplicated::DatabaseReplicated( const String & metadata_path_, UUID uuid, const String & zookeeper_path_, + const String & shard_name_, const String & replica_name_, Context & context_) : DatabaseAtomic(name_, metadata_path_, uuid, "DatabaseReplicated (" + name_ + ")", context_) , zookeeper_path(zookeeper_path_) + , shard_name(shard_name_) , replica_name(replica_name_) { - if (zookeeper_path.empty() || replica_name.empty()) - { - throw Exception("ZooKeeper path and replica name must be non-empty", ErrorCodes::BAD_ARGUMENTS); - } + if (zookeeper_path.empty() || shard_name.empty() || replica_name.empty()) + throw Exception("ZooKeeper path and shard and replica names must be non-empty", ErrorCodes::BAD_ARGUMENTS); if (zookeeper_path.back() == '/') zookeeper_path.resize(zookeeper_path.size() - 1); @@ -79,10 +80,12 @@ DatabaseReplicated::DatabaseReplicated( /// New database if (!current_zookeeper->exists(zookeeper_path)) { - createDatabaseZKNodes(); - /// Old replica recovery + createDatabaseZooKeeperNodes(); } - else if (current_zookeeper->exists(zookeeper_path + "/replicas/" + replica_name)) + + /// Attach existing replica + //TODO better protection from wrong replica names + if (current_zookeeper->exists(zookeeper_path + "/replicas/" + replica_name)) { String remote_last_entry = current_zookeeper->get(zookeeper_path + "/replicas/" + replica_name, {}, nullptr); @@ -106,17 +109,23 @@ DatabaseReplicated::DatabaseReplicated( } else { + //FIXME throw Exception( "Replica name might be in use by a different node. Please check replica_name parameter. Remove .last_entry file from " "metadata to create a new replica.", ErrorCodes::LOGICAL_ERROR); } } + else + { + createReplicaZooKeeperNodes(); + } snapshot_period = context_.getConfigRef().getInt("database_replicated_snapshot_period", 10); feedback_timeout = context_.getConfigRef().getInt("database_replicated_feedback_timeout", 0); LOG_DEBUG(log, "Snapshot period is set to {} log entries per one snapshot", snapshot_period); + //TODO do we need separate pool? background_log_executor = context_.getReplicatedSchedulePool().createTask( database_name + "(DatabaseReplicated::background_executor)", [this] { runBackgroundLogExecutor(); } ); @@ -124,7 +133,7 @@ DatabaseReplicated::DatabaseReplicated( background_log_executor->scheduleAfter(500); } -void DatabaseReplicated::createDatabaseZKNodes() +void DatabaseReplicated::createDatabaseZooKeeperNodes() { current_zookeeper = getZooKeeper(); @@ -136,6 +145,11 @@ void DatabaseReplicated::createDatabaseZKNodes() current_zookeeper->createIfNotExists(zookeeper_path + "/replicas", String()); } +void DatabaseReplicated::createReplicaZooKeeperNodes() +{ + current_zookeeper->create(zookeeper_path + "/replicas/" + replica_name, "", zkutil::CreateMode::Persistent); +} + void DatabaseReplicated::removeOutdatedSnapshotsAndLog() { /// This method removes all snapshots and logged queries @@ -151,6 +165,9 @@ void DatabaseReplicated::removeOutdatedSnapshotsAndLog() /// to a greater one than the least advanced current replica. current_zookeeper = getZooKeeper(); Strings replica_states = current_zookeeper->getChildren(zookeeper_path + "/replicas"); + //TODO do not use log pointers to determine which entries to remove if there are staled pointers. + // We can just remove all entries older than previous snapshot version. + // Possible invariant: store all entries since last snapshot, replica becomes lost when it cannot get log entry. auto least_advanced = std::min_element(replica_states.begin(), replica_states.end()); Strings snapshots = current_zookeeper->getChildren(zookeeper_path + "/snapshots"); diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index 62997e953ac..375118e7356 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -35,7 +35,9 @@ namespace DB class DatabaseReplicated : public DatabaseAtomic { public: - DatabaseReplicated(const String & name_, const String & metadata_path_, UUID uuid, const String & zookeeper_path_, const String & replica_name_, Context & context); + DatabaseReplicated(const String & name_, const String & metadata_path_, UUID uuid, + const String & zookeeper_path_, const String & shard_name_, const String & replica_name_, + Context & context); void drop(const Context & /*context*/) override; @@ -45,11 +47,9 @@ public: BlockIO getFeedback(); - String zookeeper_path; - String replica_name; - private: - void createDatabaseZKNodes(); + void createDatabaseZooKeeperNodes(); + void createReplicaZooKeeperNodes(); void runBackgroundLogExecutor(); void executeLogName(const String &); @@ -59,6 +59,10 @@ private: void createSnapshot(); void removeOutdatedSnapshotsAndLog(); + String zookeeper_path; + String shard_name; + String replica_name; + std::unique_ptr current_context; // to run executeQuery std::mutex log_name_mutex; diff --git a/src/Databases/IDatabase.h b/src/Databases/IDatabase.h index eeb69a97092..393e8f2d10c 100644 --- a/src/Databases/IDatabase.h +++ b/src/Databases/IDatabase.h @@ -197,7 +197,7 @@ public: const StoragePtr & /*table*/, const ASTPtr & /*query*/) { - throw Exception("There is no CREATE TABLE query for Database " + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); + throw Exception("There is no CREATE TABLE query for Database" + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); } /// Add the dictionary to the database. Record its presence in the metadata. @@ -206,7 +206,7 @@ public: const String & /*dictionary_name*/, const ASTPtr & /*query*/) { - throw Exception("There is no CREATE DICTIONARY query for Database " + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); + throw Exception("There is no CREATE DICTIONARY query for Database" + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); } /// Delete the table from the database, drop table and delete the metadata. @@ -215,7 +215,7 @@ public: const String & /*name*/, [[maybe_unused]] bool no_delay = false) { - throw Exception("There is no DROP TABLE query for Database " + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); + throw Exception("There is no DROP TABLE query for Database" + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); } /// Delete the dictionary from the database. Delete the metadata. @@ -223,32 +223,32 @@ public: const Context & /*context*/, const String & /*dictionary_name*/) { - throw Exception("There is no DROP DICTIONARY query for Database " + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); + throw Exception("There is no DROP DICTIONARY query for Database" + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); } /// Add a table to the database, but do not add it to the metadata. The database may not support this method. virtual void attachTable(const String & /*name*/, const StoragePtr & /*table*/, [[maybe_unused]] const String & relative_table_path = {}) { - throw Exception("There is no ATTACH TABLE query for Database " + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); + throw Exception("There is no ATTACH TABLE query for Database" + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); } /// Add dictionary to the database, but do not add it to the metadata. The database may not support this method. /// If dictionaries_lazy_load is false it also starts loading the dictionary asynchronously. virtual void attachDictionary(const String & /* dictionary_name */, const DictionaryAttachInfo & /* attach_info */) { - throw Exception("There is no ATTACH DICTIONARY query for Database " + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); + throw Exception("There is no ATTACH DICTIONARY query for Database" + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); } /// Forget about the table without deleting it, and return it. The database may not support this method. virtual StoragePtr detachTable(const String & /*name*/) { - throw Exception("There is no DETACH TABLE query for Database " + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); + throw Exception("There is no DETACH TABLE query for Database" + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); } /// Forget about the dictionary without deleting it. The database may not support this method. virtual void detachDictionary(const String & /*name*/) { - throw Exception("There is no DETACH DICTIONARY query for Database " + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); + throw Exception("There is no DETACH DICTIONARY query for Database" + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); } /// Rename the table and possibly move the table to another database. @@ -352,14 +352,14 @@ protected: virtual ASTPtr getCreateTableQueryImpl(const String & /*name*/, const Context & /*context*/, bool throw_on_error) const { if (throw_on_error) - throw Exception("There is no SHOW CREATE TABLE query for Database " + getEngineName(), ErrorCodes::CANNOT_GET_CREATE_TABLE_QUERY); + throw Exception("There is no SHOW CREATE TABLE query for Database" + getEngineName(), ErrorCodes::CANNOT_GET_CREATE_TABLE_QUERY); return nullptr; } virtual ASTPtr getCreateDictionaryQueryImpl(const String & /*name*/, bool throw_on_error) const { if (throw_on_error) - throw Exception("There is no SHOW CREATE DICTIONARY query for Database " + getEngineName(), ErrorCodes::CANNOT_GET_CREATE_DICTIONARY_QUERY); + throw Exception("There is no SHOW CREATE DICTIONARY query for Database" + getEngineName(), ErrorCodes::CANNOT_GET_CREATE_DICTIONARY_QUERY); return nullptr; } diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index 32d0e25bde5..4e2dcc98767 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -759,6 +759,7 @@ void DDLWorker::processTask(DDLTask & task) else if (code == Coordination::Error::ZNONODE) { /// There is no parent + //TODO why not to create parent before active_node? createStatusDirs(task.entry_path, zookeeper); if (Coordination::Error::ZOK != zookeeper->tryCreate(active_node_path, "", zkutil::CreateMode::Ephemeral, dummy)) throw Coordination::Exception(code, active_node_path); From cbcdee0cf9f735e9c8545f32fe73579d01bbb9a5 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Tue, 3 Nov 2020 16:47:26 +0300 Subject: [PATCH 046/381] split DDLWorker.cpp --- src/Interpreters/DDLTask.cpp | 81 +++ src/Interpreters/DDLTask.h | 88 ++++ src/Interpreters/DDLWorker.cpp | 479 +----------------- src/Interpreters/DDLWorker.h | 22 +- src/Interpreters/InterpreterAlterQuery.cpp | 2 +- src/Interpreters/InterpreterCreateQuery.cpp | 3 +- .../InterpreterCreateQuotaQuery.cpp | 2 +- .../InterpreterCreateRoleQuery.cpp | 2 +- .../InterpreterCreateRowPolicyQuery.cpp | 2 +- .../InterpreterCreateSettingsProfileQuery.cpp | 2 +- .../InterpreterCreateUserQuery.cpp | 2 +- .../InterpreterDropAccessEntityQuery.cpp | 2 +- src/Interpreters/InterpreterDropQuery.cpp | 2 +- src/Interpreters/InterpreterGrantQuery.cpp | 2 +- .../InterpreterKillQueryQuery.cpp | 2 +- src/Interpreters/InterpreterOptimizeQuery.cpp | 2 +- src/Interpreters/InterpreterRenameQuery.cpp | 2 +- src/Interpreters/InterpreterSystemQuery.cpp | 2 +- src/Interpreters/executeDDLQueryOnCluster.cpp | 317 ++++++++++++ src/Interpreters/executeDDLQueryOnCluster.h | 63 +++ src/Interpreters/ya.make | 2 + 21 files changed, 576 insertions(+), 505 deletions(-) create mode 100644 src/Interpreters/DDLTask.cpp create mode 100644 src/Interpreters/DDLTask.h create mode 100644 src/Interpreters/executeDDLQueryOnCluster.cpp create mode 100644 src/Interpreters/executeDDLQueryOnCluster.h diff --git a/src/Interpreters/DDLTask.cpp b/src/Interpreters/DDLTask.cpp new file mode 100644 index 00000000000..dfb8f5ff746 --- /dev/null +++ b/src/Interpreters/DDLTask.cpp @@ -0,0 +1,81 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int UNKNOWN_FORMAT_VERSION; +} + +HostID HostID::fromString(const String & host_port_str) +{ + HostID res; + std::tie(res.host_name, res.port) = Cluster::Address::fromString(host_port_str); + return res; +} + +bool HostID::isLocalAddress(UInt16 clickhouse_port) const +{ + try + { + return DB::isLocalAddress(DNSResolver::instance().resolveAddress(host_name, port), clickhouse_port); + } + catch (const Poco::Net::NetException &) + { + /// Avoid "Host not found" exceptions + return false; + } +} + + +String DDLLogEntry::toString() const +{ + WriteBufferFromOwnString wb; + + Strings host_id_strings(hosts.size()); + std::transform(hosts.begin(), hosts.end(), host_id_strings.begin(), HostID::applyToString); + + auto version = CURRENT_VERSION; + wb << "version: " << version << "\n"; + wb << "query: " << escape << query << "\n"; + wb << "hosts: " << host_id_strings << "\n"; + wb << "initiator: " << initiator << "\n"; + + return wb.str(); +} + +void DDLLogEntry::parse(const String & data) +{ + ReadBufferFromString rb(data); + + int version; + rb >> "version: " >> version >> "\n"; + + if (version != CURRENT_VERSION) + throw Exception(ErrorCodes::UNKNOWN_FORMAT_VERSION, "Unknown DDLLogEntry format version: {}", version); + + Strings host_id_strings; + rb >> "query: " >> escape >> query >> "\n"; + rb >> "hosts: " >> host_id_strings >> "\n"; + + if (!rb.eof()) + rb >> "initiator: " >> initiator >> "\n"; + else + initiator.clear(); + + assertEOF(rb); + + hosts.resize(host_id_strings.size()); + std::transform(host_id_strings.begin(), host_id_strings.end(), hosts.begin(), HostID::fromString); +} + + +} diff --git a/src/Interpreters/DDLTask.h b/src/Interpreters/DDLTask.h new file mode 100644 index 00000000000..51f09efd0bd --- /dev/null +++ b/src/Interpreters/DDLTask.h @@ -0,0 +1,88 @@ +#pragma once +#include +#include + + +namespace DB +{ + +class ASTQueryWithOnCluster; + +struct HostID +{ + String host_name; + UInt16 port; + + HostID() = default; + + explicit HostID(const Cluster::Address & address) + : host_name(address.host_name), port(address.port) {} + + static HostID fromString(const String & host_port_str); + + String toString() const + { + return Cluster::Address::toString(host_name, port); + } + + String readableString() const + { + return host_name + ":" + DB::toString(port); + } + + bool isLocalAddress(UInt16 clickhouse_port) const; + + static String applyToString(const HostID & host_id) + { + return host_id.toString(); + } +}; + + +struct DDLLogEntry +{ + String query; + std::vector hosts; + String initiator; // optional + + static constexpr int CURRENT_VERSION = 1; + + String toString() const; + + void parse(const String & data); +}; + + +struct DDLTask +{ + /// Stages of task lifetime correspond ordering of these data fields: + + /// Stage 1: parse entry + String entry_name; + String entry_path; + DDLLogEntry entry; + + /// Stage 2: resolve host_id and check that + HostID host_id; + String host_id_str; + + /// Stage 3.1: parse query + ASTPtr query; + ASTQueryWithOnCluster * query_on_cluster = nullptr; + + /// Stage 3.2: check cluster and find the host in cluster + String cluster_name; + ClusterPtr cluster; + Cluster::Address address_in_cluster; + size_t host_shard_num; + size_t host_replica_num; + + /// Stage 3.3: execute query + ExecutionStatus execution_status; + bool was_executed = false; + + /// Stage 4: commit results to ZooKeeper +}; + + +} diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index 4e2dcc98767..2c454db4787 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -9,37 +10,21 @@ #include #include #include -#include #include #include -#include -#include #include #include -#include #include -#include -#include -#include -#include #include -#include #include #include #include #include -#include -#include -#include -#include -#include -#include -#include #include #include -#include #include #include +#include #include #include @@ -51,7 +36,6 @@ namespace ErrorCodes { extern const int NOT_IMPLEMENTED; extern const int LOGICAL_ERROR; - extern const int UNKNOWN_FORMAT_VERSION; extern const int INCONSISTENT_CLUSTER_DEFINITION; extern const int TIMEOUT_EXCEEDED; extern const int UNKNOWN_TYPE_OF_QUERY; @@ -60,141 +44,6 @@ namespace ErrorCodes } -namespace -{ - -struct HostID -{ - String host_name; - UInt16 port; - - HostID() = default; - - explicit HostID(const Cluster::Address & address) - : host_name(address.host_name), port(address.port) {} - - static HostID fromString(const String & host_port_str) - { - HostID res; - std::tie(res.host_name, res.port) = Cluster::Address::fromString(host_port_str); - return res; - } - - String toString() const - { - return Cluster::Address::toString(host_name, port); - } - - String readableString() const - { - return host_name + ":" + DB::toString(port); - } - - bool isLocalAddress(UInt16 clickhouse_port) const - { - try - { - return DB::isLocalAddress(DNSResolver::instance().resolveAddress(host_name, port), clickhouse_port); - } - catch (const Poco::Net::NetException &) - { - /// Avoid "Host not found" exceptions - return false; - } - } - - static String applyToString(const HostID & host_id) - { - return host_id.toString(); - } -}; - -} - - -struct DDLLogEntry -{ - String query; - std::vector hosts; - String initiator; // optional - - static constexpr int CURRENT_VERSION = 1; - - String toString() - { - WriteBufferFromOwnString wb; - - Strings host_id_strings(hosts.size()); - std::transform(hosts.begin(), hosts.end(), host_id_strings.begin(), HostID::applyToString); - - auto version = CURRENT_VERSION; - wb << "version: " << version << "\n"; - wb << "query: " << escape << query << "\n"; - wb << "hosts: " << host_id_strings << "\n"; - wb << "initiator: " << initiator << "\n"; - - return wb.str(); - } - - void parse(const String & data) - { - ReadBufferFromString rb(data); - - int version; - rb >> "version: " >> version >> "\n"; - - if (version != CURRENT_VERSION) - throw Exception(ErrorCodes::UNKNOWN_FORMAT_VERSION, "Unknown DDLLogEntry format version: {}", version); - - Strings host_id_strings; - rb >> "query: " >> escape >> query >> "\n"; - rb >> "hosts: " >> host_id_strings >> "\n"; - - if (!rb.eof()) - rb >> "initiator: " >> initiator >> "\n"; - else - initiator.clear(); - - assertEOF(rb); - - hosts.resize(host_id_strings.size()); - std::transform(host_id_strings.begin(), host_id_strings.end(), hosts.begin(), HostID::fromString); - } -}; - - -struct DDLTask -{ - /// Stages of task lifetime correspond ordering of these data fields: - - /// Stage 1: parse entry - String entry_name; - String entry_path; - DDLLogEntry entry; - - /// Stage 2: resolve host_id and check that - HostID host_id; - String host_id_str; - - /// Stage 3.1: parse query - ASTPtr query; - ASTQueryWithOnCluster * query_on_cluster = nullptr; - - /// Stage 3.2: check cluster and find the host in cluster - String cluster_name; - ClusterPtr cluster; - Cluster::Address address_in_cluster; - size_t host_shard_num; - size_t host_replica_num; - - /// Stage 3.3: execute query - ExecutionStatus execution_status; - bool was_executed = false; - - /// Stage 4: commit results to ZooKeeper -}; - - namespace { @@ -293,21 +142,6 @@ std::unique_ptr createSimpleZooKeeperLock( } -static bool isSupportedAlterType(int type) -{ - static const std::unordered_set unsupported_alter_types{ - ASTAlterCommand::ATTACH_PARTITION, - ASTAlterCommand::REPLACE_PARTITION, - ASTAlterCommand::FETCH_PARTITION, - ASTAlterCommand::FREEZE_PARTITION, - ASTAlterCommand::FREEZE_ALL, - ASTAlterCommand::NO_TYPE, - }; - - return unsupported_alter_types.count(type) == 0; -} - - DDLWorker::DDLWorker(int pool_size_, const std::string & zk_root_dir, Context & context_, const Poco::Util::AbstractConfiguration * config, const String & prefix) : context(context_) , log(&Poco::Logger::get("DDLWorker")) @@ -1187,313 +1021,4 @@ void DDLWorker::runCleanupThread() } -class DDLQueryStatusInputStream : public IBlockInputStream -{ -public: - - DDLQueryStatusInputStream(const String & zk_node_path, const DDLLogEntry & entry, const Context & context_) - : node_path(zk_node_path), context(context_), watch(CLOCK_MONOTONIC_COARSE), log(&Poco::Logger::get("DDLQueryStatusInputStream")) - { - sample = Block{ - {std::make_shared(), "host"}, - {std::make_shared(), "port"}, - {std::make_shared(), "status"}, - {std::make_shared(), "error"}, - {std::make_shared(), "num_hosts_remaining"}, - {std::make_shared(), "num_hosts_active"}, - }; - - for (const HostID & host: entry.hosts) - waiting_hosts.emplace(host.toString()); - - addTotalRowsApprox(entry.hosts.size()); - - timeout_seconds = context.getSettingsRef().distributed_ddl_task_timeout; - } - - String getName() const override - { - return "DDLQueryStatusInputStream"; - } - - Block getHeader() const override { return sample; } - - Block readImpl() override - { - Block res; - if (num_hosts_finished >= waiting_hosts.size()) - { - if (first_exception) - throw Exception(*first_exception); - - return res; - } - - auto zookeeper = context.getZooKeeper(); - size_t try_number = 0; - - while (res.rows() == 0) - { - if (isCancelled()) - { - if (first_exception) - throw Exception(*first_exception); - - return res; - } - - if (timeout_seconds >= 0 && watch.elapsedSeconds() > timeout_seconds) - { - size_t num_unfinished_hosts = waiting_hosts.size() - num_hosts_finished; - size_t num_active_hosts = current_active_hosts.size(); - - - throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, - "Watching task {} is executing longer than distributed_ddl_task_timeout (={}) seconds. " - "There are {} unfinished hosts ({} of them are currently active), they are going to execute the query in background", - node_path, timeout_seconds, num_unfinished_hosts, num_active_hosts); - } - - if (num_hosts_finished != 0 || try_number != 0) - { - sleepForMilliseconds(std::min(1000, 50 * (try_number + 1))); - } - - /// TODO: add shared lock - if (!zookeeper->exists(node_path)) - { - throw Exception(ErrorCodes::UNFINISHED, - "Cannot provide query execution status. The query's node {} has been deleted by the cleaner since it was finished (or its lifetime is expired)", - node_path); - } - - Strings new_hosts = getNewAndUpdate(getChildrenAllowNoNode(zookeeper, node_path + "/finished")); - ++try_number; - if (new_hosts.empty()) - continue; - - current_active_hosts = getChildrenAllowNoNode(zookeeper, node_path + "/active"); - - MutableColumns columns = sample.cloneEmptyColumns(); - for (const String & host_id : new_hosts) - { - ExecutionStatus status(-1, "Cannot obtain error message"); - { - String status_data; - if (zookeeper->tryGet(node_path + "/finished/" + host_id, status_data)) - status.tryDeserializeText(status_data); - } - - auto [host, port] = Cluster::Address::fromString(host_id); - - if (status.code != 0 && first_exception == nullptr) - first_exception = std::make_unique(status.code, "There was an error on [{}:{}]: {}", host, port, status.message); - - ++num_hosts_finished; - - columns[0]->insert(host); - columns[1]->insert(port); - columns[2]->insert(status.code); - columns[3]->insert(status.message); - columns[4]->insert(waiting_hosts.size() - num_hosts_finished); - columns[5]->insert(current_active_hosts.size()); - } - res = sample.cloneWithColumns(std::move(columns)); - } - - return res; - } - - Block getSampleBlock() const - { - return sample.cloneEmpty(); - } - - ~DDLQueryStatusInputStream() override = default; - -private: - - static Strings getChildrenAllowNoNode(const std::shared_ptr & zookeeper, const String & node_path) - { - Strings res; - Coordination::Error code = zookeeper->tryGetChildren(node_path, res); - if (code != Coordination::Error::ZOK && code != Coordination::Error::ZNONODE) - throw Coordination::Exception(code, node_path); - return res; - } - - Strings getNewAndUpdate(const Strings & current_list_of_finished_hosts) - { - Strings diff; - for (const String & host : current_list_of_finished_hosts) - { - if (!waiting_hosts.count(host)) - { - if (!ignoring_hosts.count(host)) - { - ignoring_hosts.emplace(host); - LOG_INFO(log, "Unexpected host {} appeared in task {}", host, node_path); - } - continue; - } - - if (!finished_hosts.count(host)) - { - diff.emplace_back(host); - finished_hosts.emplace(host); - } - } - - return diff; - } - - String node_path; - const Context & context; - Stopwatch watch; - Poco::Logger * log; - - Block sample; - - NameSet waiting_hosts; /// hosts from task host list - NameSet finished_hosts; /// finished hosts from host list - NameSet ignoring_hosts; /// appeared hosts that are not in hosts list - Strings current_active_hosts; /// Hosts that were in active state at the last check - size_t num_hosts_finished = 0; - - /// Save the first detected error and throw it at the end of execution - std::unique_ptr first_exception; - - Int64 timeout_seconds = 120; -}; - - -BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr_, const Context & context, AccessRightsElements && query_requires_access, bool query_requires_grant_option) -{ - /// Remove FORMAT and INTO OUTFILE if exists - ASTPtr query_ptr = query_ptr_->clone(); - ASTQueryWithOutput::resetOutputASTIfExist(*query_ptr); - - // XXX: serious design flaw since `ASTQueryWithOnCluster` is not inherited from `IAST`! - auto * query = dynamic_cast(query_ptr.get()); - if (!query) - { - throw Exception("Distributed execution is not supported for such DDL queries", ErrorCodes::NOT_IMPLEMENTED); - } - - if (!context.getSettingsRef().allow_distributed_ddl) - throw Exception("Distributed DDL queries are prohibited for the user", ErrorCodes::QUERY_IS_PROHIBITED); - - if (const auto * query_alter = query_ptr->as()) - { - for (const auto & command : query_alter->command_list->commands) - { - if (!isSupportedAlterType(command->type)) - throw Exception("Unsupported type of ALTER query", ErrorCodes::NOT_IMPLEMENTED); - } - } - - query->cluster = context.getMacros()->expand(query->cluster); - ClusterPtr cluster = context.getCluster(query->cluster); - DDLWorker & ddl_worker = context.getDDLWorker(); - - /// Enumerate hosts which will be used to send query. - Cluster::AddressesWithFailover shards = cluster->getShardsAddresses(); - std::vector hosts; - for (const auto & shard : shards) - { - for (const auto & addr : shard) - hosts.emplace_back(addr); - } - - if (hosts.empty()) - throw Exception("No hosts defined to execute distributed DDL query", ErrorCodes::LOGICAL_ERROR); - - /// The current database in a distributed query need to be replaced with either - /// the local current database or a shard's default database. - bool need_replace_current_database - = (std::find_if( - query_requires_access.begin(), - query_requires_access.end(), - [](const AccessRightsElement & elem) { return elem.isEmptyDatabase(); }) - != query_requires_access.end()); - - bool use_local_default_database = false; - const String & current_database = context.getCurrentDatabase(); - - if (need_replace_current_database) - { - Strings shard_default_databases; - for (const auto & shard : shards) - { - for (const auto & addr : shard) - { - if (!addr.default_database.empty()) - shard_default_databases.push_back(addr.default_database); - else - use_local_default_database = true; - } - } - std::sort(shard_default_databases.begin(), shard_default_databases.end()); - shard_default_databases.erase(std::unique(shard_default_databases.begin(), shard_default_databases.end()), shard_default_databases.end()); - assert(use_local_default_database || !shard_default_databases.empty()); - - if (use_local_default_database && !shard_default_databases.empty()) - throw Exception("Mixed local default DB and shard default DB in DDL query", ErrorCodes::NOT_IMPLEMENTED); - - if (use_local_default_database) - { - query_requires_access.replaceEmptyDatabase(current_database); - } - else - { - for (size_t i = 0; i != query_requires_access.size();) - { - auto & element = query_requires_access[i]; - if (element.isEmptyDatabase()) - { - query_requires_access.insert(query_requires_access.begin() + i + 1, shard_default_databases.size() - 1, element); - for (size_t j = 0; j != shard_default_databases.size(); ++j) - query_requires_access[i + j].replaceEmptyDatabase(shard_default_databases[j]); - i += shard_default_databases.size(); - } - else - ++i; - } - } - } - - AddDefaultDatabaseVisitor visitor(current_database, !use_local_default_database); - visitor.visitDDL(query_ptr); - - /// Check access rights, assume that all servers have the same users config - if (query_requires_grant_option) - context.getAccess()->checkGrantOption(query_requires_access); - else - context.checkAccess(query_requires_access); - - DDLLogEntry entry; - entry.hosts = std::move(hosts); - entry.query = queryToString(query_ptr); - entry.initiator = ddl_worker.getCommonHostID(); - String node_path = ddl_worker.enqueueQuery(entry); - - BlockIO io; - if (context.getSettingsRef().distributed_ddl_task_timeout == 0) - return io; - - auto stream = std::make_shared(node_path, entry, context); - io.in = std::move(stream); - return io; -} - -BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr, const Context & context, const AccessRightsElements & query_requires_access, bool query_requires_grant_option) -{ - return executeDDLQueryOnCluster(query_ptr, context, AccessRightsElements{query_requires_access}, query_requires_grant_option); -} - -BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr_, const Context & context) -{ - return executeDDLQueryOnCluster(query_ptr_, context, {}); -} - } diff --git a/src/Interpreters/DDLWorker.h b/src/Interpreters/DDLWorker.h index 39cdcab709e..caa2242caf8 100644 --- a/src/Interpreters/DDLWorker.h +++ b/src/Interpreters/DDLWorker.h @@ -1,11 +1,9 @@ #pragma once -#include -#include #include #include -#include -#include +#include +#include #include #include @@ -18,23 +16,22 @@ namespace zkutil class ZooKeeper; } +namespace Poco +{ + class Logger; + namespace Util { class AbstractConfiguration; } +} + namespace DB { class Context; class ASTAlterQuery; -class AccessRightsElements; struct DDLLogEntry; struct DDLTask; using DDLTaskPtr = std::unique_ptr; -/// Pushes distributed DDL query to the queue -BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr, const Context & context); -BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr, const Context & context, const AccessRightsElements & query_requires_access, bool query_requires_grant_option = false); -BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr, const Context & context, AccessRightsElements && query_requires_access, bool query_requires_grant_option = false); - - class DDLWorker { public: @@ -137,9 +134,6 @@ private: size_t max_tasks_in_queue = 1000; ThreadGroupStatusPtr thread_group; - - friend class DDLQueryStatusInputStream; - friend struct DDLTask; }; diff --git a/src/Interpreters/InterpreterAlterQuery.cpp b/src/Interpreters/InterpreterAlterQuery.cpp index e229cb120e5..013e30a3ed5 100644 --- a/src/Interpreters/InterpreterAlterQuery.cpp +++ b/src/Interpreters/InterpreterAlterQuery.cpp @@ -1,5 +1,5 @@ #include -#include +#include #include #include #include diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 0f7d441c0d6..04c5efce3e2 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -28,7 +28,8 @@ #include #include -#include +#include +#include #include #include #include diff --git a/src/Interpreters/InterpreterCreateQuotaQuery.cpp b/src/Interpreters/InterpreterCreateQuotaQuery.cpp index f45c2c9709d..ff30a2fff47 100644 --- a/src/Interpreters/InterpreterCreateQuotaQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuotaQuery.cpp @@ -2,7 +2,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Interpreters/InterpreterCreateRoleQuery.cpp b/src/Interpreters/InterpreterCreateRoleQuery.cpp index 2fa04eebae1..72ad3234b95 100644 --- a/src/Interpreters/InterpreterCreateRoleQuery.cpp +++ b/src/Interpreters/InterpreterCreateRoleQuery.cpp @@ -1,7 +1,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/Interpreters/InterpreterCreateRowPolicyQuery.cpp b/src/Interpreters/InterpreterCreateRowPolicyQuery.cpp index 9dacc9d1bf4..8f1c5b061e0 100644 --- a/src/Interpreters/InterpreterCreateRowPolicyQuery.cpp +++ b/src/Interpreters/InterpreterCreateRowPolicyQuery.cpp @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Interpreters/InterpreterCreateSettingsProfileQuery.cpp b/src/Interpreters/InterpreterCreateSettingsProfileQuery.cpp index 2d5f4d499b7..b65225db16c 100644 --- a/src/Interpreters/InterpreterCreateSettingsProfileQuery.cpp +++ b/src/Interpreters/InterpreterCreateSettingsProfileQuery.cpp @@ -2,7 +2,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Interpreters/InterpreterCreateUserQuery.cpp b/src/Interpreters/InterpreterCreateUserQuery.cpp index 111f698beb9..c9b087de5b4 100644 --- a/src/Interpreters/InterpreterCreateUserQuery.cpp +++ b/src/Interpreters/InterpreterCreateUserQuery.cpp @@ -1,7 +1,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Interpreters/InterpreterDropAccessEntityQuery.cpp b/src/Interpreters/InterpreterDropAccessEntityQuery.cpp index d79d239ee12..e86f8361100 100644 --- a/src/Interpreters/InterpreterDropAccessEntityQuery.cpp +++ b/src/Interpreters/InterpreterDropAccessEntityQuery.cpp @@ -2,7 +2,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Interpreters/InterpreterDropQuery.cpp b/src/Interpreters/InterpreterDropQuery.cpp index 48eb20485be..0f03525f237 100644 --- a/src/Interpreters/InterpreterDropQuery.cpp +++ b/src/Interpreters/InterpreterDropQuery.cpp @@ -2,7 +2,7 @@ #include #include -#include +#include #include #include #include diff --git a/src/Interpreters/InterpreterGrantQuery.cpp b/src/Interpreters/InterpreterGrantQuery.cpp index 6f45687a4e1..dafe4d2e18c 100644 --- a/src/Interpreters/InterpreterGrantQuery.cpp +++ b/src/Interpreters/InterpreterGrantQuery.cpp @@ -2,7 +2,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Interpreters/InterpreterKillQueryQuery.cpp b/src/Interpreters/InterpreterKillQueryQuery.cpp index 0f7da8f1f58..c50659c6c45 100644 --- a/src/Interpreters/InterpreterKillQueryQuery.cpp +++ b/src/Interpreters/InterpreterKillQueryQuery.cpp @@ -2,7 +2,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Interpreters/InterpreterOptimizeQuery.cpp b/src/Interpreters/InterpreterOptimizeQuery.cpp index 680dd9b803b..431d5074cde 100644 --- a/src/Interpreters/InterpreterOptimizeQuery.cpp +++ b/src/Interpreters/InterpreterOptimizeQuery.cpp @@ -1,7 +1,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Interpreters/InterpreterRenameQuery.cpp b/src/Interpreters/InterpreterRenameQuery.cpp index 65ed33bd9db..3a375e2ba60 100644 --- a/src/Interpreters/InterpreterRenameQuery.cpp +++ b/src/Interpreters/InterpreterRenameQuery.cpp @@ -3,7 +3,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Interpreters/InterpreterSystemQuery.cpp b/src/Interpreters/InterpreterSystemQuery.cpp index f0a8ce9064d..1b8c3ae79f2 100644 --- a/src/Interpreters/InterpreterSystemQuery.cpp +++ b/src/Interpreters/InterpreterSystemQuery.cpp @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Interpreters/executeDDLQueryOnCluster.cpp b/src/Interpreters/executeDDLQueryOnCluster.cpp new file mode 100644 index 00000000000..6da1704ce55 --- /dev/null +++ b/src/Interpreters/executeDDLQueryOnCluster.cpp @@ -0,0 +1,317 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; + extern const int TIMEOUT_EXCEEDED; + extern const int UNFINISHED; + extern const int QUERY_IS_PROHIBITED; +} + +static bool isSupportedAlterType(int type) +{ + static const std::unordered_set unsupported_alter_types{ + ASTAlterCommand::ATTACH_PARTITION, + ASTAlterCommand::REPLACE_PARTITION, + ASTAlterCommand::FETCH_PARTITION, + ASTAlterCommand::FREEZE_PARTITION, + ASTAlterCommand::FREEZE_ALL, + ASTAlterCommand::NO_TYPE, + }; + + return unsupported_alter_types.count(type) == 0; +} + + +BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr_, const Context & context) +{ + return executeDDLQueryOnCluster(query_ptr_, context, {}); +} + +BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr, const Context & context, const AccessRightsElements & query_requires_access, bool query_requires_grant_option) +{ + return executeDDLQueryOnCluster(query_ptr, context, AccessRightsElements{query_requires_access}, query_requires_grant_option); +} + +BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr_, const Context & context, AccessRightsElements && query_requires_access, bool query_requires_grant_option) +{ + /// Remove FORMAT and INTO OUTFILE if exists + ASTPtr query_ptr = query_ptr_->clone(); + ASTQueryWithOutput::resetOutputASTIfExist(*query_ptr); + + // XXX: serious design flaw since `ASTQueryWithOnCluster` is not inherited from `IAST`! + auto * query = dynamic_cast(query_ptr.get()); + if (!query) + { + throw Exception("Distributed execution is not supported for such DDL queries", ErrorCodes::NOT_IMPLEMENTED); + } + + if (!context.getSettingsRef().allow_distributed_ddl) + throw Exception("Distributed DDL queries are prohibited for the user", ErrorCodes::QUERY_IS_PROHIBITED); + + if (const auto * query_alter = query_ptr->as()) + { + for (const auto & command : query_alter->command_list->commands) + { + if (!isSupportedAlterType(command->type)) + throw Exception("Unsupported type of ALTER query", ErrorCodes::NOT_IMPLEMENTED); + } + } + + query->cluster = context.getMacros()->expand(query->cluster); + ClusterPtr cluster = context.getCluster(query->cluster); + DDLWorker & ddl_worker = context.getDDLWorker(); + + /// Enumerate hosts which will be used to send query. + Cluster::AddressesWithFailover shards = cluster->getShardsAddresses(); + std::vector hosts; + for (const auto & shard : shards) + { + for (const auto & addr : shard) + hosts.emplace_back(addr); + } + + if (hosts.empty()) + throw Exception("No hosts defined to execute distributed DDL query", ErrorCodes::LOGICAL_ERROR); + + /// The current database in a distributed query need to be replaced with either + /// the local current database or a shard's default database. + bool need_replace_current_database + = (std::find_if( + query_requires_access.begin(), + query_requires_access.end(), + [](const AccessRightsElement & elem) { return elem.isEmptyDatabase(); }) + != query_requires_access.end()); + + bool use_local_default_database = false; + const String & current_database = context.getCurrentDatabase(); + + if (need_replace_current_database) + { + Strings shard_default_databases; + for (const auto & shard : shards) + { + for (const auto & addr : shard) + { + if (!addr.default_database.empty()) + shard_default_databases.push_back(addr.default_database); + else + use_local_default_database = true; + } + } + std::sort(shard_default_databases.begin(), shard_default_databases.end()); + shard_default_databases.erase(std::unique(shard_default_databases.begin(), shard_default_databases.end()), shard_default_databases.end()); + assert(use_local_default_database || !shard_default_databases.empty()); + + if (use_local_default_database && !shard_default_databases.empty()) + throw Exception("Mixed local default DB and shard default DB in DDL query", ErrorCodes::NOT_IMPLEMENTED); + + if (use_local_default_database) + { + query_requires_access.replaceEmptyDatabase(current_database); + } + else + { + for (size_t i = 0; i != query_requires_access.size();) + { + auto & element = query_requires_access[i]; + if (element.isEmptyDatabase()) + { + query_requires_access.insert(query_requires_access.begin() + i + 1, shard_default_databases.size() - 1, element); + for (size_t j = 0; j != shard_default_databases.size(); ++j) + query_requires_access[i + j].replaceEmptyDatabase(shard_default_databases[j]); + i += shard_default_databases.size(); + } + else + ++i; + } + } + } + + AddDefaultDatabaseVisitor visitor(current_database, !use_local_default_database); + visitor.visitDDL(query_ptr); + + /// Check access rights, assume that all servers have the same users config + if (query_requires_grant_option) + context.getAccess()->checkGrantOption(query_requires_access); + else + context.checkAccess(query_requires_access); + + DDLLogEntry entry; + entry.hosts = std::move(hosts); + entry.query = queryToString(query_ptr); + entry.initiator = ddl_worker.getCommonHostID(); + String node_path = ddl_worker.enqueueQuery(entry); + + BlockIO io; + if (context.getSettingsRef().distributed_ddl_task_timeout == 0) + return io; + + auto stream = std::make_shared(node_path, entry, context); + io.in = std::move(stream); + return io; +} + + +DDLQueryStatusInputStream::DDLQueryStatusInputStream(const String & zk_node_path, const DDLLogEntry & entry, const Context & context_) + : node_path(zk_node_path) + , context(context_) + , watch(CLOCK_MONOTONIC_COARSE) + , log(&Poco::Logger::get("DDLQueryStatusInputStream")) +{ + sample = Block{ + {std::make_shared(), "host"}, + {std::make_shared(), "port"}, + {std::make_shared(), "status"}, + {std::make_shared(), "error"}, + {std::make_shared(), "num_hosts_remaining"}, + {std::make_shared(), "num_hosts_active"}, + }; + + for (const HostID & host: entry.hosts) + waiting_hosts.emplace(host.toString()); + + addTotalRowsApprox(entry.hosts.size()); + + timeout_seconds = context.getSettingsRef().distributed_ddl_task_timeout; +} + +Block DDLQueryStatusInputStream::readImpl() +{ + Block res; + if (num_hosts_finished >= waiting_hosts.size()) + { + if (first_exception) + throw Exception(*first_exception); + + return res; + } + + auto zookeeper = context.getZooKeeper(); + size_t try_number = 0; + + while (res.rows() == 0) + { + if (isCancelled()) + { + if (first_exception) + throw Exception(*first_exception); + + return res; + } + + if (timeout_seconds >= 0 && watch.elapsedSeconds() > timeout_seconds) + { + size_t num_unfinished_hosts = waiting_hosts.size() - num_hosts_finished; + size_t num_active_hosts = current_active_hosts.size(); + + + throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, + "Watching task {} is executing longer than distributed_ddl_task_timeout (={}) seconds. " + "There are {} unfinished hosts ({} of them are currently active), they are going to execute the query in background", + node_path, timeout_seconds, num_unfinished_hosts, num_active_hosts); + } + + if (num_hosts_finished != 0 || try_number != 0) + { + sleepForMilliseconds(std::min(1000, 50 * (try_number + 1))); + } + + /// TODO: add shared lock + if (!zookeeper->exists(node_path)) + { + throw Exception(ErrorCodes::UNFINISHED, + "Cannot provide query execution status. The query's node {} has been deleted by the cleaner since it was finished (or its lifetime is expired)", + node_path); + } + + Strings new_hosts = getNewAndUpdate(getChildrenAllowNoNode(zookeeper, node_path + "/finished")); + ++try_number; + if (new_hosts.empty()) + continue; + + current_active_hosts = getChildrenAllowNoNode(zookeeper, node_path + "/active"); + + MutableColumns columns = sample.cloneEmptyColumns(); + for (const String & host_id : new_hosts) + { + ExecutionStatus status(-1, "Cannot obtain error message"); + { + String status_data; + if (zookeeper->tryGet(node_path + "/finished/" + host_id, status_data)) + status.tryDeserializeText(status_data); + } + + auto [host, port] = Cluster::Address::fromString(host_id); + + if (status.code != 0 && first_exception == nullptr) + first_exception = std::make_unique(status.code, "There was an error on [{}:{}]: {}", host, port, status.message); + + ++num_hosts_finished; + + columns[0]->insert(host); + columns[1]->insert(port); + columns[2]->insert(status.code); + columns[3]->insert(status.message); + columns[4]->insert(waiting_hosts.size() - num_hosts_finished); + columns[5]->insert(current_active_hosts.size()); + } + res = sample.cloneWithColumns(std::move(columns)); + } + + return res; +} + +Strings DDLQueryStatusInputStream::getChildrenAllowNoNode(const std::shared_ptr & zookeeper, const String & node_path) +{ + Strings res; + Coordination::Error code = zookeeper->tryGetChildren(node_path, res); + if (code != Coordination::Error::ZOK && code != Coordination::Error::ZNONODE) + throw Coordination::Exception(code, node_path); + return res; +} + +Strings DDLQueryStatusInputStream::getNewAndUpdate(const Strings & current_list_of_finished_hosts) +{ + Strings diff; + for (const String & host : current_list_of_finished_hosts) + { + if (!waiting_hosts.count(host)) + { + if (!ignoring_hosts.count(host)) + { + ignoring_hosts.emplace(host); + LOG_INFO(log, "Unexpected host {} appeared in task {}", host, node_path); + } + continue; + } + + if (!finished_hosts.count(host)) + { + diff.emplace_back(host); + finished_hosts.emplace(host); + } + } + + return diff; +} + + +} diff --git a/src/Interpreters/executeDDLQueryOnCluster.h b/src/Interpreters/executeDDLQueryOnCluster.h new file mode 100644 index 00000000000..83880cc94c1 --- /dev/null +++ b/src/Interpreters/executeDDLQueryOnCluster.h @@ -0,0 +1,63 @@ +#pragma once +#include +#include + +namespace zkutil +{ + class ZooKeeper; +} + +namespace DB +{ + +class Context; +class AccessRightsElements; +struct DDLLogEntry; + + +/// Pushes distributed DDL query to the queue. +/// Returns DDLQueryStatusInputStream, which reads results of query execution on each host in the cluster. +BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr, const Context & context); +BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr, const Context & context, const AccessRightsElements & query_requires_access, bool query_requires_grant_option = false); +BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr, const Context & context, AccessRightsElements && query_requires_access, bool query_requires_grant_option = false); + + +class DDLQueryStatusInputStream : public IBlockInputStream +{ +public: + DDLQueryStatusInputStream(const String & zk_node_path, const DDLLogEntry & entry, const Context & context_); + + String getName() const override { return "DDLQueryStatusInputStream"; } + + Block getHeader() const override { return sample; } + + Block getSampleBlock() const { return sample.cloneEmpty(); } + + Block readImpl() override; + +private: + + static Strings getChildrenAllowNoNode(const std::shared_ptr & zookeeper, const String & node_path); + + Strings getNewAndUpdate(const Strings & current_list_of_finished_hosts); + + String node_path; + const Context & context; + Stopwatch watch; + Poco::Logger * log; + + Block sample; + + NameSet waiting_hosts; /// hosts from task host list + NameSet finished_hosts; /// finished hosts from host list + NameSet ignoring_hosts; /// appeared hosts that are not in hosts list + Strings current_active_hosts; /// Hosts that were in active state at the last check + size_t num_hosts_finished = 0; + + /// Save the first detected error and throw it at the end of execution + std::unique_ptr first_exception; + + Int64 timeout_seconds = 120; +}; + +} diff --git a/src/Interpreters/ya.make b/src/Interpreters/ya.make index 4c0b64934c7..11a09c40d6a 100644 --- a/src/Interpreters/ya.make +++ b/src/Interpreters/ya.make @@ -45,11 +45,13 @@ SRCS( CrossToInnerJoinVisitor.cpp DatabaseAndTableWithAlias.cpp DatabaseCatalog.cpp + DDLTask.cpp DDLWorker.cpp DictionaryReader.cpp DNSCacheUpdater.cpp EmbeddedDictionaries.cpp evaluateConstantExpression.cpp + executeDDLQueryOnCluster.cpp executeQuery.cpp ExecuteScalarSubqueriesVisitor.cpp ExpressionActions.cpp From 2a6c0b91802de8279a0928e853a3840d94a1413a Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Thu, 5 Nov 2020 12:52:23 +0300 Subject: [PATCH 047/381] try reuse DDLWorker in DatabaseReplicated --- src/Databases/DatabaseReplicated.cpp | 206 +++++++++++------- src/Databases/DatabaseReplicated.h | 16 +- src/Databases/IDatabase.h | 6 - src/Interpreters/DDLWorker.cpp | 36 ++- src/Interpreters/DDLWorker.h | 10 +- src/Interpreters/InterpreterAlterQuery.cpp | 8 +- src/Interpreters/InterpreterCreateQuery.cpp | 29 ++- src/Interpreters/InterpreterDropQuery.cpp | 16 +- src/Interpreters/InterpreterRenameQuery.cpp | 11 +- src/Interpreters/executeDDLQueryOnCluster.cpp | 18 +- src/Interpreters/executeDDLQueryOnCluster.h | 5 +- .../test_replicated_database/test.py | 12 +- 12 files changed, 224 insertions(+), 149 deletions(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 145b3abba00..1213b5bc075 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -13,7 +13,10 @@ #include #include #include - +#include +#include +#include +#include namespace DB { @@ -45,6 +48,7 @@ zkutil::ZooKeeperPtr DatabaseReplicated::getZooKeeper() const return res; } +DatabaseReplicated::~DatabaseReplicated() = default; DatabaseReplicated::DatabaseReplicated( const String & name_, @@ -125,12 +129,15 @@ DatabaseReplicated::DatabaseReplicated( feedback_timeout = context_.getConfigRef().getInt("database_replicated_feedback_timeout", 0); LOG_DEBUG(log, "Snapshot period is set to {} log entries per one snapshot", snapshot_period); - //TODO do we need separate pool? - background_log_executor = context_.getReplicatedSchedulePool().createTask( - database_name + "(DatabaseReplicated::background_executor)", [this] { runBackgroundLogExecutor(); } - ); + //FIXME use database UUID + ddl_worker = std::make_unique(1, zookeeper_path + "/log", context_, nullptr, String{}, true, database_name, replica_name, shard_name); - background_log_executor->scheduleAfter(500); + //TODO do we need separate pool? + //background_log_executor = context_.getReplicatedSchedulePool().createTask( + // database_name + "(DatabaseReplicated::background_executor)", [this] { runBackgroundLogExecutor(); } + //); + + //background_log_executor->scheduleAfter(500); } void DatabaseReplicated::createDatabaseZooKeeperNodes() @@ -226,7 +233,7 @@ void DatabaseReplicated::runBackgroundLogExecutor() } } - background_log_executor->scheduleAfter(500); + //background_log_executor->scheduleAfter(500); } void DatabaseReplicated::writeLastExecutedToDiskAndZK() @@ -244,95 +251,128 @@ void DatabaseReplicated::writeLastExecutedToDiskAndZK() out.close(); } -void DatabaseReplicated::executeLogName(const String & log_entry_name) +void DatabaseReplicated::executeLogName(const String & /*log_entry_name*/) { - String path = zookeeper_path + "/log/" + log_entry_name; - current_zookeeper = getZooKeeper(); - String query_to_execute = current_zookeeper->get(path, {}, nullptr); - - try - { - current_context = std::make_unique(global_context); - current_context->getClientInfo().query_kind = ClientInfo::QueryKind::REPLICATED_LOG_QUERY; - current_context->setCurrentDatabase(database_name); - current_context->setCurrentQueryId(""); // generate random query_id - executeQuery(query_to_execute, *current_context); - } - catch (const Exception & e) - { - tryLogCurrentException(log, "Query from zookeeper " + query_to_execute + " wasn't finished successfully"); - current_zookeeper->create( - zookeeper_path + "/replicas/" + replica_name + "/errors/" + log_entry_name, e.what(), zkutil::CreateMode::Persistent); - } - - LOG_DEBUG(log, "Executed query: {}", query_to_execute); +// String path = zookeeper_path + "/log/" + log_entry_name; +// current_zookeeper = getZooKeeper(); +// String query_to_execute = current_zookeeper->get(path, {}, nullptr); +// +// try +// { +// current_context = std::make_unique(global_context); +// current_context->getClientInfo().query_kind = ClientInfo::QueryKind::REPLICATED_LOG_QUERY; +// current_context->setCurrentDatabase(database_name); +// current_context->setCurrentQueryId(""); // generate random query_id +// executeQuery(query_to_execute, *current_context); +// } +// catch (const Exception & e) +// { +// tryLogCurrentException(log, "Query from zookeeper " + query_to_execute + " wasn't finished successfully"); +// current_zookeeper->create( +// zookeeper_path + "/replicas/" + replica_name + "/errors/" + log_entry_name, e.what(), zkutil::CreateMode::Persistent); +// } +// +// LOG_DEBUG(log, "Executed query: {}", query_to_execute); } -void DatabaseReplicated::propose(const ASTPtr & query) +BlockIO DatabaseReplicated::propose(const ASTPtr & query) { - current_zookeeper = getZooKeeper(); + //current_zookeeper = getZooKeeper(); - LOG_DEBUG(log, "Proposing query: {}", queryToString(query)); + if (const auto * query_alter = query->as()) { - std::lock_guard lock(log_name_mutex); - log_name_to_exec_with_result - = current_zookeeper->create(zookeeper_path + "/log/log-", queryToString(query), zkutil::CreateMode::PersistentSequential); - } - - background_log_executor->schedule(); -} - -BlockIO DatabaseReplicated::getFeedback() -{ - BlockIO res; - if (feedback_timeout == 0) - return res; - - Stopwatch watch; - - NamesAndTypes block_structure = - { - {"replica_name", std::make_shared()}, - {"execution_feedback", std::make_shared()}, - }; - auto replica_name_column = block_structure[0].type->createColumn(); - auto feedback_column = block_structure[1].type->createColumn(); - - current_zookeeper = getZooKeeper(); - Strings replica_states = current_zookeeper->getChildren(zookeeper_path + "/replicas"); - auto replica_iter = replica_states.begin(); - - while (!replica_states.empty() && watch.elapsedSeconds() < feedback_timeout) - { - String last_executed = current_zookeeper->get(zookeeper_path + "/replicas/" + *replica_iter); - if (last_executed > log_name_to_exec_with_result) + for (const auto & command : query_alter->command_list->commands) { - replica_name_column->insert(*replica_iter); - String err_path = zookeeper_path + "/replicas/" + *replica_iter + "/errors/" + log_name_to_exec_with_result; - if (!current_zookeeper->exists(err_path)) - { - feedback_column->insert("OK"); - } - else - { - String feedback = current_zookeeper->get(err_path, {}, nullptr); - feedback_column->insert(feedback); - } - replica_states.erase(replica_iter); - replica_iter = replica_states.begin(); + //FIXME allow all types of queries (maybe we should execute ATTACH an similar queries on leader) + if (!isSupportedAlterType(command->type)) + throw Exception("Unsupported type of ALTER query", ErrorCodes::NOT_IMPLEMENTED); } } - Block block = Block({ - {std::move(replica_name_column), block_structure[0].type, block_structure[0].name}, - {std::move(feedback_column), block_structure[1].type, block_structure[1].name} - }); + LOG_DEBUG(log, "Proposing query: {}", queryToString(query)); - res.in = std::make_shared(block); - return res; + DDLLogEntry entry; + entry.hosts = {}; + entry.query = queryToString(query); + entry.initiator = ddl_worker->getCommonHostID(); + String node_path = ddl_worker->enqueueQuery(entry); + + BlockIO io; + //FIXME use query context + if (global_context.getSettingsRef().distributed_ddl_task_timeout == 0) + return io; + + //FIXME need list of all replicas + Strings hosts_to_wait; + //TODO maybe it's better to use (shard_name + sep + replica_name) as host ID to allow use {replica} macro (may may have the same values across shards) + hosts_to_wait.emplace_back(replica_name); + auto stream = std::make_shared(node_path, entry, global_context); + io.in = std::move(stream); + return io; + + //executeDDLQueryOnCluster(query, global_context); + + + //{ + // std::lock_guard lock(log_name_mutex); + // log_name_to_exec_with_result + // = current_zookeeper->create(zookeeper_path + "/log/log-", queryToString(query), zkutil::CreateMode::PersistentSequential); + //} + + //background_log_executor->schedule(); } +//BlockIO DatabaseReplicated::getFeedback() +//{ +// BlockIO res; +// if (feedback_timeout == 0) +// return res; +// +// Stopwatch watch; +// +// NamesAndTypes block_structure = +// { +// {"replica_name", std::make_shared()}, +// {"execution_feedback", std::make_shared()}, +// }; +// auto replica_name_column = block_structure[0].type->createColumn(); +// auto feedback_column = block_structure[1].type->createColumn(); +// +// current_zookeeper = getZooKeeper(); +// Strings replica_states = current_zookeeper->getChildren(zookeeper_path + "/replicas"); +// auto replica_iter = replica_states.begin(); +// +// while (!replica_states.empty() && watch.elapsedSeconds() < feedback_timeout) +// { +// String last_executed = current_zookeeper->get(zookeeper_path + "/replicas/" + *replica_iter); +// if (last_executed > log_name_to_exec_with_result) +// { +// replica_name_column->insert(*replica_iter); +// String err_path = zookeeper_path + "/replicas/" + *replica_iter + "/errors/" + log_name_to_exec_with_result; +// if (!current_zookeeper->exists(err_path)) +// { +// feedback_column->insert("OK"); +// } +// else +// { +// String feedback = current_zookeeper->get(err_path, {}, nullptr); +// feedback_column->insert(feedback); +// } +// replica_states.erase(replica_iter); +// replica_iter = replica_states.begin(); +// } +// } +// +// Block block = Block({ +// {std::move(replica_name_column), block_structure[0].type, block_structure[0].name}, +// {std::move(feedback_column), block_structure[1].type, block_structure[1].name} +// }); +// +// res.in = std::make_shared(block); +// return res; +//} + void DatabaseReplicated::createSnapshot() { current_zookeeper = getZooKeeper(); @@ -389,7 +429,7 @@ void DatabaseReplicated::loadMetadataFromSnapshot() String query_to_execute = current_zookeeper->get(path, {}, nullptr); - current_context = std::make_unique(global_context); + auto current_context = std::make_unique(global_context); current_context->getClientInfo().query_kind = ClientInfo::QueryKind::REPLICATED_LOG_QUERY; current_context->setCurrentDatabase(database_name); current_context->setCurrentQueryId(""); // generate random query_id diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index 375118e7356..537eaad893f 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -6,10 +6,14 @@ #include #include #include +#include namespace DB { + +class DDLWorker; + /** DatabaseReplicated engine * supports replication of metadata * via DDL log being written to ZooKeeper @@ -39,13 +43,15 @@ public: const String & zookeeper_path_, const String & shard_name_, const String & replica_name_, Context & context); + ~DatabaseReplicated() override; + void drop(const Context & /*context*/) override; String getEngineName() const override { return "Replicated"; } - void propose(const ASTPtr & query) override; + BlockIO propose(const ASTPtr & query); - BlockIO getFeedback(); + //BlockIO getFeedback(); private: void createDatabaseZooKeeperNodes(); @@ -63,7 +69,7 @@ private: String shard_name; String replica_name; - std::unique_ptr current_context; // to run executeQuery + //std::unique_ptr current_context; // to run executeQuery std::mutex log_name_mutex; String log_name_to_exec_with_result; @@ -73,7 +79,7 @@ private: String last_executed_log_entry = ""; - BackgroundSchedulePool::TaskHolder background_log_executor; + //BackgroundSchedulePool::TaskHolder background_log_executor; zkutil::ZooKeeperPtr current_zookeeper; /// Use only the methods below. mutable std::mutex current_zookeeper_mutex; /// To recreate the session in the background thread. @@ -82,6 +88,8 @@ private: zkutil::ZooKeeperPtr getZooKeeper() const; void setZooKeeper(zkutil::ZooKeeperPtr zookeeper); + std::unique_ptr ddl_worker; + }; } diff --git a/src/Databases/IDatabase.h b/src/Databases/IDatabase.h index 393e8f2d10c..9b744259406 100644 --- a/src/Databases/IDatabase.h +++ b/src/Databases/IDatabase.h @@ -184,12 +184,6 @@ public: /// Is the database empty. virtual bool empty() const = 0; - /// Submit query to log. Currently used by DatabaseReplicated engine only. - virtual void propose(const ASTPtr & /*query*/) - { - throw Exception(getEngineName() + ": propose() is not supported", ErrorCodes::NOT_IMPLEMENTED); - } - /// Add the table to the database. Record its presence in the metadata. virtual void createTable( const Context & /*context*/, diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index 2c454db4787..b607bd084ea 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -142,12 +142,17 @@ std::unique_ptr createSimpleZooKeeperLock( } -DDLWorker::DDLWorker(int pool_size_, const std::string & zk_root_dir, Context & context_, const Poco::Util::AbstractConfiguration * config, const String & prefix) +DDLWorker::DDLWorker(int pool_size_, const std::string & zk_root_dir, Context & context_, const Poco::Util::AbstractConfiguration * config, const String & prefix, + bool is_replicated_db_, const std::optional & db_name_, const std::optional & db_replica_name_, const std::optional & db_shard_name_) : context(context_) , log(&Poco::Logger::get("DDLWorker")) , pool_size(pool_size_) , worker_pool(pool_size_) { + is_replicated_db = is_replicated_db_; + db_name = db_name_; + db_replica_name = db_replica_name_; + db_shard_name = db_shard_name_; last_tasks.reserve(pool_size); queue_dir = zk_root_dir; @@ -267,6 +272,15 @@ DDLTaskPtr DDLWorker::initAndCheckTask(const String & entry_name, String & out_r return {}; } + if (is_replicated_db) + { + // + task->host_id.host_name = host_fqdn; + task->host_id.port = context.getTCPPort(); + task->host_id_str = *db_replica_name; + return task; + } + bool host_in_hostlist = false; for (const HostID & host : task->entry.hosts) { @@ -390,6 +404,9 @@ void DDLWorker::parseQueryAndResolveHost(DDLTask & task) if (!task.query || !(task.query_on_cluster = dynamic_cast(task.query.get()))) throw Exception("Received unknown DDL query", ErrorCodes::UNKNOWN_TYPE_OF_QUERY); + if (is_replicated_db) + return; + task.cluster_name = task.query_on_cluster->cluster; task.cluster = context.tryGetCluster(task.cluster_name); if (!task.cluster) @@ -507,7 +524,14 @@ bool DDLWorker::tryExecuteQuery(const String & query, const DDLTask & task, Exec try { auto current_context = std::make_unique(context); - current_context->getClientInfo().query_kind = ClientInfo::QueryKind::SECONDARY_QUERY; + if (is_replicated_db) + { + current_context->getClientInfo().query_kind + = ClientInfo::QueryKind::REPLICATED_LOG_QUERY; //FIXME why do we need separate query kind? + current_context->setCurrentDatabase(*db_name); + } + else + current_context->getClientInfo().query_kind = ClientInfo::QueryKind::SECONDARY_QUERY; current_context->setCurrentQueryId(""); // generate random query_id executeQuery(istr, ostr, false, *current_context, {}); } @@ -696,7 +720,11 @@ bool DDLWorker::tryExecuteQueryOnLeaderReplica( return res; }; - String shard_node_name = get_shard_name(task.cluster->getShardsAddresses().at(task.host_shard_num)); + String shard_node_name; + if (is_replicated_db) + shard_node_name = *db_shard_name; + else + shard_node_name = get_shard_name(task.cluster->getShardsAddresses().at(task.host_shard_num)); String shard_path = node_path + "/shards/" + shard_node_name; String is_executed_path = shard_path + "/executed"; String tries_to_execute_path = shard_path + "/tries_to_execute"; @@ -892,7 +920,7 @@ void DDLWorker::createStatusDirs(const std::string & node_path, const ZooKeeperP String DDLWorker::enqueueQuery(DDLLogEntry & entry) { - if (entry.hosts.empty()) + if (entry.hosts.empty() && !is_replicated_db) throw Exception("Empty host list in a distributed DDL task", ErrorCodes::LOGICAL_ERROR); auto zookeeper = getAndSetZooKeeper(); diff --git a/src/Interpreters/DDLWorker.h b/src/Interpreters/DDLWorker.h index caa2242caf8..1c28100f933 100644 --- a/src/Interpreters/DDLWorker.h +++ b/src/Interpreters/DDLWorker.h @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -35,7 +36,8 @@ using DDLTaskPtr = std::unique_ptr; class DDLWorker { public: - DDLWorker(int pool_size_, const std::string & zk_root_dir, Context & context_, const Poco::Util::AbstractConfiguration * config, const String & prefix); + DDLWorker(int pool_size_, const std::string & zk_root_dir, Context & context_, const Poco::Util::AbstractConfiguration * config, const String & prefix, + bool is_replicated_db_ = false, const std::optional & db_name_ = std::nullopt, const std::optional & db_replica_name_ = std::nullopt, const std::optional & db_shard_name_ = std::nullopt); ~DDLWorker(); /// Pushes query into DDL queue, returns path to created node @@ -101,8 +103,12 @@ private: void attachToThreadGroup(); private: + bool is_replicated_db; + std::optional db_name; + std::optional db_replica_name; + std::optional db_shard_name; std::atomic is_circular_replicated = false; - Context & context; + Context context; Poco::Logger * log; std::string host_fqdn; /// current host domain name diff --git a/src/Interpreters/InterpreterAlterQuery.cpp b/src/Interpreters/InterpreterAlterQuery.cpp index 013e30a3ed5..38d00c089ab 100644 --- a/src/Interpreters/InterpreterAlterQuery.cpp +++ b/src/Interpreters/InterpreterAlterQuery.cpp @@ -51,12 +51,8 @@ BlockIO InterpreterAlterQuery::execute() auto metadata_snapshot = table->getInMemoryMetadataPtr(); DatabasePtr database = DatabaseCatalog::instance().getDatabase(table_id.database_name); - if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY && !table->supportsReplication()) - { - database->propose(query_ptr); - auto * database_replicated = typeid_cast(database.get()); - return database_replicated->getFeedback(); - } + if (typeid_cast(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY && !table->supportsReplication()) + return typeid_cast(database.get())->propose(query_ptr); /// Add default database to table identifiers that we can encounter in e.g. default expressions, /// mutation expression, etc. diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 04c5efce3e2..b36fe32b26d 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -77,6 +77,7 @@ namespace ErrorCodes extern const int ILLEGAL_SYNTAX_FOR_DATA_TYPE; extern const int ILLEGAL_COLUMN; extern const int LOGICAL_ERROR; + extern const int UNKNOWN_DATABASE; } namespace fs = std::filesystem; @@ -720,15 +721,22 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create) create.database = current_database; } + //TODO make code better if possible + bool need_add_to_database = !create.temporary; + if(need_add_to_database && database->getEngineName() == "Replicated") + { + auto guard = DatabaseCatalog::instance().getDDLGuard(create.database, create.table); + database = DatabaseCatalog::instance().getDatabase(create.database); + if (typeid_cast(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) + { + assertOrSetUUID(create, database); + return typeid_cast(database.get())->propose(query_ptr); + } + } + /// Actually creates table bool created = doCreateTable(create, properties); - if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) - { - auto * database_replicated = typeid_cast(database.get()); - return database_replicated->getFeedback(); - } - if (!created) /// Table already exists return {}; @@ -753,6 +761,9 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, guard = DatabaseCatalog::instance().getDDLGuard(create.database, table_name); database = DatabaseCatalog::instance().getDatabase(create.database); + //TODO do we need it? + if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) + throw Exception(ErrorCodes::UNKNOWN_DATABASE, "Database was renamed"); assertOrSetUUID(create, database); /// Table can be created before or it can be created concurrently in another thread, while we were waiting in DDLGuard. @@ -790,12 +801,6 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, return true; } - if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) - { - database->propose(query_ptr); - return true; - } - StoragePtr res; /// NOTE: CREATE query may be rewritten by Storage creator or table function if (create.as_table_function) diff --git a/src/Interpreters/InterpreterDropQuery.cpp b/src/Interpreters/InterpreterDropQuery.cpp index 0f03525f237..c93f8098713 100644 --- a/src/Interpreters/InterpreterDropQuery.cpp +++ b/src/Interpreters/InterpreterDropQuery.cpp @@ -101,8 +101,8 @@ BlockIO InterpreterDropQuery::executeToTable(const ASTDropQuery & query) if (database->getEngineName() != "Atomic" && database->getEngineName() != "Replicated") table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); /// Drop table from memory, don't touch data and metadata - if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) - database->propose(query_ptr); + if (typeid_cast(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) + return typeid_cast(database.get())->propose(query_ptr); else database->detachTable(table_id.table_name); } @@ -115,7 +115,7 @@ BlockIO InterpreterDropQuery::executeToTable(const ASTDropQuery & query) auto metadata_snapshot = table->getInMemoryMetadataPtr(); /// Drop table data, don't touch metadata if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) - database->propose(query_ptr); + return typeid_cast(database.get())->propose(query_ptr); else table->truncate(query_ptr, metadata_snapshot, context, table_lock); } @@ -131,8 +131,8 @@ BlockIO InterpreterDropQuery::executeToTable(const ASTDropQuery & query) table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); /// Prevents recursive drop from drop database query. The original query must specify a table. - if (!query_ptr->as().table.empty() && database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) - database->propose(query_ptr); + if (typeid_cast(database.get()) && !query_ptr->as().table.empty() && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) + return typeid_cast(database.get())->propose(query_ptr); else database->dropTable(context, table_id.table_name, query.no_delay); } @@ -151,12 +151,6 @@ BlockIO InterpreterDropQuery::executeToTable(const ASTDropQuery & query) } } - if (database && database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) - { - auto * database_replicated = typeid_cast(database.get()); - return database_replicated->getFeedback(); - } - return {}; } diff --git a/src/Interpreters/InterpreterRenameQuery.cpp b/src/Interpreters/InterpreterRenameQuery.cpp index 3a375e2ba60..4eee34a683e 100644 --- a/src/Interpreters/InterpreterRenameQuery.cpp +++ b/src/Interpreters/InterpreterRenameQuery.cpp @@ -75,9 +75,9 @@ BlockIO InterpreterRenameQuery::executeToTables(const ASTRenameQuery & rename, c database_catalog.assertTableDoesntExist(StorageID(elem.to_database_name, elem.to_table_name), context); DatabasePtr database = database_catalog.getDatabase(elem.from_database_name); - if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) + if (typeid_cast(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { - database->propose(query_ptr); + return typeid_cast(database.get())->propose(query_ptr); } else { @@ -89,13 +89,6 @@ BlockIO InterpreterRenameQuery::executeToTables(const ASTRenameQuery & rename, c rename.exchange, rename.dictionary); } - - // TODO it can't work - if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) - { - auto * database_replicated = typeid_cast(database.get()); - return database_replicated->getFeedback(); - } } return {}; diff --git a/src/Interpreters/executeDDLQueryOnCluster.cpp b/src/Interpreters/executeDDLQueryOnCluster.cpp index 6da1704ce55..03065245766 100644 --- a/src/Interpreters/executeDDLQueryOnCluster.cpp +++ b/src/Interpreters/executeDDLQueryOnCluster.cpp @@ -25,7 +25,7 @@ namespace ErrorCodes extern const int QUERY_IS_PROHIBITED; } -static bool isSupportedAlterType(int type) +bool isSupportedAlterType(int type) { static const std::unordered_set unsupported_alter_types{ ASTAlterCommand::ATTACH_PARTITION, @@ -170,7 +170,8 @@ BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr_, const Context & cont } -DDLQueryStatusInputStream::DDLQueryStatusInputStream(const String & zk_node_path, const DDLLogEntry & entry, const Context & context_) +DDLQueryStatusInputStream::DDLQueryStatusInputStream(const String & zk_node_path, const DDLLogEntry & entry, const Context & context_, + const std::optional & hosts_to_wait) : node_path(zk_node_path) , context(context_) , watch(CLOCK_MONOTONIC_COARSE) @@ -185,10 +186,17 @@ DDLQueryStatusInputStream::DDLQueryStatusInputStream(const String & zk_node_path {std::make_shared(), "num_hosts_active"}, }; - for (const HostID & host: entry.hosts) - waiting_hosts.emplace(host.toString()); + if (hosts_to_wait) + { + waiting_hosts = NameSet(hosts_to_wait->begin(), hosts_to_wait->end()); + } + else + { + for (const HostID & host : entry.hosts) + waiting_hosts.emplace(host.toString()); + } - addTotalRowsApprox(entry.hosts.size()); + addTotalRowsApprox(waiting_hosts.size()); timeout_seconds = context.getSettingsRef().distributed_ddl_task_timeout; } diff --git a/src/Interpreters/executeDDLQueryOnCluster.h b/src/Interpreters/executeDDLQueryOnCluster.h index 83880cc94c1..0f7a411ed92 100644 --- a/src/Interpreters/executeDDLQueryOnCluster.h +++ b/src/Interpreters/executeDDLQueryOnCluster.h @@ -15,6 +15,9 @@ class AccessRightsElements; struct DDLLogEntry; +/// Returns true if provided ALTER type can be executed ON CLUSTER +bool isSupportedAlterType(int type); + /// Pushes distributed DDL query to the queue. /// Returns DDLQueryStatusInputStream, which reads results of query execution on each host in the cluster. BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr, const Context & context); @@ -25,7 +28,7 @@ BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr, const Context & conte class DDLQueryStatusInputStream : public IBlockInputStream { public: - DDLQueryStatusInputStream(const String & zk_node_path, const DDLLogEntry & entry, const Context & context_); + DDLQueryStatusInputStream(const String & zk_node_path, const DDLLogEntry & entry, const Context & context_, const std::optional & hosts_to_wait = {}); String getName() const override { return "DDLQueryStatusInputStream"; } diff --git a/tests/integration/test_replicated_database/test.py b/tests/integration/test_replicated_database/test.py index 372ac7a7c3e..06d8aa9467a 100644 --- a/tests/integration/test_replicated_database/test.py +++ b/tests/integration/test_replicated_database/test.py @@ -24,8 +24,8 @@ def assert_create_query(nodes, table_name, expected): def started_cluster(): try: cluster.start() - main_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'replica1');") - dummy_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'replica2');") + main_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'shard1', 'replica1');") + dummy_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'shard1', 'replica2');") yield cluster finally: @@ -67,7 +67,7 @@ def test_simple_alter_table(started_cluster): assert_create_query([main_node, dummy_node], "alter_test", expected) def test_create_replica_after_delay(started_cluster): - competing_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'replica3');") + competing_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'shard1', 'replica3');") main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN Added3 UInt32;") main_node.query("ALTER TABLE testdb.alter_test DROP COLUMN AddedNested1;") @@ -128,15 +128,15 @@ def test_replica_restart(started_cluster): def test_snapshot_and_snapshot_recover(started_cluster): #FIXME bad test - snapshotting_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'replica4');") + snapshotting_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'shard1', 'replica4');") time.sleep(5) - snapshot_recovering_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'replica5');") + snapshot_recovering_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'shard1', 'replica5');") time.sleep(5) assert snapshotting_node.query("desc table testdb.alter_test") == snapshot_recovering_node.query("desc table testdb.alter_test") def test_drop_and_create_replica(started_cluster): main_node.query("DROP DATABASE testdb") - main_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'replica1');") + main_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'shard1', 'replica1');") expected = "CREATE TABLE testdb.concurrent_test\\n(\\n `CounterID` UInt32,\\n `StartDate` Date,\\n `UserID` UInt32,\\n" \ " `VisitID` UInt32,\\n `NestedColumn.A` Array(UInt8),\\n `NestedColumn.S` Array(String),\\n `ToDrop` UInt32\\n)\\n" \ From b0262b3d06130854ae96a10b1d2854ad9c7b92bb Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Fri, 13 Nov 2020 21:35:45 +0300 Subject: [PATCH 048/381] better replica creation --- src/Databases/DatabaseReplicated.cpp | 280 +++++++++++---------------- src/Databases/DatabaseReplicated.h | 20 +- src/Interpreters/DDLWorker.cpp | 41 ++-- src/Interpreters/DDLWorker.h | 29 ++- 4 files changed, 159 insertions(+), 211 deletions(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 1213b5bc075..c4bffd8fd5d 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -16,6 +16,8 @@ #include #include #include +#include +#include #include namespace DB @@ -25,29 +27,22 @@ namespace ErrorCodes extern const int NO_ZOOKEEPER; extern const int LOGICAL_ERROR; extern const int BAD_ARGUMENTS; + extern const int REPLICA_IS_ALREADY_EXIST; } -//FIXME never used -void DatabaseReplicated::setZooKeeper(zkutil::ZooKeeperPtr zookeeper) -{ - std::lock_guard lock(current_zookeeper_mutex); - current_zookeeper = zookeeper; -} - -zkutil::ZooKeeperPtr DatabaseReplicated::tryGetZooKeeper() const -{ - std::lock_guard lock(current_zookeeper_mutex); - return current_zookeeper; -} +constexpr const char * first_entry_name = "query-0000000000"; zkutil::ZooKeeperPtr DatabaseReplicated::getZooKeeper() const { - auto res = tryGetZooKeeper(); - if (!res) - throw Exception("Cannot get ZooKeeper", ErrorCodes::NO_ZOOKEEPER); - return res; + return global_context.getZooKeeper(); } +static inline String getHostID(const Context & global_context) +{ + return Cluster::Address::toString(getFQDNOrHostName(), global_context.getTCPPort()); +} + + DatabaseReplicated::~DatabaseReplicated() = default; DatabaseReplicated::DatabaseReplicated( @@ -64,99 +59,119 @@ DatabaseReplicated::DatabaseReplicated( , replica_name(replica_name_) { if (zookeeper_path.empty() || shard_name.empty() || replica_name.empty()) - throw Exception("ZooKeeper path and shard and replica names must be non-empty", ErrorCodes::BAD_ARGUMENTS); + throw Exception("ZooKeeper path, shard and replica names must be non-empty", ErrorCodes::BAD_ARGUMENTS); + if (shard_name.find('/') != std::string::npos || replica_name.find('/') != std::string::npos) + throw Exception("Shard and replica names should not contain '/'", ErrorCodes::BAD_ARGUMENTS); if (zookeeper_path.back() == '/') zookeeper_path.resize(zookeeper_path.size() - 1); + /// If zookeeper chroot prefix is used, path should start with '/', because chroot concatenates without it. if (zookeeper_path.front() != '/') zookeeper_path = "/" + zookeeper_path; - if (context_.hasZooKeeper()) - { - current_zookeeper = context_.getZooKeeper(); - } - if (!current_zookeeper) + if (!context_.hasZooKeeper()) { throw Exception("Can't create replicated database without ZooKeeper", ErrorCodes::NO_ZOOKEEPER); } + //FIXME it will fail on startup if zk is not available + + auto current_zookeeper = global_context.getZooKeeper(); - /// New database if (!current_zookeeper->exists(zookeeper_path)) { - createDatabaseZooKeeperNodes(); + /// Create new database, multiple nodes can execute it concurrently + createDatabaseNodesInZooKeeper(current_zookeeper); } - /// Attach existing replica - //TODO better protection from wrong replica names - if (current_zookeeper->exists(zookeeper_path + "/replicas/" + replica_name)) + replica_path = zookeeper_path + "/replicas/" + shard_name + "|" + replica_name; + + String replica_host_id; + if (current_zookeeper->tryGet(replica_path, replica_host_id)) { - String remote_last_entry = current_zookeeper->get(zookeeper_path + "/replicas/" + replica_name, {}, nullptr); + String host_id = getHostID(global_context); + if (replica_host_id != host_id) + throw Exception(ErrorCodes::REPLICA_IS_ALREADY_EXIST, + "Replica {} of shard {} of replicated database at {} already exists. Replica host ID: '{}', current host ID: '{}'", + replica_name, shard_name, zookeeper_path, replica_host_id, host_id); - String local_last_entry; - try - { - ReadBufferFromFile in(getMetadataPath() + ".last_entry", 16); - readStringUntilEOF(local_last_entry, in); - } - catch (const Exception &) - { - /// Metadata is corrupted. - /// Replica erases the previous zk last executed log entry - /// and behaves like a new clean replica. - writeLastExecutedToDiskAndZK(); - } - - if (!local_last_entry.empty() && local_last_entry == remote_last_entry) - { - last_executed_log_entry = local_last_entry; - } - else - { - //FIXME - throw Exception( - "Replica name might be in use by a different node. Please check replica_name parameter. Remove .last_entry file from " - "metadata to create a new replica.", - ErrorCodes::LOGICAL_ERROR); - } + log_entry_to_execute = current_zookeeper->get(replica_path + "/log_ptr"); } else { - createReplicaZooKeeperNodes(); + /// Throws if replica with the same name was created concurrently + createReplicaNodesInZooKeeper(current_zookeeper); } + assert(log_entry_to_execute.starts_with("query-")); + + snapshot_period = context_.getConfigRef().getInt("database_replicated_snapshot_period", 10); - feedback_timeout = context_.getConfigRef().getInt("database_replicated_feedback_timeout", 0); LOG_DEBUG(log, "Snapshot period is set to {} log entries per one snapshot", snapshot_period); - - //FIXME use database UUID - ddl_worker = std::make_unique(1, zookeeper_path + "/log", context_, nullptr, String{}, true, database_name, replica_name, shard_name); - - //TODO do we need separate pool? - //background_log_executor = context_.getReplicatedSchedulePool().createTask( - // database_name + "(DatabaseReplicated::background_executor)", [this] { runBackgroundLogExecutor(); } - //); - - //background_log_executor->scheduleAfter(500); } -void DatabaseReplicated::createDatabaseZooKeeperNodes() +bool DatabaseReplicated::createDatabaseNodesInZooKeeper(const zkutil::ZooKeeperPtr & current_zookeeper) { - current_zookeeper = getZooKeeper(); - current_zookeeper->createAncestors(zookeeper_path); - current_zookeeper->createIfNotExists(zookeeper_path, String()); - current_zookeeper->createIfNotExists(zookeeper_path + "/log", String()); - current_zookeeper->createIfNotExists(zookeeper_path + "/snapshots", String()); - current_zookeeper->createIfNotExists(zookeeper_path + "/replicas", String()); + Coordination::Requests ops; + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path, "", zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/log", "", zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/snapshots", "", zkutil::CreateMode::Persistent)); + /// Create empty snapshot (with no tables) + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/snapshots/" + first_entry_name, "", zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/replicas", "", zkutil::CreateMode::Persistent)); + + Coordination::Responses responses; + auto res = current_zookeeper->tryMulti(ops, responses); + if (res == Coordination::Error::ZOK) + return true; + if (res == Coordination::Error::ZNODEEXISTS) + return false; + + zkutil::KeeperMultiException::check(res, ops, responses); + assert(false); } -void DatabaseReplicated::createReplicaZooKeeperNodes() +void DatabaseReplicated::createReplicaNodesInZooKeeper(const zkutil::ZooKeeperPtr & current_zookeeper) { - current_zookeeper->create(zookeeper_path + "/replicas/" + replica_name, "", zkutil::CreateMode::Persistent); + current_zookeeper->createAncestors(replica_path); + + Strings snapshots = current_zookeeper->getChildren(zookeeper_path + "/snapshots"); + std::sort(snapshots.begin(), snapshots.end()); + if (snapshots.empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "No snapshots found"); + + /// When creating new replica, use latest snapshot version as initial value of log_pointer + log_entry_to_execute = snapshots.back(); + + /// Write host name to replica_path, it will protect from multiple replicas with the same name + auto host_id = getHostID(global_context); + + Coordination::Requests ops; + ops.emplace_back(zkutil::makeCreateRequest(replica_path, host_id, zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(replica_path + "/log_ptr", log_entry_to_execute , zkutil::CreateMode::Persistent)); + current_zookeeper->multi(ops); } +void DatabaseReplicated::loadStoredObjects(Context & context, bool has_force_restore_data_flag, bool force_attach) +{ + DatabaseAtomic::loadStoredObjects(context, has_force_restore_data_flag, force_attach); + + DatabaseReplicatedExtensions ext; + ext.database_uuid = getUUID(); + ext.database_name = getDatabaseName(); + ext.shard_name = shard_name; + ext.replica_name = replica_name; + ext.first_not_executed = log_entry_to_execute; + + /// Pool size must be 1 (to avoid reordering of log entries) + constexpr size_t pool_size = 1; + ddl_worker = std::make_unique(pool_size, zookeeper_path + "/log", global_context, nullptr, "", + std::make_optional(std::move(ext))); +} + + void DatabaseReplicated::removeOutdatedSnapshotsAndLog() { /// This method removes all snapshots and logged queries @@ -170,7 +185,7 @@ void DatabaseReplicated::removeOutdatedSnapshotsAndLog() /// because the replica will use the latest snapshot available /// and this snapshot will set the last executed log query /// to a greater one than the least advanced current replica. - current_zookeeper = getZooKeeper(); + auto current_zookeeper = getZooKeeper(); Strings replica_states = current_zookeeper->getChildren(zookeeper_path + "/replicas"); //TODO do not use log pointers to determine which entries to remove if there are staled pointers. // We can just remove all entries older than previous snapshot version. @@ -209,7 +224,7 @@ void DatabaseReplicated::runBackgroundLogExecutor() loadMetadataFromSnapshot(); } - current_zookeeper = getZooKeeper(); + auto current_zookeeper = getZooKeeper(); Strings log_entry_names = current_zookeeper->getChildren(zookeeper_path + "/log"); std::sort(log_entry_names.begin(), log_entry_names.end()); @@ -219,7 +234,7 @@ void DatabaseReplicated::runBackgroundLogExecutor() for (const String & log_entry_name : log_entry_names) { - executeLogName(log_entry_name); + //executeLogName(log_entry_name); last_executed_log_entry = log_entry_name; writeLastExecutedToDiskAndZK(); @@ -238,7 +253,7 @@ void DatabaseReplicated::runBackgroundLogExecutor() void DatabaseReplicated::writeLastExecutedToDiskAndZK() { - current_zookeeper = getZooKeeper(); + auto current_zookeeper = getZooKeeper(); current_zookeeper->createOrUpdate( zookeeper_path + "/replicas/" + replica_name, last_executed_log_entry, zkutil::CreateMode::Persistent); @@ -251,35 +266,9 @@ void DatabaseReplicated::writeLastExecutedToDiskAndZK() out.close(); } -void DatabaseReplicated::executeLogName(const String & /*log_entry_name*/) -{ -// String path = zookeeper_path + "/log/" + log_entry_name; -// current_zookeeper = getZooKeeper(); -// String query_to_execute = current_zookeeper->get(path, {}, nullptr); -// -// try -// { -// current_context = std::make_unique(global_context); -// current_context->getClientInfo().query_kind = ClientInfo::QueryKind::REPLICATED_LOG_QUERY; -// current_context->setCurrentDatabase(database_name); -// current_context->setCurrentQueryId(""); // generate random query_id -// executeQuery(query_to_execute, *current_context); -// } -// catch (const Exception & e) -// { -// tryLogCurrentException(log, "Query from zookeeper " + query_to_execute + " wasn't finished successfully"); -// current_zookeeper->create( -// zookeeper_path + "/replicas/" + replica_name + "/errors/" + log_entry_name, e.what(), zkutil::CreateMode::Persistent); -// } -// -// LOG_DEBUG(log, "Executed query: {}", query_to_execute); -} BlockIO DatabaseReplicated::propose(const ASTPtr & query) { - //current_zookeeper = getZooKeeper(); - - if (const auto * query_alter = query->as()) { for (const auto & command : query_alter->command_list->commands) @@ -303,79 +292,18 @@ BlockIO DatabaseReplicated::propose(const ASTPtr & query) if (global_context.getSettingsRef().distributed_ddl_task_timeout == 0) return io; - //FIXME need list of all replicas + //FIXME need list of all replicas, we can obtain it from zk Strings hosts_to_wait; - //TODO maybe it's better to use (shard_name + sep + replica_name) as host ID to allow use {replica} macro (may may have the same values across shards) - hosts_to_wait.emplace_back(replica_name); + hosts_to_wait.emplace_back(shard_name + '/' +replica_name); auto stream = std::make_shared(node_path, entry, global_context); io.in = std::move(stream); return io; - - //executeDDLQueryOnCluster(query, global_context); - - - //{ - // std::lock_guard lock(log_name_mutex); - // log_name_to_exec_with_result - // = current_zookeeper->create(zookeeper_path + "/log/log-", queryToString(query), zkutil::CreateMode::PersistentSequential); - //} - - //background_log_executor->schedule(); } -//BlockIO DatabaseReplicated::getFeedback() -//{ -// BlockIO res; -// if (feedback_timeout == 0) -// return res; -// -// Stopwatch watch; -// -// NamesAndTypes block_structure = -// { -// {"replica_name", std::make_shared()}, -// {"execution_feedback", std::make_shared()}, -// }; -// auto replica_name_column = block_structure[0].type->createColumn(); -// auto feedback_column = block_structure[1].type->createColumn(); -// -// current_zookeeper = getZooKeeper(); -// Strings replica_states = current_zookeeper->getChildren(zookeeper_path + "/replicas"); -// auto replica_iter = replica_states.begin(); -// -// while (!replica_states.empty() && watch.elapsedSeconds() < feedback_timeout) -// { -// String last_executed = current_zookeeper->get(zookeeper_path + "/replicas/" + *replica_iter); -// if (last_executed > log_name_to_exec_with_result) -// { -// replica_name_column->insert(*replica_iter); -// String err_path = zookeeper_path + "/replicas/" + *replica_iter + "/errors/" + log_name_to_exec_with_result; -// if (!current_zookeeper->exists(err_path)) -// { -// feedback_column->insert("OK"); -// } -// else -// { -// String feedback = current_zookeeper->get(err_path, {}, nullptr); -// feedback_column->insert(feedback); -// } -// replica_states.erase(replica_iter); -// replica_iter = replica_states.begin(); -// } -// } -// -// Block block = Block({ -// {std::move(replica_name_column), block_structure[0].type, block_structure[0].name}, -// {std::move(feedback_column), block_structure[1].type, block_structure[1].name} -// }); -// -// res.in = std::make_shared(block); -// return res; -//} void DatabaseReplicated::createSnapshot() { - current_zookeeper = getZooKeeper(); + auto current_zookeeper = getZooKeeper(); String snapshot_path = zookeeper_path + "/snapshots/" + last_executed_log_entry; if (Coordination::Error::ZNODEEXISTS == current_zookeeper->tryCreate(snapshot_path, String(), zkutil::CreateMode::Persistent)) @@ -399,7 +327,7 @@ void DatabaseReplicated::loadMetadataFromSnapshot() { /// Executes the latest snapshot. /// Used by new replicas only. - current_zookeeper = getZooKeeper(); + auto current_zookeeper = getZooKeeper(); Strings snapshots; if (current_zookeeper->tryGetChildren(zookeeper_path + "/snapshots", snapshots) != Coordination::Error::ZOK) @@ -443,9 +371,19 @@ void DatabaseReplicated::loadMetadataFromSnapshot() void DatabaseReplicated::drop(const Context & context_) { - current_zookeeper = getZooKeeper(); + auto current_zookeeper = getZooKeeper(); current_zookeeper->tryRemove(zookeeper_path + "/replicas/" + replica_name); DatabaseAtomic::drop(context_); } +void DatabaseReplicated::shutdown() +{ + if (ddl_worker) + { + ddl_worker->shutdown(); + ddl_worker = nullptr; + } + DatabaseAtomic::shutdown(); +} + } diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index 537eaad893f..219779d602d 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -51,14 +51,15 @@ public: BlockIO propose(const ASTPtr & query); - //BlockIO getFeedback(); + void shutdown() override; + + void loadStoredObjects(Context & context, bool has_force_restore_data_flag, bool force_attach = false) override; private: - void createDatabaseZooKeeperNodes(); - void createReplicaZooKeeperNodes(); + bool createDatabaseNodesInZooKeeper(const zkutil::ZooKeeperPtr & current_zookeeper); + void createReplicaNodesInZooKeeper(const zkutil::ZooKeeperPtr & current_zookeeper); void runBackgroundLogExecutor(); - void executeLogName(const String &); void writeLastExecutedToDiskAndZK(); void loadMetadataFromSnapshot(); @@ -68,25 +69,18 @@ private: String zookeeper_path; String shard_name; String replica_name; + String replica_path; - //std::unique_ptr current_context; // to run executeQuery + String log_entry_to_execute; std::mutex log_name_mutex; String log_name_to_exec_with_result; int snapshot_period; - int feedback_timeout; String last_executed_log_entry = ""; - //BackgroundSchedulePool::TaskHolder background_log_executor; - - zkutil::ZooKeeperPtr current_zookeeper; /// Use only the methods below. - mutable std::mutex current_zookeeper_mutex; /// To recreate the session in the background thread. - - zkutil::ZooKeeperPtr tryGetZooKeeper() const; zkutil::ZooKeeperPtr getZooKeeper() const; - void setZooKeeper(zkutil::ZooKeeperPtr zookeeper); std::unique_ptr ddl_worker; diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index 83e7029ec31..7d947a264a6 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -142,17 +142,15 @@ std::unique_ptr createSimpleZooKeeperLock( } -DDLWorker::DDLWorker(int pool_size_, const std::string & zk_root_dir, Context & context_, const Poco::Util::AbstractConfiguration * config, const String & prefix, - bool is_replicated_db_, const std::optional & db_name_, const std::optional & db_replica_name_, const std::optional & db_shard_name_) +DDLWorker::DDLWorker(int pool_size_, const std::string & zk_root_dir, const Context & context_, const Poco::Util::AbstractConfiguration * config, const String & prefix, + std::optional database_replicated_ext_) : context(context_) - , log(&Poco::Logger::get("DDLWorker")) + , log(&Poco::Logger::get(database_replicated_ext_ ? fmt::format("DDLWorker ({})", database_replicated_ext_->database_name) : "DDLWorker")) + , database_replicated_ext(std::move(database_replicated_ext_)) , pool_size(pool_size_) , worker_pool(pool_size_) { - is_replicated_db = is_replicated_db_; - db_name = db_name_; - db_replica_name = db_replica_name_; - db_shard_name = db_shard_name_; + assert(!database_replicated_ext || pool_size == 1); last_tasks.reserve(pool_size); queue_dir = zk_root_dir; @@ -181,25 +179,29 @@ DDLWorker::DDLWorker(int pool_size_, const std::string & zk_root_dir, Context & cleanup_thread = ThreadFromGlobalPool(&DDLWorker::runCleanupThread, this); } - -DDLWorker::~DDLWorker() +void DDLWorker::shutdown() { stop_flag = true; queue_updated_event->set(); cleanup_event->set(); +} + +DDLWorker::~DDLWorker() +{ + shutdown(); worker_pool.wait(); main_thread.join(); cleanup_thread.join(); } -DDLWorker::ZooKeeperPtr DDLWorker::tryGetZooKeeper() const +ZooKeeperPtr DDLWorker::tryGetZooKeeper() const { std::lock_guard lock(zookeeper_mutex); return current_zookeeper; } -DDLWorker::ZooKeeperPtr DDLWorker::getAndSetZooKeeper() +ZooKeeperPtr DDLWorker::getAndSetZooKeeper() { std::lock_guard lock(zookeeper_mutex); @@ -272,12 +274,11 @@ DDLTaskPtr DDLWorker::initAndCheckTask(const String & entry_name, String & out_r return {}; } - if (is_replicated_db) + if (database_replicated_ext) { - // task->host_id.host_name = host_fqdn; task->host_id.port = context.getTCPPort(); - task->host_id_str = *db_replica_name; + task->host_id_str = database_replicated_ext->shard_name + '|' + database_replicated_ext->replica_name; return task; } @@ -404,7 +405,7 @@ void DDLWorker::parseQueryAndResolveHost(DDLTask & task) if (!task.query || !(task.query_on_cluster = dynamic_cast(task.query.get()))) throw Exception("Received unknown DDL query", ErrorCodes::UNKNOWN_TYPE_OF_QUERY); - if (is_replicated_db) + if (database_replicated_ext) return; task.cluster_name = task.query_on_cluster->cluster; @@ -524,11 +525,11 @@ bool DDLWorker::tryExecuteQuery(const String & query, const DDLTask & task, Exec try { auto current_context = std::make_unique(context); - if (is_replicated_db) + if (database_replicated_ext) { current_context->getClientInfo().query_kind = ClientInfo::QueryKind::REPLICATED_LOG_QUERY; //FIXME why do we need separate query kind? - current_context->setCurrentDatabase(*db_name); + current_context->setCurrentDatabase(database_replicated_ext->database_name); } else current_context->getClientInfo().query_kind = ClientInfo::QueryKind::SECONDARY_QUERY; @@ -721,8 +722,8 @@ bool DDLWorker::tryExecuteQueryOnLeaderReplica( }; String shard_node_name; - if (is_replicated_db) - shard_node_name = *db_shard_name; + if (database_replicated_ext) + shard_node_name = database_replicated_ext->shard_name; else shard_node_name = get_shard_name(task.cluster->getShardsAddresses().at(task.host_shard_num)); String shard_path = node_path + "/shards/" + shard_node_name; @@ -920,7 +921,7 @@ void DDLWorker::createStatusDirs(const std::string & node_path, const ZooKeeperP String DDLWorker::enqueueQuery(DDLLogEntry & entry) { - if (entry.hosts.empty() && !is_replicated_db) + if (entry.hosts.empty() && !database_replicated_ext) throw Exception("Empty host list in a distributed DDL task", ErrorCodes::LOGICAL_ERROR); auto zookeeper = getAndSetZooKeeper(); diff --git a/src/Interpreters/DDLWorker.h b/src/Interpreters/DDLWorker.h index 1c28100f933..f38d41df503 100644 --- a/src/Interpreters/DDLWorker.h +++ b/src/Interpreters/DDLWorker.h @@ -31,13 +31,30 @@ class ASTAlterQuery; struct DDLLogEntry; struct DDLTask; using DDLTaskPtr = std::unique_ptr; +using ZooKeeperPtr = std::shared_ptr; + + +struct DatabaseReplicatedExtensions +{ + UUID database_uuid; + String database_name; + String shard_name; + String replica_name; + String first_not_executed; + using NewEntryCallback = std::function; + using EntryExecutedCallback = std::function; + using EntryErrorCallback = std::function; + NewEntryCallback before_execution_callback; + EntryExecutedCallback executed_callback; + EntryErrorCallback error_callback; +}; class DDLWorker { public: - DDLWorker(int pool_size_, const std::string & zk_root_dir, Context & context_, const Poco::Util::AbstractConfiguration * config, const String & prefix, - bool is_replicated_db_ = false, const std::optional & db_name_ = std::nullopt, const std::optional & db_replica_name_ = std::nullopt, const std::optional & db_shard_name_ = std::nullopt); + DDLWorker(int pool_size_, const std::string & zk_root_dir, const Context & context_, const Poco::Util::AbstractConfiguration * config, const String & prefix, + std::optional database_replicated_ext_ = std::nullopt); ~DDLWorker(); /// Pushes query into DDL queue, returns path to created node @@ -50,8 +67,9 @@ public: return host_fqdn_id; } + void shutdown(); + private: - using ZooKeeperPtr = std::shared_ptr; /// Returns cached ZooKeeper session (possibly expired). ZooKeeperPtr tryGetZooKeeper() const; @@ -103,13 +121,10 @@ private: void attachToThreadGroup(); private: - bool is_replicated_db; - std::optional db_name; - std::optional db_replica_name; - std::optional db_shard_name; std::atomic is_circular_replicated = false; Context context; Poco::Logger * log; + std::optional database_replicated_ext; std::string host_fqdn; /// current host domain name std::string host_fqdn_id; /// host_name:port From 2283906a1118d0836fc6cb813557e8a3d8f21383 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Thu, 19 Nov 2020 13:34:45 +0300 Subject: [PATCH 049/381] try support replica recovery --- src/Common/ErrorCodes.cpp | 1 + src/Databases/DatabaseReplicated.cpp | 259 +++++++++++++++++---------- src/Databases/DatabaseReplicated.h | 22 ++- src/Interpreters/DDLWorker.cpp | 65 ++++++- src/Interpreters/DDLWorker.h | 18 +- 5 files changed, 253 insertions(+), 112 deletions(-) diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp index 405b8c60af8..1981dea5cb9 100644 --- a/src/Common/ErrorCodes.cpp +++ b/src/Common/ErrorCodes.cpp @@ -522,6 +522,7 @@ M(553, ROCKSDB_ERROR) \ M(553, LZMA_STREAM_ENCODER_FAILED) \ M(554, LZMA_STREAM_DECODER_FAILED) \ + M(554, DATABASE_REPLICATION_FAILED) \ M(999, KEEPER_EXCEPTION) \ M(1000, POCO_EXCEPTION) \ M(1001, STD_EXCEPTION) \ diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index c4bffd8fd5d..7b6d98f992a 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -28,9 +28,10 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; extern const int BAD_ARGUMENTS; extern const int REPLICA_IS_ALREADY_EXIST; + extern const int DATABASE_REPLICATION_FAILED; } -constexpr const char * first_entry_name = "query-0000000000"; +static constexpr size_t METADATA_FILE_BUFFER_SIZE = 32768; zkutil::ZooKeeperPtr DatabaseReplicated::getZooKeeper() const { @@ -42,6 +43,15 @@ static inline String getHostID(const Context & global_context) return Cluster::Address::toString(getFQDNOrHostName(), global_context.getTCPPort()); } +Strings DatabaseReplicated::getSnapshots(const ZooKeeperPtr & zookeeper) const +{ + Strings snapshots = zookeeper->getChildren(zookeeper_path + "/snapshots"); + std::sort(snapshots.begin(), snapshots.end()); + if (snapshots.empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "No snapshots found"); + return snapshots; +} + DatabaseReplicated::~DatabaseReplicated() = default; @@ -84,7 +94,7 @@ DatabaseReplicated::DatabaseReplicated( createDatabaseNodesInZooKeeper(current_zookeeper); } - replica_path = zookeeper_path + "/replicas/" + shard_name + "|" + replica_name; + replica_path = zookeeper_path + "/replicas/" + shard_name + "/" + replica_name; String replica_host_id; if (current_zookeeper->tryGet(replica_path, replica_host_id)) @@ -95,7 +105,7 @@ DatabaseReplicated::DatabaseReplicated( "Replica {} of shard {} of replicated database at {} already exists. Replica host ID: '{}', current host ID: '{}'", replica_name, shard_name, zookeeper_path, replica_host_id, host_id); - log_entry_to_execute = current_zookeeper->get(replica_path + "/log_ptr"); + log_entry_to_execute = parse(current_zookeeper->get(replica_path + "/log_ptr")); } else { @@ -103,10 +113,7 @@ DatabaseReplicated::DatabaseReplicated( createReplicaNodesInZooKeeper(current_zookeeper); } - assert(log_entry_to_execute.starts_with("query-")); - - - snapshot_period = context_.getConfigRef().getInt("database_replicated_snapshot_period", 10); + snapshot_period = 1; //context_.getConfigRef().getInt("database_replicated_snapshot_period", 10); LOG_DEBUG(log, "Snapshot period is set to {} log entries per one snapshot", snapshot_period); } @@ -117,10 +124,12 @@ bool DatabaseReplicated::createDatabaseNodesInZooKeeper(const zkutil::ZooKeeperP Coordination::Requests ops; ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path, "", zkutil::CreateMode::Persistent)); ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/log", "", zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/replicas", "", zkutil::CreateMode::Persistent)); ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/snapshots", "", zkutil::CreateMode::Persistent)); /// Create empty snapshot (with no tables) - ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/snapshots/" + first_entry_name, "", zkutil::CreateMode::Persistent)); - ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/replicas", "", zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/snapshots/0", "", zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/metadata", "", zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/metadata/0", "", zkutil::CreateMode::Persistent)); Coordination::Responses responses; auto res = current_zookeeper->tryMulti(ops, responses); @@ -137,20 +146,24 @@ void DatabaseReplicated::createReplicaNodesInZooKeeper(const zkutil::ZooKeeperPt { current_zookeeper->createAncestors(replica_path); - Strings snapshots = current_zookeeper->getChildren(zookeeper_path + "/snapshots"); - std::sort(snapshots.begin(), snapshots.end()); - if (snapshots.empty()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "No snapshots found"); - /// When creating new replica, use latest snapshot version as initial value of log_pointer - log_entry_to_execute = snapshots.back(); + log_entry_to_execute = parse(getSnapshots(current_zookeeper).back()); /// Write host name to replica_path, it will protect from multiple replicas with the same name auto host_id = getHostID(global_context); + /// On replica creation add empty entry to log. Can be used to trigger some actions on other replicas (e.g. update cluster info). + DDLLogEntry entry; + entry.hosts = {}; + entry.query = {}; + entry.initiator = {}; + + recoverLostReplica(current_zookeeper, log_entry_to_execute, true); + Coordination::Requests ops; ops.emplace_back(zkutil::makeCreateRequest(replica_path, host_id, zkutil::CreateMode::Persistent)); - ops.emplace_back(zkutil::makeCreateRequest(replica_path + "/log_ptr", log_entry_to_execute , zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(replica_path + "/log_ptr", toString(log_entry_to_execute), zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/log/query-", entry.toString(), zkutil::CreateMode::PersistentSequential)); current_zookeeper->multi(ops); } @@ -160,10 +173,13 @@ void DatabaseReplicated::loadStoredObjects(Context & context, bool has_force_res DatabaseReplicatedExtensions ext; ext.database_uuid = getUUID(); + ext.zookeeper_path = zookeeper_path; ext.database_name = getDatabaseName(); ext.shard_name = shard_name; ext.replica_name = replica_name; ext.first_not_executed = log_entry_to_execute; + ext.lost_callback = [this] (const String & entry_name, const ZooKeeperPtr & zookeeper) { onUnexpectedLogEntry(entry_name, zookeeper); }; + ext.executed_callback = [this] (const String & entry_name, const ZooKeeperPtr & zookeeper) { onExecutedLogEntry(entry_name, zookeeper); }; /// Pool size must be 1 (to avoid reordering of log entries) constexpr size_t pool_size = 1; @@ -171,6 +187,41 @@ void DatabaseReplicated::loadStoredObjects(Context & context, bool has_force_res std::make_optional(std::move(ext))); } +void DatabaseReplicated::onUnexpectedLogEntry(const String & entry_name, const ZooKeeperPtr & zookeeper) +{ + /// We cannot execute next entry of replication log. Possible reasons: + /// 1. Replica is staled, some entries were removed by log cleanup process. + /// In this case we should recover replica from the last snapshot. + /// 2. Replication log is broken due to manual operations with ZooKeeper or logical error. + /// In this case we just stop replication without any attempts to recover it automatically, + /// because such attempts may lead to unexpected data removal. + + constexpr const char * name = "query-"; + if (!startsWith(entry_name, name)) + throw Exception(ErrorCodes::DATABASE_REPLICATION_FAILED, "Unexpected entry in replication log: {}", entry_name); + + UInt32 entry_number; + if (!tryParse(entry_number, entry_name.substr(strlen(name)))) + throw Exception(ErrorCodes::DATABASE_REPLICATION_FAILED, "Cannot parse number of replication log entry {}", entry_name); + + if (entry_number < log_entry_to_execute) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Entry {} already executed, current pointer is {}", entry_number, log_entry_to_execute); + + /// Entry name is valid. Let's get min snapshot version to check if replica is staled. + Strings snapshots = getSnapshots(zookeeper); + UInt32 min_snapshot = parse(snapshots.front()); + + if (log_entry_to_execute < min_snapshot) + { + recoverLostReplica(zookeeper, parse(snapshots.back())); + return; + } + + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot recover replica, probably it's a bug. " + "Got log entry '{}' when expected entry number {}, " + "available snapshots: ", + entry_name, log_entry_to_execute, boost::algorithm::join(snapshots, ", ")); +} void DatabaseReplicated::removeOutdatedSnapshotsAndLog() { @@ -217,40 +268,51 @@ void DatabaseReplicated::removeOutdatedSnapshotsAndLog() } } -void DatabaseReplicated::runBackgroundLogExecutor() +void DatabaseReplicated::onExecutedLogEntry(const String & entry_name, const ZooKeeperPtr & zookeeper) { - if (last_executed_log_entry.empty()) + assert(entry_name == DatabaseReplicatedExtensions::getLogEntryName(log_entry_to_execute)); + ++log_entry_to_execute; + + if (snapshot_period > 0 && log_entry_to_execute % snapshot_period == 0) { - loadMetadataFromSnapshot(); + createSnapshot(zookeeper); } - - auto current_zookeeper = getZooKeeper(); - Strings log_entry_names = current_zookeeper->getChildren(zookeeper_path + "/log"); - - std::sort(log_entry_names.begin(), log_entry_names.end()); - auto newest_entry_it = std::upper_bound(log_entry_names.begin(), log_entry_names.end(), last_executed_log_entry); - - log_entry_names.erase(log_entry_names.begin(), newest_entry_it); - - for (const String & log_entry_name : log_entry_names) - { - //executeLogName(log_entry_name); - last_executed_log_entry = log_entry_name; - writeLastExecutedToDiskAndZK(); - - int log_n = parse(log_entry_name.substr(4)); - int last_log_n = parse(log_entry_names.back().substr(4)); - - /// The third condition gurantees at most one snapshot creation per batch - if (log_n > 0 && snapshot_period > 0 && (last_log_n - log_n) / snapshot_period == 0 && log_n % snapshot_period == 0) - { - createSnapshot(); - } - } - - //background_log_executor->scheduleAfter(500); } +//void DatabaseReplicated::runBackgroundLogExecutor() +//{ +// if (last_executed_log_entry.empty()) +// { +// loadMetadataFromSnapshot(); +// } +// +// auto current_zookeeper = getZooKeeper(); +// Strings log_entry_names = current_zookeeper->getChildren(zookeeper_path + "/log"); +// +// std::sort(log_entry_names.begin(), log_entry_names.end()); +// auto newest_entry_it = std::upper_bound(log_entry_names.begin(), log_entry_names.end(), last_executed_log_entry); +// +// log_entry_names.erase(log_entry_names.begin(), newest_entry_it); +// +// for (const String & log_entry_name : log_entry_names) +// { +// //executeLogName(log_entry_name); +// last_executed_log_entry = log_entry_name; +// writeLastExecutedToDiskAndZK(); +// +// int log_n = parse(log_entry_name.substr(4)); +// int last_log_n = parse(log_entry_names.back().substr(4)); +// +// /// The third condition gurantees at most one snapshot creation per batch +// if (log_n > 0 && snapshot_period > 0 && (last_log_n - log_n) / snapshot_period == 0 && log_n % snapshot_period == 0) +// { +// createSnapshot(); +// } +// } +// +// //background_log_executor->scheduleAfter(500); +//} + void DatabaseReplicated::writeLastExecutedToDiskAndZK() { auto current_zookeeper = getZooKeeper(); @@ -294,79 +356,88 @@ BlockIO DatabaseReplicated::propose(const ASTPtr & query) //FIXME need list of all replicas, we can obtain it from zk Strings hosts_to_wait; - hosts_to_wait.emplace_back(shard_name + '/' +replica_name); + hosts_to_wait.emplace_back(shard_name + '|' +replica_name); auto stream = std::make_shared(node_path, entry, global_context); io.in = std::move(stream); return io; } -void DatabaseReplicated::createSnapshot() +void DatabaseReplicated::createSnapshot(const ZooKeeperPtr & zookeeper) { - auto current_zookeeper = getZooKeeper(); - String snapshot_path = zookeeper_path + "/snapshots/" + last_executed_log_entry; + String snapshot_path = zookeeper_path + "/snapshot/" + toString(log_entry_to_execute); - if (Coordination::Error::ZNODEEXISTS == current_zookeeper->tryCreate(snapshot_path, String(), zkutil::CreateMode::Persistent)) - { + if (zookeeper->exists(snapshot_path)) return; - } - for (auto iterator = getTablesIterator(global_context, {}); iterator->isValid(); iterator->next()) + std::vector> create_queries; { - String table_name = iterator->name(); - auto query = getCreateQueryFromMetadata(getObjectMetadataPath(table_name), true); - String statement = queryToString(query); - current_zookeeper->create(snapshot_path + "/" + table_name, statement, zkutil::CreateMode::Persistent); + std::lock_guard lock{mutex}; + create_queries.reserve(tables.size()); + for (const auto & table : tables) + { + const String & name = table.first; + ReadBufferFromFile in(getObjectMetadataPath(name), METADATA_FILE_BUFFER_SIZE); + String attach_query; + readStringUntilEOF(attach_query, in); + create_queries.emplace_back(escapeForFileName(name), std::move(attach_query)); + } } - current_zookeeper->create(snapshot_path + "/.completed", String(), zkutil::CreateMode::Persistent); - removeOutdatedSnapshotsAndLog(); + if (zookeeper->exists(snapshot_path)) + return; + + String queries_path = zookeeper_path + "/metadata/" + toString(log_entry_to_execute); + zookeeper->tryCreate(queries_path, "", zkutil::CreateMode::Persistent); + queries_path += '/'; + + //FIXME use tryMulti with MULTI_BATCH_SIZE + + for (const auto & table : create_queries) + zookeeper->tryCreate(queries_path + table.first, table.second, zkutil::CreateMode::Persistent); + + if (create_queries.size() != zookeeper->getChildren(zookeeper_path + "/metadata/" + toString(log_entry_to_execute)).size()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Created invalid snapshot"); + + zookeeper->tryCreate(snapshot_path, String(), zkutil::CreateMode::Persistent); } -void DatabaseReplicated::loadMetadataFromSnapshot() +void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeeper, UInt32 from_snapshot, bool create) { - /// Executes the latest snapshot. - /// Used by new replicas only. - auto current_zookeeper = getZooKeeper(); + LOG_WARNING(log, "Will recover replica from snapshot", from_snapshot); - Strings snapshots; - if (current_zookeeper->tryGetChildren(zookeeper_path + "/snapshots", snapshots) != Coordination::Error::ZOK) - return; + //FIXME drop old tables - auto latest_snapshot = std::max_element(snapshots.begin(), snapshots.end()); - while (snapshots.size() > 0 && !current_zookeeper->exists(zookeeper_path + "/snapshots/" + *latest_snapshot + "/.completed")) + String snapshot_metadata_path = zookeeper_path + "/metadata/" + toString(from_snapshot); + Strings tables_in_snapshot = current_zookeeper->getChildren(snapshot_metadata_path); + current_zookeeper->get(zookeeper_path + "/snapshots/" + toString(from_snapshot)); /// Assert node exists + snapshot_metadata_path += '/'; + + for (const auto & table_name : tables_in_snapshot) { - snapshots.erase(latest_snapshot); - latest_snapshot = std::max_element(snapshots.begin(), snapshots.end()); + String query_to_execute = current_zookeeper->get(snapshot_metadata_path + table_name); + + + if (!startsWith(query_to_execute, "ATTACH ")) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected query: {}", query_to_execute); + query_to_execute = "CREATE " + query_to_execute.substr(strlen("ATTACH ")); + + Context current_context = global_context; + current_context.getClientInfo().query_kind = ClientInfo::QueryKind::REPLICATED_LOG_QUERY; + current_context.setCurrentDatabase(database_name); + current_context.setCurrentQueryId(""); // generate random query_id + + executeQuery(query_to_execute, current_context); } - if (snapshots.size() < 1) - { - return; - } - - Strings metadatas; - if (current_zookeeper->tryGetChildren(zookeeper_path + "/snapshots/" + *latest_snapshot, metadatas) != Coordination::Error::ZOK) + if (create) return; - LOG_DEBUG(log, "Executing {} snapshot", *latest_snapshot); + current_zookeeper->set(replica_path + "/log-ptr", toString(from_snapshot)); + last_executed_log_entry = from_snapshot; + ddl_worker->setLogPointer(from_snapshot); //FIXME - for (auto t = metadatas.begin(); t != metadatas.end(); ++t) - { - String path = zookeeper_path + "/snapshots/" + *latest_snapshot + "/" + *t; - - String query_to_execute = current_zookeeper->get(path, {}, nullptr); - - auto current_context = std::make_unique(global_context); - current_context->getClientInfo().query_kind = ClientInfo::QueryKind::REPLICATED_LOG_QUERY; - current_context->setCurrentDatabase(database_name); - current_context->setCurrentQueryId(""); // generate random query_id - - executeQuery(query_to_execute, *current_context); - } - - last_executed_log_entry = *latest_snapshot; - writeLastExecutedToDiskAndZK(); + //writeLastExecutedToDiskAndZK(); } void DatabaseReplicated::drop(const Context & context_) diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index 219779d602d..3f5bd4608f1 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -13,6 +13,7 @@ namespace DB { class DDLWorker; +using ZooKeeperPtr = std::shared_ptr; /** DatabaseReplicated engine * supports replication of metadata @@ -56,22 +57,29 @@ public: void loadStoredObjects(Context & context, bool has_force_restore_data_flag, bool force_attach = false) override; private: - bool createDatabaseNodesInZooKeeper(const zkutil::ZooKeeperPtr & current_zookeeper); - void createReplicaNodesInZooKeeper(const zkutil::ZooKeeperPtr & current_zookeeper); + bool createDatabaseNodesInZooKeeper(const ZooKeeperPtr & current_zookeeper); + void createReplicaNodesInZooKeeper(const ZooKeeperPtr & current_zookeeper); - void runBackgroundLogExecutor(); + //void runBackgroundLogExecutor(); void writeLastExecutedToDiskAndZK(); - void loadMetadataFromSnapshot(); - void createSnapshot(); + //void loadMetadataFromSnapshot(); + void createSnapshot(const ZooKeeperPtr & zookeeper); void removeOutdatedSnapshotsAndLog(); + Strings getSnapshots(const ZooKeeperPtr & zookeeper) const; + + void onUnexpectedLogEntry(const String & entry_name, const ZooKeeperPtr & zookeeper); + void recoverLostReplica(const ZooKeeperPtr & current_zookeeper, UInt32 from_snapshot, bool create = false); + + void onExecutedLogEntry(const String & entry_name, const ZooKeeperPtr & zookeeper); + String zookeeper_path; String shard_name; String replica_name; String replica_path; - String log_entry_to_execute; + UInt32 log_entry_to_execute; std::mutex log_name_mutex; String log_name_to_exec_with_result; @@ -84,6 +92,8 @@ private: std::unique_ptr ddl_worker; + + }; } diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index 7d947a264a6..51f0e1b45a9 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -142,6 +142,22 @@ std::unique_ptr createSimpleZooKeeperLock( } +String DatabaseReplicatedExtensions::getLogEntryName(UInt32 log_entry_number) +{ + constexpr size_t seq_node_digits = 10; + String number = toString(log_entry_number); + String name = "query-" + String(seq_node_digits - number.size(), '0') + number; + return name; +} + +UInt32 DatabaseReplicatedExtensions::getLogEntryNumber(const String & log_entry_name) +{ + constexpr const char * name = "query-"; + assert(startsWith(log_entry_name, name)); + return parse(log_entry_name.substr(strlen(name))); +} + + DDLWorker::DDLWorker(int pool_size_, const std::string & zk_root_dir, const Context & context_, const Poco::Util::AbstractConfiguration * config, const String & prefix, std::optional database_replicated_ext_) : context(context_) @@ -236,8 +252,21 @@ DDLTaskPtr DDLWorker::initAndCheckTask(const String & entry_name, String & out_r String node_data; String entry_path = queue_dir + "/" + entry_name; + if (database_replicated_ext) + { + auto expected_log_entry = DatabaseReplicatedExtensions::getLogEntryName(database_replicated_ext->first_not_executed); + if (entry_name != expected_log_entry) + { + database_replicated_ext->lost_callback(entry_name, zookeeper); + out_reason = "DatabaseReplicated: expected " + expected_log_entry + " got " + entry_name; + return {}; + } + } + if (!zookeeper->tryGet(entry_path, node_data)) { + if (database_replicated_ext) + database_replicated_ext->lost_callback(entry_name, zookeeper); /// It is Ok that node could be deleted just now. It means that there are no current host in node's host list. out_reason = "The task was deleted"; return {}; @@ -339,7 +368,7 @@ void DDLWorker::scheduleTasks() ? queue_nodes.begin() : std::upper_bound(queue_nodes.begin(), queue_nodes.end(), last_tasks.back()); - for (auto it = begin_node; it != queue_nodes.end(); ++it) + for (auto it = begin_node; it != queue_nodes.end() && !stop_flag; ++it) { String entry_name = *it; @@ -362,11 +391,17 @@ void DDLWorker::scheduleTasks() if (!already_processed) { - worker_pool.scheduleOrThrowOnError([this, task_ptr = task.release()]() + if (database_replicated_ext) { - setThreadName("DDLWorkerExec"); - enqueueTask(DDLTaskPtr(task_ptr)); - }); + enqueueTask(DDLTaskPtr(task.release())); + } + else + { + worker_pool.scheduleOrThrowOnError([this, task_ptr = task.release()]() { + setThreadName("DDLWorkerExec"); + enqueueTask(DDLTaskPtr(task_ptr)); + }); + } } else { @@ -374,9 +409,6 @@ void DDLWorker::scheduleTasks() } saveTask(entry_name); - - if (stop_flag) - break; } } @@ -599,6 +631,7 @@ void DDLWorker::enqueueTask(DDLTaskPtr task_ptr) } } } + void DDLWorker::processTask(DDLTask & task) { auto zookeeper = tryGetZooKeeper(); @@ -626,7 +659,9 @@ void DDLWorker::processTask(DDLTask & task) else throw Coordination::Exception(code, active_node_path); - if (!task.was_executed) + //FIXME + bool is_dummy_query = database_replicated_ext && task.entry.query.empty(); + if (!task.was_executed && !is_dummy_query) { try { @@ -675,7 +710,19 @@ void DDLWorker::processTask(DDLTask & task) Coordination::Requests ops; ops.emplace_back(zkutil::makeRemoveRequest(active_node_path, -1)); ops.emplace_back(zkutil::makeCreateRequest(finished_node_path, task.execution_status.serializeText(), zkutil::CreateMode::Persistent)); + if (database_replicated_ext) + { + assert(DatabaseReplicatedExtensions::getLogEntryName(database_replicated_ext->first_not_executed) == task.entry_name); + ops.emplace_back(zkutil::makeSetRequest(database_replicated_ext->getReplicaPath() + "/log_ptr", toString(database_replicated_ext->first_not_executed), -1)); + } + zookeeper->multi(ops); + + if (database_replicated_ext) + { + database_replicated_ext->executed_callback(task.entry_name, zookeeper); + ++(database_replicated_ext->first_not_executed); + } } diff --git a/src/Interpreters/DDLWorker.h b/src/Interpreters/DDLWorker.h index f38d41df503..08bf641264e 100644 --- a/src/Interpreters/DDLWorker.h +++ b/src/Interpreters/DDLWorker.h @@ -37,16 +37,25 @@ using ZooKeeperPtr = std::shared_ptr; struct DatabaseReplicatedExtensions { UUID database_uuid; + String zookeeper_path; String database_name; String shard_name; String replica_name; - String first_not_executed; - using NewEntryCallback = std::function; + UInt32 first_not_executed; + using EntryLostCallback = std::function; using EntryExecutedCallback = std::function; using EntryErrorCallback = std::function; - NewEntryCallback before_execution_callback; + EntryLostCallback lost_callback; EntryExecutedCallback executed_callback; EntryErrorCallback error_callback; + + String getReplicaPath() const + { + return zookeeper_path + "/replicas/" + shard_name + "/" + replica_name; + } + + static String getLogEntryName(UInt32 log_entry_number); + static UInt32 getLogEntryNumber(const String & log_entry_name); }; @@ -69,6 +78,9 @@ public: void shutdown(); + //FIXME get rid of this method + void setLogPointer(UInt32 log_pointer) { database_replicated_ext->first_not_executed = log_pointer; } + private: /// Returns cached ZooKeeper session (possibly expired). From 7ab4445e993333f15cea8d69e0de9a909c7d6495 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Fri, 20 Nov 2020 19:06:27 +0300 Subject: [PATCH 050/381] try another approach --- src/Databases/DatabaseAtomic.cpp | 18 ++- src/Databases/DatabaseAtomic.h | 4 +- src/Databases/DatabaseOnDisk.cpp | 5 +- src/Databases/DatabaseOnDisk.h | 2 +- src/Databases/DatabaseReplicated.cpp | 124 +++----------------- src/Databases/DatabaseReplicated.h | 2 - src/Interpreters/Context.cpp | 13 ++ src/Interpreters/Context.h | 11 ++ src/Interpreters/DDLTask.h | 22 ++++ src/Interpreters/DDLWorker.cpp | 96 ++++++++++++--- src/Interpreters/DDLWorker.h | 5 + src/Interpreters/SystemLog.h | 9 +- src/Storages/StorageReplicatedMergeTree.cpp | 7 ++ 13 files changed, 186 insertions(+), 132 deletions(-) diff --git a/src/Databases/DatabaseAtomic.cpp b/src/Databases/DatabaseAtomic.cpp index 15a55da89b2..78400368924 100644 --- a/src/Databases/DatabaseAtomic.cpp +++ b/src/Databases/DatabaseAtomic.cpp @@ -11,6 +11,9 @@ #include #include +//FIXME it shouldn't be here +#include +#include namespace DB { @@ -263,7 +266,8 @@ void DatabaseAtomic::renameTable(const Context & context, const String & table_n } void DatabaseAtomic::commitCreateTable(const ASTCreateQuery & query, const StoragePtr & table, - const String & table_metadata_tmp_path, const String & table_metadata_path) + const String & table_metadata_tmp_path, const String & table_metadata_path, + const Context & query_context) { DetachedTables not_in_use; auto table_data_path = getTableDataPath(query); @@ -280,6 +284,18 @@ void DatabaseAtomic::commitCreateTable(const ASTCreateQuery & query, const Stora /// We will get en exception if some table with the same UUID exists (even if it's detached table or table from another database) DatabaseCatalog::instance().addUUIDMapping(query.uuid); locked_uuid = true; + + if (auto txn = query_context.getMetadataTransaction()) + { + String metadata_zk_path = txn->zookeeper_path + "/metadata/" + escapeForFileName(query.table); + String statement = getObjectDefinitionFromCreateQuery(query.clone()); + /// zk::multi(...) will throw if `metadata_zk_path` exists + txn->ops.emplace_back(zkutil::makeCreateRequest(metadata_zk_path, statement, zkutil::CreateMode::Persistent)); + txn->current_zookeeper->multi(txn->ops); /// Commit point (a sort of) for Replicated database + /// NOTE: replica will be lost if server crashes before the following renameNoReplace(...) + /// TODO better detection and recovery + } + /// It throws if `table_metadata_path` already exists (it's possible if table was detached) renameNoReplace(table_metadata_tmp_path, table_metadata_path); /// Commit point (a sort of) attachTableUnlocked(query.table, table, lock); /// Should never throw diff --git a/src/Databases/DatabaseAtomic.h b/src/Databases/DatabaseAtomic.h index 97e6e1173d1..61ce2721701 100644 --- a/src/Databases/DatabaseAtomic.h +++ b/src/Databases/DatabaseAtomic.h @@ -60,10 +60,10 @@ public: void waitDetachedTableNotInUse(const UUID & uuid); -private: +protected: void commitAlterTable(const StorageID & table_id, const String & table_metadata_tmp_path, const String & table_metadata_path) override; void commitCreateTable(const ASTCreateQuery & query, const StoragePtr & table, - const String & table_metadata_tmp_path, const String & table_metadata_path) override; + const String & table_metadata_tmp_path, const String & table_metadata_path, const Context & query_context) override; void assertDetachedTableNotInUse(const UUID & uuid); typedef std::unordered_map DetachedTables; diff --git a/src/Databases/DatabaseOnDisk.cpp b/src/Databases/DatabaseOnDisk.cpp index 8fa136f4969..8f24f53fc3f 100644 --- a/src/Databases/DatabaseOnDisk.cpp +++ b/src/Databases/DatabaseOnDisk.cpp @@ -193,11 +193,12 @@ void DatabaseOnDisk::createTable( out.close(); } - commitCreateTable(create, table, table_metadata_tmp_path, table_metadata_path); + commitCreateTable(create, table, table_metadata_tmp_path, table_metadata_path, context); } void DatabaseOnDisk::commitCreateTable(const ASTCreateQuery & query, const StoragePtr & table, - const String & table_metadata_tmp_path, const String & table_metadata_path) + const String & table_metadata_tmp_path, const String & table_metadata_path, + const Context & /*query_context*/) { try { diff --git a/src/Databases/DatabaseOnDisk.h b/src/Databases/DatabaseOnDisk.h index 23c1584ff9c..a5510ef4810 100644 --- a/src/Databases/DatabaseOnDisk.h +++ b/src/Databases/DatabaseOnDisk.h @@ -83,7 +83,7 @@ protected: ASTPtr getCreateQueryFromMetadata(const String & metadata_path, bool throw_on_error) const; virtual void commitCreateTable(const ASTCreateQuery & query, const StoragePtr & table, - const String & table_metadata_tmp_path, const String & table_metadata_path); + const String & table_metadata_tmp_path, const String & table_metadata_path, const Context & query_context); const String metadata_path; const String data_path; diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 7b6d98f992a..608d03c339b 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -29,10 +29,9 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; extern const int REPLICA_IS_ALREADY_EXIST; extern const int DATABASE_REPLICATION_FAILED; + extern const int UNKNOWN_DATABASE; } -static constexpr size_t METADATA_FILE_BUFFER_SIZE = 32768; - zkutil::ZooKeeperPtr DatabaseReplicated::getZooKeeper() const { return global_context.getZooKeeper(); @@ -43,15 +42,6 @@ static inline String getHostID(const Context & global_context) return Cluster::Address::toString(getFQDNOrHostName(), global_context.getTCPPort()); } -Strings DatabaseReplicated::getSnapshots(const ZooKeeperPtr & zookeeper) const -{ - Strings snapshots = zookeeper->getChildren(zookeeper_path + "/snapshots"); - std::sort(snapshots.begin(), snapshots.end()); - if (snapshots.empty()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "No snapshots found"); - return snapshots; -} - DatabaseReplicated::~DatabaseReplicated() = default; @@ -125,11 +115,9 @@ bool DatabaseReplicated::createDatabaseNodesInZooKeeper(const zkutil::ZooKeeperP ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path, "", zkutil::CreateMode::Persistent)); ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/log", "", zkutil::CreateMode::Persistent)); ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/replicas", "", zkutil::CreateMode::Persistent)); - ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/snapshots", "", zkutil::CreateMode::Persistent)); - /// Create empty snapshot (with no tables) - ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/snapshots/0", "", zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/counter", "", zkutil::CreateMode::Persistent)); ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/metadata", "", zkutil::CreateMode::Persistent)); - ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/metadata/0", "", zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/min_log_ptr", "0", zkutil::CreateMode::Persistent)); Coordination::Responses responses; auto res = current_zookeeper->tryMulti(ops, responses); @@ -147,7 +135,7 @@ void DatabaseReplicated::createReplicaNodesInZooKeeper(const zkutil::ZooKeeperPt current_zookeeper->createAncestors(replica_path); /// When creating new replica, use latest snapshot version as initial value of log_pointer - log_entry_to_execute = parse(getSnapshots(current_zookeeper).back()); + log_entry_to_execute = 0; //FIXME /// Write host name to replica_path, it will protect from multiple replicas with the same name auto host_id = getHostID(global_context); @@ -160,10 +148,16 @@ void DatabaseReplicated::createReplicaNodesInZooKeeper(const zkutil::ZooKeeperPt recoverLostReplica(current_zookeeper, log_entry_to_execute, true); + String query_path_prefix = zookeeper_path + "/log/query-"; + String counter_prefix = zookeeper_path + "/counter/cnt-"; + String counter_path = current_zookeeper->create(counter_prefix, "", zkutil::CreateMode::EphemeralSequential); + String query_path = query_path_prefix + counter_path.substr(counter_prefix.size()); + Coordination::Requests ops; ops.emplace_back(zkutil::makeCreateRequest(replica_path, host_id, zkutil::CreateMode::Persistent)); ops.emplace_back(zkutil::makeCreateRequest(replica_path + "/log_ptr", toString(log_entry_to_execute), zkutil::CreateMode::Persistent)); - ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/log/query-", entry.toString(), zkutil::CreateMode::PersistentSequential)); + ops.emplace_back(zkutil::makeCreateRequest(query_path, entry.toString(), zkutil::CreateMode::PersistentSequential)); + ops.emplace_back(zkutil::makeRemoveRequest(counter_path, -1)); current_zookeeper->multi(ops); } @@ -207,20 +201,17 @@ void DatabaseReplicated::onUnexpectedLogEntry(const String & entry_name, const Z if (entry_number < log_entry_to_execute) throw Exception(ErrorCodes::LOGICAL_ERROR, "Entry {} already executed, current pointer is {}", entry_number, log_entry_to_execute); - /// Entry name is valid. Let's get min snapshot version to check if replica is staled. - Strings snapshots = getSnapshots(zookeeper); - UInt32 min_snapshot = parse(snapshots.front()); + /// Entry name is valid. Let's get min log pointer to check if replica is staled. + UInt32 min_snapshot = parse(zookeeper->get(zookeeper_path + "/min_log_ptr")); if (log_entry_to_execute < min_snapshot) { - recoverLostReplica(zookeeper, parse(snapshots.back())); + recoverLostReplica(zookeeper, 0); //FIXME log_pointer return; } throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot recover replica, probably it's a bug. " - "Got log entry '{}' when expected entry number {}, " - "available snapshots: ", - entry_name, log_entry_to_execute, boost::algorithm::join(snapshots, ", ")); + "Got log entry '{}' when expected entry number {}"); } void DatabaseReplicated::removeOutdatedSnapshotsAndLog() @@ -268,51 +259,11 @@ void DatabaseReplicated::removeOutdatedSnapshotsAndLog() } } -void DatabaseReplicated::onExecutedLogEntry(const String & entry_name, const ZooKeeperPtr & zookeeper) +void DatabaseReplicated::onExecutedLogEntry(const String & /*entry_name*/, const ZooKeeperPtr & /*zookeeper*/) { - assert(entry_name == DatabaseReplicatedExtensions::getLogEntryName(log_entry_to_execute)); - ++log_entry_to_execute; - if (snapshot_period > 0 && log_entry_to_execute % snapshot_period == 0) - { - createSnapshot(zookeeper); - } } -//void DatabaseReplicated::runBackgroundLogExecutor() -//{ -// if (last_executed_log_entry.empty()) -// { -// loadMetadataFromSnapshot(); -// } -// -// auto current_zookeeper = getZooKeeper(); -// Strings log_entry_names = current_zookeeper->getChildren(zookeeper_path + "/log"); -// -// std::sort(log_entry_names.begin(), log_entry_names.end()); -// auto newest_entry_it = std::upper_bound(log_entry_names.begin(), log_entry_names.end(), last_executed_log_entry); -// -// log_entry_names.erase(log_entry_names.begin(), newest_entry_it); -// -// for (const String & log_entry_name : log_entry_names) -// { -// //executeLogName(log_entry_name); -// last_executed_log_entry = log_entry_name; -// writeLastExecutedToDiskAndZK(); -// -// int log_n = parse(log_entry_name.substr(4)); -// int last_log_n = parse(log_entry_names.back().substr(4)); -// -// /// The third condition gurantees at most one snapshot creation per batch -// if (log_n > 0 && snapshot_period > 0 && (last_log_n - log_n) / snapshot_period == 0 && log_n % snapshot_period == 0) -// { -// createSnapshot(); -// } -// } -// -// //background_log_executor->scheduleAfter(500); -//} - void DatabaseReplicated::writeLastExecutedToDiskAndZK() { auto current_zookeeper = getZooKeeper(); @@ -363,58 +314,19 @@ BlockIO DatabaseReplicated::propose(const ASTPtr & query) } -void DatabaseReplicated::createSnapshot(const ZooKeeperPtr & zookeeper) -{ - String snapshot_path = zookeeper_path + "/snapshot/" + toString(log_entry_to_execute); - - if (zookeeper->exists(snapshot_path)) - return; - - std::vector> create_queries; - { - std::lock_guard lock{mutex}; - create_queries.reserve(tables.size()); - for (const auto & table : tables) - { - const String & name = table.first; - ReadBufferFromFile in(getObjectMetadataPath(name), METADATA_FILE_BUFFER_SIZE); - String attach_query; - readStringUntilEOF(attach_query, in); - create_queries.emplace_back(escapeForFileName(name), std::move(attach_query)); - } - } - - if (zookeeper->exists(snapshot_path)) - return; - - String queries_path = zookeeper_path + "/metadata/" + toString(log_entry_to_execute); - zookeeper->tryCreate(queries_path, "", zkutil::CreateMode::Persistent); - queries_path += '/'; - - //FIXME use tryMulti with MULTI_BATCH_SIZE - - for (const auto & table : create_queries) - zookeeper->tryCreate(queries_path + table.first, table.second, zkutil::CreateMode::Persistent); - - if (create_queries.size() != zookeeper->getChildren(zookeeper_path + "/metadata/" + toString(log_entry_to_execute)).size()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Created invalid snapshot"); - - zookeeper->tryCreate(snapshot_path, String(), zkutil::CreateMode::Persistent); -} - void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeeper, UInt32 from_snapshot, bool create) { LOG_WARNING(log, "Will recover replica from snapshot", from_snapshot); //FIXME drop old tables - String snapshot_metadata_path = zookeeper_path + "/metadata/" + toString(from_snapshot); + String snapshot_metadata_path = zookeeper_path + "/metadata"; Strings tables_in_snapshot = current_zookeeper->getChildren(snapshot_metadata_path); - current_zookeeper->get(zookeeper_path + "/snapshots/" + toString(from_snapshot)); /// Assert node exists snapshot_metadata_path += '/'; for (const auto & table_name : tables_in_snapshot) { + //FIXME It's not atomic. We need multiget here (available since ZooKeeper 3.6.0). String query_to_execute = current_zookeeper->get(snapshot_metadata_path + table_name); diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index 3f5bd4608f1..663df59ac63 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -64,10 +64,8 @@ private: void writeLastExecutedToDiskAndZK(); //void loadMetadataFromSnapshot(); - void createSnapshot(const ZooKeeperPtr & zookeeper); void removeOutdatedSnapshotsAndLog(); - Strings getSnapshots(const ZooKeeperPtr & zookeeper) const; void onUnexpectedLogEntry(const String & entry_name, const ZooKeeperPtr & zookeeper); void recoverLostReplica(const ZooKeeperPtr & current_zookeeper, UInt32 from_snapshot, bool create = false); diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index 1b9391b8725..a7309e9ae47 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -2415,4 +2415,17 @@ StorageID Context::resolveStorageIDImpl(StorageID storage_id, StorageNamespace w return StorageID::createEmpty(); } +void Context::initMetadataTransaction(MetadataTransactionPtr txn) +{ + assert(!metadata_transaction); + assert(query_context == this); + metadata_transaction = std::move(txn); +} + +MetadataTransactionPtr Context::getMetadataTransaction() const +{ + assert(query_context == this); + return metadata_transaction; +} + } diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index c55d8e6d604..ed11fab7599 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -114,6 +114,8 @@ using VolumePtr = std::shared_ptr; struct NamedSession; struct BackgroundTaskSchedulingSettings; +struct MetadataTransaction; +using MetadataTransactionPtr = std::shared_ptr; #if USE_EMBEDDED_COMPILER class CompiledExpressionCache; @@ -212,6 +214,12 @@ private: /// to be customized in HTTP and TCP servers by overloading the customizeContext(DB::Context&) /// methods. + MetadataTransactionPtr metadata_transaction; /// Distributed DDL context. I'm not sure if it's a suitable place for this, + /// but it's the easiest way to pass this through the whole stack from executeQuery(...) + /// to DatabaseOnDisk::commitCreateTable(...) or IStorage::alter(...) without changing + /// thousands of signatures. + /// And I hope it will be replaced with more common Transaction sometime. + /// Use copy constructor or createGlobal() instead Context(); @@ -634,6 +642,9 @@ public: IHostContextPtr & getHostContext(); const IHostContextPtr & getHostContext() const; + void initMetadataTransaction(MetadataTransactionPtr txn); + MetadataTransactionPtr getMetadataTransaction() const; + struct MySQLWireContext { uint8_t sequence_id = 0; diff --git a/src/Interpreters/DDLTask.h b/src/Interpreters/DDLTask.h index 51f09efd0bd..ba58fe3f42e 100644 --- a/src/Interpreters/DDLTask.h +++ b/src/Interpreters/DDLTask.h @@ -1,12 +1,14 @@ #pragma once #include #include +#include namespace DB { class ASTQueryWithOnCluster; +using ZooKeeperPtr = std::shared_ptr; struct HostID { @@ -62,6 +64,8 @@ struct DDLTask String entry_path; DDLLogEntry entry; + bool we_are_initiator = false; + /// Stage 2: resolve host_id and check that HostID host_id; String host_id_str; @@ -82,7 +86,25 @@ struct DDLTask bool was_executed = false; /// Stage 4: commit results to ZooKeeper + + String active_path; + String finished_path; + String shard_path; }; +struct MetadataTransaction +{ + ZooKeeperPtr current_zookeeper; + String zookeeper_path; + Coordination::Requests ops; + + + + void addOps(Coordination::Requests & other_ops) + { + std::move(ops.begin(), ops.end(), std::back_inserter(other_ops)); + } +}; + } diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index 51f0e1b45a9..5e4d79c32ab 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -252,13 +252,35 @@ DDLTaskPtr DDLWorker::initAndCheckTask(const String & entry_name, String & out_r String node_data; String entry_path = queue_dir + "/" + entry_name; + auto task = std::make_unique(); + task->entry_name = entry_name; + task->entry_path = entry_path; + if (database_replicated_ext) { - auto expected_log_entry = DatabaseReplicatedExtensions::getLogEntryName(database_replicated_ext->first_not_executed); - if (entry_name != expected_log_entry) + //auto expected_log_entry = DatabaseReplicatedExtensions::getLogEntryName(database_replicated_ext->first_not_executed); + //if (entry_name != expected_log_entry) + //{ + // database_replicated_ext->lost_callback(entry_name, zookeeper); + // out_reason = "DatabaseReplicated: expected " + expected_log_entry + " got " + entry_name; + // return {}; + //} + + String initiator_name; + zkutil::EventPtr wait_committed_or_failed; + + if (zookeeper->tryGet(entry_path + "/try", initiator_name, nullptr, wait_committed_or_failed)) { - database_replicated_ext->lost_callback(entry_name, zookeeper); - out_reason = "DatabaseReplicated: expected " + expected_log_entry + " got " + entry_name; + task->we_are_initiator = initiator_name == database_replicated_ext->getFullReplicaName(); + /// Query is not committed yet. We cannot just skip it and execute next one, because reordering may break replication. + //FIXME add some timeouts + if (!task->we_are_initiator) + wait_committed_or_failed->wait(); + } + + if (!task->we_are_initiator && !zookeeper->exists(entry_path + "/committed")) + { + out_reason = "Entry " + entry_name + " hasn't been committed"; return {}; } } @@ -272,10 +294,6 @@ DDLTaskPtr DDLWorker::initAndCheckTask(const String & entry_name, String & out_r return {}; } - auto task = std::make_unique(); - task->entry_name = entry_name; - task->entry_path = entry_path; - try { task->entry.parse(node_data); @@ -557,15 +575,34 @@ bool DDLWorker::tryExecuteQuery(const String & query, const DDLTask & task, Exec try { auto current_context = std::make_unique(context); + current_context->makeQueryContext(); + current_context->setCurrentQueryId(""); // generate random query_id + if (database_replicated_ext) { current_context->getClientInfo().query_kind = ClientInfo::QueryKind::REPLICATED_LOG_QUERY; //FIXME why do we need separate query kind? current_context->setCurrentDatabase(database_replicated_ext->database_name); + + if (task.we_are_initiator) + { + auto txn = std::make_shared(); + current_context->initMetadataTransaction(txn); + txn->current_zookeeper = current_zookeeper; + txn->zookeeper_path = database_replicated_ext->zookeeper_path; + txn->ops.emplace_back(zkutil::makeRemoveRequest(task.entry_path + "/try", -1)); + txn->ops.emplace_back(zkutil::makeCreateRequest(task.entry_path + "/committed", + database_replicated_ext->getFullReplicaName(), zkutil::CreateMode::Persistent)); + txn->ops.emplace_back(zkutil::makeRemoveRequest(task.active_path, -1)); + if (!task.shard_path.empty()) + txn->ops.emplace_back(zkutil::makeCreateRequest(task.shard_path, task.host_id_str, zkutil::CreateMode::Persistent)); + txn->ops.emplace_back(zkutil::makeCreateRequest(task.finished_path, task.execution_status.serializeText(), zkutil::CreateMode::Persistent)); + //txn->ops.emplace_back(zkutil::makeSetRequest(database_replicated_ext->getReplicaPath() + "/log_ptr", toString(database_replicated_ext->first_not_executed), -1)); + } } else current_context->getClientInfo().query_kind = ClientInfo::QueryKind::SECONDARY_QUERY; - current_context->setCurrentQueryId(""); // generate random query_id + executeQuery(istr, ostr, false, *current_context, {}); } catch (...) @@ -639,8 +676,9 @@ void DDLWorker::processTask(DDLTask & task) LOG_DEBUG(log, "Processing task {} ({})", task.entry_name, task.entry.query); String dummy; - String active_node_path = task.entry_path + "/active/" + task.host_id_str; - String finished_node_path = task.entry_path + "/finished/" + task.host_id_str; + //FIXME duplicate + String active_node_path = task.active_path = task.entry_path + "/active/" + task.host_id_str; + String finished_node_path = task.finished_path = task.entry_path + "/finished/" + task.host_id_str; auto code = zookeeper->tryCreate(active_node_path, "", zkutil::CreateMode::Ephemeral, dummy); @@ -712,11 +750,15 @@ void DDLWorker::processTask(DDLTask & task) ops.emplace_back(zkutil::makeCreateRequest(finished_node_path, task.execution_status.serializeText(), zkutil::CreateMode::Persistent)); if (database_replicated_ext) { - assert(DatabaseReplicatedExtensions::getLogEntryName(database_replicated_ext->first_not_executed) == task.entry_name); - ops.emplace_back(zkutil::makeSetRequest(database_replicated_ext->getReplicaPath() + "/log_ptr", toString(database_replicated_ext->first_not_executed), -1)); + //assert(DatabaseReplicatedExtensions::getLogEntryName(database_replicated_ext->first_not_executed) == task.entry_name); + //ops.emplace_back(zkutil::makeSetRequest(database_replicated_ext->getReplicaPath() + "/log_ptr", toString(database_replicated_ext->first_not_executed), -1)); } - zookeeper->multi(ops); + //FIXME replace with multi(...) or use MetadataTransaction + Coordination::Responses responses; + auto res = zookeeper->tryMulti(ops, responses); + if (res != Coordination::Error::ZNODEEXISTS && res != Coordination::Error::ZNONODE) + zkutil::KeeperMultiException::check(res, ops, responses); if (database_replicated_ext) { @@ -774,6 +816,7 @@ bool DDLWorker::tryExecuteQueryOnLeaderReplica( else shard_node_name = get_shard_name(task.cluster->getShardsAddresses().at(task.host_shard_num)); String shard_path = node_path + "/shards/" + shard_node_name; + task.shard_path = shard_path; //FIXME duplicate String is_executed_path = shard_path + "/executed"; String tries_to_execute_path = shard_path + "/tries_to_execute"; zookeeper->createAncestors(shard_path + "/"); @@ -826,7 +869,8 @@ bool DDLWorker::tryExecuteQueryOnLeaderReplica( /// and on the next iteration new leader will take lock if (tryExecuteQuery(rewritten_query, task, task.execution_status)) { - zookeeper->create(is_executed_path, task.host_id_str, zkutil::CreateMode::Persistent); + //FIXME replace with create(...) or remove and use MetadataTransaction + zookeeper->createIfNotExists(is_executed_path, task.host_id_str); executed_by_leader = true; break; } @@ -976,7 +1020,27 @@ String DDLWorker::enqueueQuery(DDLLogEntry & entry) String query_path_prefix = queue_dir + "/query-"; zookeeper->createAncestors(query_path_prefix); - String node_path = zookeeper->create(query_path_prefix, entry.toString(), zkutil::CreateMode::PersistentSequential); + String node_path; + if (database_replicated_ext) + { + /// We cannot create sequential node and it's ephemeral child in a single transaction, so allocate sequential number another way + String counter_prefix = database_replicated_ext->zookeeper_path + "/counter/cnt-"; + String counter_path = zookeeper->create(counter_prefix, "", zkutil::CreateMode::EphemeralSequential); + node_path = query_path_prefix + counter_path.substr(counter_prefix.size()); + + Coordination::Requests ops; + /// Query is not committed yet, but we have to write it into log to avoid reordering + ops.emplace_back(zkutil::makeCreateRequest(node_path, entry.toString(), zkutil::CreateMode::Persistent)); + /// '/try' will be replaced with '/committed' or will be removed due to expired session or other error + ops.emplace_back(zkutil::makeCreateRequest(node_path + "/try", database_replicated_ext->getFullReplicaName(), zkutil::CreateMode::Ephemeral)); + /// We don't need it anymore + ops.emplace_back(zkutil::makeRemoveRequest(counter_path, -1)); + zookeeper->multi(ops); + } + else + { + node_path = zookeeper->create(query_path_prefix, entry.toString(), zkutil::CreateMode::PersistentSequential); + } /// Optional step try diff --git a/src/Interpreters/DDLWorker.h b/src/Interpreters/DDLWorker.h index 08bf641264e..86677bfbb19 100644 --- a/src/Interpreters/DDLWorker.h +++ b/src/Interpreters/DDLWorker.h @@ -54,6 +54,11 @@ struct DatabaseReplicatedExtensions return zookeeper_path + "/replicas/" + shard_name + "/" + replica_name; } + String getFullReplicaName() const + { + return shard_name + '|' + replica_name; + } + static String getLogEntryName(UInt32 log_entry_number); static UInt32 getLogEntryNumber(const String & log_entry_name); }; diff --git a/src/Interpreters/SystemLog.h b/src/Interpreters/SystemLog.h index 6c56565a152..20980a186cb 100644 --- a/src/Interpreters/SystemLog.h +++ b/src/Interpreters/SystemLog.h @@ -505,7 +505,9 @@ void SystemLog::prepareTable() LOG_DEBUG(log, "Existing table {} for system log has obsolete or different structure. Renaming it to {}", description, backQuoteIfNeed(to.table)); - InterpreterRenameQuery(rename, context).execute(); + Context query_context = context; + query_context.makeQueryContext(); + InterpreterRenameQuery(rename, query_context).execute(); /// The required table will be created. table = nullptr; @@ -521,7 +523,10 @@ void SystemLog::prepareTable() auto create = getCreateTableQuery(); - InterpreterCreateQuery interpreter(create, context); + + Context query_context = context; + query_context.makeQueryContext(); + InterpreterCreateQuery interpreter(create, query_context); interpreter.setInternal(true); interpreter.execute(); diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index b93500000b5..5c176de1395 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -46,6 +46,7 @@ #include #include #include +#include #include #include @@ -4104,6 +4105,12 @@ void StorageReplicatedMergeTree::alter( zkutil::makeCreateRequest(mutations_path + "/", mutation_entry.toString(), zkutil::CreateMode::PersistentSequential)); } + if (auto txn = query_context.getMetadataTransaction()) + { + txn->addOps(ops); + //TODO maybe also change here table metadata in replicated database? + } + Coordination::Responses results; Coordination::Error rc = zookeeper->tryMulti(ops, results); From dad21ee684c5869d1c83b572cdec5c6f3bcb9130 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Tue, 24 Nov 2020 13:24:39 +0300 Subject: [PATCH 051/381] maintain metadata in zk --- src/Common/ZooKeeper/ZooKeeper.cpp | 8 +++ src/Databases/DatabaseAtomic.cpp | 56 ++++++++++++++++- src/Databases/DatabaseAtomic.h | 2 +- src/Databases/DatabaseOrdinary.cpp | 4 +- src/Databases/DatabaseOrdinary.h | 2 +- src/Databases/DatabaseReplicated.cpp | 4 +- src/Interpreters/DDLWorker.cpp | 24 +++----- src/Interpreters/InterpreterAlterQuery.cpp | 4 +- src/Storages/StorageReplicatedMergeTree.cpp | 30 ++++++++-- .../test_replicated_database/test.py | 60 +++++++++++-------- 10 files changed, 140 insertions(+), 54 deletions(-) diff --git a/src/Common/ZooKeeper/ZooKeeper.cpp b/src/Common/ZooKeeper/ZooKeeper.cpp index bee875d1c74..09703e523bb 100644 --- a/src/Common/ZooKeeper/ZooKeeper.cpp +++ b/src/Common/ZooKeeper/ZooKeeper.cpp @@ -537,6 +537,14 @@ Coordination::Error ZooKeeper::trySet(const std::string & path, const std::strin Coordination::Error ZooKeeper::multiImpl(const Coordination::Requests & requests, Coordination::Responses & responses) { + String desc; + for (const auto & r : requests) + { + auto & r_ref = *r; + desc += String(typeid(r_ref).name()) + "\t" + r->getPath() + "\n"; + } + LOG_TRACE(&Poco::Logger::get("ZKTX"), "zk multi {}", desc); + if (requests.empty()) return Coordination::Error::ZOK; diff --git a/src/Databases/DatabaseAtomic.cpp b/src/Databases/DatabaseAtomic.cpp index 78400368924..ca39cefc5c8 100644 --- a/src/Databases/DatabaseAtomic.cpp +++ b/src/Databases/DatabaseAtomic.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -108,7 +109,7 @@ StoragePtr DatabaseAtomic::detachTable(const String & name) return table; } -void DatabaseAtomic::dropTable(const Context &, const String & table_name, bool no_delay) +void DatabaseAtomic::dropTable(const Context & context, const String & table_name, bool no_delay) { String table_metadata_path = getObjectMetadataPath(table_name); String table_metadata_path_drop; @@ -117,6 +118,16 @@ void DatabaseAtomic::dropTable(const Context &, const String & table_name, bool std::unique_lock lock(mutex); table = getTableUnlocked(table_name, lock); table_metadata_path_drop = DatabaseCatalog::instance().getPathForDroppedMetadata(table->getStorageID()); + + if (auto txn = context.getMetadataTransaction()) + { + String metadata_zk_path = txn->zookeeper_path + "/metadata/" + escapeForFileName(table_name); + txn->ops.emplace_back(zkutil::makeRemoveRequest(metadata_zk_path, -1)); + txn->current_zookeeper->multi(txn->ops); /// Commit point (a sort of) for Replicated database + /// NOTE: replica will be lost if server crashes before the following rename + /// TODO better detection and recovery + } + Poco::File(table_metadata_path).renameTo(table_metadata_path_drop); /// Mark table as dropped DatabaseWithDictionaries::detachTableUnlocked(table_name, lock); /// Should never throw table_name_to_path.erase(table_name); @@ -146,6 +157,8 @@ void DatabaseAtomic::renameTable(const Context & context, const String & table_n if (exchange && dictionary) throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Cannot exchange dictionaries"); + if (exchange && !supportsRenameat2()) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "RENAME EXCHANGE is not supported"); auto & other_db = dynamic_cast(to_database); bool inside_database = this == &other_db; @@ -231,6 +244,33 @@ void DatabaseAtomic::renameTable(const Context & context, const String & table_n } /// Table renaming actually begins here + if (auto txn = context.getMetadataTransaction()) + { + String statement; + String statement_to; + { + ReadBufferFromFile in(old_metadata_path, 4096); + readStringUntilEOF(statement, in); + if (exchange) + { + ReadBufferFromFile in_to(new_metadata_path, 4096); + readStringUntilEOF(statement_to, in_to); + } + } + String metadata_zk_path = txn->zookeeper_path + "/metadata/" + escapeForFileName(table_name); + String metadata_zk_path_to = txn->zookeeper_path + "/metadata/" + escapeForFileName(to_table_name); + txn->ops.emplace_back(zkutil::makeRemoveRequest(metadata_zk_path, -1)); + if (exchange) + { + txn->ops.emplace_back(zkutil::makeRemoveRequest(metadata_zk_path_to, -1)); + txn->ops.emplace_back(zkutil::makeCreateRequest(metadata_zk_path, statement_to, zkutil::CreateMode::Persistent)); + } + txn->ops.emplace_back(zkutil::makeCreateRequest(metadata_zk_path_to, statement, zkutil::CreateMode::Persistent)); + txn->current_zookeeper->multi(txn->ops); /// Commit point (a sort of) for Replicated database + /// NOTE: replica will be lost if server crashes before the following rename + /// TODO better detection and recovery + } + if (exchange) renameExchange(old_metadata_path, new_metadata_path); else @@ -312,7 +352,7 @@ void DatabaseAtomic::commitCreateTable(const ASTCreateQuery & query, const Stora tryCreateSymlink(query.table, table_data_path); } -void DatabaseAtomic::commitAlterTable(const StorageID & table_id, const String & table_metadata_tmp_path, const String & table_metadata_path) +void DatabaseAtomic::commitAlterTable(const StorageID & table_id, const String & table_metadata_tmp_path, const String & table_metadata_path, const String & statement, const Context & query_context) { bool check_file_exists = true; SCOPE_EXIT({ std::error_code code; if (check_file_exists) std::filesystem::remove(table_metadata_tmp_path, code); }); @@ -323,6 +363,18 @@ void DatabaseAtomic::commitAlterTable(const StorageID & table_id, const String & if (table_id.uuid != actual_table_id.uuid) throw Exception("Cannot alter table because it was renamed", ErrorCodes::CANNOT_ASSIGN_ALTER); + if (&query_context != &query_context.getGlobalContext()) // FIXME + { + if (auto txn = query_context.getMetadataTransaction()) + { + String metadata_zk_path = txn->zookeeper_path + "/metadata/" + escapeForFileName(table_id.table_name); + txn->ops.emplace_back(zkutil::makeSetRequest(metadata_zk_path, statement, -1)); + txn->current_zookeeper->multi(txn->ops); /// Commit point (a sort of) for Replicated database + /// NOTE: replica will be lost if server crashes before the following rename + /// TODO better detection and recovery + } + } + check_file_exists = renameExchangeIfSupported(table_metadata_tmp_path, table_metadata_path); if (!check_file_exists) std::filesystem::rename(table_metadata_tmp_path, table_metadata_path); diff --git a/src/Databases/DatabaseAtomic.h b/src/Databases/DatabaseAtomic.h index 61ce2721701..9cc6a429656 100644 --- a/src/Databases/DatabaseAtomic.h +++ b/src/Databases/DatabaseAtomic.h @@ -61,7 +61,7 @@ public: void waitDetachedTableNotInUse(const UUID & uuid); protected: - void commitAlterTable(const StorageID & table_id, const String & table_metadata_tmp_path, const String & table_metadata_path) override; + void commitAlterTable(const StorageID & table_id, const String & table_metadata_tmp_path, const String & table_metadata_path, const String & statement, const Context & query_context) override; void commitCreateTable(const ASTCreateQuery & query, const StoragePtr & table, const String & table_metadata_tmp_path, const String & table_metadata_path, const Context & query_context) override; diff --git a/src/Databases/DatabaseOrdinary.cpp b/src/Databases/DatabaseOrdinary.cpp index b363058c0c6..3df0d8fe907 100644 --- a/src/Databases/DatabaseOrdinary.cpp +++ b/src/Databases/DatabaseOrdinary.cpp @@ -312,10 +312,10 @@ void DatabaseOrdinary::alterTable(const Context & context, const StorageID & tab out.close(); } - commitAlterTable(table_id, table_metadata_tmp_path, table_metadata_path); + commitAlterTable(table_id, table_metadata_tmp_path, table_metadata_path, statement, context); } -void DatabaseOrdinary::commitAlterTable(const StorageID &, const String & table_metadata_tmp_path, const String & table_metadata_path) +void DatabaseOrdinary::commitAlterTable(const StorageID &, const String & table_metadata_tmp_path, const String & table_metadata_path, const String & /*statement*/, const Context & /*query_context*/) { try { diff --git a/src/Databases/DatabaseOrdinary.h b/src/Databases/DatabaseOrdinary.h index b5ea286ef15..6a21e19d5e2 100644 --- a/src/Databases/DatabaseOrdinary.h +++ b/src/Databases/DatabaseOrdinary.h @@ -30,7 +30,7 @@ public: const StorageInMemoryMetadata & metadata) override; protected: - virtual void commitAlterTable(const StorageID & table_id, const String & table_metadata_tmp_path, const String & table_metadata_path); + virtual void commitAlterTable(const StorageID & table_id, const String & table_metadata_tmp_path, const String & table_metadata_path, const String & statement, const Context & query_context); void startupTables(ThreadPool & thread_pool); }; diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 608d03c339b..25fb95ba0de 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -146,8 +146,6 @@ void DatabaseReplicated::createReplicaNodesInZooKeeper(const zkutil::ZooKeeperPt entry.query = {}; entry.initiator = {}; - recoverLostReplica(current_zookeeper, log_entry_to_execute, true); - String query_path_prefix = zookeeper_path + "/log/query-"; String counter_prefix = zookeeper_path + "/counter/cnt-"; String counter_path = current_zookeeper->create(counter_prefix, "", zkutil::CreateMode::EphemeralSequential); @@ -165,6 +163,8 @@ void DatabaseReplicated::loadStoredObjects(Context & context, bool has_force_res { DatabaseAtomic::loadStoredObjects(context, has_force_restore_data_flag, force_attach); + recoverLostReplica(global_context.getZooKeeper(), 0, true); //FIXME + DatabaseReplicatedExtensions ext; ext.database_uuid = getUUID(); ext.zookeeper_path = zookeeper_path; diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index 5e4d79c32ab..099b968d895 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -258,16 +258,8 @@ DDLTaskPtr DDLWorker::initAndCheckTask(const String & entry_name, String & out_r if (database_replicated_ext) { - //auto expected_log_entry = DatabaseReplicatedExtensions::getLogEntryName(database_replicated_ext->first_not_executed); - //if (entry_name != expected_log_entry) - //{ - // database_replicated_ext->lost_callback(entry_name, zookeeper); - // out_reason = "DatabaseReplicated: expected " + expected_log_entry + " got " + entry_name; - // return {}; - //} - String initiator_name; - zkutil::EventPtr wait_committed_or_failed; + zkutil::EventPtr wait_committed_or_failed = std::make_shared(); if (zookeeper->tryGet(entry_path + "/try", initiator_name, nullptr, wait_committed_or_failed)) { @@ -275,7 +267,10 @@ DDLTaskPtr DDLWorker::initAndCheckTask(const String & entry_name, String & out_r /// Query is not committed yet. We cannot just skip it and execute next one, because reordering may break replication. //FIXME add some timeouts if (!task->we_are_initiator) + { + LOG_TRACE(log, "Waiting for initiator {} to commit or rollback entry {}", initiator_name, entry_path); wait_committed_or_failed->wait(); + } } if (!task->we_are_initiator && !zookeeper->exists(entry_path + "/committed")) @@ -378,7 +373,10 @@ void DDLWorker::scheduleTasks() Strings queue_nodes = zookeeper->getChildren(queue_dir, nullptr, queue_updated_event); filterAndSortQueueNodes(queue_nodes); if (queue_nodes.empty()) + { + LOG_TRACE(log, "No tasks to schedule"); return; + } bool server_startup = last_tasks.empty(); @@ -389,6 +387,7 @@ void DDLWorker::scheduleTasks() for (auto it = begin_node; it != queue_nodes.end() && !stop_flag; ++it) { String entry_name = *it; + LOG_TRACE(log, "Checking task {}", entry_name); String reason; auto task = initAndCheckTask(entry_name, reason, zookeeper); @@ -748,11 +747,6 @@ void DDLWorker::processTask(DDLTask & task) Coordination::Requests ops; ops.emplace_back(zkutil::makeRemoveRequest(active_node_path, -1)); ops.emplace_back(zkutil::makeCreateRequest(finished_node_path, task.execution_status.serializeText(), zkutil::CreateMode::Persistent)); - if (database_replicated_ext) - { - //assert(DatabaseReplicatedExtensions::getLogEntryName(database_replicated_ext->first_not_executed) == task.entry_name); - //ops.emplace_back(zkutil::makeSetRequest(database_replicated_ext->getReplicaPath() + "/log_ptr", toString(database_replicated_ext->first_not_executed), -1)); - } //FIXME replace with multi(...) or use MetadataTransaction Coordination::Responses responses; @@ -816,8 +810,8 @@ bool DDLWorker::tryExecuteQueryOnLeaderReplica( else shard_node_name = get_shard_name(task.cluster->getShardsAddresses().at(task.host_shard_num)); String shard_path = node_path + "/shards/" + shard_node_name; - task.shard_path = shard_path; //FIXME duplicate String is_executed_path = shard_path + "/executed"; + task.shard_path = is_executed_path; //FIXME duplicate String tries_to_execute_path = shard_path + "/tries_to_execute"; zookeeper->createAncestors(shard_path + "/"); diff --git a/src/Interpreters/InterpreterAlterQuery.cpp b/src/Interpreters/InterpreterAlterQuery.cpp index c094bb8377c..5f6058b48c0 100644 --- a/src/Interpreters/InterpreterAlterQuery.cpp +++ b/src/Interpreters/InterpreterAlterQuery.cpp @@ -51,9 +51,11 @@ BlockIO InterpreterAlterQuery::execute() auto metadata_snapshot = table->getInMemoryMetadataPtr(); DatabasePtr database = DatabaseCatalog::instance().getDatabase(table_id.database_name); - if (typeid_cast(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY && !table->supportsReplication()) + if (typeid_cast(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) return typeid_cast(database.get())->propose(query_ptr); + //FIXME commit MetadataTransaction for all ALTER kinds. Now its' implemented only for metadata alter. + /// Add default database to table identifiers that we can encounter in e.g. default expressions, /// mutation expression, etc. AddDefaultDatabaseVisitor visitor(table_id.getDatabaseName()); diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 5c176de1395..9db2821502d 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -30,6 +30,7 @@ #include #include +#include #include #include @@ -4047,6 +4048,8 @@ void StorageReplicatedMergeTree::alter( future_metadata_in_zk.constraints = new_constraints_str; Coordination::Requests ops; + size_t alter_path_idx = std::numeric_limits::max(); + size_t mutation_path_idx = std::numeric_limits::max(); String new_metadata_str = future_metadata_in_zk.toString(); ops.emplace_back(zkutil::makeSetRequest(zookeeper_path + "/metadata", new_metadata_str, metadata_version)); @@ -4078,6 +4081,7 @@ void StorageReplicatedMergeTree::alter( *current_metadata, query_context.getSettingsRef().materialize_ttl_after_modify, query_context); alter_entry->have_mutation = !maybe_mutation_commands.empty(); + alter_path_idx = ops.size(); ops.emplace_back(zkutil::makeCreateRequest( zookeeper_path + "/log/log-", alter_entry->toString(), zkutil::CreateMode::PersistentSequential)); @@ -4101,6 +4105,7 @@ void StorageReplicatedMergeTree::alter( mutation_entry.create_time = time(nullptr); ops.emplace_back(zkutil::makeSetRequest(mutations_path, String(), mutations_stat.version)); + mutation_path_idx = ops.size(); ops.emplace_back( zkutil::makeCreateRequest(mutations_path + "/", mutation_entry.toString(), zkutil::CreateMode::PersistentSequential)); } @@ -4108,7 +4113,24 @@ void StorageReplicatedMergeTree::alter( if (auto txn = query_context.getMetadataTransaction()) { txn->addOps(ops); - //TODO maybe also change here table metadata in replicated database? + /// NOTE: IDatabase::alterTable(...) is called when executing ALTER_METADATA queue entry without query context, + /// so we have to update metadata of DatabaseReplicated here. + /// It also may cause "Table columns structure in ZooKeeper is different" error on server startup + /// even for Ordinary and Atomic databases. + String metadata_zk_path = txn->zookeeper_path + "/metadata/" + escapeForFileName(table_id.table_name); + auto ast = DatabaseCatalog::instance().getDatabase(table_id.database_name)->getCreateTableQuery(table_id.table_name, query_context); + auto & ast_create_query = ast->as(); + + //FIXME copy-paste + ASTPtr new_columns = InterpreterCreateQuery::formatColumns(future_metadata.columns); + ASTPtr new_indices = InterpreterCreateQuery::formatIndices(future_metadata.secondary_indices); + ASTPtr new_constraints = InterpreterCreateQuery::formatConstraints(future_metadata.constraints); + + ast_create_query.columns_list->replace(ast_create_query.columns_list->columns, new_columns); + ast_create_query.columns_list->setOrReplace(ast_create_query.columns_list->indices, new_indices); + ast_create_query.columns_list->setOrReplace(ast_create_query.columns_list->constraints, new_constraints); + + ops.emplace_back(zkutil::makeSetRequest(metadata_zk_path, getObjectDefinitionFromCreateQuery(ast), -1)); } Coordination::Responses results; @@ -4124,17 +4146,17 @@ void StorageReplicatedMergeTree::alter( if (alter_entry->have_mutation) { /// ALTER_METADATA record in replication /log - String alter_path = dynamic_cast(*results[2]).path_created; + String alter_path = dynamic_cast(*results[alter_path_idx]).path_created; alter_entry->znode_name = alter_path.substr(alter_path.find_last_of('/') + 1); /// ReplicatedMergeTreeMutationEntry record in /mutations - String mutation_path = dynamic_cast(*results.back()).path_created; + String mutation_path = dynamic_cast(*results[mutation_path_idx]).path_created; mutation_znode = mutation_path.substr(mutation_path.find_last_of('/') + 1); } else { /// ALTER_METADATA record in replication /log - String alter_path = dynamic_cast(*results.back()).path_created; + String alter_path = dynamic_cast(*results[alter_path_idx]).path_created; alter_entry->znode_name = alter_path.substr(alter_path.find_last_of('/') + 1); } break; diff --git a/tests/integration/test_replicated_database/test.py b/tests/integration/test_replicated_database/test.py index 06d8aa9467a..11bfbad393b 100644 --- a/tests/integration/test_replicated_database/test.py +++ b/tests/integration/test_replicated_database/test.py @@ -16,7 +16,7 @@ snapshot_recovering_node = cluster.add_instance('snapshot_recovering_node', main uuid_regex = re.compile("[0-9a-f]{8}\-[0-9a-f]{4}\-[0-9a-f]{4}\-[0-9a-f]{4}\-[0-9a-f]{12}") def assert_create_query(nodes, table_name, expected): replace_uuid = lambda x: re.sub(uuid_regex, "uuid", x) - query = "show create table testdb.{}".format(table_name) + query = "show create table {}".format(table_name) for node in nodes: assert_eq_with_retry(node, query, expected, get_result=replace_uuid) @@ -41,45 +41,53 @@ def test_create_replicated_table(started_cluster): expected = "CREATE TABLE testdb.replicated_table\\n(\\n `d` Date,\\n `k` UInt64,\\n `i32` Int32\\n)\\n" \ "ENGINE = ReplicatedMergeTree(\\'/clickhouse/tables/uuid/{shard}\\', \\'{replica}\\')\\n" \ "PARTITION BY toYYYYMM(d)\\nORDER BY k\\nSETTINGS index_granularity = 8192" - assert_create_query([main_node, dummy_node], "replicated_table", expected) + assert_create_query([main_node, dummy_node], "testdb.replicated_table", expected) # assert without replacing uuid assert main_node.query("show create testdb.replicated_table") == dummy_node.query("show create testdb.replicated_table") -def test_simple_alter_table(started_cluster): - #TODO add test with ReplicatedMergeTree - main_node.query("CREATE TABLE testdb.alter_test " +@pytest.mark.parametrize("engine", ['MergeTree', 'ReplicatedMergeTree']) +def test_simple_alter_table(started_cluster, engine): + name = "testdb.alter_test_{}".format(engine) + main_node.query("CREATE TABLE {} " "(CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) " - "ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192);") - main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN Added0 UInt32;") - main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN Added2 UInt32;") - main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN Added1 UInt32 AFTER Added0;") - main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN AddedNested1 Nested(A UInt32, B UInt64) AFTER Added2;") - main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN AddedNested1.C Array(String) AFTER AddedNested1.B;") - main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN AddedNested2 Nested(A UInt32, B UInt64) AFTER AddedNested1;") + "ENGINE = {} PARTITION BY StartDate ORDER BY (CounterID, StartDate, intHash32(UserID), VisitID);".format(name, engine)) + main_node.query("ALTER TABLE {} ADD COLUMN Added0 UInt32;".format(name)) + main_node.query("ALTER TABLE {} ADD COLUMN Added2 UInt32;".format(name)) + main_node.query("ALTER TABLE {} ADD COLUMN Added1 UInt32 AFTER Added0;".format(name)) + main_node.query("ALTER TABLE {} ADD COLUMN AddedNested1 Nested(A UInt32, B UInt64) AFTER Added2;".format(name)) + main_node.query("ALTER TABLE {} ADD COLUMN AddedNested1.C Array(String) AFTER AddedNested1.B;".format(name)) + main_node.query("ALTER TABLE {} ADD COLUMN AddedNested2 Nested(A UInt32, B UInt64) AFTER AddedNested1;".format(name)) - expected = "CREATE TABLE testdb.alter_test\\n(\\n `CounterID` UInt32,\\n `StartDate` Date,\\n `UserID` UInt32,\\n" \ + full_engine = engine if not "Replicated" in engine else engine + "(\\'/clickhouse/tables/uuid/{shard}\\', \\'{replica}\\')" + expected = "CREATE TABLE {}\\n(\\n `CounterID` UInt32,\\n `StartDate` Date,\\n `UserID` UInt32,\\n" \ " `VisitID` UInt32,\\n `NestedColumn.A` Array(UInt8),\\n `NestedColumn.S` Array(String),\\n" \ " `ToDrop` UInt32,\\n `Added0` UInt32,\\n `Added1` UInt32,\\n `Added2` UInt32,\\n" \ " `AddedNested1.A` Array(UInt32),\\n `AddedNested1.B` Array(UInt64),\\n `AddedNested1.C` Array(String),\\n" \ " `AddedNested2.A` Array(UInt32),\\n `AddedNested2.B` Array(UInt64)\\n)\\n" \ - "ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192)" + "ENGINE = {}\\nPARTITION BY StartDate\\nORDER BY (CounterID, StartDate, intHash32(UserID), VisitID)\\n" \ + "SETTINGS index_granularity = 8192".format(name, full_engine) - assert_create_query([main_node, dummy_node], "alter_test", expected) + assert_create_query([main_node, dummy_node], name, expected) -def test_create_replica_after_delay(started_cluster): + +@pytest.mark.parametrize("engine", ['MergeTree', 'ReplicatedMergeTree']) +def test_create_replica_after_delay(started_cluster, engine): competing_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'shard1', 'replica3');") - main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN Added3 UInt32;") - main_node.query("ALTER TABLE testdb.alter_test DROP COLUMN AddedNested1;") - main_node.query("ALTER TABLE testdb.alter_test RENAME COLUMN Added1 TO AddedNested1;") + name = "testdb.alter_test_{}".format(engine) + main_node.query("ALTER TABLE {} ADD COLUMN Added3 UInt32;".format(name)) + main_node.query("ALTER TABLE {} DROP COLUMN AddedNested1;".format(name)) + main_node.query("ALTER TABLE {} RENAME COLUMN Added1 TO AddedNested1;".format(name)) - expected = "CREATE TABLE testdb.alter_test\\n(\\n `CounterID` UInt32,\\n `StartDate` Date,\\n `UserID` UInt32,\\n" \ + full_engine = engine if not "Replicated" in engine else engine + "(\\'/clickhouse/tables/uuid/{shard}\\', \\'{replica}\\')" + expected = "CREATE TABLE {}\\n(\\n `CounterID` UInt32,\\n `StartDate` Date,\\n `UserID` UInt32,\\n" \ " `VisitID` UInt32,\\n `NestedColumn.A` Array(UInt8),\\n `NestedColumn.S` Array(String),\\n" \ " `ToDrop` UInt32,\\n `Added0` UInt32,\\n `AddedNested1` UInt32,\\n `Added2` UInt32,\\n" \ " `AddedNested2.A` Array(UInt32),\\n `AddedNested2.B` Array(UInt64),\\n `Added3` UInt32\\n)\\n" \ - "ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192)" + "ENGINE = {}\\nPARTITION BY StartDate\\nORDER BY (CounterID, StartDate, intHash32(UserID), VisitID)\\n" \ + "SETTINGS index_granularity = 8192".format(name, full_engine) - assert_create_query([main_node, dummy_node, competing_node], "alter_test", expected) + assert_create_query([main_node, dummy_node, competing_node], name, expected) def test_alters_from_different_replicas(started_cluster): main_node.query("CREATE TABLE testdb.concurrent_test " @@ -103,7 +111,7 @@ def test_alters_from_different_replicas(started_cluster): " `AddedNested2.B` Array(UInt64)\\n)\\n" \ "ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192)" - assert_create_query([main_node, competing_node], "concurrent_test", expected) + assert_create_query([main_node, competing_node], "testdb.concurrent_test", expected) def test_drop_and_create_table(started_cluster): main_node.query("DROP TABLE testdb.concurrent_test") @@ -115,7 +123,7 @@ def test_drop_and_create_table(started_cluster): " `VisitID` UInt32,\\n `NestedColumn.A` Array(UInt8),\\n `NestedColumn.S` Array(String),\\n `ToDrop` UInt32\\n)\\n" \ "ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192)" - assert_create_query([main_node, competing_node], "concurrent_test", expected) + assert_create_query([main_node, competing_node], "testdb.concurrent_test", expected) def test_replica_restart(started_cluster): main_node.restart_clickhouse() @@ -124,7 +132,7 @@ def test_replica_restart(started_cluster): " `VisitID` UInt32,\\n `NestedColumn.A` Array(UInt8),\\n `NestedColumn.S` Array(String),\\n `ToDrop` UInt32\\n)\\n" \ "ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192)" - assert_create_query([main_node, competing_node], "concurrent_test", expected) + assert_create_query([main_node, competing_node], "testdb.concurrent_test", expected) def test_snapshot_and_snapshot_recover(started_cluster): #FIXME bad test @@ -142,7 +150,7 @@ def test_drop_and_create_replica(started_cluster): " `VisitID` UInt32,\\n `NestedColumn.A` Array(UInt8),\\n `NestedColumn.S` Array(String),\\n `ToDrop` UInt32\\n)\\n" \ "ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192)" - assert_create_query([main_node, competing_node], "concurrent_test", expected) + assert_create_query([main_node, competing_node], "testdb.concurrent_test", expected) #TODO tests with Distributed From f1a52a609bd6ced447fbb2cb4102675c798e32c0 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Fri, 27 Nov 2020 17:04:03 +0300 Subject: [PATCH 052/381] separate DatabaseReplicatedDDLWorker --- src/Databases/DatabaseAtomic.cpp | 4 +- src/Databases/DatabaseAtomic.h | 4 +- src/Databases/DatabaseLazy.cpp | 2 +- src/Databases/DatabaseLazy.h | 2 +- src/Databases/DatabaseOnDisk.cpp | 2 +- src/Databases/DatabaseOnDisk.h | 2 +- src/Databases/DatabaseOrdinary.cpp | 4 +- src/Databases/DatabaseOrdinary.h | 4 +- src/Databases/DatabaseReplicated.cpp | 91 +++-- src/Databases/DatabaseReplicated.h | 13 +- src/Databases/DatabaseReplicatedWorker.cpp | 114 ++++++ src/Databases/DatabaseReplicatedWorker.h | 26 ++ src/Databases/DatabaseWithDictionaries.cpp | 2 +- src/Databases/DatabaseWithDictionaries.h | 2 +- src/Interpreters/Context.cpp | 3 +- src/Interpreters/DDLTask.cpp | 280 +++++++++++++ src/Interpreters/DDLTask.h | 85 +++- src/Interpreters/DDLWorker.cpp | 371 ++---------------- src/Interpreters/DDLWorker.h | 64 +-- .../configs/config.xml | 3 + .../configs/disable_snapshots.xml | 3 - .../configs/snapshot_each_query.xml | 3 - .../test_replicated_database/test.py | 21 +- 23 files changed, 639 insertions(+), 466 deletions(-) create mode 100644 src/Databases/DatabaseReplicatedWorker.cpp create mode 100644 src/Databases/DatabaseReplicatedWorker.h create mode 100644 tests/integration/test_replicated_database/configs/config.xml delete mode 100644 tests/integration/test_replicated_database/configs/disable_snapshots.xml delete mode 100644 tests/integration/test_replicated_database/configs/snapshot_each_query.xml diff --git a/src/Databases/DatabaseAtomic.cpp b/src/Databases/DatabaseAtomic.cpp index ca39cefc5c8..a444d9cc200 100644 --- a/src/Databases/DatabaseAtomic.cpp +++ b/src/Databases/DatabaseAtomic.cpp @@ -38,12 +38,12 @@ public: UUID uuid() const override { return table()->getStorageID().uuid; } }; -DatabaseAtomic::DatabaseAtomic(String name_, String metadata_path_, UUID uuid, Context & context_) +DatabaseAtomic::DatabaseAtomic(String name_, String metadata_path_, UUID uuid, const Context & context_) : DatabaseAtomic(name_, metadata_path_, uuid, "DatabaseAtomic (" + name_ + ")", context_) { } -DatabaseAtomic::DatabaseAtomic(String name_, String metadata_path_, UUID uuid, const String & logger, Context & context_) +DatabaseAtomic::DatabaseAtomic(String name_, String metadata_path_, UUID uuid, const String & logger, const Context & context_) : DatabaseOrdinary(name_, std::move(metadata_path_), "store/", logger, context_) , path_to_table_symlinks(global_context.getPath() + "data/" + escapeForFileName(name_) + "/") , path_to_metadata_symlink(global_context.getPath() + "metadata/" + escapeForFileName(name_)) diff --git a/src/Databases/DatabaseAtomic.h b/src/Databases/DatabaseAtomic.h index 9cc6a429656..e9cb418c787 100644 --- a/src/Databases/DatabaseAtomic.h +++ b/src/Databases/DatabaseAtomic.h @@ -20,8 +20,8 @@ namespace DB class DatabaseAtomic : public DatabaseOrdinary { public: - DatabaseAtomic(String name_, String metadata_path_, UUID uuid, Context & context_); - DatabaseAtomic(String name_, String metadata_path_, UUID uuid, const String & logger, Context & context_); + DatabaseAtomic(String name_, String metadata_path_, UUID uuid, const Context & context_); + DatabaseAtomic(String name_, String metadata_path_, UUID uuid, const String & logger, const Context & context_); String getEngineName() const override { return "Atomic"; } UUID getUUID() const override { return db_uuid; } diff --git a/src/Databases/DatabaseLazy.cpp b/src/Databases/DatabaseLazy.cpp index a4ace4bde9b..0119f17f843 100644 --- a/src/Databases/DatabaseLazy.cpp +++ b/src/Databases/DatabaseLazy.cpp @@ -27,7 +27,7 @@ namespace ErrorCodes } -DatabaseLazy::DatabaseLazy(const String & name_, const String & metadata_path_, time_t expiration_time_, Context & context_) +DatabaseLazy::DatabaseLazy(const String & name_, const String & metadata_path_, time_t expiration_time_, const Context & context_) : DatabaseOnDisk(name_, metadata_path_, "data/" + escapeForFileName(name_) + "/", "DatabaseLazy (" + name_ + ")", context_) , expiration_time(expiration_time_) { diff --git a/src/Databases/DatabaseLazy.h b/src/Databases/DatabaseLazy.h index 0893b085fae..2d091297c91 100644 --- a/src/Databases/DatabaseLazy.h +++ b/src/Databases/DatabaseLazy.h @@ -18,7 +18,7 @@ class Context; class DatabaseLazy final : public DatabaseOnDisk { public: - DatabaseLazy(const String & name_, const String & metadata_path_, time_t expiration_time_, Context & context_); + DatabaseLazy(const String & name_, const String & metadata_path_, time_t expiration_time_, const Context & context_); String getEngineName() const override { return "Lazy"; } diff --git a/src/Databases/DatabaseOnDisk.cpp b/src/Databases/DatabaseOnDisk.cpp index 8f24f53fc3f..18941ba7c04 100644 --- a/src/Databases/DatabaseOnDisk.cpp +++ b/src/Databases/DatabaseOnDisk.cpp @@ -131,7 +131,7 @@ DatabaseOnDisk::DatabaseOnDisk( const String & metadata_path_, const String & data_path_, const String & logger, - Context & context) + const Context & context) : DatabaseWithOwnTablesBase(name, logger, context) , metadata_path(metadata_path_) , data_path(data_path_) diff --git a/src/Databases/DatabaseOnDisk.h b/src/Databases/DatabaseOnDisk.h index a5510ef4810..f5b9ea0c0d5 100644 --- a/src/Databases/DatabaseOnDisk.h +++ b/src/Databases/DatabaseOnDisk.h @@ -31,7 +31,7 @@ String getObjectDefinitionFromCreateQuery(const ASTPtr & query); class DatabaseOnDisk : public DatabaseWithOwnTablesBase { public: - DatabaseOnDisk(const String & name, const String & metadata_path_, const String & data_path_, const String & logger, Context & context); + DatabaseOnDisk(const String & name, const String & metadata_path_, const String & data_path_, const String & logger, const Context & context); void createTable( const Context & context, diff --git a/src/Databases/DatabaseOrdinary.cpp b/src/Databases/DatabaseOrdinary.cpp index aaceb640213..470c9e7db29 100644 --- a/src/Databases/DatabaseOrdinary.cpp +++ b/src/Databases/DatabaseOrdinary.cpp @@ -99,13 +99,13 @@ namespace } -DatabaseOrdinary::DatabaseOrdinary(const String & name_, const String & metadata_path_, Context & context_) +DatabaseOrdinary::DatabaseOrdinary(const String & name_, const String & metadata_path_, const Context & context_) : DatabaseOrdinary(name_, metadata_path_, "data/" + escapeForFileName(name_) + "/", "DatabaseOrdinary (" + name_ + ")", context_) { } DatabaseOrdinary::DatabaseOrdinary( - const String & name_, const String & metadata_path_, const String & data_path_, const String & logger, Context & context_) + const String & name_, const String & metadata_path_, const String & data_path_, const String & logger, const Context & context_) : DatabaseWithDictionaries(name_, metadata_path_, data_path_, logger, context_) { } diff --git a/src/Databases/DatabaseOrdinary.h b/src/Databases/DatabaseOrdinary.h index 6a21e19d5e2..c1ad32345f6 100644 --- a/src/Databases/DatabaseOrdinary.h +++ b/src/Databases/DatabaseOrdinary.h @@ -14,8 +14,8 @@ namespace DB class DatabaseOrdinary : public DatabaseWithDictionaries { public: - DatabaseOrdinary(const String & name_, const String & metadata_path_, Context & context); - DatabaseOrdinary(const String & name_, const String & metadata_path_, const String & data_path_, const String & logger, Context & context_); + DatabaseOrdinary(const String & name_, const String & metadata_path_, const Context & context); + DatabaseOrdinary(const String & name_, const String & metadata_path_, const String & data_path_, const String & logger, const Context & context_); String getEngineName() const override { return "Ordinary"; } diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 25fb95ba0de..eef1b98afe2 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -13,12 +13,16 @@ #include #include #include -#include +#include #include #include #include #include #include +#include +#include +#include +#include namespace DB { @@ -52,7 +56,7 @@ DatabaseReplicated::DatabaseReplicated( const String & zookeeper_path_, const String & shard_name_, const String & replica_name_, - Context & context_) + const Context & context_) : DatabaseAtomic(name_, metadata_path_, uuid, "DatabaseReplicated (" + name_ + ")", context_) , zookeeper_path(zookeeper_path_) , shard_name(shard_name_) @@ -116,8 +120,11 @@ bool DatabaseReplicated::createDatabaseNodesInZooKeeper(const zkutil::ZooKeeperP ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/log", "", zkutil::CreateMode::Persistent)); ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/replicas", "", zkutil::CreateMode::Persistent)); ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/counter", "", zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/counter/cnt-", "", zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_path + "/counter/cnt-", -1)); ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/metadata", "", zkutil::CreateMode::Persistent)); - ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/min_log_ptr", "0", zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/min_log_ptr", "1", zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/max_log_ptr", "1", zkutil::CreateMode::Persistent)); Coordination::Responses responses; auto res = current_zookeeper->tryMulti(ops, responses); @@ -128,6 +135,7 @@ bool DatabaseReplicated::createDatabaseNodesInZooKeeper(const zkutil::ZooKeeperP zkutil::KeeperMultiException::check(res, ops, responses); assert(false); + __builtin_unreachable(); } void DatabaseReplicated::createReplicaNodesInZooKeeper(const zkutil::ZooKeeperPtr & current_zookeeper) @@ -135,7 +143,7 @@ void DatabaseReplicated::createReplicaNodesInZooKeeper(const zkutil::ZooKeeperPt current_zookeeper->createAncestors(replica_path); /// When creating new replica, use latest snapshot version as initial value of log_pointer - log_entry_to_execute = 0; //FIXME + //log_entry_to_execute = 0; //FIXME /// Write host name to replica_path, it will protect from multiple replicas with the same name auto host_id = getHostID(global_context); @@ -153,8 +161,8 @@ void DatabaseReplicated::createReplicaNodesInZooKeeper(const zkutil::ZooKeeperPt Coordination::Requests ops; ops.emplace_back(zkutil::makeCreateRequest(replica_path, host_id, zkutil::CreateMode::Persistent)); - ops.emplace_back(zkutil::makeCreateRequest(replica_path + "/log_ptr", toString(log_entry_to_execute), zkutil::CreateMode::Persistent)); - ops.emplace_back(zkutil::makeCreateRequest(query_path, entry.toString(), zkutil::CreateMode::PersistentSequential)); + ops.emplace_back(zkutil::makeCreateRequest(replica_path + "/log_ptr", "0", zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(query_path, entry.toString(), zkutil::CreateMode::Persistent)); ops.emplace_back(zkutil::makeRemoveRequest(counter_path, -1)); current_zookeeper->multi(ops); } @@ -163,22 +171,9 @@ void DatabaseReplicated::loadStoredObjects(Context & context, bool has_force_res { DatabaseAtomic::loadStoredObjects(context, has_force_restore_data_flag, force_attach); - recoverLostReplica(global_context.getZooKeeper(), 0, true); //FIXME + //recoverLostReplica(global_context.getZooKeeper(), 0, true); //FIXME - DatabaseReplicatedExtensions ext; - ext.database_uuid = getUUID(); - ext.zookeeper_path = zookeeper_path; - ext.database_name = getDatabaseName(); - ext.shard_name = shard_name; - ext.replica_name = replica_name; - ext.first_not_executed = log_entry_to_execute; - ext.lost_callback = [this] (const String & entry_name, const ZooKeeperPtr & zookeeper) { onUnexpectedLogEntry(entry_name, zookeeper); }; - ext.executed_callback = [this] (const String & entry_name, const ZooKeeperPtr & zookeeper) { onExecutedLogEntry(entry_name, zookeeper); }; - - /// Pool size must be 1 (to avoid reordering of log entries) - constexpr size_t pool_size = 1; - ddl_worker = std::make_unique(pool_size, zookeeper_path + "/log", global_context, nullptr, "", - std::make_optional(std::move(ext))); + ddl_worker = std::make_unique(this, global_context); } void DatabaseReplicated::onUnexpectedLogEntry(const String & entry_name, const ZooKeeperPtr & zookeeper) @@ -314,48 +309,68 @@ BlockIO DatabaseReplicated::propose(const ASTPtr & query) } -void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeeper, UInt32 from_snapshot, bool create) +void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeeper, UInt32 from_snapshot, bool /*create*/) { - LOG_WARNING(log, "Will recover replica from snapshot", from_snapshot); + LOG_WARNING(log, "Will recover replica"); //FIXME drop old tables String snapshot_metadata_path = zookeeper_path + "/metadata"; Strings tables_in_snapshot = current_zookeeper->getChildren(snapshot_metadata_path); snapshot_metadata_path += '/'; + from_snapshot = parse(current_zookeeper->get(zookeeper_path + "/max_log_ptr")); for (const auto & table_name : tables_in_snapshot) { //FIXME It's not atomic. We need multiget here (available since ZooKeeper 3.6.0). - String query_to_execute = current_zookeeper->get(snapshot_metadata_path + table_name); + String query_text = current_zookeeper->get(snapshot_metadata_path + table_name); + auto query_ast = parseQueryFromMetadataInZooKeeper(table_name, query_text); + Context query_context = global_context; + query_context.makeQueryContext(); + query_context.getClientInfo().query_kind = ClientInfo::QueryKind::REPLICATED_LOG_QUERY; + query_context.setCurrentDatabase(database_name); + query_context.setCurrentQueryId(""); // generate random query_id - if (!startsWith(query_to_execute, "ATTACH ")) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected query: {}", query_to_execute); - query_to_execute = "CREATE " + query_to_execute.substr(strlen("ATTACH ")); + //FIXME + DatabaseCatalog::instance().waitTableFinallyDropped(query_ast->as()->uuid); - Context current_context = global_context; - current_context.getClientInfo().query_kind = ClientInfo::QueryKind::REPLICATED_LOG_QUERY; - current_context.setCurrentDatabase(database_name); - current_context.setCurrentQueryId(""); // generate random query_id - - executeQuery(query_to_execute, current_context); + LOG_INFO(log, "Executing {}", serializeAST(*query_ast)); + InterpreterCreateQuery(query_ast, query_context).execute(); } - if (create) - return; + //if (create) + // return; - current_zookeeper->set(replica_path + "/log-ptr", toString(from_snapshot)); + current_zookeeper->set(replica_path + "/log_ptr", toString(from_snapshot)); last_executed_log_entry = from_snapshot; - ddl_worker->setLogPointer(from_snapshot); //FIXME + //ddl_worker->setLogPointer(from_snapshot); //FIXME //writeLastExecutedToDiskAndZK(); } +ASTPtr DatabaseReplicated::parseQueryFromMetadataInZooKeeper(const String & node_name, const String & query) +{ + ParserCreateQuery parser; + String description = "in ZooKeeper " + zookeeper_path + "/metadata/" + node_name; + auto ast = parseQuery(parser, query, description, 0, global_context.getSettingsRef().max_parser_depth); + + auto & create = ast->as(); + if (create.uuid == UUIDHelpers::Nil || create.table != TABLE_WITH_UUID_NAME_PLACEHOLDER || ! create.database.empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Got unexpected query from {}: {}", node_name, query); + + create.database = getDatabaseName(); + create.table = unescapeForFileName(node_name); + create.attach = false; + + return ast; +} + void DatabaseReplicated::drop(const Context & context_) { auto current_zookeeper = getZooKeeper(); - current_zookeeper->tryRemove(zookeeper_path + "/replicas/" + replica_name); + current_zookeeper->set(replica_path, "DROPPED"); + current_zookeeper->tryRemoveRecursive(replica_path); DatabaseAtomic::drop(context_); } diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index 663df59ac63..d6cd93773cf 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -12,7 +12,7 @@ namespace DB { -class DDLWorker; +class DatabaseReplicatedDDLWorker; using ZooKeeperPtr = std::shared_ptr; /** DatabaseReplicated engine @@ -42,7 +42,7 @@ class DatabaseReplicated : public DatabaseAtomic public: DatabaseReplicated(const String & name_, const String & metadata_path_, UUID uuid, const String & zookeeper_path_, const String & shard_name_, const String & replica_name_, - Context & context); + const Context & context); ~DatabaseReplicated() override; @@ -56,6 +56,11 @@ public: void loadStoredObjects(Context & context, bool has_force_restore_data_flag, bool force_attach = false) override; + String getFullReplicaName() const { return shard_name + '|' + replica_name; } + + //FIXME + friend struct DatabaseReplicatedTask; + friend class DatabaseReplicatedDDLWorker; private: bool createDatabaseNodesInZooKeeper(const ZooKeeperPtr & current_zookeeper); void createReplicaNodesInZooKeeper(const ZooKeeperPtr & current_zookeeper); @@ -72,6 +77,8 @@ private: void onExecutedLogEntry(const String & entry_name, const ZooKeeperPtr & zookeeper); + ASTPtr parseQueryFromMetadataInZooKeeper(const String & node_name, const String & query); + String zookeeper_path; String shard_name; String replica_name; @@ -88,7 +95,7 @@ private: zkutil::ZooKeeperPtr getZooKeeper() const; - std::unique_ptr ddl_worker; + std::unique_ptr ddl_worker; diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp new file mode 100644 index 00000000000..869b888d3ad --- /dev/null +++ b/src/Databases/DatabaseReplicatedWorker.cpp @@ -0,0 +1,114 @@ +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + +DatabaseReplicatedDDLWorker::DatabaseReplicatedDDLWorker(DatabaseReplicated * db, const Context & context_) + : DDLWorker(/* pool_size */ 1, db->zookeeper_path + "/log", context_, nullptr, {}, fmt::format("DDLWorker({})", db->getDatabaseName())) + , database(db) +{ + /// Pool size must be 1 (to avoid reordering of log entries) +} + +void DatabaseReplicatedDDLWorker::initialize() +{ + /// Check if we need to recover replica. + /// Invariant: replica is lost if it's log_ptr value is less then min_log_ptr value. + + UInt32 our_log_ptr = parse(current_zookeeper->get(database->replica_path + "/log_ptr")); + UInt32 min_log_ptr = parse(current_zookeeper->get(database->zookeeper_path + "/min_log_ptr")); + if (our_log_ptr < min_log_ptr) + database->recoverLostReplica(current_zookeeper, 0); +} + +String DatabaseReplicatedDDLWorker::enqueueQuery(DDLLogEntry & entry) +{ + auto zookeeper = getAndSetZooKeeper(); + const String query_path_prefix = queue_dir + "/query-"; + + /// We cannot create sequential node and it's ephemeral child in a single transaction, so allocate sequential number another way + String counter_prefix = database->zookeeper_path + "/counter/cnt-"; + String counter_path = zookeeper->create(counter_prefix, "", zkutil::CreateMode::EphemeralSequential); + String node_path = query_path_prefix + counter_path.substr(counter_prefix.size()); + + Coordination::Requests ops; + /// Query is not committed yet, but we have to write it into log to avoid reordering + ops.emplace_back(zkutil::makeCreateRequest(node_path, entry.toString(), zkutil::CreateMode::Persistent)); + /// '/try' will be replaced with '/committed' or will be removed due to expired session or other error + ops.emplace_back(zkutil::makeCreateRequest(node_path + "/try", database->getFullReplicaName(), zkutil::CreateMode::Ephemeral)); + /// We don't need it anymore + ops.emplace_back(zkutil::makeRemoveRequest(counter_path, -1)); + /// Create status dirs + ops.emplace_back(zkutil::makeCreateRequest(node_path + "/active", "", zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(node_path + "/finished", "", zkutil::CreateMode::Persistent)); + zookeeper->multi(ops); + + return node_path; +} + +DDLTaskPtr DatabaseReplicatedDDLWorker::initAndCheckTask(const String & entry_name, String & out_reason, const ZooKeeperPtr & zookeeper) +{ + UInt32 our_log_ptr = parse(current_zookeeper->get(database->replica_path + "/log_ptr")); + UInt32 entry_num = DatabaseReplicatedTask::getLogEntryNumber(entry_name); + + if (entry_num <= our_log_ptr) + { + out_reason = fmt::format("Task {} already executed according to log pointer {}", entry_name, our_log_ptr); + return {}; + } + + String entry_path = queue_dir + "/" + entry_name; + auto task = std::make_unique(entry_name, entry_path, database); + + String initiator_name; + zkutil::EventPtr wait_committed_or_failed = std::make_shared(); + + if (zookeeper->tryGet(entry_path + "/try", initiator_name, nullptr, wait_committed_or_failed)) + { + task->we_are_initiator = initiator_name == task->host_id_str; + /// Query is not committed yet. We cannot just skip it and execute next one, because reordering may break replication. + //FIXME add some timeouts + if (!task->we_are_initiator) + { + LOG_TRACE(log, "Waiting for initiator {} to commit or rollback entry {}", initiator_name, entry_path); + wait_committed_or_failed->wait(); + } + } + + if (!task->we_are_initiator && !zookeeper->exists(entry_path + "/committed")) + { + out_reason = "Entry " + entry_name + " hasn't been committed"; + return {}; + } + + String node_data; + if (!zookeeper->tryGet(entry_path, node_data)) + { + LOG_ERROR(log, "Cannot get log entry {}", entry_path); + database->onUnexpectedLogEntry(entry_name, zookeeper); + throw Exception(ErrorCodes::LOGICAL_ERROR, "should be unreachable"); + } + + auto error = task->tryParseEntry(node_data); + if (error) + { + LOG_ERROR(log, "Cannot parse query from '{}': {}", node_data, *error); + database->onUnexpectedLogEntry(entry_name, zookeeper); + throw Exception(ErrorCodes::LOGICAL_ERROR, "should be unreachable"); + } + + task->parseQueryFromEntry(context); + + return task; +} + + + +} diff --git a/src/Databases/DatabaseReplicatedWorker.h b/src/Databases/DatabaseReplicatedWorker.h new file mode 100644 index 00000000000..d190bd1795d --- /dev/null +++ b/src/Databases/DatabaseReplicatedWorker.h @@ -0,0 +1,26 @@ +#pragma once +#include + + +namespace DB +{ + +class DatabaseReplicated; + +class DatabaseReplicatedDDLWorker : public DDLWorker +{ +public: + DatabaseReplicatedDDLWorker(DatabaseReplicated * db, const Context & context_); + + String enqueueQuery(DDLLogEntry & entry) override; + +private: + void initialize() override; + + DDLTaskPtr initAndCheckTask(const String & entry_name, String & out_reason, const ZooKeeperPtr & zookeeper) override; + + DatabaseReplicated * database; + +}; + +} diff --git a/src/Databases/DatabaseWithDictionaries.cpp b/src/Databases/DatabaseWithDictionaries.cpp index da7f7f9b83e..ee16f4ae15e 100644 --- a/src/Databases/DatabaseWithDictionaries.cpp +++ b/src/Databases/DatabaseWithDictionaries.cpp @@ -349,7 +349,7 @@ void DatabaseWithDictionaries::shutdown() DatabaseWithDictionaries::DatabaseWithDictionaries( - const String & name, const String & metadata_path_, const String & data_path_, const String & logger, Context & context) + const String & name, const String & metadata_path_, const String & data_path_, const String & logger, const Context & context) : DatabaseOnDisk(name, metadata_path_, data_path_, logger, context) , external_loader(context.getExternalDictionariesLoader()) { diff --git a/src/Databases/DatabaseWithDictionaries.h b/src/Databases/DatabaseWithDictionaries.h index 36cee18e4db..d69289d7456 100644 --- a/src/Databases/DatabaseWithDictionaries.h +++ b/src/Databases/DatabaseWithDictionaries.h @@ -38,7 +38,7 @@ public: ~DatabaseWithDictionaries() override; protected: - DatabaseWithDictionaries(const String & name, const String & metadata_path_, const String & data_path_, const String & logger, Context & context); + DatabaseWithDictionaries(const String & name, const String & metadata_path_, const String & data_path_, const String & logger, const Context & context); ASTPtr getCreateDictionaryQueryImpl(const String & dictionary_name, bool throw_on_error) const override; diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index 04bd6b37280..b9283935ec9 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -2437,7 +2437,8 @@ void Context::initMetadataTransaction(MetadataTransactionPtr txn) MetadataTransactionPtr Context::getMetadataTransaction() const { - assert(query_context == this); + //FIXME + //assert(query_context == this); return metadata_transaction; } diff --git a/src/Interpreters/DDLTask.cpp b/src/Interpreters/DDLTask.cpp index dfb8f5ff746..0bc98dfd0dd 100644 --- a/src/Interpreters/DDLTask.cpp +++ b/src/Interpreters/DDLTask.cpp @@ -6,6 +6,12 @@ #include #include #include +#include +#include +#include +#include +#include +#include namespace DB { @@ -13,6 +19,8 @@ namespace DB namespace ErrorCodes { extern const int UNKNOWN_FORMAT_VERSION; + extern const int UNKNOWN_TYPE_OF_QUERY; + extern const int INCONSISTENT_CLUSTER_DEFINITION; } HostID HostID::fromString(const String & host_port_str) @@ -78,4 +86,276 @@ void DDLLogEntry::parse(const String & data) } +std::optional DDLTaskBase::tryParseEntry(const String & data) +{ + std::optional error; + try + { + entry.parse(data); + } + catch (...) + { + error = ExecutionStatus::fromCurrentException().serializeText(); + } + return error; +} + +void DDLTaskBase::parseQueryFromEntry(const Context & context) +{ + const char * begin = entry.query.data(); + const char * end = begin + entry.query.size(); + + ParserQuery parser_query(end); + String description; + query = parseQuery(parser_query, begin, end, description, 0, context.getSettingsRef().max_parser_depth); +} + +std::unique_ptr DDLTaskBase::makeQueryContext(Context & from_context) const +{ + auto query_context = std::make_unique(from_context); + query_context->makeQueryContext(); + query_context->setCurrentQueryId(""); // generate random query_id + query_context->getClientInfo().query_kind = ClientInfo::QueryKind::SECONDARY_QUERY; + return query_context; +} + + +bool DDLTask::findCurrentHostID(const Context & global_context, Poco::Logger * log) +{ + bool host_in_hostlist = false; + + for (const HostID & host : entry.hosts) + { + auto maybe_secure_port = global_context.getTCPPortSecure(); + + /// The port is considered local if it matches TCP or TCP secure port that the server is listening. + bool is_local_port = (maybe_secure_port && host.isLocalAddress(*maybe_secure_port)) + || host.isLocalAddress(global_context.getTCPPort()); + + if (!is_local_port) + continue; + + if (host_in_hostlist) + { + /// This check could be slow a little bit + LOG_WARNING(log, "There are two the same ClickHouse instances in task {}: {} and {}. Will use the first one only.", + entry_name, host_id.readableString(), host.readableString()); + } + else + { + host_in_hostlist = true; + host_id = host; + host_id_str = host.toString(); + } + } + + return host_in_hostlist; +} + +void DDLTask::setClusterInfo(const Context & context, Poco::Logger * log) +{ + auto query_on_cluster = dynamic_cast(query.get()); + if (!query_on_cluster) + throw Exception("Received unknown DDL query", ErrorCodes::UNKNOWN_TYPE_OF_QUERY); + + cluster_name = query_on_cluster->cluster; + cluster = context.tryGetCluster(cluster_name); + + if (!cluster) + throw Exception(ErrorCodes::INCONSISTENT_CLUSTER_DEFINITION, + "DDL task {} contains current host {} in cluster {}, but there are no such cluster here.", + entry_name, host_id.readableString(), cluster_name); + + /// Try to find host from task host list in cluster + /// At the first, try find exact match (host name and ports should be literally equal) + /// If the attempt fails, try find it resolving host name of each instance + + if (!tryFindHostInCluster()) + { + LOG_WARNING(log, "Not found the exact match of host {} from task {} in cluster {} definition. Will try to find it using host name resolving.", + host_id.readableString(), entry_name, cluster_name); + + if (!tryFindHostInClusterViaResolving(context)) + throw Exception(ErrorCodes::INCONSISTENT_CLUSTER_DEFINITION, "Not found host {} in definition of cluster {}", + host_id.readableString(), cluster_name); + + LOG_INFO(log, "Resolved host {} from task {} as host {} in definition of cluster {}", + host_id.readableString(), entry_name, address_in_cluster.readableString(), cluster_name); + } + + query = query_on_cluster->getRewrittenASTWithoutOnCluster(address_in_cluster.default_database); + query_on_cluster = nullptr; +} + +bool DDLTask::tryFindHostInCluster() +{ + const auto & shards = cluster->getShardsAddresses(); + bool found_exact_match = false; + String default_database; + + for (size_t shard_num = 0; shard_num < shards.size(); ++shard_num) + { + for (size_t replica_num = 0; replica_num < shards[shard_num].size(); ++replica_num) + { + const Cluster::Address & address = shards[shard_num][replica_num]; + + if (address.host_name == host_id.host_name && address.port == host_id.port) + { + if (found_exact_match) + { + if (default_database == address.default_database) + { + throw Exception(ErrorCodes::INCONSISTENT_CLUSTER_DEFINITION, + "There are two exactly the same ClickHouse instances {} in cluster {}", + address.readableString(), cluster_name); + } + else + { + /* Circular replication is used. + * It is when every physical node contains + * replicas of different shards of the same table. + * To distinguish one replica from another on the same node, + * every shard is placed into separate database. + * */ + is_circular_replicated = true; + auto * query_with_table = dynamic_cast(query.get()); + if (!query_with_table || query_with_table->database.empty()) + { + throw Exception(ErrorCodes::INCONSISTENT_CLUSTER_DEFINITION, + "For a distributed DDL on circular replicated cluster its table name must be qualified by database name."); + } + if (default_database == query_with_table->database) + return true; + } + } + found_exact_match = true; + host_shard_num = shard_num; + host_replica_num = replica_num; + address_in_cluster = address; + default_database = address.default_database; + } + } + } + + return found_exact_match; +} + +bool DDLTask::tryFindHostInClusterViaResolving(const Context & context) +{ + const auto & shards = cluster->getShardsAddresses(); + bool found_via_resolving = false; + + for (size_t shard_num = 0; shard_num < shards.size(); ++shard_num) + { + for (size_t replica_num = 0; replica_num < shards[shard_num].size(); ++replica_num) + { + const Cluster::Address & address = shards[shard_num][replica_num]; + + if (auto resolved = address.getResolvedAddress(); + resolved && (isLocalAddress(*resolved, context.getTCPPort()) + || (context.getTCPPortSecure() && isLocalAddress(*resolved, *context.getTCPPortSecure())))) + { + if (found_via_resolving) + { + throw Exception(ErrorCodes::INCONSISTENT_CLUSTER_DEFINITION, + "There are two the same ClickHouse instances in cluster {} : {} and {}", + cluster_name, address_in_cluster.readableString(), address.readableString()); + } + else + { + found_via_resolving = true; + host_shard_num = shard_num; + host_replica_num = replica_num; + address_in_cluster = address; + } + } + } + } + + return found_via_resolving; +} + +String DDLTask::getShardID() const +{ + /// Generate unique name for shard node, it will be used to execute the query by only single host + /// Shard node name has format 'replica_name1,replica_name2,...,replica_nameN' + /// Where replica_name is 'replica_config_host_name:replica_port' + + auto shard_addresses = cluster->getShardsAddresses().at(host_shard_num); + + Strings replica_names; + for (const Cluster::Address & address : shard_addresses) + replica_names.emplace_back(address.readableString()); + std::sort(replica_names.begin(), replica_names.end()); + + String res; + for (auto it = replica_names.begin(); it != replica_names.end(); ++it) + res += *it + (std::next(it) != replica_names.end() ? "," : ""); + + return res; +} + +DatabaseReplicatedTask::DatabaseReplicatedTask(const String & name, const String & path, DatabaseReplicated * database_) + : DDLTaskBase(name, path) + , database(database_) +{ + host_id_str = database->getFullReplicaName(); +} + +String DatabaseReplicatedTask::getShardID() const +{ + return database->shard_name; +} + +std::unique_ptr DatabaseReplicatedTask::makeQueryContext(Context & from_context) const +{ + auto query_context = DDLTaskBase::makeQueryContext(from_context); + query_context->getClientInfo().query_kind = ClientInfo::QueryKind::REPLICATED_LOG_QUERY; //FIXME why do we need separate query kind? + query_context->setCurrentDatabase(database->getDatabaseName()); + + if (we_are_initiator) + { + auto txn = std::make_shared(); + query_context->initMetadataTransaction(txn); + txn->current_zookeeper = from_context.getZooKeeper(); + txn->zookeeper_path = database->zookeeper_path; + txn->ops.emplace_back(zkutil::makeRemoveRequest(entry_path + "/try", -1)); + txn->ops.emplace_back(zkutil::makeCreateRequest(entry_path + "/committed", host_id_str, zkutil::CreateMode::Persistent)); + txn->ops.emplace_back(zkutil::makeRemoveRequest(getActiveNodePath(), -1)); + if (execute_on_leader) + txn->ops.emplace_back(zkutil::makeCreateRequest(getShardNodePath() + "/executed", host_id_str, zkutil::CreateMode::Persistent)); + txn->ops.emplace_back(zkutil::makeCreateRequest(getFinishedNodePath(), execution_status.serializeText(), zkutil::CreateMode::Persistent)); + txn->ops.emplace_back(zkutil::makeSetRequest(database->replica_path + "/log_ptr", toString(getLogEntryNumber(entry_name)), -1)); + txn->ops.emplace_back(zkutil::makeSetRequest(database->zookeeper_path + "/max_log_ptr", toString(getLogEntryNumber(entry_name)), -1)); + } + + return query_context; +} + +String DatabaseReplicatedTask::getLogEntryName(UInt32 log_entry_number) +{ + constexpr size_t seq_node_digits = 10; + String number = toString(log_entry_number); + String name = "query-" + String(seq_node_digits - number.size(), '0') + number; + return name; +} + +UInt32 DatabaseReplicatedTask::getLogEntryNumber(const String & log_entry_name) +{ + constexpr const char * name = "query-"; + assert(startsWith(log_entry_name, name)); + return parse(log_entry_name.substr(strlen(name))); +} + +void DatabaseReplicatedTask::parseQueryFromEntry(const Context & context) +{ + if (entry.query.empty()) + { + was_executed = true; + return; + } + + DDLTaskBase::parseQueryFromEntry(context); +} + } diff --git a/src/Interpreters/DDLTask.h b/src/Interpreters/DDLTask.h index ba58fe3f42e..19d92a1bc78 100644 --- a/src/Interpreters/DDLTask.h +++ b/src/Interpreters/DDLTask.h @@ -3,12 +3,17 @@ #include #include +namespace Poco +{ +class Logger; +} namespace DB { class ASTQueryWithOnCluster; using ZooKeeperPtr = std::shared_ptr; +class DatabaseReplicated; struct HostID { @@ -54,42 +59,88 @@ struct DDLLogEntry void parse(const String & data); }; +struct DDLTaskBase +{ + const String entry_name; + const String entry_path; -struct DDLTask + DDLTaskBase(const String & name, const String & path) : entry_name(name), entry_path(path) {} + virtual ~DDLTaskBase() = default; + + std::optional tryParseEntry(const String & data); + virtual void parseQueryFromEntry(const Context & context); + + DDLLogEntry entry; + + String host_id_str; + ASTPtr query; + + bool is_circular_replicated = false; + bool execute_on_leader = false; + + ExecutionStatus execution_status; + bool was_executed = false; + + virtual String getShardID() const = 0; + + virtual std::unique_ptr makeQueryContext(Context & from_context) const; + + inline String getActiveNodePath() const { return entry_path + "/active/" + host_id_str; } + inline String getFinishedNodePath() const { return entry_path + "/finished/" + host_id_str; } + inline String getShardNodePath() const { return entry_path + "/shards/" + getShardID(); } + +}; + +struct DDLTask : public DDLTaskBase { /// Stages of task lifetime correspond ordering of these data fields: - /// Stage 1: parse entry - String entry_name; - String entry_path; - DDLLogEntry entry; + DDLTask(const String & name, const String & path) : DDLTaskBase(name, path) {} + + bool findCurrentHostID(const Context & global_context, Poco::Logger * log); + + void setClusterInfo(const Context & context, Poco::Logger * log); - bool we_are_initiator = false; /// Stage 2: resolve host_id and check that - HostID host_id; - String host_id_str; + /// Stage 3.1: parse query - ASTPtr query; - ASTQueryWithOnCluster * query_on_cluster = nullptr; /// Stage 3.2: check cluster and find the host in cluster + + /// Stage 3.3: execute query + + /// Stage 4: commit results to ZooKeeper + + String getShardID() const override; + +private: + bool tryFindHostInCluster(); + bool tryFindHostInClusterViaResolving(const Context & context); + + HostID host_id; String cluster_name; ClusterPtr cluster; Cluster::Address address_in_cluster; size_t host_shard_num; size_t host_replica_num; +}; - /// Stage 3.3: execute query - ExecutionStatus execution_status; - bool was_executed = false; +struct DatabaseReplicatedTask : public DDLTaskBase +{ + DatabaseReplicatedTask(const String & name, const String & path, DatabaseReplicated * database_); - /// Stage 4: commit results to ZooKeeper + void parseQueryFromEntry(const Context & context) override; - String active_path; - String finished_path; - String shard_path; + String getShardID() const override; + std::unique_ptr makeQueryContext(Context & from_context) const override; + + static String getLogEntryName(UInt32 log_entry_number); + static UInt32 getLogEntryNumber(const String & log_entry_name); + + DatabaseReplicated * database; + bool we_are_initiator = false; }; diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index fc9039be576..0399687a4d8 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -142,33 +142,13 @@ std::unique_ptr createSimpleZooKeeperLock( } -String DatabaseReplicatedExtensions::getLogEntryName(UInt32 log_entry_number) -{ - constexpr size_t seq_node_digits = 10; - String number = toString(log_entry_number); - String name = "query-" + String(seq_node_digits - number.size(), '0') + number; - return name; -} - -UInt32 DatabaseReplicatedExtensions::getLogEntryNumber(const String & log_entry_name) -{ - constexpr const char * name = "query-"; - assert(startsWith(log_entry_name, name)); - return parse(log_entry_name.substr(strlen(name))); -} - - DDLWorker::DDLWorker(int pool_size_, const std::string & zk_root_dir, const Context & context_, const Poco::Util::AbstractConfiguration * config, const String & prefix, - std::optional database_replicated_ext_) + const String & logger_name) : context(context_) - , log(&Poco::Logger::get(database_replicated_ext_ ? fmt::format("DDLWorker ({})", database_replicated_ext_->database_name) : "DDLWorker")) - , database_replicated_ext(std::move(database_replicated_ext_)) - , pool_size(pool_size_) + , log(&Poco::Logger::get(logger_name)) + , pool_size(pool_size_) //FIXME make it optional , worker_pool(pool_size_) { - assert(!database_replicated_ext || pool_size == 1); - last_tasks.reserve(pool_size); - queue_dir = zk_root_dir; if (queue_dir.back() == '/') queue_dir.resize(queue_dir.size() - 1); @@ -252,60 +232,26 @@ DDLTaskPtr DDLWorker::initAndCheckTask(const String & entry_name, String & out_r String node_data; String entry_path = queue_dir + "/" + entry_name; - auto task = std::make_unique(); - task->entry_name = entry_name; - task->entry_path = entry_path; - - if (database_replicated_ext) - { - String initiator_name; - zkutil::EventPtr wait_committed_or_failed = std::make_shared(); - - if (zookeeper->tryGet(entry_path + "/try", initiator_name, nullptr, wait_committed_or_failed)) - { - task->we_are_initiator = initiator_name == database_replicated_ext->getFullReplicaName(); - /// Query is not committed yet. We cannot just skip it and execute next one, because reordering may break replication. - //FIXME add some timeouts - if (!task->we_are_initiator) - { - LOG_TRACE(log, "Waiting for initiator {} to commit or rollback entry {}", initiator_name, entry_path); - wait_committed_or_failed->wait(); - } - } - - if (!task->we_are_initiator && !zookeeper->exists(entry_path + "/committed")) - { - out_reason = "Entry " + entry_name + " hasn't been committed"; - return {}; - } - } + auto task = std::make_unique(entry_name, entry_path); if (!zookeeper->tryGet(entry_path, node_data)) { - if (database_replicated_ext) - database_replicated_ext->lost_callback(entry_name, zookeeper); /// It is Ok that node could be deleted just now. It means that there are no current host in node's host list. out_reason = "The task was deleted"; return {}; } - try - { - task->entry.parse(node_data); - } - catch (...) + auto error = task->tryParseEntry(node_data); + if (error) { /// What should we do if we even cannot parse host name and therefore cannot properly submit execution status? /// We can try to create fail node using FQDN if it equal to host name in cluster config attempt will be successful. /// Otherwise, that node will be ignored by DDLQueryStatusInputStream. - - tryLogCurrentException(log, "Cannot parse DDL task " + entry_name + ", will try to send error status"); - - String status = ExecutionStatus::fromCurrentException().serializeText(); + LOG_ERROR(log, "Cannot parse DDL task {}, will try to send error status: {}", entry_name, *error); try { createStatusDirs(entry_path, zookeeper); - zookeeper->tryCreate(entry_path + "/finished/" + host_fqdn_id, status, zkutil::CreateMode::Persistent); + zookeeper->tryCreate(entry_path + "/finished/" + host_fqdn_id, *error, zkutil::CreateMode::Persistent); } catch (...) { @@ -316,45 +262,15 @@ DDLTaskPtr DDLWorker::initAndCheckTask(const String & entry_name, String & out_r return {}; } - if (database_replicated_ext) - { - task->host_id.host_name = host_fqdn; - task->host_id.port = context.getTCPPort(); - task->host_id_str = database_replicated_ext->shard_name + '|' + database_replicated_ext->replica_name; - return task; - } - - bool host_in_hostlist = false; - for (const HostID & host : task->entry.hosts) - { - auto maybe_secure_port = context.getTCPPortSecure(); - - /// The port is considered local if it matches TCP or TCP secure port that the server is listening. - bool is_local_port = (maybe_secure_port && host.isLocalAddress(*maybe_secure_port)) - || host.isLocalAddress(context.getTCPPort()); - - if (!is_local_port) - continue; - - if (host_in_hostlist) - { - /// This check could be slow a little bit - LOG_WARNING(log, "There are two the same ClickHouse instances in task {}: {} and {}. Will use the first one only.", entry_name, task->host_id.readableString(), host.readableString()); - } - else - { - host_in_hostlist = true; - task->host_id = host; - task->host_id_str = host.toString(); - } - } - - if (!host_in_hostlist) + if (!task->findCurrentHostID(context, log)) { out_reason = "There is no a local address in host list"; return {}; } + task->parseQueryFromEntry(context); + task->setClusterInfo(context, log); + return task; } @@ -378,11 +294,11 @@ void DDLWorker::scheduleTasks() return; } - bool server_startup = last_tasks.empty(); + bool server_startup = !last_entry_name.has_value(); auto begin_node = server_startup ? queue_nodes.begin() - : std::upper_bound(queue_nodes.begin(), queue_nodes.end(), last_tasks.back()); + : std::upper_bound(queue_nodes.begin(), queue_nodes.end(), *last_entry_name); for (auto it = begin_node; it != queue_nodes.end() && !stop_flag; ++it) { @@ -394,7 +310,7 @@ void DDLWorker::scheduleTasks() if (!task) { LOG_DEBUG(log, "Will not execute task {}: {}", entry_name, reason); - saveTask(entry_name); + last_entry_name = entry_name; continue; } @@ -408,7 +324,7 @@ void DDLWorker::scheduleTasks() if (!already_processed) { - if (database_replicated_ext) + if (pool_size == 1) { enqueueTask(DDLTaskPtr(task.release())); } @@ -425,143 +341,18 @@ void DDLWorker::scheduleTasks() LOG_DEBUG(log, "Task {} ({}) has been already processed", entry_name, task->entry.query); } - saveTask(entry_name); + last_entry_name = entry_name; } } -void DDLWorker::saveTask(const String & entry_name) -{ - if (last_tasks.size() == pool_size) - { - last_tasks.erase(last_tasks.begin()); - } - last_tasks.emplace_back(entry_name); -} - /// Parses query and resolves cluster and host in cluster -void DDLWorker::parseQueryAndResolveHost(DDLTask & task) +void DDLWorker::parseQueryAndResolveHost(DDLTaskBase & /*task*/) { - { - const char * begin = task.entry.query.data(); - const char * end = begin + task.entry.query.size(); - ParserQuery parser_query(end); - String description; - task.query = parseQuery(parser_query, begin, end, description, 0, context.getSettingsRef().max_parser_depth); - } - - // XXX: serious design flaw since `ASTQueryWithOnCluster` is not inherited from `IAST`! - if (!task.query || !(task.query_on_cluster = dynamic_cast(task.query.get()))) - throw Exception("Received unknown DDL query", ErrorCodes::UNKNOWN_TYPE_OF_QUERY); - - if (database_replicated_ext) - return; - - task.cluster_name = task.query_on_cluster->cluster; - task.cluster = context.tryGetCluster(task.cluster_name); - if (!task.cluster) - throw Exception(ErrorCodes::INCONSISTENT_CLUSTER_DEFINITION, - "DDL task {} contains current host {} in cluster {}, but there are no such cluster here.", - task.entry_name, task.host_id.readableString(), task.cluster_name); - - /// Try to find host from task host list in cluster - /// At the first, try find exact match (host name and ports should be literally equal) - /// If the attempt fails, try find it resolving host name of each instance - const auto & shards = task.cluster->getShardsAddresses(); - - bool found_exact_match = false; - String default_database; - for (size_t shard_num = 0; shard_num < shards.size(); ++shard_num) - { - for (size_t replica_num = 0; replica_num < shards[shard_num].size(); ++replica_num) - { - const Cluster::Address & address = shards[shard_num][replica_num]; - - if (address.host_name == task.host_id.host_name && address.port == task.host_id.port) - { - if (found_exact_match) - { - if (default_database == address.default_database) - { - throw Exception(ErrorCodes::INCONSISTENT_CLUSTER_DEFINITION, - "There are two exactly the same ClickHouse instances {} in cluster {}", - address.readableString(), task.cluster_name); - } - else - { - /* Circular replication is used. - * It is when every physical node contains - * replicas of different shards of the same table. - * To distinguish one replica from another on the same node, - * every shard is placed into separate database. - * */ - is_circular_replicated = true; - auto * query_with_table = dynamic_cast(task.query.get()); - if (!query_with_table || query_with_table->database.empty()) - { - throw Exception(ErrorCodes::INCONSISTENT_CLUSTER_DEFINITION, - "For a distributed DDL on circular replicated cluster its table name must be qualified by database name."); - } - if (default_database == query_with_table->database) - return; - } - } - found_exact_match = true; - task.host_shard_num = shard_num; - task.host_replica_num = replica_num; - task.address_in_cluster = address; - default_database = address.default_database; - } - } - } - - if (found_exact_match) - return; - - LOG_WARNING(log, "Not found the exact match of host {} from task {} in cluster {} definition. Will try to find it using host name resolving.", task.host_id.readableString(), task.entry_name, task.cluster_name); - - bool found_via_resolving = false; - for (size_t shard_num = 0; shard_num < shards.size(); ++shard_num) - { - for (size_t replica_num = 0; replica_num < shards[shard_num].size(); ++replica_num) - { - const Cluster::Address & address = shards[shard_num][replica_num]; - - if (auto resolved = address.getResolvedAddress(); - resolved && (isLocalAddress(*resolved, context.getTCPPort()) - || (context.getTCPPortSecure() && isLocalAddress(*resolved, *context.getTCPPortSecure())))) - { - if (found_via_resolving) - { - throw Exception(ErrorCodes::INCONSISTENT_CLUSTER_DEFINITION, - "There are two the same ClickHouse instances in cluster {} : {} and {}", - task.cluster_name, task.address_in_cluster.readableString(), address.readableString()); - } - else - { - found_via_resolving = true; - task.host_shard_num = shard_num; - task.host_replica_num = replica_num; - task.address_in_cluster = address; - } - } - } - } - - if (!found_via_resolving) - { - throw Exception(ErrorCodes::INCONSISTENT_CLUSTER_DEFINITION, - "Not found host {} in definition of cluster {}", - task.host_id.readableString(), task.cluster_name); - } - else - { - LOG_INFO(log, "Resolved host {} from task {} as host {} in definition of cluster {}", task.host_id.readableString(), task.entry_name, task.address_in_cluster.readableString(), task.cluster_name); - } } -bool DDLWorker::tryExecuteQuery(const String & query, const DDLTask & task, ExecutionStatus & status) +bool DDLWorker::tryExecuteQuery(const String & query, const DDLTaskBase & task, ExecutionStatus & status) { /// Add special comment at the start of query to easily identify DDL-produced queries in query_log String query_prefix = "/* ddl_entry=" + task.entry_name + " */ "; @@ -573,36 +364,8 @@ bool DDLWorker::tryExecuteQuery(const String & query, const DDLTask & task, Exec try { - auto current_context = std::make_unique(context); - current_context->makeQueryContext(); - current_context->setCurrentQueryId(""); // generate random query_id - - if (database_replicated_ext) - { - current_context->getClientInfo().query_kind - = ClientInfo::QueryKind::REPLICATED_LOG_QUERY; //FIXME why do we need separate query kind? - current_context->setCurrentDatabase(database_replicated_ext->database_name); - - if (task.we_are_initiator) - { - auto txn = std::make_shared(); - current_context->initMetadataTransaction(txn); - txn->current_zookeeper = current_zookeeper; - txn->zookeeper_path = database_replicated_ext->zookeeper_path; - txn->ops.emplace_back(zkutil::makeRemoveRequest(task.entry_path + "/try", -1)); - txn->ops.emplace_back(zkutil::makeCreateRequest(task.entry_path + "/committed", - database_replicated_ext->getFullReplicaName(), zkutil::CreateMode::Persistent)); - txn->ops.emplace_back(zkutil::makeRemoveRequest(task.active_path, -1)); - if (!task.shard_path.empty()) - txn->ops.emplace_back(zkutil::makeCreateRequest(task.shard_path, task.host_id_str, zkutil::CreateMode::Persistent)); - txn->ops.emplace_back(zkutil::makeCreateRequest(task.finished_path, task.execution_status.serializeText(), zkutil::CreateMode::Persistent)); - //txn->ops.emplace_back(zkutil::makeSetRequest(database_replicated_ext->getReplicaPath() + "/log_ptr", toString(database_replicated_ext->first_not_executed), -1)); - } - } - else - current_context->getClientInfo().query_kind = ClientInfo::QueryKind::SECONDARY_QUERY; - - executeQuery(istr, ostr, false, *current_context, {}); + auto query_context = task.makeQueryContext(context); + executeQuery(istr, ostr, false, *query_context, {}); } catch (...) { @@ -644,6 +407,7 @@ void DDLWorker::enqueueTask(DDLTaskPtr task_ptr) processTask(task); return; } + /// TODO recover zk in runMainThread(...) and retry task (why do we need another place where session is recovered?) catch (const Coordination::Exception & e) { if (Coordination::isHardwareError(e.code)) @@ -668,17 +432,16 @@ void DDLWorker::enqueueTask(DDLTaskPtr task_ptr) } } -void DDLWorker::processTask(DDLTask & task) +void DDLWorker::processTask(DDLTaskBase & task) { auto zookeeper = tryGetZooKeeper(); LOG_DEBUG(log, "Processing task {} ({})", task.entry_name, task.entry.query); - String dummy; - //FIXME duplicate - String active_node_path = task.active_path = task.entry_path + "/active/" + task.host_id_str; - String finished_node_path = task.finished_path = task.entry_path + "/finished/" + task.host_id_str; + String active_node_path = task.getActiveNodePath(); + String finished_node_path = task.getFinishedNodePath(); + String dummy; auto code = zookeeper->tryCreate(active_node_path, "", zkutil::CreateMode::Ephemeral, dummy); if (code == Coordination::Error::ZOK || code == Coordination::Error::ZNODEEXISTS) @@ -696,22 +459,16 @@ void DDLWorker::processTask(DDLTask & task) else throw Coordination::Exception(code, active_node_path); - //FIXME - bool is_dummy_query = database_replicated_ext && task.entry.query.empty(); - if (!task.was_executed && !is_dummy_query) + if (!task.was_executed) { try { - is_circular_replicated = false; - parseQueryAndResolveHost(task); - - ASTPtr rewritten_ast = task.query_on_cluster->getRewrittenASTWithoutOnCluster(task.address_in_cluster.default_database); - String rewritten_query = queryToString(rewritten_ast); + String rewritten_query = queryToString(task.query); LOG_DEBUG(log, "Executing query: {}", rewritten_query); - if (auto * query_with_table = dynamic_cast(rewritten_ast.get()); query_with_table) + StoragePtr storage; + if (auto * query_with_table = dynamic_cast(task.query.get()); query_with_table) { - StoragePtr storage; if (!query_with_table->table.empty()) { /// It's not CREATE DATABASE @@ -719,11 +476,11 @@ void DDLWorker::processTask(DDLTask & task) storage = DatabaseCatalog::instance().tryGetTable(table_id, context); } - if (storage && taskShouldBeExecutedOnLeader(rewritten_ast, storage) && !is_circular_replicated) - tryExecuteQueryOnLeaderReplica(task, storage, rewritten_query, task.entry_path, zookeeper); - else - tryExecuteQuery(rewritten_query, task, task.execution_status); + task.execute_on_leader = storage && taskShouldBeExecutedOnLeader(task.query, storage) && !task.is_circular_replicated; } + + if (task.execute_on_leader) + tryExecuteQueryOnLeaderReplica(task, storage, rewritten_query, task.entry_path, zookeeper); else tryExecuteQuery(rewritten_query, task, task.execution_status); } @@ -753,12 +510,6 @@ void DDLWorker::processTask(DDLTask & task) auto res = zookeeper->tryMulti(ops, responses); if (res != Coordination::Error::ZNODEEXISTS && res != Coordination::Error::ZNONODE) zkutil::KeeperMultiException::check(res, ops, responses); - - if (database_replicated_ext) - { - database_replicated_ext->executed_callback(task.entry_name, zookeeper); - ++(database_replicated_ext->first_not_executed); - } } @@ -775,10 +526,10 @@ bool DDLWorker::taskShouldBeExecutedOnLeader(const ASTPtr ast_ddl, const Storage } bool DDLWorker::tryExecuteQueryOnLeaderReplica( - DDLTask & task, + DDLTaskBase & task, StoragePtr storage, const String & rewritten_query, - const String & node_path, + const String & /*node_path*/, const ZooKeeperPtr & zookeeper) { StorageReplicatedMergeTree * replicated_storage = dynamic_cast(storage.get()); @@ -787,31 +538,8 @@ bool DDLWorker::tryExecuteQueryOnLeaderReplica( if (!replicated_storage) throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Storage type '{}' is not supported by distributed DDL", storage->getName()); - /// Generate unique name for shard node, it will be used to execute the query by only single host - /// Shard node name has format 'replica_name1,replica_name2,...,replica_nameN' - /// Where replica_name is 'replica_config_host_name:replica_port' - auto get_shard_name = [] (const Cluster::Addresses & shard_addresses) - { - Strings replica_names; - for (const Cluster::Address & address : shard_addresses) - replica_names.emplace_back(address.readableString()); - std::sort(replica_names.begin(), replica_names.end()); - - String res; - for (auto it = replica_names.begin(); it != replica_names.end(); ++it) - res += *it + (std::next(it) != replica_names.end() ? "," : ""); - - return res; - }; - - String shard_node_name; - if (database_replicated_ext) - shard_node_name = database_replicated_ext->shard_name; - else - shard_node_name = get_shard_name(task.cluster->getShardsAddresses().at(task.host_shard_num)); - String shard_path = node_path + "/shards/" + shard_node_name; + String shard_path = task.getShardNodePath(); String is_executed_path = shard_path + "/executed"; - task.shard_path = is_executed_path; //FIXME duplicate String tries_to_execute_path = shard_path + "/tries_to_execute"; zookeeper->createAncestors(shard_path + "/"); @@ -1035,7 +763,7 @@ void DDLWorker::createStatusDirs(const std::string & node_path, const ZooKeeperP String DDLWorker::enqueueQuery(DDLLogEntry & entry) { - if (entry.hosts.empty() && !database_replicated_ext) + if (entry.hosts.empty()) throw Exception("Empty host list in a distributed DDL task", ErrorCodes::LOGICAL_ERROR); auto zookeeper = getAndSetZooKeeper(); @@ -1043,27 +771,7 @@ String DDLWorker::enqueueQuery(DDLLogEntry & entry) String query_path_prefix = queue_dir + "/query-"; zookeeper->createAncestors(query_path_prefix); - String node_path; - if (database_replicated_ext) - { - /// We cannot create sequential node and it's ephemeral child in a single transaction, so allocate sequential number another way - String counter_prefix = database_replicated_ext->zookeeper_path + "/counter/cnt-"; - String counter_path = zookeeper->create(counter_prefix, "", zkutil::CreateMode::EphemeralSequential); - node_path = query_path_prefix + counter_path.substr(counter_prefix.size()); - - Coordination::Requests ops; - /// Query is not committed yet, but we have to write it into log to avoid reordering - ops.emplace_back(zkutil::makeCreateRequest(node_path, entry.toString(), zkutil::CreateMode::Persistent)); - /// '/try' will be replaced with '/committed' or will be removed due to expired session or other error - ops.emplace_back(zkutil::makeCreateRequest(node_path + "/try", database_replicated_ext->getFullReplicaName(), zkutil::CreateMode::Ephemeral)); - /// We don't need it anymore - ops.emplace_back(zkutil::makeRemoveRequest(counter_path, -1)); - zookeeper->multi(ops); - } - else - { - node_path = zookeeper->create(query_path_prefix, entry.toString(), zkutil::CreateMode::PersistentSequential); - } + String node_path = zookeeper->create(query_path_prefix, entry.toString(), zkutil::CreateMode::PersistentSequential); /// Optional step try @@ -1091,6 +799,7 @@ void DDLWorker::runMainThread() { auto zookeeper = getAndSetZooKeeper(); zookeeper->createAncestors(queue_dir + "/"); + initialize(); initialized = true; } catch (const Coordination::Exception & e) diff --git a/src/Interpreters/DDLWorker.h b/src/Interpreters/DDLWorker.h index 86677bfbb19..39087d05fbb 100644 --- a/src/Interpreters/DDLWorker.h +++ b/src/Interpreters/DDLWorker.h @@ -29,50 +29,20 @@ namespace DB class Context; class ASTAlterQuery; struct DDLLogEntry; -struct DDLTask; -using DDLTaskPtr = std::unique_ptr; +struct DDLTaskBase; +using DDLTaskPtr = std::unique_ptr; using ZooKeeperPtr = std::shared_ptr; -struct DatabaseReplicatedExtensions -{ - UUID database_uuid; - String zookeeper_path; - String database_name; - String shard_name; - String replica_name; - UInt32 first_not_executed; - using EntryLostCallback = std::function; - using EntryExecutedCallback = std::function; - using EntryErrorCallback = std::function; - EntryLostCallback lost_callback; - EntryExecutedCallback executed_callback; - EntryErrorCallback error_callback; - - String getReplicaPath() const - { - return zookeeper_path + "/replicas/" + shard_name + "/" + replica_name; - } - - String getFullReplicaName() const - { - return shard_name + '|' + replica_name; - } - - static String getLogEntryName(UInt32 log_entry_number); - static UInt32 getLogEntryNumber(const String & log_entry_name); -}; - - class DDLWorker { public: DDLWorker(int pool_size_, const std::string & zk_root_dir, const Context & context_, const Poco::Util::AbstractConfiguration * config, const String & prefix, - std::optional database_replicated_ext_ = std::nullopt); - ~DDLWorker(); + const String & logger_name = "DDLWorker"); + virtual ~DDLWorker(); /// Pushes query into DDL queue, returns path to created node - String enqueueQuery(DDLLogEntry & entry); + virtual String enqueueQuery(DDLLogEntry & entry); /// Host ID (name:port) for logging purposes /// Note that in each task hosts are identified individually by name:port from initiator server cluster config @@ -83,10 +53,7 @@ public: void shutdown(); - //FIXME get rid of this method - void setLogPointer(UInt32 log_pointer) { database_replicated_ext->first_not_executed = log_pointer; } - -private: +protected: /// Returns cached ZooKeeper session (possibly expired). ZooKeeperPtr tryGetZooKeeper() const; @@ -97,14 +64,13 @@ private: void checkCurrentTasks(); void scheduleTasks(); - void saveTask(const String & entry_name); /// Reads entry and check that the host belongs to host list of the task /// Returns non-empty DDLTaskPtr if entry parsed and the check is passed - DDLTaskPtr initAndCheckTask(const String & entry_name, String & out_reason, const ZooKeeperPtr & zookeeper); + virtual DDLTaskPtr initAndCheckTask(const String & entry_name, String & out_reason, const ZooKeeperPtr & zookeeper); void enqueueTask(DDLTaskPtr task); - void processTask(DDLTask & task); + void processTask(DDLTaskBase & task); /// Check that query should be executed on leader replica only static bool taskShouldBeExecutedOnLeader(const ASTPtr ast_ddl, StoragePtr storage); @@ -115,15 +81,15 @@ private: /// query via RemoteBlockOutputStream to leader, so to avoid such "2-phase" query execution we /// execute query directly on leader. bool tryExecuteQueryOnLeaderReplica( - DDLTask & task, + DDLTaskBase & task, StoragePtr storage, const String & rewritten_query, const String & node_path, const ZooKeeperPtr & zookeeper); - void parseQueryAndResolveHost(DDLTask & task); + void parseQueryAndResolveHost(DDLTaskBase & task); - bool tryExecuteQuery(const String & query, const DDLTask & task, ExecutionStatus & status); + bool tryExecuteQuery(const String & query, const DDLTaskBase & task, ExecutionStatus & status); /// Checks and cleanups queue's nodes void cleanupQueue(Int64 current_time_seconds, const ZooKeeperPtr & zookeeper); @@ -131,17 +97,16 @@ private: /// Init task node static void createStatusDirs(const std::string & node_path, const ZooKeeperPtr & zookeeper); + virtual void initialize() {} void runMainThread(); void runCleanupThread(); void attachToThreadGroup(); -private: - std::atomic is_circular_replicated = false; +protected: Context context; Poco::Logger * log; - std::optional database_replicated_ext; std::string host_fqdn; /// current host domain name std::string host_fqdn_id; /// host_name:port @@ -151,7 +116,8 @@ private: ZooKeeperPtr current_zookeeper; /// Save state of executed task to avoid duplicate execution on ZK error - std::vector last_tasks; + //std::vector last_tasks; + std::optional last_entry_name; std::shared_ptr queue_updated_event = std::make_shared(); std::shared_ptr cleanup_event = std::make_shared(); diff --git a/tests/integration/test_replicated_database/configs/config.xml b/tests/integration/test_replicated_database/configs/config.xml new file mode 100644 index 00000000000..d751454437c --- /dev/null +++ b/tests/integration/test_replicated_database/configs/config.xml @@ -0,0 +1,3 @@ + + 10 + diff --git a/tests/integration/test_replicated_database/configs/disable_snapshots.xml b/tests/integration/test_replicated_database/configs/disable_snapshots.xml deleted file mode 100644 index 9a656bdcea1..00000000000 --- a/tests/integration/test_replicated_database/configs/disable_snapshots.xml +++ /dev/null @@ -1,3 +0,0 @@ - - 0 - diff --git a/tests/integration/test_replicated_database/configs/snapshot_each_query.xml b/tests/integration/test_replicated_database/configs/snapshot_each_query.xml deleted file mode 100644 index 6eae1d9d992..00000000000 --- a/tests/integration/test_replicated_database/configs/snapshot_each_query.xml +++ /dev/null @@ -1,3 +0,0 @@ - - 1 - diff --git a/tests/integration/test_replicated_database/test.py b/tests/integration/test_replicated_database/test.py index 11bfbad393b..8c5a25b3fe7 100644 --- a/tests/integration/test_replicated_database/test.py +++ b/tests/integration/test_replicated_database/test.py @@ -7,11 +7,11 @@ from helpers.test_tools import assert_eq_with_retry cluster = ClickHouseCluster(__file__) -main_node = cluster.add_instance('main_node', main_configs=['configs/disable_snapshots.xml'], with_zookeeper=True, stay_alive=True, macros={"shard": 1, "replica": 1}) -dummy_node = cluster.add_instance('dummy_node', main_configs=['configs/disable_snapshots.xml'], with_zookeeper=True, macros={"shard": 1, "replica": 2}) -competing_node = cluster.add_instance('competing_node', main_configs=['configs/disable_snapshots.xml'], with_zookeeper=True, macros={"shard": 1, "replica": 3}) -snapshotting_node = cluster.add_instance('snapshotting_node', main_configs=['configs/snapshot_each_query.xml'], with_zookeeper=True, macros={"shard": 2, "replica": 1}) -snapshot_recovering_node = cluster.add_instance('snapshot_recovering_node', main_configs=['configs/disable_snapshots.xml'], with_zookeeper=True, macros={"shard": 2, "replica": 2}) +main_node = cluster.add_instance('main_node', main_configs=['configs/config.xml'], with_zookeeper=True, stay_alive=True, macros={"shard": 1, "replica": 1}) +dummy_node = cluster.add_instance('dummy_node', main_configs=['configs/config.xml'], with_zookeeper=True, macros={"shard": 1, "replica": 2}) +competing_node = cluster.add_instance('competing_node', main_configs=['configs/config.xml'], with_zookeeper=True, macros={"shard": 1, "replica": 3}) +snapshotting_node = cluster.add_instance('snapshotting_node', main_configs=['configs/config.xml'], with_zookeeper=True, macros={"shard": 2, "replica": 1}) +snapshot_recovering_node = cluster.add_instance('snapshot_recovering_node', main_configs=['configs/config.xml'], with_zookeeper=True, macros={"shard": 2, "replica": 2}) uuid_regex = re.compile("[0-9a-f]{8}\-[0-9a-f]{4}\-[0-9a-f]{4}\-[0-9a-f]{4}\-[0-9a-f]{12}") def assert_create_query(nodes, table_name, expected): @@ -70,9 +70,10 @@ def test_simple_alter_table(started_cluster, engine): assert_create_query([main_node, dummy_node], name, expected) +@pytest.mark.dependency(depends=['test_simple_alter_table']) @pytest.mark.parametrize("engine", ['MergeTree', 'ReplicatedMergeTree']) def test_create_replica_after_delay(started_cluster, engine): - competing_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'shard1', 'replica3');") + competing_node.query("CREATE DATABASE IF NOT EXISTS testdb ENGINE = Replicated('/clickhouse/databases/test1', 'shard1', 'replica3');") name = "testdb.alter_test_{}".format(engine) main_node.query("ALTER TABLE {} ADD COLUMN Added3 UInt32;".format(name)) @@ -113,6 +114,7 @@ def test_alters_from_different_replicas(started_cluster): assert_create_query([main_node, competing_node], "testdb.concurrent_test", expected) +@pytest.mark.dependency(depends=['test_alters_from_different_replicas']) def test_drop_and_create_table(started_cluster): main_node.query("DROP TABLE testdb.concurrent_test") main_node.query("CREATE TABLE testdb.concurrent_test " @@ -125,6 +127,7 @@ def test_drop_and_create_table(started_cluster): assert_create_query([main_node, competing_node], "testdb.concurrent_test", expected) +@pytest.mark.dependency(depends=['test_drop_and_create_table']) def test_replica_restart(started_cluster): main_node.restart_clickhouse() @@ -134,14 +137,18 @@ def test_replica_restart(started_cluster): assert_create_query([main_node, competing_node], "testdb.concurrent_test", expected) + +@pytest.mark.dependency(depends=['test_create_replica_after_delay']) def test_snapshot_and_snapshot_recover(started_cluster): #FIXME bad test snapshotting_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'shard1', 'replica4');") time.sleep(5) snapshot_recovering_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'shard1', 'replica5');") time.sleep(5) - assert snapshotting_node.query("desc table testdb.alter_test") == snapshot_recovering_node.query("desc table testdb.alter_test") + assert snapshotting_node.query("desc table testdb.alter_test_MergeTree") == snapshot_recovering_node.query("desc table testdb.alter_test_MergeTree") + assert snapshotting_node.query("desc table testdb.alter_test_ReplicatedMergeTree") == snapshot_recovering_node.query("desc table testdb.alter_test_ReplicatedMergeTree") +@pytest.mark.dependency(depends=['test_replica_restart']) def test_drop_and_create_replica(started_cluster): main_node.query("DROP DATABASE testdb") main_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'shard1', 'replica1');") From ab197a49c82db8c9e4aae3984a8da91a0e120728 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Sun, 29 Nov 2020 14:45:32 +0300 Subject: [PATCH 053/381] better code, fixes --- src/Databases/DatabaseAtomic.cpp | 72 +++----- src/Databases/DatabaseReplicated.cpp | 160 +++++++++--------- src/Databases/DatabaseReplicated.h | 31 ++-- src/Databases/DatabaseReplicatedWorker.cpp | 20 +-- src/Databases/ya.make | 1 + src/Interpreters/DDLTask.cpp | 43 ++--- src/Interpreters/DDLTask.h | 32 +--- src/Interpreters/DDLWorker.cpp | 59 ++++--- src/Interpreters/DDLWorker.h | 5 +- src/Interpreters/InterpreterCreateQuery.cpp | 2 +- src/Interpreters/executeDDLQueryOnCluster.cpp | 12 +- src/Interpreters/executeDDLQueryOnCluster.h | 1 + .../test_replicated_database/test.py | 9 +- 13 files changed, 194 insertions(+), 253 deletions(-) diff --git a/src/Databases/DatabaseAtomic.cpp b/src/Databases/DatabaseAtomic.cpp index a444d9cc200..b60adf44e51 100644 --- a/src/Databases/DatabaseAtomic.cpp +++ b/src/Databases/DatabaseAtomic.cpp @@ -120,13 +120,10 @@ void DatabaseAtomic::dropTable(const Context & context, const String & table_nam table_metadata_path_drop = DatabaseCatalog::instance().getPathForDroppedMetadata(table->getStorageID()); if (auto txn = context.getMetadataTransaction()) - { - String metadata_zk_path = txn->zookeeper_path + "/metadata/" + escapeForFileName(table_name); - txn->ops.emplace_back(zkutil::makeRemoveRequest(metadata_zk_path, -1)); - txn->current_zookeeper->multi(txn->ops); /// Commit point (a sort of) for Replicated database - /// NOTE: replica will be lost if server crashes before the following rename - /// TODO better detection and recovery - } + txn->commit(); /// Commit point (a sort of) for Replicated database + + /// NOTE: replica will be lost if server crashes before the following rename + /// TODO better detection and recovery Poco::File(table_metadata_path).renameTo(table_metadata_path_drop); /// Mark table as dropped DatabaseWithDictionaries::detachTableUnlocked(table_name, lock); /// Should never throw @@ -245,31 +242,10 @@ void DatabaseAtomic::renameTable(const Context & context, const String & table_n /// Table renaming actually begins here if (auto txn = context.getMetadataTransaction()) - { - String statement; - String statement_to; - { - ReadBufferFromFile in(old_metadata_path, 4096); - readStringUntilEOF(statement, in); - if (exchange) - { - ReadBufferFromFile in_to(new_metadata_path, 4096); - readStringUntilEOF(statement_to, in_to); - } - } - String metadata_zk_path = txn->zookeeper_path + "/metadata/" + escapeForFileName(table_name); - String metadata_zk_path_to = txn->zookeeper_path + "/metadata/" + escapeForFileName(to_table_name); - txn->ops.emplace_back(zkutil::makeRemoveRequest(metadata_zk_path, -1)); - if (exchange) - { - txn->ops.emplace_back(zkutil::makeRemoveRequest(metadata_zk_path_to, -1)); - txn->ops.emplace_back(zkutil::makeCreateRequest(metadata_zk_path, statement_to, zkutil::CreateMode::Persistent)); - } - txn->ops.emplace_back(zkutil::makeCreateRequest(metadata_zk_path_to, statement, zkutil::CreateMode::Persistent)); - txn->current_zookeeper->multi(txn->ops); /// Commit point (a sort of) for Replicated database - /// NOTE: replica will be lost if server crashes before the following rename - /// TODO better detection and recovery - } + txn->commit(); /// Commit point (a sort of) for Replicated database + + /// NOTE: replica will be lost if server crashes before the following rename + /// TODO better detection and recovery if (exchange) renameExchange(old_metadata_path, new_metadata_path); @@ -326,15 +302,10 @@ void DatabaseAtomic::commitCreateTable(const ASTCreateQuery & query, const Stora locked_uuid = true; if (auto txn = query_context.getMetadataTransaction()) - { - String metadata_zk_path = txn->zookeeper_path + "/metadata/" + escapeForFileName(query.table); - String statement = getObjectDefinitionFromCreateQuery(query.clone()); - /// zk::multi(...) will throw if `metadata_zk_path` exists - txn->ops.emplace_back(zkutil::makeCreateRequest(metadata_zk_path, statement, zkutil::CreateMode::Persistent)); - txn->current_zookeeper->multi(txn->ops); /// Commit point (a sort of) for Replicated database - /// NOTE: replica will be lost if server crashes before the following renameNoReplace(...) - /// TODO better detection and recovery - } + txn->commit(); /// Commit point (a sort of) for Replicated database + + /// NOTE: replica will be lost if server crashes before the following renameNoReplace(...) + /// TODO better detection and recovery /// It throws if `table_metadata_path` already exists (it's possible if table was detached) renameNoReplace(table_metadata_tmp_path, table_metadata_path); /// Commit point (a sort of) @@ -352,7 +323,8 @@ void DatabaseAtomic::commitCreateTable(const ASTCreateQuery & query, const Stora tryCreateSymlink(query.table, table_data_path); } -void DatabaseAtomic::commitAlterTable(const StorageID & table_id, const String & table_metadata_tmp_path, const String & table_metadata_path, const String & statement, const Context & query_context) +void DatabaseAtomic::commitAlterTable(const StorageID & table_id, const String & table_metadata_tmp_path, const String & table_metadata_path, + const String & /*statement*/, const Context & query_context) { bool check_file_exists = true; SCOPE_EXIT({ std::error_code code; if (check_file_exists) std::filesystem::remove(table_metadata_tmp_path, code); }); @@ -363,17 +335,11 @@ void DatabaseAtomic::commitAlterTable(const StorageID & table_id, const String & if (table_id.uuid != actual_table_id.uuid) throw Exception("Cannot alter table because it was renamed", ErrorCodes::CANNOT_ASSIGN_ALTER); - if (&query_context != &query_context.getGlobalContext()) // FIXME - { - if (auto txn = query_context.getMetadataTransaction()) - { - String metadata_zk_path = txn->zookeeper_path + "/metadata/" + escapeForFileName(table_id.table_name); - txn->ops.emplace_back(zkutil::makeSetRequest(metadata_zk_path, statement, -1)); - txn->current_zookeeper->multi(txn->ops); /// Commit point (a sort of) for Replicated database - /// NOTE: replica will be lost if server crashes before the following rename - /// TODO better detection and recovery - } - } + if (auto txn = query_context.getMetadataTransaction()) + txn->commit(); /// Commit point (a sort of) for Replicated database + + /// NOTE: replica will be lost if server crashes before the following rename + /// TODO better detection and recovery check_file_exists = renameExchangeIfSupported(table_metadata_tmp_path, table_metadata_path); if (!check_file_exists) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index eef1b98afe2..418eaf567a4 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -34,6 +34,7 @@ namespace ErrorCodes extern const int REPLICA_IS_ALREADY_EXIST; extern const int DATABASE_REPLICATION_FAILED; extern const int UNKNOWN_DATABASE; + extern const int NOT_IMPLEMENTED; } zkutil::ZooKeeperPtr DatabaseReplicated::getZooKeeper() const @@ -106,9 +107,6 @@ DatabaseReplicated::DatabaseReplicated( /// Throws if replica with the same name was created concurrently createReplicaNodesInZooKeeper(current_zookeeper); } - - snapshot_period = 1; //context_.getConfigRef().getInt("database_replicated_snapshot_period", 10); - LOG_DEBUG(log, "Snapshot period is set to {} log entries per one snapshot", snapshot_period); } bool DatabaseReplicated::createDatabaseNodesInZooKeeper(const zkutil::ZooKeeperPtr & current_zookeeper) @@ -171,8 +169,6 @@ void DatabaseReplicated::loadStoredObjects(Context & context, bool has_force_res { DatabaseAtomic::loadStoredObjects(context, has_force_restore_data_flag, force_attach); - //recoverLostReplica(global_context.getZooKeeper(), 0, true); //FIXME - ddl_worker = std::make_unique(this, global_context); } @@ -209,71 +205,6 @@ void DatabaseReplicated::onUnexpectedLogEntry(const String & entry_name, const Z "Got log entry '{}' when expected entry number {}"); } -void DatabaseReplicated::removeOutdatedSnapshotsAndLog() -{ - /// This method removes all snapshots and logged queries - /// that no longer will be in use by current replicas or - /// new coming ones. - /// Each registered replica has its state in ZooKeeper. - /// Therefore, snapshots and logged queries that are less - /// than a least advanced replica are removed. - /// It does not interfere with a new coming replica - /// metadata loading from snapshot - /// because the replica will use the latest snapshot available - /// and this snapshot will set the last executed log query - /// to a greater one than the least advanced current replica. - auto current_zookeeper = getZooKeeper(); - Strings replica_states = current_zookeeper->getChildren(zookeeper_path + "/replicas"); - //TODO do not use log pointers to determine which entries to remove if there are staled pointers. - // We can just remove all entries older than previous snapshot version. - // Possible invariant: store all entries since last snapshot, replica becomes lost when it cannot get log entry. - auto least_advanced = std::min_element(replica_states.begin(), replica_states.end()); - Strings snapshots = current_zookeeper->getChildren(zookeeper_path + "/snapshots"); - - if (snapshots.size() < 2) - { - return; - } - - std::sort(snapshots.begin(), snapshots.end()); - auto still_useful = std::lower_bound(snapshots.begin(), snapshots.end(), *least_advanced); - snapshots.erase(still_useful, snapshots.end()); - for (const String & snapshot : snapshots) - { - current_zookeeper->tryRemoveRecursive(zookeeper_path + "/snapshots/" + snapshot); - } - - Strings log_entry_names = current_zookeeper->getChildren(zookeeper_path + "/log"); - std::sort(log_entry_names.begin(), log_entry_names.end()); - auto still_useful_log = std::upper_bound(log_entry_names.begin(), log_entry_names.end(), *still_useful); - log_entry_names.erase(still_useful_log, log_entry_names.end()); - for (const String & log_entry_name : log_entry_names) - { - String log_entry_path = zookeeper_path + "/log/" + log_entry_name; - current_zookeeper->tryRemove(log_entry_path); - } -} - -void DatabaseReplicated::onExecutedLogEntry(const String & /*entry_name*/, const ZooKeeperPtr & /*zookeeper*/) -{ - -} - -void DatabaseReplicated::writeLastExecutedToDiskAndZK() -{ - auto current_zookeeper = getZooKeeper(); - current_zookeeper->createOrUpdate( - zookeeper_path + "/replicas/" + replica_name, last_executed_log_entry, zkutil::CreateMode::Persistent); - - String metadata_file = getMetadataPath() + ".last_entry"; - WriteBufferFromFile out(metadata_file, last_executed_log_entry.size(), O_WRONLY | O_CREAT); - writeString(last_executed_log_entry, out); - out.next(); - if (global_context.getSettingsRef().fsync_metadata) - out.sync(); - out.close(); -} - BlockIO DatabaseReplicated::propose(const ASTPtr & query) { @@ -302,14 +233,14 @@ BlockIO DatabaseReplicated::propose(const ASTPtr & query) //FIXME need list of all replicas, we can obtain it from zk Strings hosts_to_wait; - hosts_to_wait.emplace_back(shard_name + '|' +replica_name); - auto stream = std::make_shared(node_path, entry, global_context); + hosts_to_wait.emplace_back(getFullReplicaName()); + auto stream = std::make_shared(node_path, entry, global_context, hosts_to_wait); io.in = std::move(stream); return io; } -void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeeper, UInt32 from_snapshot, bool /*create*/) +void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeeper, UInt32 from_snapshot) { LOG_WARNING(log, "Will recover replica"); @@ -339,14 +270,7 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep InterpreterCreateQuery(query_ast, query_context).execute(); } - //if (create) - // return; - current_zookeeper->set(replica_path + "/log_ptr", toString(from_snapshot)); - last_executed_log_entry = from_snapshot; - //ddl_worker->setLogPointer(from_snapshot); //FIXME - - //writeLastExecutedToDiskAndZK(); } ASTPtr DatabaseReplicated::parseQueryFromMetadataInZooKeeper(const String & node_name, const String & query) @@ -384,4 +308,80 @@ void DatabaseReplicated::shutdown() DatabaseAtomic::shutdown(); } + +void DatabaseReplicated::dropTable(const Context & context, const String & table_name, bool no_delay) +{ + auto txn = context.getMetadataTransaction(); + //assert(!ddl_worker->isCurrentlyActive() || txn /*|| called from DROP DATABASE */); + if (txn && txn->is_initial_query) + { + String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(table_name); + txn->ops.emplace_back(zkutil::makeRemoveRequest(metadata_zk_path, -1)); + } + DatabaseAtomic::dropTable(context, table_name, no_delay); +} + +void DatabaseReplicated::renameTable(const Context & context, const String & table_name, IDatabase & to_database, + const String & to_table_name, bool exchange, bool dictionary) +{ + auto txn = context.getMetadataTransaction(); + assert(txn); + + if (txn->is_initial_query) + { + String statement; + String statement_to; + { + //FIXME It's not atomic (however we have only one thread) + ReadBufferFromFile in(getObjectMetadataPath(table_name), 4096); + readStringUntilEOF(statement, in); + if (exchange) + { + ReadBufferFromFile in_to(to_database.getObjectMetadataPath(to_table_name), 4096); + readStringUntilEOF(statement_to, in_to); + } + } + String metadata_zk_path = txn->zookeeper_path + "/metadata/" + escapeForFileName(table_name); + String metadata_zk_path_to = txn->zookeeper_path + "/metadata/" + escapeForFileName(to_table_name); + txn->ops.emplace_back(zkutil::makeRemoveRequest(metadata_zk_path, -1)); + if (exchange) + { + txn->ops.emplace_back(zkutil::makeRemoveRequest(metadata_zk_path_to, -1)); + txn->ops.emplace_back(zkutil::makeCreateRequest(metadata_zk_path, statement_to, zkutil::CreateMode::Persistent)); + } + txn->ops.emplace_back(zkutil::makeCreateRequest(metadata_zk_path_to, statement, zkutil::CreateMode::Persistent)); + } + + DatabaseAtomic::renameTable(context, table_name, to_database, to_table_name, exchange, dictionary); +} + +void DatabaseReplicated::commitCreateTable(const ASTCreateQuery & query, const StoragePtr & table, + const String & table_metadata_tmp_path, const String & table_metadata_path, + const Context & query_context) +{ + auto txn = query_context.getMetadataTransaction(); + assert(!ddl_worker->isCurrentlyActive() || txn); + if (txn && txn->is_initial_query) + { + String metadata_zk_path = txn->zookeeper_path + "/metadata/" + escapeForFileName(query.table); + String statement = getObjectDefinitionFromCreateQuery(query.clone()); + /// zk::multi(...) will throw if `metadata_zk_path` exists + txn->ops.emplace_back(zkutil::makeCreateRequest(metadata_zk_path, statement, zkutil::CreateMode::Persistent)); + } + DatabaseAtomic::commitCreateTable(query, table, table_metadata_tmp_path, table_metadata_path, query_context); +} + +void DatabaseReplicated::commitAlterTable(const StorageID & table_id, + const String & table_metadata_tmp_path, const String & table_metadata_path, + const String & statement, const Context & query_context) +{ + auto txn = query_context.getMetadataTransaction(); + if (txn && txn->is_initial_query) + { + String metadata_zk_path = txn->zookeeper_path + "/metadata/" + escapeForFileName(table_id.table_name); + txn->ops.emplace_back(zkutil::makeSetRequest(metadata_zk_path, statement, -1)); + } + DatabaseAtomic::commitAlterTable(table_id, table_metadata_tmp_path, table_metadata_path, statement, query_context); +} + } diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index d6cd93773cf..8085c234af4 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -46,6 +46,16 @@ public: ~DatabaseReplicated() override; + void dropTable(const Context &, const String & table_name, bool no_delay) override; + void renameTable(const Context & context, const String & table_name, IDatabase & to_database, + const String & to_table_name, bool exchange, bool dictionary) override; + void commitCreateTable(const ASTCreateQuery & query, const StoragePtr & table, + const String & table_metadata_tmp_path, const String & table_metadata_path, + const Context & query_context) override; + void commitAlterTable(const StorageID & table_id, + const String & table_metadata_tmp_path, const String & table_metadata_path, + const String & statement, const Context & query_context) override; + void drop(const Context & /*context*/) override; String getEngineName() const override { return "Replicated"; } @@ -65,17 +75,8 @@ private: bool createDatabaseNodesInZooKeeper(const ZooKeeperPtr & current_zookeeper); void createReplicaNodesInZooKeeper(const ZooKeeperPtr & current_zookeeper); - //void runBackgroundLogExecutor(); - void writeLastExecutedToDiskAndZK(); - - //void loadMetadataFromSnapshot(); - void removeOutdatedSnapshotsAndLog(); - - void onUnexpectedLogEntry(const String & entry_name, const ZooKeeperPtr & zookeeper); - void recoverLostReplica(const ZooKeeperPtr & current_zookeeper, UInt32 from_snapshot, bool create = false); - - void onExecutedLogEntry(const String & entry_name, const ZooKeeperPtr & zookeeper); + void recoverLostReplica(const ZooKeeperPtr & current_zookeeper, UInt32 from_snapshot); ASTPtr parseQueryFromMetadataInZooKeeper(const String & node_name, const String & query); @@ -86,19 +87,9 @@ private: UInt32 log_entry_to_execute; - std::mutex log_name_mutex; - String log_name_to_exec_with_result; - - int snapshot_period; - - String last_executed_log_entry = ""; - zkutil::ZooKeeperPtr getZooKeeper() const; std::unique_ptr ddl_worker; - - - }; } diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp index 869b888d3ad..29599d4d66d 100644 --- a/src/Databases/DatabaseReplicatedWorker.cpp +++ b/src/Databases/DatabaseReplicatedWorker.cpp @@ -96,19 +96,19 @@ DDLTaskPtr DatabaseReplicatedDDLWorker::initAndCheckTask(const String & entry_na throw Exception(ErrorCodes::LOGICAL_ERROR, "should be unreachable"); } - auto error = task->tryParseEntry(node_data); - if (error) - { - LOG_ERROR(log, "Cannot parse query from '{}': {}", node_data, *error); - database->onUnexpectedLogEntry(entry_name, zookeeper); - throw Exception(ErrorCodes::LOGICAL_ERROR, "should be unreachable"); - } + task->entry.parse(node_data); - task->parseQueryFromEntry(context); + if (task->entry.query.empty()) + { + //TODO better way to determine special entries + task->was_executed = true; + } + else + { + task->parseQueryFromEntry(context); + } return task; } - - } diff --git a/src/Databases/ya.make b/src/Databases/ya.make index 09d3dc38cb2..38f79532080 100644 --- a/src/Databases/ya.make +++ b/src/Databases/ya.make @@ -17,6 +17,7 @@ SRCS( DatabaseOnDisk.cpp DatabaseOrdinary.cpp DatabaseReplicated.cpp + DatabaseReplicatedWorker.cpp DatabaseWithDictionaries.cpp DatabasesCommon.cpp MySQL/ConnectionMySQLSettings.cpp diff --git a/src/Interpreters/DDLTask.cpp b/src/Interpreters/DDLTask.cpp index 0bc98dfd0dd..9ef7352ceb4 100644 --- a/src/Interpreters/DDLTask.cpp +++ b/src/Interpreters/DDLTask.cpp @@ -86,20 +86,6 @@ void DDLLogEntry::parse(const String & data) } -std::optional DDLTaskBase::tryParseEntry(const String & data) -{ - std::optional error; - try - { - entry.parse(data); - } - catch (...) - { - error = ExecutionStatus::fromCurrentException().serializeText(); - } - return error; -} - void DDLTaskBase::parseQueryFromEntry(const Context & context) { const char * begin = entry.query.data(); @@ -313,22 +299,25 @@ std::unique_ptr DatabaseReplicatedTask::makeQueryContext(Context & from query_context->getClientInfo().query_kind = ClientInfo::QueryKind::REPLICATED_LOG_QUERY; //FIXME why do we need separate query kind? query_context->setCurrentDatabase(database->getDatabaseName()); + auto txn = std::make_shared(); + query_context->initMetadataTransaction(txn); + txn->current_zookeeper = from_context.getZooKeeper(); + txn->zookeeper_path = database->zookeeper_path; + txn->is_initial_query = we_are_initiator; + if (we_are_initiator) { - auto txn = std::make_shared(); - query_context->initMetadataTransaction(txn); - txn->current_zookeeper = from_context.getZooKeeper(); - txn->zookeeper_path = database->zookeeper_path; txn->ops.emplace_back(zkutil::makeRemoveRequest(entry_path + "/try", -1)); txn->ops.emplace_back(zkutil::makeCreateRequest(entry_path + "/committed", host_id_str, zkutil::CreateMode::Persistent)); txn->ops.emplace_back(zkutil::makeRemoveRequest(getActiveNodePath(), -1)); - if (execute_on_leader) - txn->ops.emplace_back(zkutil::makeCreateRequest(getShardNodePath() + "/executed", host_id_str, zkutil::CreateMode::Persistent)); - txn->ops.emplace_back(zkutil::makeCreateRequest(getFinishedNodePath(), execution_status.serializeText(), zkutil::CreateMode::Persistent)); - txn->ops.emplace_back(zkutil::makeSetRequest(database->replica_path + "/log_ptr", toString(getLogEntryNumber(entry_name)), -1)); txn->ops.emplace_back(zkutil::makeSetRequest(database->zookeeper_path + "/max_log_ptr", toString(getLogEntryNumber(entry_name)), -1)); } + if (execute_on_leader) + txn->ops.emplace_back(zkutil::makeCreateRequest(getShardNodePath() + "/executed", host_id_str, zkutil::CreateMode::Persistent)); + txn->ops.emplace_back(zkutil::makeCreateRequest(getFinishedNodePath(), execution_status.serializeText(), zkutil::CreateMode::Persistent)); + txn->ops.emplace_back(zkutil::makeSetRequest(database->replica_path + "/log_ptr", toString(getLogEntryNumber(entry_name)), -1)); + return query_context; } @@ -347,15 +336,9 @@ UInt32 DatabaseReplicatedTask::getLogEntryNumber(const String & log_entry_name) return parse(log_entry_name.substr(strlen(name))); } -void DatabaseReplicatedTask::parseQueryFromEntry(const Context & context) +void MetadataTransaction::commit() { - if (entry.query.empty()) - { - was_executed = true; - return; - } - - DDLTaskBase::parseQueryFromEntry(context); + current_zookeeper->multi(ops); } } diff --git a/src/Interpreters/DDLTask.h b/src/Interpreters/DDLTask.h index 19d92a1bc78..2db1a696384 100644 --- a/src/Interpreters/DDLTask.h +++ b/src/Interpreters/DDLTask.h @@ -64,12 +64,6 @@ struct DDLTaskBase const String entry_name; const String entry_path; - DDLTaskBase(const String & name, const String & path) : entry_name(name), entry_path(path) {} - virtual ~DDLTaskBase() = default; - - std::optional tryParseEntry(const String & data); - virtual void parseQueryFromEntry(const Context & context); - DDLLogEntry entry; String host_id_str; @@ -81,6 +75,11 @@ struct DDLTaskBase ExecutionStatus execution_status; bool was_executed = false; + DDLTaskBase(const String & name, const String & path) : entry_name(name), entry_path(path) {} + virtual ~DDLTaskBase() = default; + + void parseQueryFromEntry(const Context & context); + virtual String getShardID() const = 0; virtual std::unique_ptr makeQueryContext(Context & from_context) const; @@ -93,26 +92,12 @@ struct DDLTaskBase struct DDLTask : public DDLTaskBase { - /// Stages of task lifetime correspond ordering of these data fields: - DDLTask(const String & name, const String & path) : DDLTaskBase(name, path) {} bool findCurrentHostID(const Context & global_context, Poco::Logger * log); void setClusterInfo(const Context & context, Poco::Logger * log); - - /// Stage 2: resolve host_id and check that - - - /// Stage 3.1: parse query - - /// Stage 3.2: check cluster and find the host in cluster - - /// Stage 3.3: execute query - - /// Stage 4: commit results to ZooKeeper - String getShardID() const override; private: @@ -131,8 +116,6 @@ struct DatabaseReplicatedTask : public DDLTaskBase { DatabaseReplicatedTask(const String & name, const String & path, DatabaseReplicated * database_); - void parseQueryFromEntry(const Context & context) override; - String getShardID() const override; std::unique_ptr makeQueryContext(Context & from_context) const override; @@ -148,14 +131,15 @@ struct MetadataTransaction { ZooKeeperPtr current_zookeeper; String zookeeper_path; + bool is_initial_query; Coordination::Requests ops; - - void addOps(Coordination::Requests & other_ops) { std::move(ops.begin(), ops.end(), std::back_inserter(other_ops)); } + + void commit(); }; } diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index 0399687a4d8..12f4c42b467 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -36,11 +36,8 @@ namespace ErrorCodes { extern const int NOT_IMPLEMENTED; extern const int LOGICAL_ERROR; - extern const int INCONSISTENT_CLUSTER_DEFINITION; extern const int TIMEOUT_EXCEEDED; - extern const int UNKNOWN_TYPE_OF_QUERY; extern const int UNFINISHED; - extern const int QUERY_IS_PROHIBITED; } @@ -226,7 +223,6 @@ void DDLWorker::recoverZooKeeper() } } - DDLTaskPtr DDLWorker::initAndCheckTask(const String & entry_name, String & out_reason, const ZooKeeperPtr & zookeeper) { String node_data; @@ -241,36 +237,50 @@ DDLTaskPtr DDLWorker::initAndCheckTask(const String & entry_name, String & out_r return {}; } - auto error = task->tryParseEntry(node_data); - if (error) + auto write_error_status = [&](const String & host_id, const String & error_message, const String & reason) + { + LOG_ERROR(log, "Cannot parse DDL task {}: {}. Will try to send error status: {}", entry_name, reason, error_message); + createStatusDirs(entry_path, zookeeper); + zookeeper->tryCreate(entry_path + "/finished/" + host_id, error_message, zkutil::CreateMode::Persistent); + }; + + try + { + /// Stage 1: parse entry + task->entry.parse(node_data); + } + catch (...) { /// What should we do if we even cannot parse host name and therefore cannot properly submit execution status? /// We can try to create fail node using FQDN if it equal to host name in cluster config attempt will be successful. /// Otherwise, that node will be ignored by DDLQueryStatusInputStream. - LOG_ERROR(log, "Cannot parse DDL task {}, will try to send error status: {}", entry_name, *error); - try - { - createStatusDirs(entry_path, zookeeper); - zookeeper->tryCreate(entry_path + "/finished/" + host_fqdn_id, *error, zkutil::CreateMode::Persistent); - } - catch (...) - { - tryLogCurrentException(log, "Can't report the task has invalid format"); - } - out_reason = "Incorrect task format"; + write_error_status(host_fqdn_id, ExecutionStatus::fromCurrentException().serializeText(), out_reason); return {}; } + /// Stage 2: resolve host_id and check if we should execute query or not if (!task->findCurrentHostID(context, log)) { out_reason = "There is no a local address in host list"; return {}; } - task->parseQueryFromEntry(context); - task->setClusterInfo(context, log); + try + { + /// Stage 3.1: parse query + task->parseQueryFromEntry(context); + /// Stage 3.2: check cluster and find the host in cluster + task->setClusterInfo(context, log); + } + catch (...) + { + out_reason = "Cannot parse query or obtain cluster info"; + write_error_status(task->host_id_str, ExecutionStatus::fromCurrentException().serializeText(), out_reason); + return {}; + } + /// Now task is ready for execution return task; } @@ -330,7 +340,8 @@ void DDLWorker::scheduleTasks() } else { - worker_pool.scheduleOrThrowOnError([this, task_ptr = task.release()]() { + worker_pool.scheduleOrThrowOnError([this, task_ptr = task.release()]() + { setThreadName("DDLWorkerExec"); enqueueTask(DDLTaskPtr(task_ptr)); }); @@ -345,13 +356,6 @@ void DDLWorker::scheduleTasks() } } -/// Parses query and resolves cluster and host in cluster -void DDLWorker::parseQueryAndResolveHost(DDLTaskBase & /*task*/) -{ - -} - - bool DDLWorker::tryExecuteQuery(const String & query, const DDLTaskBase & task, ExecutionStatus & status) { /// Add special comment at the start of query to easily identify DDL-produced queries in query_log @@ -792,7 +796,6 @@ void DDLWorker::runMainThread() setThreadName("DDLWorker"); LOG_DEBUG(log, "Started DDLWorker thread"); - bool initialized = false; do { try diff --git a/src/Interpreters/DDLWorker.h b/src/Interpreters/DDLWorker.h index 39087d05fbb..02076ae1df1 100644 --- a/src/Interpreters/DDLWorker.h +++ b/src/Interpreters/DDLWorker.h @@ -53,6 +53,8 @@ public: void shutdown(); + bool isCurrentlyActive() const { return initialized && !stop_flag; } + protected: /// Returns cached ZooKeeper session (possibly expired). @@ -87,8 +89,6 @@ protected: const String & node_path, const ZooKeeperPtr & zookeeper); - void parseQueryAndResolveHost(DDLTaskBase & task); - bool tryExecuteQuery(const String & query, const DDLTaskBase & task, ExecutionStatus & status); /// Checks and cleanups queue's nodes @@ -121,6 +121,7 @@ protected: std::shared_ptr queue_updated_event = std::make_shared(); std::shared_ptr cleanup_event = std::make_shared(); + std::atomic initialized = false; std::atomic stop_flag = false; ThreadFromGlobalPool main_thread; diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 8d695b29793..f79eb800b66 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -731,7 +731,7 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create) //TODO make code better if possible bool need_add_to_database = !create.temporary; - if(need_add_to_database && database->getEngineName() == "Replicated") + if (need_add_to_database && database->getEngineName() == "Replicated") { auto guard = DatabaseCatalog::instance().getDDLGuard(create.database, create.table); database = DatabaseCatalog::instance().getDatabase(create.database); diff --git a/src/Interpreters/executeDDLQueryOnCluster.cpp b/src/Interpreters/executeDDLQueryOnCluster.cpp index 03065245766..24405a5be27 100644 --- a/src/Interpreters/executeDDLQueryOnCluster.cpp +++ b/src/Interpreters/executeDDLQueryOnCluster.cpp @@ -23,6 +23,7 @@ namespace ErrorCodes extern const int TIMEOUT_EXCEEDED; extern const int UNFINISHED; extern const int QUERY_IS_PROHIBITED; + extern const int LOGICAL_ERROR; } bool isSupportedAlterType(int type) @@ -189,6 +190,7 @@ DDLQueryStatusInputStream::DDLQueryStatusInputStream(const String & zk_node_path if (hosts_to_wait) { waiting_hosts = NameSet(hosts_to_wait->begin(), hosts_to_wait->end()); + by_hostname = false; } else { @@ -267,7 +269,15 @@ Block DDLQueryStatusInputStream::readImpl() status.tryDeserializeText(status_data); } - auto [host, port] = Cluster::Address::fromString(host_id); + //FIXME + String host = host_id; + UInt16 port = 0; + if (by_hostname) + { + auto host_and_port = Cluster::Address::fromString(host_id); + host = host_and_port.first; + port = host_and_port.second; + } if (status.code != 0 && first_exception == nullptr) first_exception = std::make_unique(status.code, "There was an error on [{}:{}]: {}", host, port, status.message); diff --git a/src/Interpreters/executeDDLQueryOnCluster.h b/src/Interpreters/executeDDLQueryOnCluster.h index 0f7a411ed92..f65abf33c4f 100644 --- a/src/Interpreters/executeDDLQueryOnCluster.h +++ b/src/Interpreters/executeDDLQueryOnCluster.h @@ -61,6 +61,7 @@ private: std::unique_ptr first_exception; Int64 timeout_seconds = 120; + bool by_hostname = true; }; } diff --git a/tests/integration/test_replicated_database/test.py b/tests/integration/test_replicated_database/test.py index 8c5a25b3fe7..f99f4517e5a 100644 --- a/tests/integration/test_replicated_database/test.py +++ b/tests/integration/test_replicated_database/test.py @@ -90,6 +90,7 @@ def test_create_replica_after_delay(started_cluster, engine): assert_create_query([main_node, dummy_node, competing_node], name, expected) +@pytest.mark.dependency(depends=['test_create_replica_after_delay']) def test_alters_from_different_replicas(started_cluster): main_node.query("CREATE TABLE testdb.concurrent_test " "(CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) " @@ -138,13 +139,13 @@ def test_replica_restart(started_cluster): assert_create_query([main_node, competing_node], "testdb.concurrent_test", expected) -@pytest.mark.dependency(depends=['test_create_replica_after_delay']) +@pytest.mark.dependency(depends=['test_replica_restart']) def test_snapshot_and_snapshot_recover(started_cluster): - #FIXME bad test snapshotting_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'shard1', 'replica4');") - time.sleep(5) snapshot_recovering_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'shard1', 'replica5');") - time.sleep(5) + + assert_eq_with_retry(snapshotting_node, "select count() from system.tables where name like 'alter_test_%'", "2\n") + assert_eq_with_retry(snapshot_recovering_node, "select count() from system.tables where name like 'alter_test_%'", "2\n") assert snapshotting_node.query("desc table testdb.alter_test_MergeTree") == snapshot_recovering_node.query("desc table testdb.alter_test_MergeTree") assert snapshotting_node.query("desc table testdb.alter_test_ReplicatedMergeTree") == snapshot_recovering_node.query("desc table testdb.alter_test_ReplicatedMergeTree") From c955542dce00478321a424e05f0ef777dfcc00e2 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Mon, 30 Nov 2020 23:22:25 +0300 Subject: [PATCH 054/381] run functional tests with Replicated engine --- src/Interpreters/InterpreterCreateQuery.cpp | 10 +++++++++- src/Interpreters/executeDDLQueryOnCluster.cpp | 7 ++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index f79eb800b66..0b7fb3e5431 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -132,7 +132,15 @@ BlockIO InterpreterCreateQuery::createDatabase(ASTCreateQuery & create) bool old_style_database = context.getSettingsRef().default_database_engine.value == DefaultDatabaseEngine::Ordinary; auto engine = std::make_shared(); auto storage = std::make_shared(); - engine->name = old_style_database ? "Ordinary" : "Atomic"; + //FIXME revert it before merge + engine->name = "Atomic"; + if (old_style_database) + { + engine = makeASTFunction("Replicated", + std::make_shared(fmt::format("/clickhouse/db/{}/", create.database)), + std::make_shared("s1"), + std::make_shared("r1")); + } storage->set(storage->engine, engine); create.set(create.storage, storage); } diff --git a/src/Interpreters/executeDDLQueryOnCluster.cpp b/src/Interpreters/executeDDLQueryOnCluster.cpp index 24405a5be27..0b44206a2b2 100644 --- a/src/Interpreters/executeDDLQueryOnCluster.cpp +++ b/src/Interpreters/executeDDLQueryOnCluster.cpp @@ -294,7 +294,12 @@ Block DDLQueryStatusInputStream::readImpl() res = sample.cloneWithColumns(std::move(columns)); } - return res; + //FIXME revert it before merge + bool is_functional_tests = !by_hostname && context.getSettingsRef().default_database_engine.value == DefaultDatabaseEngine::Ordinary; + if (is_functional_tests) + return {}; + else + return res; } Strings DDLQueryStatusInputStream::getChildrenAllowNoNode(const std::shared_ptr & zookeeper, const String & node_path) From 1a4bd67736df1fdaec41df52bb4ca9d6ea5c4f81 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Tue, 1 Dec 2020 20:20:42 +0300 Subject: [PATCH 055/381] fixes --- src/Common/ZooKeeper/TestKeeper.cpp | 8 ++++---- src/Databases/DatabaseReplicated.cpp | 1 + src/Interpreters/Context.cpp | 1 + src/Interpreters/DDLWorker.cpp | 16 +++++++++++++--- src/Interpreters/DDLWorker.h | 1 + src/Interpreters/InterpreterCreateQuery.cpp | 5 ++++- src/Interpreters/executeDDLQueryOnCluster.cpp | 4 ++++ 7 files changed, 28 insertions(+), 8 deletions(-) diff --git a/src/Common/ZooKeeper/TestKeeper.cpp b/src/Common/ZooKeeper/TestKeeper.cpp index 5f34a60c34e..2d89228c7ae 100644 --- a/src/Common/ZooKeeper/TestKeeper.cpp +++ b/src/Common/ZooKeeper/TestKeeper.cpp @@ -213,10 +213,11 @@ std::pair TestKeeperCreateRequest::process(TestKeeper::Contai created_node.is_sequental = is_sequential; std::string path_created = path; + ++it->second.seq_num; + if (is_sequential) { auto seq_num = it->second.seq_num; - ++it->second.seq_num; std::stringstream seq_num_str; // STYLE_CHECK_ALLOW_STD_STRING_STREAM seq_num_str.exceptions(std::ios::failbit); @@ -228,15 +229,14 @@ std::pair TestKeeperCreateRequest::process(TestKeeper::Contai response.path_created = path_created; container.emplace(path_created, std::move(created_node)); - undo = [&container, path_created, is_sequential = is_sequential, parent_path = it->first] + undo = [&container, path_created, parent_path = it->first] { container.erase(path_created); auto & undo_parent = container.at(parent_path); --undo_parent.stat.cversion; --undo_parent.stat.numChildren; - if (is_sequential) - --undo_parent.seq_num; + --undo_parent.seq_num; }; ++it->second.stat.cversion; diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 418eaf567a4..a7e6c11ca4c 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -170,6 +170,7 @@ void DatabaseReplicated::loadStoredObjects(Context & context, bool has_force_res DatabaseAtomic::loadStoredObjects(context, has_force_restore_data_flag, force_attach); ddl_worker = std::make_unique(this, global_context); + ddl_worker->startup(); } void DatabaseReplicated::onUnexpectedLogEntry(const String & entry_name, const ZooKeeperPtr & zookeeper) diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index 27deb07d296..ef19c134854 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -1487,6 +1487,7 @@ void Context::setDDLWorker(std::unique_ptr ddl_worker) auto lock = getLock(); if (shared->ddl_worker) throw Exception("DDL background thread has already been initialized", ErrorCodes::LOGICAL_ERROR); + ddl_worker->startup(); shared->ddl_worker = std::move(ddl_worker); } diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index 12f4c42b467..188d38b8647 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -167,7 +167,10 @@ DDLWorker::DDLWorker(int pool_size_, const std::string & zk_root_dir, const Cont host_fqdn = getFQDNOrHostName(); host_fqdn_id = Cluster::Address::toString(host_fqdn, context.getTCPPort()); +} +void DDLWorker::startup() +{ main_thread = ThreadFromGlobalPool(&DDLWorker::runMainThread, this); cleanup_thread = ThreadFromGlobalPool(&DDLWorker::runCleanupThread, this); } @@ -183,8 +186,10 @@ DDLWorker::~DDLWorker() { shutdown(); worker_pool.wait(); - main_thread.join(); - cleanup_thread.join(); + if (main_thread.joinable()) + main_thread.join(); + if (cleanup_thread.joinable()) + cleanup_thread.join(); } @@ -421,7 +426,12 @@ void DDLWorker::enqueueTask(DDLTaskPtr task_ptr) else if (e.code == Coordination::Error::ZNONODE) { LOG_ERROR(log, "ZooKeeper error: {}", getCurrentExceptionMessage(true)); - // TODO: retry? + if (!current_zookeeper->exists(task_ptr->entry_path)) + { + //FIXME race condition with cleanup thread + LOG_ERROR(log, "Task {} is lost. It probably was removed by other server.", task_ptr->entry_path); + return; + } } else { diff --git a/src/Interpreters/DDLWorker.h b/src/Interpreters/DDLWorker.h index 02076ae1df1..f41ca0fce8f 100644 --- a/src/Interpreters/DDLWorker.h +++ b/src/Interpreters/DDLWorker.h @@ -51,6 +51,7 @@ public: return host_fqdn_id; } + void startup(); void shutdown(); bool isCurrentlyActive() const { return initialized && !stop_flag; } diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 0b7fb3e5431..f201e38be2e 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -136,7 +136,10 @@ BlockIO InterpreterCreateQuery::createDatabase(ASTCreateQuery & create) engine->name = "Atomic"; if (old_style_database) { - engine = makeASTFunction("Replicated", + if (database_name == "test") + engine->name = "Ordinary"; // for stateful tests + else + engine = makeASTFunction("Replicated", std::make_shared(fmt::format("/clickhouse/db/{}/", create.database)), std::make_shared("s1"), std::make_shared("r1")); diff --git a/src/Interpreters/executeDDLQueryOnCluster.cpp b/src/Interpreters/executeDDLQueryOnCluster.cpp index 0b44206a2b2..2ca07349cbc 100644 --- a/src/Interpreters/executeDDLQueryOnCluster.cpp +++ b/src/Interpreters/executeDDLQueryOnCluster.cpp @@ -201,6 +201,10 @@ DDLQueryStatusInputStream::DDLQueryStatusInputStream(const String & zk_node_path addTotalRowsApprox(waiting_hosts.size()); timeout_seconds = context.getSettingsRef().distributed_ddl_task_timeout; + + //FIXME revert it before merge + if (context.getSettingsRef().default_database_engine.value == DefaultDatabaseEngine::Ordinary) + timeout_seconds = 10; } Block DDLQueryStatusInputStream::readImpl() From 39532f7d9e47204a499ffa9200b91eaae9763aae Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Thu, 3 Dec 2020 21:14:27 +0300 Subject: [PATCH 056/381] slightly better DDLWorker initialization and restarting --- src/Common/ZooKeeper/TestKeeper.cpp | 4 +- src/Databases/DatabaseAtomic.cpp | 3 - src/Databases/DatabaseReplicatedWorker.cpp | 32 +++- src/Databases/DatabaseReplicatedWorker.h | 3 +- src/Interpreters/DDLTask.h | 2 + src/Interpreters/DDLWorker.cpp | 187 ++++++++------------- src/Interpreters/DDLWorker.h | 15 +- 7 files changed, 114 insertions(+), 132 deletions(-) diff --git a/src/Common/ZooKeeper/TestKeeper.cpp b/src/Common/ZooKeeper/TestKeeper.cpp index 2d89228c7ae..86387417a3c 100644 --- a/src/Common/ZooKeeper/TestKeeper.cpp +++ b/src/Common/ZooKeeper/TestKeeper.cpp @@ -213,8 +213,6 @@ std::pair TestKeeperCreateRequest::process(TestKeeper::Contai created_node.is_sequental = is_sequential; std::string path_created = path; - ++it->second.seq_num; - if (is_sequential) { auto seq_num = it->second.seq_num; @@ -226,6 +224,8 @@ std::pair TestKeeperCreateRequest::process(TestKeeper::Contai path_created += seq_num_str.str(); } + ++it->second.seq_num; + response.path_created = path_created; container.emplace(path_created, std::move(created_node)); diff --git a/src/Databases/DatabaseAtomic.cpp b/src/Databases/DatabaseAtomic.cpp index b60adf44e51..438fa2d97bd 100644 --- a/src/Databases/DatabaseAtomic.cpp +++ b/src/Databases/DatabaseAtomic.cpp @@ -11,10 +11,7 @@ #include #include #include - -//FIXME it shouldn't be here #include -#include namespace DB { diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp index 29599d4d66d..0c2368cdcf6 100644 --- a/src/Databases/DatabaseReplicatedWorker.cpp +++ b/src/Databases/DatabaseReplicatedWorker.cpp @@ -17,7 +17,26 @@ DatabaseReplicatedDDLWorker::DatabaseReplicatedDDLWorker(DatabaseReplicated * db /// Pool size must be 1 (to avoid reordering of log entries) } -void DatabaseReplicatedDDLWorker::initialize() +void DatabaseReplicatedDDLWorker::initializeMainThread() +{ + do + { + try + { + auto zookeeper = getAndSetZooKeeper(); + initializeReplication(); + initialized = true; + } + catch (...) + { + tryLogCurrentException(log, fmt::format("Error on initialization of {}", database->getDatabaseName())); + sleepForSeconds(5); + } + } + while (!initialized && !stop_flag); +} + +void DatabaseReplicatedDDLWorker::initializeReplication() { /// Check if we need to recover replica. /// Invariant: replica is lost if it's log_ptr value is less then min_log_ptr value. @@ -101,11 +120,16 @@ DDLTaskPtr DatabaseReplicatedDDLWorker::initAndCheckTask(const String & entry_na if (task->entry.query.empty()) { //TODO better way to determine special entries - task->was_executed = true; + out_reason = "It's dummy task"; + return {}; } - else + + task->parseQueryFromEntry(context); + + if (zookeeper->exists(task->getFinishedNodePath())) { - task->parseQueryFromEntry(context); + out_reason = "Task has been already processed"; + return {}; } return task; diff --git a/src/Databases/DatabaseReplicatedWorker.h b/src/Databases/DatabaseReplicatedWorker.h index d190bd1795d..7994104331e 100644 --- a/src/Databases/DatabaseReplicatedWorker.h +++ b/src/Databases/DatabaseReplicatedWorker.h @@ -15,7 +15,8 @@ public: String enqueueQuery(DDLLogEntry & entry) override; private: - void initialize() override; + void initializeMainThread() override; + void initializeReplication(); DDLTaskPtr initAndCheckTask(const String & entry_name, String & out_reason, const ZooKeeperPtr & zookeeper) override; diff --git a/src/Interpreters/DDLTask.h b/src/Interpreters/DDLTask.h index 2db1a696384..94127b39b84 100644 --- a/src/Interpreters/DDLTask.h +++ b/src/Interpreters/DDLTask.h @@ -76,6 +76,8 @@ struct DDLTaskBase bool was_executed = false; DDLTaskBase(const String & name, const String & path) : entry_name(name), entry_path(path) {} + DDLTaskBase(const DDLTaskBase &) = delete; + DDLTaskBase(DDLTaskBase &&) = default; virtual ~DDLTaskBase() = default; void parseQueryFromEntry(const Context & context); diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index 188d38b8647..e4ea5f8db17 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -143,9 +143,14 @@ DDLWorker::DDLWorker(int pool_size_, const std::string & zk_root_dir, const Cont const String & logger_name) : context(context_) , log(&Poco::Logger::get(logger_name)) - , pool_size(pool_size_) //FIXME make it optional - , worker_pool(pool_size_) + , pool_size(pool_size_) { + if (1 < pool_size) + { + LOG_WARNING(log, "DDLWorker is configured to use multiple threads. " + "It's not recommended because queries can be reordered. Also it may cause some unknown issues to appear."); + worker_pool.emplace(pool_size); + } queue_dir = zk_root_dir; if (queue_dir.back() == '/') queue_dir.resize(queue_dir.size() - 1); @@ -185,7 +190,8 @@ void DDLWorker::shutdown() DDLWorker::~DDLWorker() { shutdown(); - worker_pool.wait(); + if (worker_pool) + worker_pool->wait(); if (main_thread.joinable()) main_thread.join(); if (cleanup_thread.joinable()) @@ -209,24 +215,6 @@ ZooKeeperPtr DDLWorker::getAndSetZooKeeper() return current_zookeeper; } -void DDLWorker::recoverZooKeeper() -{ - LOG_DEBUG(log, "Recovering ZooKeeper session after: {}", getCurrentExceptionMessage(false)); - - while (!stop_flag) - { - try - { - getAndSetZooKeeper(); - break; - } - catch (...) - { - tryLogCurrentException(__PRETTY_FUNCTION__); - sleepForSeconds(5); - } - } -} DDLTaskPtr DDLWorker::initAndCheckTask(const String & entry_name, String & out_reason, const ZooKeeperPtr & zookeeper) { @@ -285,6 +273,12 @@ DDLTaskPtr DDLWorker::initAndCheckTask(const String & entry_name, String & out_r return {}; } + if (zookeeper->exists(task->getFinishedNodePath())) + { + out_reason = "Task has been already processed"; + return {}; + } + /// Now task is ready for execution return task; } @@ -309,11 +303,11 @@ void DDLWorker::scheduleTasks() return; } - bool server_startup = !last_entry_name.has_value(); + bool server_startup = current_tasks.empty(); auto begin_node = server_startup ? queue_nodes.begin() - : std::upper_bound(queue_nodes.begin(), queue_nodes.end(), *last_entry_name); + : std::upper_bound(queue_nodes.begin(), queue_nodes.end(), current_tasks.back()->entry_name); for (auto it = begin_node; it != queue_nodes.end() && !stop_flag; ++it) { @@ -325,42 +319,39 @@ void DDLWorker::scheduleTasks() if (!task) { LOG_DEBUG(log, "Will not execute task {}: {}", entry_name, reason); - last_entry_name = entry_name; + task->was_executed = true; + saveTask(std::move(task)); //FIXME questionable continue; } - bool already_processed = zookeeper->exists(task->entry_path + "/finished/" + task->host_id_str); - if (!server_startup && !task->was_executed && already_processed) - { - throw Exception(ErrorCodes::LOGICAL_ERROR, - "Server expects that DDL task {} should be processed, but it was already processed according to ZK", - entry_name); - } + auto & saved_task = saveTask(std::move(task)); - if (!already_processed) + if (worker_pool) { - if (pool_size == 1) + worker_pool->scheduleOrThrowOnError([this, &saved_task]() { - enqueueTask(DDLTaskPtr(task.release())); - } - else - { - worker_pool.scheduleOrThrowOnError([this, task_ptr = task.release()]() - { - setThreadName("DDLWorkerExec"); - enqueueTask(DDLTaskPtr(task_ptr)); - }); - } + setThreadName("DDLWorkerExec"); + processTask(saved_task); + }); } else { - LOG_DEBUG(log, "Task {} ({}) has been already processed", entry_name, task->entry.query); + processTask(saved_task); } - - last_entry_name = entry_name; } } +DDLTaskBase & DDLWorker::saveTask(DDLTaskPtr && task) +{ + if (current_tasks.size() == pool_size) + { + assert(current_tasks.front()->was_executed); + current_tasks.pop_front(); + } + current_tasks.emplace_back(std::move(task)); + return *current_tasks.back(); +} + bool DDLWorker::tryExecuteQuery(const String & query, const DDLTaskBase & task, ExecutionStatus & status) { /// Add special comment at the start of query to easily identify DDL-produced queries in query_log @@ -404,48 +395,6 @@ void DDLWorker::attachToThreadGroup() } } - -void DDLWorker::enqueueTask(DDLTaskPtr task_ptr) -{ - auto & task = *task_ptr; - - while (!stop_flag) - { - try - { - processTask(task); - return; - } - /// TODO recover zk in runMainThread(...) and retry task (why do we need another place where session is recovered?) - catch (const Coordination::Exception & e) - { - if (Coordination::isHardwareError(e.code)) - { - recoverZooKeeper(); - } - else if (e.code == Coordination::Error::ZNONODE) - { - LOG_ERROR(log, "ZooKeeper error: {}", getCurrentExceptionMessage(true)); - if (!current_zookeeper->exists(task_ptr->entry_path)) - { - //FIXME race condition with cleanup thread - LOG_ERROR(log, "Task {} is lost. It probably was removed by other server.", task_ptr->entry_path); - return; - } - } - else - { - LOG_ERROR(log, "Unexpected ZooKeeper error: {}.", getCurrentExceptionMessage(true)); - return; - } - } - catch (...) - { - LOG_WARNING(log, "An error occurred while processing task {} ({}) : {}", task.entry_name, task.entry.query, getCurrentExceptionMessage(true)); - } - } -} - void DDLWorker::processTask(DDLTaskBase & task) { auto zookeeper = tryGetZooKeeper(); @@ -458,22 +407,16 @@ void DDLWorker::processTask(DDLTaskBase & task) String dummy; auto code = zookeeper->tryCreate(active_node_path, "", zkutil::CreateMode::Ephemeral, dummy); - if (code == Coordination::Error::ZOK || code == Coordination::Error::ZNODEEXISTS) - { - // Ok - } - else if (code == Coordination::Error::ZNONODE) + if (code == Coordination::Error::ZNONODE) { /// There is no parent - //TODO why not to create parent before active_node? createStatusDirs(task.entry_path, zookeeper); - if (Coordination::Error::ZOK != zookeeper->tryCreate(active_node_path, "", zkutil::CreateMode::Ephemeral, dummy)) - throw Coordination::Exception(code, active_node_path); + zookeeper->create(active_node_path, "", zkutil::CreateMode::Ephemeral); } else throw Coordination::Exception(code, active_node_path); - if (!task.was_executed) + if (!task.was_executed) // FIXME always true { try { @@ -513,6 +456,9 @@ void DDLWorker::processTask(DDLTaskBase & task) } /// FIXME: if server fails right here, the task will be executed twice. We need WAL here. + /// Another possible issue: if ZooKeeper session is lost here, we will recover connection and execute the task second time. + + /// Delete active flag and create finish flag Coordination::Requests ops; @@ -787,7 +733,9 @@ String DDLWorker::enqueueQuery(DDLLogEntry & entry) String node_path = zookeeper->create(query_path_prefix, entry.toString(), zkutil::CreateMode::PersistentSequential); - /// Optional step + /// We cannot create status dirs in a single transaction with previous request, + /// because we don't know node_path until previous request is executed. + /// Se we try to create status dirs here or later when we will execute entry. try { createStatusDirs(node_path, zookeeper); @@ -801,70 +749,80 @@ String DDLWorker::enqueueQuery(DDLLogEntry & entry) } -void DDLWorker::runMainThread() +void DDLWorker::initializeMainThread() { - setThreadName("DDLWorker"); - LOG_DEBUG(log, "Started DDLWorker thread"); - do { try { auto zookeeper = getAndSetZooKeeper(); zookeeper->createAncestors(queue_dir + "/"); - initialize(); initialized = true; } catch (const Coordination::Exception & e) { if (!Coordination::isHardwareError(e.code)) - throw; /// A logical error. + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected ZooKeeper error: {}", e.message()); tryLogCurrentException(__PRETTY_FUNCTION__); /// Avoid busy loop when ZooKeeper is not available. - sleepForSeconds(1); + sleepForSeconds(5); } catch (...) { - tryLogCurrentException(log, "Terminating. Cannot initialize DDL queue."); - return; + tryLogCurrentException(log, "Cannot initialize main thread of DDLWorker, will try again"); + sleepForSeconds(5); } } while (!initialized && !stop_flag); +} + +void DDLWorker::runMainThread() +{ + setThreadName("DDLWorker"); + attachToThreadGroup(); + LOG_DEBUG(log, "Starting DDLWorker thread"); while (!stop_flag) { try { - attachToThreadGroup(); + /// Reinitialize DDLWorker state (including ZooKeeper connection) if required + if (!initialized) + { + initializeMainThread(); + LOG_DEBUG(log, "Initialized DDLWorker thread"); + } cleanup_event->set(); scheduleTasks(); - LOG_DEBUG(log, "Waiting a watch"); + LOG_DEBUG(log, "Waiting for queue updates"); queue_updated_event->wait(); } catch (const Coordination::Exception & e) { if (Coordination::isHardwareError(e.code)) { - recoverZooKeeper(); + initialized = false; } else if (e.code == Coordination::Error::ZNONODE) { + // TODO add comment: when it happens and why it's expected? + // maybe because cleanup thread may remove nodes inside queue entry which are currently processed LOG_ERROR(log, "ZooKeeper error: {}", getCurrentExceptionMessage(true)); } else { - LOG_ERROR(log, "Unexpected ZooKeeper error: {}. Terminating.", getCurrentExceptionMessage(true)); - return; + LOG_ERROR(log, "Unexpected ZooKeeper error: {}.", getCurrentExceptionMessage(true)); + assert(false); } } catch (...) { - tryLogCurrentException(log, "Unexpected error, will terminate:"); - return; + tryLogCurrentException(log, "Unexpected error, will try to restart main thread:"); + initialized = false; } } } @@ -891,6 +849,7 @@ void DDLWorker::runCleanupThread() continue; } + /// ZooKeeper connection is recovered by main thread. We will wait for it on cleanup_event. auto zookeeper = tryGetZooKeeper(); if (zookeeper->expired()) continue; diff --git a/src/Interpreters/DDLWorker.h b/src/Interpreters/DDLWorker.h index f41ca0fce8f..78921fa60e3 100644 --- a/src/Interpreters/DDLWorker.h +++ b/src/Interpreters/DDLWorker.h @@ -62,17 +62,16 @@ protected: ZooKeeperPtr tryGetZooKeeper() const; /// If necessary, creates a new session and caches it. ZooKeeperPtr getAndSetZooKeeper(); - /// ZooKeeper recover loop (while not stopped). - void recoverZooKeeper(); - void checkCurrentTasks(); + /// Iterates through queue tasks in ZooKeeper, runs execution of new tasks void scheduleTasks(); + DDLTaskBase & saveTask(DDLTaskPtr && task); + /// Reads entry and check that the host belongs to host list of the task /// Returns non-empty DDLTaskPtr if entry parsed and the check is passed virtual DDLTaskPtr initAndCheckTask(const String & entry_name, String & out_reason, const ZooKeeperPtr & zookeeper); - void enqueueTask(DDLTaskPtr task); void processTask(DDLTaskBase & task); /// Check that query should be executed on leader replica only @@ -98,7 +97,7 @@ protected: /// Init task node static void createStatusDirs(const std::string & node_path, const ZooKeeperPtr & zookeeper); - virtual void initialize() {} + virtual void initializeMainThread(); void runMainThread(); void runCleanupThread(); @@ -117,8 +116,8 @@ protected: ZooKeeperPtr current_zookeeper; /// Save state of executed task to avoid duplicate execution on ZK error - //std::vector last_tasks; - std::optional last_entry_name; + //std::optional last_entry_name; + std::list current_tasks; std::shared_ptr queue_updated_event = std::make_shared(); std::shared_ptr cleanup_event = std::make_shared(); @@ -130,7 +129,7 @@ protected: /// Size of the pool for query execution. size_t pool_size = 1; - ThreadPool worker_pool; + std::optional worker_pool; /// Cleaning starts after new node event is received if the last cleaning wasn't made sooner than N seconds ago Int64 cleanup_delay_period = 60; // minute (in seconds) From 9f3c77f62e281fbb6c14e23ec81bde5e7000f416 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Fri, 4 Dec 2020 23:12:32 +0300 Subject: [PATCH 057/381] add zk ops into task --- src/Common/ZooKeeper/ZooKeeper.h | 8 ++ src/Interpreters/DDLTask.cpp | 18 ++-- src/Interpreters/DDLTask.h | 18 +++- src/Interpreters/DDLWorker.cpp | 172 ++++++++++++++++++++++--------- src/Interpreters/DDLWorker.h | 2 +- 5 files changed, 160 insertions(+), 58 deletions(-) diff --git a/src/Common/ZooKeeper/ZooKeeper.h b/src/Common/ZooKeeper/ZooKeeper.h index 1ad744102c6..e79553ed4d9 100644 --- a/src/Common/ZooKeeper/ZooKeeper.h +++ b/src/Common/ZooKeeper/ZooKeeper.h @@ -314,8 +314,15 @@ public: return std::make_shared(path, zookeeper, false, false, ""); } + void reset() + { + need_remove = false; + } + ~EphemeralNodeHolder() { + if (!need_remove) + return; try { zookeeper.tryRemove(path); @@ -331,6 +338,7 @@ private: std::string path; ZooKeeper & zookeeper; CurrentMetrics::Increment metric_increment{CurrentMetrics::EphemeralNode}; + bool need_remove = true; }; using EphemeralNodeHolderPtr = EphemeralNodeHolder::Ptr; diff --git a/src/Interpreters/DDLTask.cpp b/src/Interpreters/DDLTask.cpp index 9ef7352ceb4..3d9297880c1 100644 --- a/src/Interpreters/DDLTask.cpp +++ b/src/Interpreters/DDLTask.cpp @@ -96,7 +96,7 @@ void DDLTaskBase::parseQueryFromEntry(const Context & context) query = parseQuery(parser_query, begin, end, description, 0, context.getSettingsRef().max_parser_depth); } -std::unique_ptr DDLTaskBase::makeQueryContext(Context & from_context) const +std::unique_ptr DDLTaskBase::makeQueryContext(Context & from_context) { auto query_context = std::make_unique(from_context); query_context->makeQueryContext(); @@ -293,7 +293,7 @@ String DatabaseReplicatedTask::getShardID() const return database->shard_name; } -std::unique_ptr DatabaseReplicatedTask::makeQueryContext(Context & from_context) const +std::unique_ptr DatabaseReplicatedTask::makeQueryContext(Context & from_context) { auto query_context = DDLTaskBase::makeQueryContext(from_context); query_context->getClientInfo().query_kind = ClientInfo::QueryKind::REPLICATED_LOG_QUERY; //FIXME why do we need separate query kind? @@ -309,15 +309,18 @@ std::unique_ptr DatabaseReplicatedTask::makeQueryContext(Context & from { txn->ops.emplace_back(zkutil::makeRemoveRequest(entry_path + "/try", -1)); txn->ops.emplace_back(zkutil::makeCreateRequest(entry_path + "/committed", host_id_str, zkutil::CreateMode::Persistent)); - txn->ops.emplace_back(zkutil::makeRemoveRequest(getActiveNodePath(), -1)); + //txn->ops.emplace_back(zkutil::makeRemoveRequest(getActiveNodePath(), -1)); txn->ops.emplace_back(zkutil::makeSetRequest(database->zookeeper_path + "/max_log_ptr", toString(getLogEntryNumber(entry_name)), -1)); } - if (execute_on_leader) - txn->ops.emplace_back(zkutil::makeCreateRequest(getShardNodePath() + "/executed", host_id_str, zkutil::CreateMode::Persistent)); - txn->ops.emplace_back(zkutil::makeCreateRequest(getFinishedNodePath(), execution_status.serializeText(), zkutil::CreateMode::Persistent)); + //if (execute_on_leader) + // txn->ops.emplace_back(zkutil::makeCreateRequest(getShardNodePath() + "/executed", host_id_str, zkutil::CreateMode::Persistent)); + //txn->ops.emplace_back(zkutil::makeCreateRequest(getFinishedNodePath(), execution_status.serializeText(), zkutil::CreateMode::Persistent)); txn->ops.emplace_back(zkutil::makeSetRequest(database->replica_path + "/log_ptr", toString(getLogEntryNumber(entry_name)), -1)); + std::move(ops.begin(), ops.end(), std::back_inserter(txn->ops)); + ops.clear(); + return query_context; } @@ -338,7 +341,10 @@ UInt32 DatabaseReplicatedTask::getLogEntryNumber(const String & log_entry_name) void MetadataTransaction::commit() { + assert(state == CREATED); + state = FAILED; current_zookeeper->multi(ops); + state = COMMITED; } } diff --git a/src/Interpreters/DDLTask.h b/src/Interpreters/DDLTask.h index 94127b39b84..aa234d1bfdd 100644 --- a/src/Interpreters/DDLTask.h +++ b/src/Interpreters/DDLTask.h @@ -15,6 +15,9 @@ class ASTQueryWithOnCluster; using ZooKeeperPtr = std::shared_ptr; class DatabaseReplicated; +struct MetadataTransaction; +using MetadataTransactionPtr = std::shared_ptr; + struct HostID { String host_name; @@ -72,6 +75,8 @@ struct DDLTaskBase bool is_circular_replicated = false; bool execute_on_leader = false; + //MetadataTransactionPtr txn; + Coordination::Requests ops; ExecutionStatus execution_status; bool was_executed = false; @@ -84,7 +89,7 @@ struct DDLTaskBase virtual String getShardID() const = 0; - virtual std::unique_ptr makeQueryContext(Context & from_context) const; + virtual std::unique_ptr makeQueryContext(Context & from_context); inline String getActiveNodePath() const { return entry_path + "/active/" + host_id_str; } inline String getFinishedNodePath() const { return entry_path + "/finished/" + host_id_str; } @@ -119,7 +124,7 @@ struct DatabaseReplicatedTask : public DDLTaskBase DatabaseReplicatedTask(const String & name, const String & path, DatabaseReplicated * database_); String getShardID() const override; - std::unique_ptr makeQueryContext(Context & from_context) const override; + std::unique_ptr makeQueryContext(Context & from_context) override; static String getLogEntryName(UInt32 log_entry_number); static UInt32 getLogEntryNumber(const String & log_entry_name); @@ -131,6 +136,14 @@ struct DatabaseReplicatedTask : public DDLTaskBase struct MetadataTransaction { + enum State + { + CREATED, + COMMITED, + FAILED + }; + + State state = CREATED; ZooKeeperPtr current_zookeeper; String zookeeper_path; bool is_initial_query; @@ -142,6 +155,7 @@ struct MetadataTransaction } void commit(); + }; } diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index e4ea5f8db17..a3262c238fc 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -38,6 +38,11 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; extern const int TIMEOUT_EXCEEDED; extern const int UNFINISHED; + extern const int NOT_A_LEADER; + extern const int KEEPER_EXCEPTION; + extern const int CANNOT_ASSIGN_ALTER; + extern const int CANNOT_ALLOCATE_MEMORY; + extern const int MEMORY_LIMIT_EXCEEDED; } @@ -295,6 +300,19 @@ void DDLWorker::scheduleTasks() LOG_DEBUG(log, "Scheduling tasks"); auto zookeeper = tryGetZooKeeper(); + for (auto & task : current_tasks) + { + /// Main thread of DDLWorker was restarted, probably due to lost connection with ZooKeeper. + /// We have some unfinished tasks. To avoid duplication of some queries, try to write execution status. + bool status_written = task->ops.empty(); + bool task_still_exists = zookeeper->exists(task->entry_path); + if (task->was_executed && !status_written && task_still_exists) + { + assert(!zookeeper->exists(task->getFinishedNodePath())); + processTask(*task); + } + } + Strings queue_nodes = zookeeper->getChildren(queue_dir, nullptr, queue_updated_event); filterAndSortQueueNodes(queue_nodes); if (queue_nodes.empty()) @@ -304,10 +322,16 @@ void DDLWorker::scheduleTasks() } bool server_startup = current_tasks.empty(); + auto begin_node = queue_nodes.begin(); - auto begin_node = server_startup - ? queue_nodes.begin() - : std::upper_bound(queue_nodes.begin(), queue_nodes.end(), current_tasks.back()->entry_name); + if (!server_startup) + { + /// We will recheck status of last executed tasks. It's useful if main thread was just restarted. + auto & min_task = *std::min_element(current_tasks.begin(), current_tasks.end()); + begin_node = std::upper_bound(queue_nodes.begin(), queue_nodes.end(), min_task->entry_name); + current_tasks.clear(); + //FIXME better way of maintaning current tasks list and min_task name; + } for (auto it = begin_node; it != queue_nodes.end() && !stop_flag; ++it) { @@ -319,8 +343,8 @@ void DDLWorker::scheduleTasks() if (!task) { LOG_DEBUG(log, "Will not execute task {}: {}", entry_name, reason); - task->was_executed = true; - saveTask(std::move(task)); //FIXME questionable + //task->was_executed = true; + //saveTask(std::move(task)); continue; } @@ -343,16 +367,17 @@ void DDLWorker::scheduleTasks() DDLTaskBase & DDLWorker::saveTask(DDLTaskPtr && task) { - if (current_tasks.size() == pool_size) - { - assert(current_tasks.front()->was_executed); - current_tasks.pop_front(); - } + //assert(current_tasks.size() <= pool_size + 1); + //if (current_tasks.size() == pool_size) + //{ + // assert(current_tasks.front()->ops.empty()); //FIXME + // current_tasks.pop_front(); + //} current_tasks.emplace_back(std::move(task)); return *current_tasks.back(); } -bool DDLWorker::tryExecuteQuery(const String & query, const DDLTaskBase & task, ExecutionStatus & status) +bool DDLWorker::tryExecuteQuery(const String & query, DDLTaskBase & task) { /// Add special comment at the start of query to easily identify DDL-produced queries in query_log String query_prefix = "/* ddl_entry=" + task.entry_name + " */ "; @@ -367,15 +392,34 @@ bool DDLWorker::tryExecuteQuery(const String & query, const DDLTaskBase & task, auto query_context = task.makeQueryContext(context); executeQuery(istr, ostr, false, *query_context, {}); } - catch (...) + catch (const DB::Exception & e) { - status = ExecutionStatus::fromCurrentException(); + task.execution_status = ExecutionStatus::fromCurrentException(); tryLogCurrentException(log, "Query " + query + " wasn't finished successfully"); + /// We use return value of tryExecuteQuery(...) in tryExecuteQueryOnLeaderReplica(...) to determine + /// if replica has stopped being leader and we should retry query. + /// However, for the majority of exceptions there is no sense to retry, because most likely we will just + /// get the same exception again. So we return false only for several special exception codes, + /// and consider query as executed with status "failed" and return true in other cases. + bool no_sense_to_retry = e.code() != ErrorCodes::KEEPER_EXCEPTION && + e.code() != ErrorCodes::NOT_A_LEADER && + e.code() != ErrorCodes::CANNOT_ASSIGN_ALTER && + e.code() != ErrorCodes::CANNOT_ALLOCATE_MEMORY && + e.code() != ErrorCodes::MEMORY_LIMIT_EXCEEDED; + return no_sense_to_retry; + } + catch (...) + { + task.execution_status = ExecutionStatus::fromCurrentException(); + tryLogCurrentException(log, "Query " + query + " wasn't finished successfully"); + + /// We don't know what exactly happened, but maybe it's Poco::NetException or std::bad_alloc, + /// so we consider unknown exception as retryable error. return false; } - status = ExecutionStatus(0); + task.execution_status = ExecutionStatus(0); LOG_DEBUG(log, "Executed query: {}", query); return true; @@ -405,19 +449,18 @@ void DDLWorker::processTask(DDLTaskBase & task) String finished_node_path = task.getFinishedNodePath(); String dummy; - auto code = zookeeper->tryCreate(active_node_path, "", zkutil::CreateMode::Ephemeral, dummy); + zookeeper->createAncestors(active_node_path); + auto active_node = zkutil::EphemeralNodeHolder::create(active_node_path, *zookeeper, ""); - if (code == Coordination::Error::ZNONODE) + if (!task.was_executed) { - /// There is no parent - createStatusDirs(task.entry_path, zookeeper); - zookeeper->create(active_node_path, "", zkutil::CreateMode::Ephemeral); - } - else - throw Coordination::Exception(code, active_node_path); + /// If table and database engine supports it, they will execute task.ops by their own in a single transaction + /// with other zk operations (such as appending something to ReplicatedMergeTree log, or + /// updating metadata in Replicated database), so we make create request for finished_node_path with status "0", + /// which means that query executed successfully. + task.ops.emplace_back(zkutil::makeRemoveRequest(active_node_path, -1)); + task.ops.emplace_back(zkutil::makeCreateRequest(finished_node_path, "0", zkutil::CreateMode::Persistent)); - if (!task.was_executed) // FIXME always true - { try { String rewritten_query = queryToString(task.query); @@ -439,7 +482,7 @@ void DDLWorker::processTask(DDLTaskBase & task) if (task.execute_on_leader) tryExecuteQueryOnLeaderReplica(task, storage, rewritten_query, task.entry_path, zookeeper); else - tryExecuteQuery(rewritten_query, task, task.execution_status); + tryExecuteQuery(rewritten_query, task); } catch (const Coordination::Exception &) { @@ -451,25 +494,35 @@ void DDLWorker::processTask(DDLTaskBase & task) task.execution_status = ExecutionStatus::fromCurrentException("An error occurred before execution"); } + if (task.execution_status.code != 0) + { + bool status_written_by_table_or_db = task.ops.empty(); + if (status_written_by_table_or_db) + { + throw Exception(ErrorCodes::UNFINISHED, "Unexpected error: {}", task.execution_status.serializeText()); + } + else + { + /// task.ops where not executed by table or database engine, se DDLWorker is responsible for + /// writing query execution status into ZooKeeper. + task.ops.emplace_back(zkutil::makeSetRequest(finished_node_path, task.execution_status.serializeText(), -1)); + } + } + /// We need to distinguish ZK errors occurred before and after query executing task.was_executed = true; } /// FIXME: if server fails right here, the task will be executed twice. We need WAL here. - /// Another possible issue: if ZooKeeper session is lost here, we will recover connection and execute the task second time. + /// If ZooKeeper connection is lost here, we will try again to write query status. - - - /// Delete active flag and create finish flag - Coordination::Requests ops; - ops.emplace_back(zkutil::makeRemoveRequest(active_node_path, -1)); - ops.emplace_back(zkutil::makeCreateRequest(finished_node_path, task.execution_status.serializeText(), zkutil::CreateMode::Persistent)); - - //FIXME replace with multi(...) or use MetadataTransaction - Coordination::Responses responses; - auto res = zookeeper->tryMulti(ops, responses); - if (res != Coordination::Error::ZNODEEXISTS && res != Coordination::Error::ZNONODE) - zkutil::KeeperMultiException::check(res, ops, responses); + bool status_written = task.ops.empty(); + if (!status_written) + { + zookeeper->multi(task.ops); + active_node->reset(); + task.ops.clear(); + } } @@ -496,13 +549,17 @@ bool DDLWorker::tryExecuteQueryOnLeaderReplica( /// If we will develop new replicated storage if (!replicated_storage) - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Storage type '{}' is not supported by distributed DDL", storage->getName()); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Storage type '{}' is not supported by distributed DDL", storage->getName()); String shard_path = task.getShardNodePath(); String is_executed_path = shard_path + "/executed"; String tries_to_execute_path = shard_path + "/tries_to_execute"; zookeeper->createAncestors(shard_path + "/"); + /// Leader replica creates is_executed_path node on successful query execution. + /// We will remove create_shard_flag from zk operations list, if current replica is just waiting for leader to execute the query. + auto create_shard_flag = zkutil::makeCreateRequest(is_executed_path, task.host_id_str, zkutil::CreateMode::Persistent); + /// Node exists, or we will create or we will get an exception zookeeper->tryCreate(tries_to_execute_path, "0", zkutil::CreateMode::Persistent); @@ -526,7 +583,9 @@ bool DDLWorker::tryExecuteQueryOnLeaderReplica( Stopwatch stopwatch; - bool executed_by_leader = false; + bool executed_by_us = false; + bool executed_by_other_leader = false; + /// Defensive programming. One hour is more than enough to execute almost all DDL queries. /// If it will be very long query like ALTER DELETE for a huge table it's still will be executed, /// but DDL worker can continue processing other queries. @@ -544,7 +603,7 @@ bool DDLWorker::tryExecuteQueryOnLeaderReplica( if (zookeeper->tryGet(is_executed_path, executed_by)) { LOG_DEBUG(log, "Task {} has already been executed by replica ({}) of the same shard.", task.entry_name, executed_by); - executed_by_leader = true; + executed_by_other_leader = true; break; } @@ -555,13 +614,14 @@ bool DDLWorker::tryExecuteQueryOnLeaderReplica( zookeeper->set(tries_to_execute_path, toString(counter + 1)); + task.ops.push_back(create_shard_flag); + SCOPE_EXIT({ if (!executed_by_us && !task.ops.empty()) task.ops.pop_back(); }); + /// If the leader will unexpectedly changed this method will return false /// and on the next iteration new leader will take lock - if (tryExecuteQuery(rewritten_query, task, task.execution_status)) + if (tryExecuteQuery(rewritten_query, task)) { - //FIXME replace with create(...) or remove and use MetadataTransaction - zookeeper->createIfNotExists(is_executed_path, task.host_id_str); - executed_by_leader = true; + executed_by_us = true; break; } @@ -572,7 +632,7 @@ bool DDLWorker::tryExecuteQueryOnLeaderReplica( if (event->tryWait(std::uniform_int_distribution(0, 1000)(rng))) { LOG_DEBUG(log, "Task {} has already been executed by replica ({}) of the same shard.", task.entry_name, zookeeper->get(is_executed_path)); - executed_by_leader = true; + executed_by_other_leader = true; break; } else @@ -593,8 +653,10 @@ bool DDLWorker::tryExecuteQueryOnLeaderReplica( } } + assert(!(executed_by_us && executed_by_other_leader)); + /// Not executed by leader so was not executed at all - if (!executed_by_leader) + if (!executed_by_us && !executed_by_other_leader) { /// If we failed with timeout if (stopwatch.elapsedSeconds() >= MAX_EXECUTION_TIMEOUT_SEC) @@ -610,7 +672,11 @@ bool DDLWorker::tryExecuteQueryOnLeaderReplica( return false; } - LOG_DEBUG(log, "Task {} has already been executed by replica ({}) of the same shard.", task.entry_name, zookeeper->get(is_executed_path)); + if (executed_by_us) + LOG_DEBUG(log, "Task {} executed by current replica", task.entry_name); + else // if (executed_by_other_leader) + LOG_DEBUG(log, "Task {} has already been executed by replica ({}) of the same shard.", task.entry_name, zookeeper->get(is_executed_path)); + return true; } @@ -816,9 +882,17 @@ void DDLWorker::runMainThread() else { LOG_ERROR(log, "Unexpected ZooKeeper error: {}.", getCurrentExceptionMessage(true)); - assert(false); + //assert(false); } } + catch (const Exception & e) + { + if (e.code() == ErrorCodes::LOGICAL_ERROR) + throw; /// Something terrible happened. Will terminate DDLWorker. + + tryLogCurrentException(log, "Unexpected error, will try to restart main thread:"); + initialized = false; + } catch (...) { tryLogCurrentException(log, "Unexpected error, will try to restart main thread:"); diff --git a/src/Interpreters/DDLWorker.h b/src/Interpreters/DDLWorker.h index 78921fa60e3..4145e0754e8 100644 --- a/src/Interpreters/DDLWorker.h +++ b/src/Interpreters/DDLWorker.h @@ -89,7 +89,7 @@ protected: const String & node_path, const ZooKeeperPtr & zookeeper); - bool tryExecuteQuery(const String & query, const DDLTaskBase & task, ExecutionStatus & status); + bool tryExecuteQuery(const String & query, DDLTaskBase & task); /// Checks and cleanups queue's nodes void cleanupQueue(Int64 current_time_seconds, const ZooKeeperPtr & zookeeper); From 3146a1a9542b16d3e56730ca6aa289d23fd70689 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Mon, 25 Jan 2021 21:59:23 +0300 Subject: [PATCH 058/381] fix --- docker/test/stress/stress | 7 +++++-- src/Interpreters/DDLTask.cpp | 2 +- src/Interpreters/DDLWorker.cpp | 17 +++++++++++++---- src/Interpreters/InterpreterCreateQuery.cpp | 2 +- .../test_materialize_mysql_database/test.py | 2 +- 5 files changed, 21 insertions(+), 9 deletions(-) diff --git a/docker/test/stress/stress b/docker/test/stress/stress index 458f78fcdb4..c530f605da7 100755 --- a/docker/test/stress/stress +++ b/docker/test/stress/stress @@ -22,12 +22,15 @@ def get_options(i): if 0 < i: options += " --order=random" - if i % 2 == 1: + if i % 3 == 1: options += " --db-engine=Ordinary" + if i % 3 == 2: + options += ''' --db-engine="Replicated('/test/db/test_{}', 's1', 'r1')"'''.format(i) + # If database name is not specified, new database is created for each functional test. # Run some threads with one database for all tests. - if i % 3 == 1: + if i % 2 == 1: options += " --database=test_{}".format(i) if i == 13: diff --git a/src/Interpreters/DDLTask.cpp b/src/Interpreters/DDLTask.cpp index 3d9297880c1..fd2de014581 100644 --- a/src/Interpreters/DDLTask.cpp +++ b/src/Interpreters/DDLTask.cpp @@ -140,7 +140,7 @@ bool DDLTask::findCurrentHostID(const Context & global_context, Poco::Logger * l void DDLTask::setClusterInfo(const Context & context, Poco::Logger * log) { - auto query_on_cluster = dynamic_cast(query.get()); + auto * query_on_cluster = dynamic_cast(query.get()); if (!query_on_cluster) throw Exception("Received unknown DDL query", ErrorCodes::UNKNOWN_TYPE_OF_QUERY); diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index 91a5309bb5d..fc72e4d8366 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -201,11 +201,7 @@ void DDLWorker::shutdown() stop_flag = true; queue_updated_event->set(); cleanup_event->set(); -} -DDLWorker::~DDLWorker() -{ - shutdown(); worker_pool.reset(); if (main_thread.joinable()) main_thread.join(); @@ -213,6 +209,11 @@ DDLWorker::~DDLWorker() cleanup_thread.join(); } +DDLWorker::~DDLWorker() +{ + shutdown(); +} + ZooKeeperPtr DDLWorker::tryGetZooKeeper() const { @@ -490,9 +491,14 @@ void DDLWorker::processTask(DDLTaskBase & task) } if (task.execute_on_leader) + { tryExecuteQueryOnLeaderReplica(task, storage, rewritten_query, task.entry_path, zookeeper); + } else + { + storage.reset(); tryExecuteQuery(rewritten_query, task); + } } catch (const Coordination::Exception &) { @@ -892,6 +898,7 @@ void DDLWorker::initializeMainThread() { tryLogCurrentException(log, "Cannot initialize DDL queue."); reset_state(false); + sleepForSeconds(5); } } while (!initialized && !stop_flag); @@ -949,11 +956,13 @@ void DDLWorker::runMainThread() LOG_ERROR(log, "Unexpected ZooKeeper error: {}", getCurrentExceptionMessage(true)); reset_state(); } + sleepForSeconds(5); } catch (...) { tryLogCurrentException(log, "Unexpected error, will try to restart main thread:"); reset_state(); + sleepForSeconds(5); } } } diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 40789fc1a8a..b66af77930c 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -718,7 +718,7 @@ void InterpreterCreateQuery::assertOrSetUUID(ASTCreateQuery & create, const Data const auto * kind = create.is_dictionary ? "Dictionary" : "Table"; const auto * kind_upper = create.is_dictionary ? "DICTIONARY" : "TABLE"; - if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind == ClientInfo::QueryKind::REPLICATED_LOG_QUERY) + if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind == ClientInfo::QueryKind::REPLICATED_LOG_QUERY && !internal) { if (create.uuid == UUIDHelpers::Nil) throw Exception("Table UUID is not specified in DDL log", ErrorCodes::LOGICAL_ERROR); diff --git a/tests/integration/test_materialize_mysql_database/test.py b/tests/integration/test_materialize_mysql_database/test.py index dbd6e894987..3cdc527d33d 100644 --- a/tests/integration/test_materialize_mysql_database/test.py +++ b/tests/integration/test_materialize_mysql_database/test.py @@ -14,7 +14,7 @@ DOCKER_COMPOSE_PATH = get_docker_compose_path() cluster = ClickHouseCluster(__file__) -node_db_ordinary = cluster.add_instance('node1', user_configs=["configs/users.xml"], with_mysql=False, stay_alive=True) +node_db_ordinary = cluster.add_instance('node1', user_configs=["configs/users.xml"], with_mysql=False, stay_alive=True, with_zookeeper=True) #FIXME node_db_atomic = cluster.add_instance('node2', user_configs=["configs/users_db_atomic.xml"], with_mysql=False, stay_alive=True) From f20d5e3b419b1efc77e3a3a1b7aa46f86ac4c201 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Tue, 26 Jan 2021 20:51:25 +0300 Subject: [PATCH 059/381] fix --- src/Databases/DatabaseAtomic.cpp | 13 +++-- src/Databases/DatabaseReplicated.h | 2 +- src/Interpreters/Context.cpp | 3 +- src/Interpreters/Context.h | 1 + src/Interpreters/DDLTask.h | 3 +- src/Interpreters/DDLWorker.cpp | 53 ++++++++----------- src/Interpreters/InterpreterRenameQuery.cpp | 7 +++ src/Interpreters/executeDDLQueryOnCluster.cpp | 7 +-- src/Parsers/ASTAlterQuery.cpp | 14 ++++- src/Parsers/ASTAlterQuery.h | 4 ++ src/Storages/StorageMaterializedView.cpp | 6 ++- tests/clickhouse-test | 16 ++++-- 12 files changed, 78 insertions(+), 51 deletions(-) diff --git a/src/Databases/DatabaseAtomic.cpp b/src/Databases/DatabaseAtomic.cpp index 1da23b9beef..8b75f439152 100644 --- a/src/Databases/DatabaseAtomic.cpp +++ b/src/Databases/DatabaseAtomic.cpp @@ -115,8 +115,8 @@ void DatabaseAtomic::dropTable(const Context & context, const String & table_nam std::unique_lock lock(mutex); table = getTableUnlocked(table_name, lock); table_metadata_path_drop = DatabaseCatalog::instance().getPathForDroppedMetadata(table->getStorageID()); - - if (auto txn = context.getMetadataTransaction()) + auto txn = context.getMetadataTransaction(); + if (txn && !context.isInternalSubquery()) txn->commit(); /// Commit point (a sort of) for Replicated database /// NOTE: replica will be lost if server crashes before the following rename @@ -241,7 +241,8 @@ void DatabaseAtomic::renameTable(const Context & context, const String & table_n } /// Table renaming actually begins here - if (auto txn = context.getMetadataTransaction()) + auto txn = context.getMetadataTransaction(); + if (txn && !context.isInternalSubquery()) txn->commit(); /// Commit point (a sort of) for Replicated database /// NOTE: replica will be lost if server crashes before the following rename @@ -301,7 +302,8 @@ void DatabaseAtomic::commitCreateTable(const ASTCreateQuery & query, const Stora DatabaseCatalog::instance().addUUIDMapping(query.uuid); locked_uuid = true; - if (auto txn = query_context.getMetadataTransaction()) + auto txn = query_context.getMetadataTransaction(); + if (txn && !query_context.isInternalSubquery()) txn->commit(); /// Commit point (a sort of) for Replicated database /// NOTE: replica will be lost if server crashes before the following renameNoReplace(...) @@ -335,7 +337,8 @@ void DatabaseAtomic::commitAlterTable(const StorageID & table_id, const String & if (table_id.uuid != actual_table_id.uuid) throw Exception("Cannot alter table because it was renamed", ErrorCodes::CANNOT_ASSIGN_ALTER); - if (auto txn = query_context.getMetadataTransaction()) + auto txn = query_context.getMetadataTransaction(); + if (txn && !query_context.isInternalSubquery()) txn->commit(); /// Commit point (a sort of) for Replicated database /// NOTE: replica will be lost if server crashes before the following rename diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index 8085c234af4..586f381c962 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -64,7 +64,7 @@ public: void shutdown() override; - void loadStoredObjects(Context & context, bool has_force_restore_data_flag, bool force_attach = false) override; + void loadStoredObjects(Context & context, bool has_force_restore_data_flag, bool force_attach) override; String getFullReplicaName() const { return shard_name + '|' + replica_name; } diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index 3d102553f5a..6895439b855 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -2522,8 +2522,7 @@ void Context::initMetadataTransaction(MetadataTransactionPtr txn) MetadataTransactionPtr Context::getMetadataTransaction() const { - //FIXME - //assert(query_context == this); + assert(!metadata_transaction || hasQueryContext()); return metadata_transaction; } diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index dcb581b98c6..37ed01d4dbc 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -536,6 +536,7 @@ public: const Context & getQueryContext() const; Context & getQueryContext(); bool hasQueryContext() const { return query_context != nullptr; } + bool isInternalSubquery() const { return hasQueryContext() && query_context != this; } const Context & getSessionContext() const; Context & getSessionContext(); diff --git a/src/Interpreters/DDLTask.h b/src/Interpreters/DDLTask.h index 7501c01aa8f..a12676ab8a3 100644 --- a/src/Interpreters/DDLTask.h +++ b/src/Interpreters/DDLTask.h @@ -85,9 +85,10 @@ struct DDLTaskBase ExecutionStatus execution_status; bool was_executed = false; + std::atomic_bool completely_processed = false; + DDLTaskBase(const String & name, const String & path) : entry_name(name), entry_path(path) {} DDLTaskBase(const DDLTaskBase &) = delete; - DDLTaskBase(DDLTaskBase &&) = default; virtual ~DDLTaskBase() = default; void parseQueryFromEntry(const Context & context); diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index fc72e4d8366..cb38c733582 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -341,9 +341,10 @@ void DDLWorker::scheduleTasks() auto & min_task = *std::min_element(current_tasks.begin(), current_tasks.end()); begin_node = std::upper_bound(queue_nodes.begin(), queue_nodes.end(), min_task->entry_name); current_tasks.clear(); - //FIXME better way of maintaning current tasks list and min_task name; } + assert(current_tasks.empty()); + for (auto it = begin_node; it != queue_nodes.end() && !stop_flag; ++it) { String entry_name = *it; @@ -378,12 +379,8 @@ void DDLWorker::scheduleTasks() DDLTaskBase & DDLWorker::saveTask(DDLTaskPtr && task) { - //assert(current_tasks.size() <= pool_size + 1); - //if (current_tasks.size() == pool_size) - //{ - // assert(current_tasks.front()->ops.empty()); //FIXME - // current_tasks.pop_front(); - //} + std::remove_if(current_tasks.begin(), current_tasks.end(), [](const DDLTaskPtr & t) { return t->completely_processed.load(); }); + assert(current_tasks.size() <= pool_size); current_tasks.emplace_back(std::move(task)); return *current_tasks.back(); } @@ -555,6 +552,8 @@ void DDLWorker::processTask(DDLTaskBase & task) active_node->reset(); task.ops.clear(); } + + task.completely_processed = true; } @@ -572,6 +571,9 @@ bool DDLWorker::taskShouldBeExecutedOnLeader(const ASTPtr ast_ddl, const Storage // Setting alters should be executed on all replicas if (alter->isSettingsAlter()) return false; + + if (alter->isFreezeAlter()) + return false; } return storage->supportsReplication(); @@ -856,28 +858,20 @@ String DDLWorker::enqueueQuery(DDLLogEntry & entry) void DDLWorker::initializeMainThread() { - auto reset_state = [&](bool reset_pool = true) - { - initialized = false; - /// It will wait for all threads in pool to finish and will not rethrow exceptions (if any). - /// We create new thread pool to forget previous exceptions. - if (reset_pool) - worker_pool = std::make_unique(pool_size); - /// Clear other in-memory state, like server just started. - current_tasks.clear(); - max_id = 0; - }; - + assert(!initialized); + assert(max_id == 0); + assert(current_tasks.empty()); setThreadName("DDLWorker"); LOG_DEBUG(log, "Started DDLWorker thread"); - do + while (!stop_flag) { try { auto zookeeper = getAndSetZooKeeper(); zookeeper->createAncestors(fs::path(queue_dir) / ""); initialized = true; + return; } catch (const Coordination::Exception & e) { @@ -885,33 +879,29 @@ void DDLWorker::initializeMainThread() { /// A logical error. LOG_ERROR(log, "ZooKeeper error: {}. Failed to start DDLWorker.",getCurrentExceptionMessage(true)); - reset_state(false); assert(false); /// Catch such failures in tests with debug build } tryLogCurrentException(__PRETTY_FUNCTION__); - - /// Avoid busy loop when ZooKeeper is not available. - sleepForSeconds(5); } catch (...) { tryLogCurrentException(log, "Cannot initialize DDL queue."); - reset_state(false); - sleepForSeconds(5); } + + /// Avoid busy loop when ZooKeeper is not available. + sleepForSeconds(5); } - while (!initialized && !stop_flag); } void DDLWorker::runMainThread() { - auto reset_state = [&](bool reset_pool = true) + auto reset_state = [&]() { initialized = false; /// It will wait for all threads in pool to finish and will not rethrow exceptions (if any). /// We create new thread pool to forget previous exceptions. - if (reset_pool) + if (1 < pool_size) worker_pool = std::make_unique(pool_size); /// Clear other in-memory state, like server just started. current_tasks.clear(); @@ -944,6 +934,7 @@ void DDLWorker::runMainThread() if (Coordination::isHardwareError(e.code)) { initialized = false; + LOG_INFO(log, "Lost ZooKeeper connection, will try to connect again: {}", getCurrentExceptionMessage(true)); } else if (e.code == Coordination::Error::ZNONODE) { @@ -953,10 +944,10 @@ void DDLWorker::runMainThread() } else { - LOG_ERROR(log, "Unexpected ZooKeeper error: {}", getCurrentExceptionMessage(true)); + LOG_ERROR(log, "Unexpected ZooKeeper error, will try to restart main thread: {}", getCurrentExceptionMessage(true)); reset_state(); } - sleepForSeconds(5); + sleepForSeconds(1); } catch (...) { diff --git a/src/Interpreters/InterpreterRenameQuery.cpp b/src/Interpreters/InterpreterRenameQuery.cpp index 72398103d62..a6075643a96 100644 --- a/src/Interpreters/InterpreterRenameQuery.cpp +++ b/src/Interpreters/InterpreterRenameQuery.cpp @@ -13,6 +13,10 @@ namespace DB { +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +} InterpreterRenameQuery::InterpreterRenameQuery(const ASTPtr & query_ptr_, Context & context_) : query_ptr(query_ptr_), context(context_) @@ -78,6 +82,9 @@ BlockIO InterpreterRenameQuery::executeToTables(const ASTRenameQuery & rename, c DatabasePtr database = database_catalog.getDatabase(elem.from_database_name); if (typeid_cast(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { + if (1 < descriptions.size()) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Database {} is Replicated, " + "it does not support renaming of multiple tables in single query.", elem.from_database_name); return typeid_cast(database.get())->propose(query_ptr); } else diff --git a/src/Interpreters/executeDDLQueryOnCluster.cpp b/src/Interpreters/executeDDLQueryOnCluster.cpp index cf801caed04..fb155e82926 100644 --- a/src/Interpreters/executeDDLQueryOnCluster.cpp +++ b/src/Interpreters/executeDDLQueryOnCluster.cpp @@ -31,12 +31,13 @@ namespace ErrorCodes bool isSupportedAlterType(int type) { + assert(type != ASTAlterCommand::NO_TYPE); static const std::unordered_set unsupported_alter_types{ + /// It's dangerous, because it may duplicate data if executed on multiple replicas ASTAlterCommand::ATTACH_PARTITION, - ASTAlterCommand::REPLACE_PARTITION, + /// Usually followed by ATTACH PARTITION ASTAlterCommand::FETCH_PARTITION, - ASTAlterCommand::FREEZE_PARTITION, - ASTAlterCommand::FREEZE_ALL, + /// Logical error ASTAlterCommand::NO_TYPE, }; diff --git a/src/Parsers/ASTAlterQuery.cpp b/src/Parsers/ASTAlterQuery.cpp index 8a44dcc7c3b..f24b26d5b54 100644 --- a/src/Parsers/ASTAlterQuery.cpp +++ b/src/Parsers/ASTAlterQuery.cpp @@ -344,7 +344,7 @@ void ASTAlterCommand::formatImpl( throw Exception("Unexpected type of ALTER", ErrorCodes::UNEXPECTED_AST_STRUCTURE); } -bool ASTAlterQuery::isSettingsAlter() const +bool ASTAlterQuery::isOneCommandTypeOnly(const ASTAlterCommand::Type & type) const { if (command_list) { @@ -353,7 +353,7 @@ bool ASTAlterQuery::isSettingsAlter() const for (const auto & child : command_list->children) { const auto & command = child->as(); - if (command.type != ASTAlterCommand::MODIFY_SETTING) + if (command.type != type) return false; } return true; @@ -361,6 +361,16 @@ bool ASTAlterQuery::isSettingsAlter() const return false; } +bool ASTAlterQuery::isSettingsAlter() const +{ + return isOneCommandTypeOnly(ASTAlterCommand::MODIFY_SETTING); +} + +bool ASTAlterQuery::isFreezeAlter() const +{ + return isOneCommandTypeOnly(ASTAlterCommand::FREEZE_PARTITION) || isOneCommandTypeOnly(ASTAlterCommand::FREEZE_ALL); +} + /** Get the text that identifies this element. */ String ASTAlterQuery::getID(char delim) const { diff --git a/src/Parsers/ASTAlterQuery.h b/src/Parsers/ASTAlterQuery.h index f53a987905e..4cc01aa889e 100644 --- a/src/Parsers/ASTAlterQuery.h +++ b/src/Parsers/ASTAlterQuery.h @@ -189,6 +189,8 @@ public: bool isSettingsAlter() const; + bool isFreezeAlter() const; + String getID(char) const override; ASTPtr clone() const override; @@ -200,6 +202,8 @@ public: protected: void formatQueryImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const override; + + bool isOneCommandTypeOnly(const ASTAlterCommand::Type & type) const; }; } diff --git a/src/Storages/StorageMaterializedView.cpp b/src/Storages/StorageMaterializedView.cpp index af00b37b1d5..29aea3e6150 100644 --- a/src/Storages/StorageMaterializedView.cpp +++ b/src/Storages/StorageMaterializedView.cpp @@ -89,6 +89,7 @@ StorageMaterializedView::StorageMaterializedView( else { /// We will create a query to create an internal table. + auto create_context = Context(local_context); auto manual_create_query = std::make_shared(); manual_create_query->database = getStorageID().database_name; manual_create_query->table = generateInnerTableName(getStorageID()); @@ -99,7 +100,7 @@ StorageMaterializedView::StorageMaterializedView( manual_create_query->set(manual_create_query->columns_list, new_columns_list); manual_create_query->set(manual_create_query->storage, query.storage->ptr()); - InterpreterCreateQuery create_interpreter(manual_create_query, local_context); + InterpreterCreateQuery create_interpreter(manual_create_query, create_context); create_interpreter.setInternal(true); create_interpreter.execute(); @@ -205,7 +206,8 @@ static void executeDropQuery(ASTDropQuery::Kind kind, Context & global_context, drop_query->no_delay = no_delay; drop_query->if_exists = true; ASTPtr ast_drop_query = drop_query; - InterpreterDropQuery drop_interpreter(ast_drop_query, global_context); + auto drop_context = Context(global_context); + InterpreterDropQuery drop_interpreter(ast_drop_query, drop_context); drop_interpreter.execute(); } } diff --git a/tests/clickhouse-test b/tests/clickhouse-test index d5c6019d28f..13e7b4be001 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -162,7 +162,12 @@ def run_single_test(args, ext, server_logs_level, client_options, case_file, std while (datetime.now() - start_time).total_seconds() < args.timeout and proc.poll() is None: sleep(0.01) - if not args.database: + need_drop_database = not args.database + if need_drop_database and args.no_drop_if_fail: + maybe_passed = (proc.returncode == 0) and (proc.stderr is None) and (proc.stdout is None or 'Exception' not in proc.stdout) + need_drop_database = not maybe_passed + + if need_drop_database: clickhouse_proc_create = Popen(shlex.split(args.client), stdin=PIPE, stdout=PIPE, stderr=PIPE, universal_newlines=True) seconds_left = max(args.timeout - (datetime.now() - start_time).total_seconds(), 10) try: @@ -181,9 +186,10 @@ def run_single_test(args, ext, server_logs_level, client_options, case_file, std total_time = (datetime.now() - start_time).total_seconds() - # Normalize randomized database names in stdout, stderr files. - os.system("LC_ALL=C sed -i -e 's/{test_db}/default/g' {file}".format(test_db=database, file=stdout_file)) - os.system("LC_ALL=C sed -i -e 's/{test_db}/default/g' {file}".format(test_db=database, file=stderr_file)) + if not args.show_db_name: + # Normalize randomized database names in stdout, stderr files. + os.system("LC_ALL=C sed -i -e 's/{test_db}/default/g' {file}".format(test_db=database, file=stdout_file)) + os.system("LC_ALL=C sed -i -e 's/{test_db}/default/g' {file}".format(test_db=database, file=stderr_file)) stdout = open(stdout_file, 'rb').read() if os.path.exists(stdout_file) else b'' stdout = str(stdout, errors='replace', encoding='utf-8') @@ -884,6 +890,8 @@ if __name__ == '__main__': parser.add_argument('--hung-check', action='store_true', default=False) parser.add_argument('--force-color', action='store_true', default=False) parser.add_argument('--database', help='Database for tests (random name test_XXXXXX by default)') + parser.add_argument('--no-drop-if-fail', action='store_true', help='Do not drop database for test if test has failed') + parser.add_argument('--show-db-name', action='store_true', help='Do not replace random database name with "default"') parser.add_argument('--parallel', default='1/1', help='One parallel test run number/total') parser.add_argument('-j', '--jobs', default=1, nargs='?', type=int, help='Run all tests in parallel') parser.add_argument('-U', '--unified', default=3, type=int, help='output NUM lines of unified context') From 9c7881f4c9dba5ce9fe241603368228fc87e9420 Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 28 Jan 2021 09:22:01 +0000 Subject: [PATCH 060/381] Fix --- .../AggregateFunctionFactory.cpp | 7 +++++- src/DataTypes/DataTypeFactory.cpp | 23 +++++++++++-------- src/Functions/FunctionFactory.cpp | 3 ++- ...56_test_query_log_factories_info.reference | 10 ++++---- .../01656_test_query_log_factories_info.sql | 4 +++- 5 files changed, 29 insertions(+), 18 deletions(-) diff --git a/src/AggregateFunctions/AggregateFunctionFactory.cpp b/src/AggregateFunctions/AggregateFunctionFactory.cpp index 5fc690d59f2..53fc895849b 100644 --- a/src/AggregateFunctions/AggregateFunctionFactory.cpp +++ b/src/AggregateFunctions/AggregateFunctionFactory.cpp @@ -98,6 +98,7 @@ AggregateFunctionPtr AggregateFunctionFactory::getImpl( bool has_null_arguments) const { String name = getAliasToOrName(name_param); + bool is_case_insensitive = false; Value found; /// Find by exact match. @@ -107,7 +108,10 @@ AggregateFunctionPtr AggregateFunctionFactory::getImpl( } if (auto jt = case_insensitive_aggregate_functions.find(Poco::toLower(name)); jt != case_insensitive_aggregate_functions.end()) + { found = jt->second; + is_case_insensitive = true; + } const Context * query_context = nullptr; if (CurrentThread::isInitialized()) @@ -118,7 +122,8 @@ AggregateFunctionPtr AggregateFunctionFactory::getImpl( out_properties = found.properties; if (query_context && query_context->getSettingsRef().log_queries) - query_context->addQueryFactoriesInfo(Context::QueryLogFactories::AggregateFunction, name); + query_context->addQueryFactoriesInfo( + Context::QueryLogFactories::AggregateFunction, is_case_insensitive ? Poco::toLower(name) : name); /// The case when aggregate function should return NULL on NULL arguments. This case is handled in "get" method. if (!out_properties.returns_default_when_only_null && has_null_arguments) diff --git a/src/DataTypes/DataTypeFactory.cpp b/src/DataTypes/DataTypeFactory.cpp index 2f100202ee9..1bc2a307915 100644 --- a/src/DataTypes/DataTypeFactory.cpp +++ b/src/DataTypes/DataTypeFactory.cpp @@ -78,16 +78,7 @@ DataTypePtr DataTypeFactory::get(const String & family_name_param, const ASTPtr return get("LowCardinality", low_cardinality_params); } - DataTypePtr res = findCreatorByName(family_name)(parameters); - - if (CurrentThread::isInitialized()) - { - const auto * query_context = CurrentThread::get().getQueryContext(); - if (query_context && query_context->getSettingsRef().log_queries) - query_context->addQueryFactoriesInfo(Context::QueryLogFactories::DataType, family_name); - } - - return res; + return findCreatorByName(family_name)(parameters); } DataTypePtr DataTypeFactory::getCustom(DataTypeCustomDescPtr customization) const @@ -159,10 +150,18 @@ void DataTypeFactory::registerSimpleDataTypeCustom(const String &name, SimpleCre const DataTypeFactory::Value & DataTypeFactory::findCreatorByName(const String & family_name) const { + const Context * query_context = nullptr; + if (CurrentThread::isInitialized()) + query_context = CurrentThread::get().getQueryContext(); + { DataTypesDictionary::const_iterator it = data_types.find(family_name); if (data_types.end() != it) + { + if (query_context && query_context->getSettingsRef().log_queries) + query_context->addQueryFactoriesInfo(Context::QueryLogFactories::DataType, family_name); return it->second; + } } String family_name_lowercase = Poco::toLower(family_name); @@ -170,7 +169,11 @@ const DataTypeFactory::Value & DataTypeFactory::findCreatorByName(const String & { DataTypesDictionary::const_iterator it = case_insensitive_data_types.find(family_name_lowercase); if (case_insensitive_data_types.end() != it) + { + if (query_context && query_context->getSettingsRef().log_queries) + query_context->addQueryFactoriesInfo(Context::QueryLogFactories::DataType, family_name_lowercase); return it->second; + } } auto hints = this->getHints(family_name); diff --git a/src/Functions/FunctionFactory.cpp b/src/Functions/FunctionFactory.cpp index 768f1cfe487..e98cb543df6 100644 --- a/src/Functions/FunctionFactory.cpp +++ b/src/Functions/FunctionFactory.cpp @@ -92,7 +92,8 @@ FunctionOverloadResolverImplPtr FunctionFactory::tryGetImpl( res = it->second(context); else { - it = case_insensitive_functions.find(Poco::toLower(name)); + name = Poco::toLower(name); + it = case_insensitive_functions.find(name); if (case_insensitive_functions.end() != it) res = it->second(context); } diff --git a/tests/queries/0_stateless/01656_test_query_log_factories_info.reference b/tests/queries/0_stateless/01656_test_query_log_factories_info.reference index 3c93cd9ec26..77486e99ea5 100644 --- a/tests/queries/0_stateless/01656_test_query_log_factories_info.reference +++ b/tests/queries/0_stateless/01656_test_query_log_factories_info.reference @@ -1,8 +1,8 @@ -2 worl [123,1,1] 49 \N 50 4950 Nullable(UInt64) 50 -2 worl [123,1,1] 49 \N 50 4950 Nullable(UInt64) 50 +2 worl 1 0.7615946626193841 0 4950 99 [123,1,1] 49 \N 50 4950 Nullable(UInt64) 50 +2 worl 1 0.7615946626193841 0 4950 99 [123,1,1] 49 \N 50 4950 Nullable(UInt64) 50 arraySort(used_aggregate_functions) -['avg','count','groupBitAnd','sum','uniq'] +['avg','count','groupBitAnd','max','sum','uniq'] arraySort(used_aggregate_function_combinators) ['Array','If','OrDefault','OrNull'] @@ -11,7 +11,7 @@ arraySort(used_table_functions) ['numbers'] arraySort(used_functions) -['addDays','array','arrayFlatten','cast','modulo','plus','substring','toDate','toDayOfYear','toTypeName','toWeek'] +['addDays','array','arrayFlatten','cast','crc32','modulo','plus','pow','substring','tanh','toDate','toDayOfYear','toTypeName','toWeek'] arraySort(used_data_type_families) ['Array','Int32','Nullable','String'] @@ -20,5 +20,5 @@ used_database_engines ['Atomic'] arraySort(used_data_type_families) used_storages -['DateTime','Int64'] ['Memory'] +['Int64','datetime'] ['Memory'] diff --git a/tests/queries/0_stateless/01656_test_query_log_factories_info.sql b/tests/queries/0_stateless/01656_test_query_log_factories_info.sql index aa9bdd42a71..0856681e9c5 100644 --- a/tests/queries/0_stateless/01656_test_query_log_factories_info.sql +++ b/tests/queries/0_stateless/01656_test_query_log_factories_info.sql @@ -1,5 +1,7 @@ SELECT uniqArray([1, 1, 2]), SUBSTRING('Hello, world', 7, 5), + POW(1, 2), TANh(1), CrC32(''), + SUM(number), MAX(number), flatten([[[BIT_AND(123)]], [[mod(3, 2)], [CAST('1' AS INTEGER)]]]), week(toDate('2000-12-05')), CAST(arrayJoin([NULL, NULL]) AS Nullable(TEXT)), @@ -47,7 +49,7 @@ WHERE current_database = currentDatabase() AND type == 'QueryFinish' AND (query ORDER BY query_start_time DESC LIMIT 1 FORMAT TabSeparatedWithNames; SELECT ''; -CREATE OR REPLACE TABLE test_query_log_factories_info1.memory_table (id BIGINT, date DateTime) ENGINE=Memory(); +CREATE OR REPLACE TABLE test_query_log_factories_info1.memory_table (id BIGINT, date DATETIME) ENGINE=Memory(); SYSTEM FLUSH LOGS; SELECT arraySort(used_data_type_families), used_storages From 65c061de4978f83c048cfd4c0292a81510ae7bfb Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 28 Jan 2021 13:28:11 +0000 Subject: [PATCH 061/381] FFix --- .../01656_test_query_log_factories_info.reference | 6 +++--- .../0_stateless/01656_test_query_log_factories_info.sql | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/queries/0_stateless/01656_test_query_log_factories_info.reference b/tests/queries/0_stateless/01656_test_query_log_factories_info.reference index 77486e99ea5..e12ee221a7b 100644 --- a/tests/queries/0_stateless/01656_test_query_log_factories_info.reference +++ b/tests/queries/0_stateless/01656_test_query_log_factories_info.reference @@ -1,5 +1,5 @@ -2 worl 1 0.7615946626193841 0 4950 99 [123,1,1] 49 \N 50 4950 Nullable(UInt64) 50 -2 worl 1 0.7615946626193841 0 4950 99 [123,1,1] 49 \N 50 4950 Nullable(UInt64) 50 +2 worl 1 1 0 4950 99 [123,1,1] 49 \N 50 4950 Nullable(UInt64) 50 +2 worl 1 1 0 4950 99 [123,1,1] 49 \N 50 4950 Nullable(UInt64) 50 arraySort(used_aggregate_functions) ['avg','count','groupBitAnd','max','sum','uniq'] @@ -11,7 +11,7 @@ arraySort(used_table_functions) ['numbers'] arraySort(used_functions) -['addDays','array','arrayFlatten','cast','crc32','modulo','plus','pow','substring','tanh','toDate','toDayOfYear','toTypeName','toWeek'] +['addDays','array','arrayFlatten','cast','crc32','modulo','plus','pow','round','substring','tanh','toDate','toDayOfYear','toTypeName','toWeek'] arraySort(used_data_type_families) ['Array','Int32','Nullable','String'] diff --git a/tests/queries/0_stateless/01656_test_query_log_factories_info.sql b/tests/queries/0_stateless/01656_test_query_log_factories_info.sql index 0856681e9c5..b584f2c38c8 100644 --- a/tests/queries/0_stateless/01656_test_query_log_factories_info.sql +++ b/tests/queries/0_stateless/01656_test_query_log_factories_info.sql @@ -1,6 +1,6 @@ SELECT uniqArray([1, 1, 2]), SUBSTRING('Hello, world', 7, 5), - POW(1, 2), TANh(1), CrC32(''), + POW(1, 2), ROUND(TANh(1)), CrC32(''), SUM(number), MAX(number), flatten([[[BIT_AND(123)]], [[mod(3, 2)], [CAST('1' AS INTEGER)]]]), week(toDate('2000-12-05')), From 52e5c0aad748b6ee55a97380abddf0ceb12aa864 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Thu, 28 Jan 2021 16:48:17 +0300 Subject: [PATCH 062/381] fix thread status --- src/Common/CurrentThread.h | 7 +++--- src/Common/ThreadStatus.cpp | 3 +++ src/Common/ThreadStatus.h | 2 +- src/Interpreters/DDLWorker.cpp | 24 +++++---------------- src/Interpreters/DDLWorker.h | 3 --- src/Interpreters/InterpreterCreateQuery.cpp | 3 ++- src/Interpreters/ThreadStatusExt.cpp | 2 ++ src/Interpreters/executeQuery.cpp | 9 ++------ src/Server/MySQLHandler.cpp | 6 +++++- src/Server/PostgreSQLHandler.cpp | 7 +++++- src/Storages/StorageReplicatedMergeTree.cpp | 2 +- 11 files changed, 31 insertions(+), 37 deletions(-) diff --git a/src/Common/CurrentThread.h b/src/Common/CurrentThread.h index 876cbd8a66b..7ab57ea7fab 100644 --- a/src/Common/CurrentThread.h +++ b/src/Common/CurrentThread.h @@ -63,9 +63,6 @@ public: /// Call from master thread as soon as possible (e.g. when thread accepted connection) static void initializeQuery(); - /// Sets query_context for current thread group - static void attachQueryContext(Context & query_context); - /// You must call one of these methods when create a query child thread: /// Add current thread to a group associated with the thread group static void attachTo(const ThreadGroupStatusPtr & thread_group); @@ -99,6 +96,10 @@ public: private: static void defaultThreadDeleter(); + + /// Sets query_context for current thread group + /// Can by used only through QueryScope + static void attachQueryContext(Context & query_context); }; } diff --git a/src/Common/ThreadStatus.cpp b/src/Common/ThreadStatus.cpp index 5105fff03b2..f2256fbf192 100644 --- a/src/Common/ThreadStatus.cpp +++ b/src/Common/ThreadStatus.cpp @@ -99,6 +99,9 @@ ThreadStatus::~ThreadStatus() /// We've already allocated a little bit more than the limit and cannot track it in the thread memory tracker or its parent. } + /// It may cause segfault if query_context was destroyed, but was not detached + assert((!query_context && query_id.empty()) || (query_id == query_context->getCurrentQueryId())); + if (deleter) deleter(); current_thread = nullptr; diff --git a/src/Common/ThreadStatus.h b/src/Common/ThreadStatus.h index 1be1f2cd4df..dc5f09c5f3d 100644 --- a/src/Common/ThreadStatus.h +++ b/src/Common/ThreadStatus.h @@ -201,7 +201,7 @@ public: void setFatalErrorCallback(std::function callback); void onFatalError(); - /// Sets query context for current thread and its thread group + /// Sets query context for current master thread and its thread group /// NOTE: query_context have to be alive until detachQuery() is called void attachQueryContext(Context & query_context); diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index cb38c733582..83412ab8fb7 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -202,11 +202,12 @@ void DDLWorker::shutdown() queue_updated_event->set(); cleanup_event->set(); - worker_pool.reset(); if (main_thread.joinable()) main_thread.join(); if (cleanup_thread.joinable()) cleanup_thread.join(); + + worker_pool.reset(); } DDLWorker::~DDLWorker() @@ -355,8 +356,6 @@ void DDLWorker::scheduleTasks() if (!task) { LOG_DEBUG(log, "Will not execute task {}: {}", entry_name, reason); - //task->was_executed = true; - //saveTask(std::move(task)); continue; } @@ -379,7 +378,7 @@ void DDLWorker::scheduleTasks() DDLTaskBase & DDLWorker::saveTask(DDLTaskPtr && task) { - std::remove_if(current_tasks.begin(), current_tasks.end(), [](const DDLTaskPtr & t) { return t->completely_processed.load(); }); + current_tasks.remove_if([](const DDLTaskPtr & t) { return t->completely_processed.load(); }); assert(current_tasks.size() <= pool_size); current_tasks.emplace_back(std::move(task)); return *current_tasks.back(); @@ -394,10 +393,12 @@ bool DDLWorker::tryExecuteQuery(const String & query, DDLTaskBase & task) ReadBufferFromString istr(query_to_execute); String dummy_string; WriteBufferFromString ostr(dummy_string); + std::optional query_scope; try { auto query_context = task.makeQueryContext(context); + query_scope.emplace(*query_context); executeQuery(istr, ostr, false, *query_context, {}); } catch (const DB::Exception & e) @@ -433,20 +434,6 @@ bool DDLWorker::tryExecuteQuery(const String & query, DDLTaskBase & task) return true; } -void DDLWorker::attachToThreadGroup() -{ - if (thread_group) - { - /// Put all threads to one thread pool - CurrentThread::attachToIfDetached(thread_group); - } - else - { - CurrentThread::initializeQuery(); - thread_group = CurrentThread::getGroup(); - } -} - void DDLWorker::processTask(DDLTaskBase & task) { auto zookeeper = tryGetZooKeeper(); @@ -909,7 +896,6 @@ void DDLWorker::runMainThread() }; setThreadName("DDLWorker"); - attachToThreadGroup(); LOG_DEBUG(log, "Starting DDLWorker thread"); while (!stop_flag) diff --git a/src/Interpreters/DDLWorker.h b/src/Interpreters/DDLWorker.h index c0194c4f252..1b7ebfb5796 100644 --- a/src/Interpreters/DDLWorker.h +++ b/src/Interpreters/DDLWorker.h @@ -102,8 +102,6 @@ protected: void runMainThread(); void runCleanupThread(); - void attachToThreadGroup(); - protected: Context context; Poco::Logger * log; @@ -138,7 +136,6 @@ protected: /// How many tasks could be in the queue size_t max_tasks_in_queue = 1000; - ThreadGroupStatusPtr thread_group; std::atomic max_id = 0; }; diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index b66af77930c..5292ef57d7a 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -929,7 +929,8 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, drop_ast->table = create.table; drop_ast->no_ddl_lock = true; - InterpreterDropQuery interpreter(drop_ast, context); + Context drop_context = context; + InterpreterDropQuery interpreter(drop_ast, drop_context); interpreter.execute(); } else diff --git a/src/Interpreters/ThreadStatusExt.cpp b/src/Interpreters/ThreadStatusExt.cpp index 61322cabfb3..8a979721290 100644 --- a/src/Interpreters/ThreadStatusExt.cpp +++ b/src/Interpreters/ThreadStatusExt.cpp @@ -500,6 +500,8 @@ CurrentThread::QueryScope::QueryScope(Context & query_context) { CurrentThread::initializeQuery(); CurrentThread::attachQueryContext(query_context); + if (!query_context.hasQueryContext()) + query_context.makeQueryContext(); } void CurrentThread::QueryScope::logPeakMemoryUsage() diff --git a/src/Interpreters/executeQuery.cpp b/src/Interpreters/executeQuery.cpp index 7003e6f5ee9..770e6e65d24 100644 --- a/src/Interpreters/executeQuery.cpp +++ b/src/Interpreters/executeQuery.cpp @@ -326,13 +326,8 @@ static std::tuple executeQueryImpl( { const auto current_time = std::chrono::system_clock::now(); - /// If we already executing query and it requires to execute internal query, than - /// don't replace thread context with given (it can be temporary). Otherwise, attach context to thread. - if (!internal) - { - context.makeQueryContext(); - CurrentThread::attachQueryContext(context); - } + assert(internal || CurrentThread::get().getQueryContext()); + assert(internal || CurrentThread::get().getQueryContext()->getCurrentQueryId() == CurrentThread::getQueryId()); const Settings & settings = context.getSettingsRef(); diff --git a/src/Server/MySQLHandler.cpp b/src/Server/MySQLHandler.cpp index 63a48fde1a7..f660d97cdc6 100644 --- a/src/Server/MySQLHandler.cpp +++ b/src/Server/MySQLHandler.cpp @@ -24,6 +24,7 @@ #include #include #include +#include #if !defined(ARCADIA_BUILD) # include @@ -86,6 +87,8 @@ MySQLHandler::MySQLHandler(IServer & server_, const Poco::Net::StreamSocket & so void MySQLHandler::run() { + setThreadName("MySQLHandler"); + ThreadStatus thread_status; connection_context.makeSessionContext(); connection_context.getClientInfo().interface = ClientInfo::Interface::MYSQL; connection_context.setDefaultFormat("MySQLWire"); @@ -339,8 +342,9 @@ void MySQLHandler::comQuery(ReadBuffer & payload) affected_rows += progress.written_rows; }); + CurrentThread::QueryScope query_scope{query_context}; - executeQuery(should_replace ? replacement : payload, *out, true, query_context, + executeQuery(should_replace ? replacement : payload, *out, false, query_context, [&with_output](const String &, const String &, const String &, const String &) { with_output = true; diff --git a/src/Server/PostgreSQLHandler.cpp b/src/Server/PostgreSQLHandler.cpp index 2bce5abcd11..b3a3bbf2aaa 100644 --- a/src/Server/PostgreSQLHandler.cpp +++ b/src/Server/PostgreSQLHandler.cpp @@ -5,6 +5,7 @@ #include #include "PostgreSQLHandler.h" #include +#include #include #if !defined(ARCADIA_BUILD) @@ -49,6 +50,8 @@ void PostgreSQLHandler::changeIO(Poco::Net::StreamSocket & socket) void PostgreSQLHandler::run() { + setThreadName("PostgresHandler"); + ThreadStatus thread_status; connection_context.makeSessionContext(); connection_context.getClientInfo().interface = ClientInfo::Interface::POSTGRESQL; connection_context.setDefaultFormat("PostgreSQLWire"); @@ -273,8 +276,10 @@ void PostgreSQLHandler::processQuery() for (const auto & spl_query : queries) { + /// FIXME why do we execute all queries in a single connection context? + CurrentThread::QueryScope query_scope{connection_context}; ReadBufferFromString read_buf(spl_query); - executeQuery(read_buf, *out, true, connection_context, {}); + executeQuery(read_buf, *out, false, connection_context, {}); PostgreSQLProtocol::Messaging::CommandComplete::Command command = PostgreSQLProtocol::Messaging::CommandComplete::classifyQuery(spl_query); diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 30b08cdea1e..951ce63944b 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -3682,7 +3682,7 @@ void StorageReplicatedMergeTree::shutdown() /// We clear all old parts after stopping all background operations. It's /// important, because background operations can produce temporary parts - /// which will remove themselves in their descrutors. If so, we may have + /// which will remove themselves in their destrutors. If so, we may have /// race condition between our remove call and background process. clearOldPartsFromFilesystem(true); } From a57456a3fd21829d22635df01404f7383ece545d Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Thu, 28 Jan 2021 22:02:39 +0300 Subject: [PATCH 063/381] fix --- src/Interpreters/DDLTask.h | 1 + src/Interpreters/DDLWorker.cpp | 6 ++++++ src/Interpreters/InterpreterCreateQuery.cpp | 6 +++++- 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/src/Interpreters/DDLTask.h b/src/Interpreters/DDLTask.h index a12676ab8a3..5b50413b975 100644 --- a/src/Interpreters/DDLTask.h +++ b/src/Interpreters/DDLTask.h @@ -158,6 +158,7 @@ struct MetadataTransaction void addOps(Coordination::Requests & other_ops) { std::move(ops.begin(), ops.end(), std::back_inserter(other_ops)); + ops.clear(); } void commit(); diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index 83412ab8fb7..7b9d3ef8f5b 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -400,6 +400,12 @@ bool DDLWorker::tryExecuteQuery(const String & query, DDLTaskBase & task) auto query_context = task.makeQueryContext(context); query_scope.emplace(*query_context); executeQuery(istr, ostr, false, *query_context, {}); + + if (auto txn = query_context->getMetadataTransaction()) + { + if (txn->state == MetadataTransaction::CREATED) + txn->commit(); + } } catch (const DB::Exception & e) { diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 5292ef57d7a..926737ef888 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -800,11 +800,11 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create) String current_database = context.getCurrentDatabase(); auto database_name = create.database.empty() ? current_database : create.database; - auto database = DatabaseCatalog::instance().getDatabase(database_name); // If this is a stub ATTACH query, read the query definition from the database if (create.attach && !create.storage && !create.columns_list) { + auto database = DatabaseCatalog::instance().getDatabase(database_name); bool if_not_exists = create.if_not_exists; // Table SQL definition is available even if the table is detached (even permanently) @@ -869,7 +869,11 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create) } //TODO make code better if possible + DatabasePtr database; bool need_add_to_database = !create.temporary; + if (need_add_to_database) + database = DatabaseCatalog::instance().getDatabase(database_name); + if (need_add_to_database && database->getEngineName() == "Replicated") { auto guard = DatabaseCatalog::instance().getDDLGuard(create.database, create.table); From 2d0f742fdab2504402432580fda1b1f182aee4c7 Mon Sep 17 00:00:00 2001 From: Evgeniia Sudarikova Date: Thu, 28 Jan 2021 23:16:29 +0300 Subject: [PATCH 064/381] edited EN docs --- .../example-datasets/brown-benchmark.md | 6 +- .../functions/array-functions.md | 105 +++++++++++++++++- .../en/sql-reference/table-functions/mysql.md | 2 +- 3 files changed, 104 insertions(+), 9 deletions(-) diff --git a/docs/en/getting-started/example-datasets/brown-benchmark.md b/docs/en/getting-started/example-datasets/brown-benchmark.md index b5ca23eddb9..effae6d5adb 100644 --- a/docs/en/getting-started/example-datasets/brown-benchmark.md +++ b/docs/en/getting-started/example-datasets/brown-benchmark.md @@ -5,7 +5,7 @@ toc_title: Brown University Benchmark # Brown University Benchmark -MgBench - A new analytical benchmark for machine-generated log data, [Andrew Crotty](http://cs.brown.edu/people/acrotty/). +`MgBench` is a new analytical benchmark for machine-generated log data, [Andrew Crotty](http://cs.brown.edu/people/acrotty/). Download the data: ``` @@ -153,7 +153,7 @@ ORDER BY dt, hr; --- Q1.4: Over a 1-month period, how often was each server blocked on disk I/O? +-- Q1.4: Over 1 month, how often was each server blocked on disk I/O? SELECT machine_name, COUNT(*) AS spikes @@ -301,7 +301,7 @@ WHERE event_type = 'temperature' AND log_time >= '2019-11-29 17:00:00.000'; --- Q3.4: Over the past 6 months, how frequently was each door opened? +-- Q3.4: Over the past 6 months, how frequently were each door opened? SELECT device_name, device_floor, diff --git a/docs/en/sql-reference/functions/array-functions.md b/docs/en/sql-reference/functions/array-functions.md index dc7727bdfd8..48c5176f0e1 100644 --- a/docs/en/sql-reference/functions/array-functions.md +++ b/docs/en/sql-reference/functions/array-functions.md @@ -1294,15 +1294,47 @@ Returns the min of the `func` values. If the function is omitted, it just return Note that the `arrayMin` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You can pass a lambda function to it as the first argument. -Examples: -```sql +**Syntax** + +``` sql +arrayMin(arr) +``` + +**Returned value** + +- A number. + +Type: [Int](../../sql-reference/data-types/int-uint.md) or [Float](../../sql-reference/data-types/float.md). + +**Parameters** + +- `arr` — [Array](../../sql-reference/data-types/array.md). + +**Examples** + +Query: + +``` sql SELECT arrayMin([1, 2, 4]) AS res +``` + +Result: + +``` text ┌─res─┐ │ 1 │ └─────┘ +``` +Query: +``` sql SELECT arrayMin(x -> (-x), [1, 2, 4]) AS res +``` + +Result: + +``` text ┌─res─┐ │ -4 │ └─────┘ @@ -1314,15 +1346,47 @@ Returns the max of the `func` values. If the function is omitted, it just return Note that the `arrayMax` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You can pass a lambda function to it as the first argument. -Examples: +**Syntax** + +``` sql +arrayMax(arr) +``` + +**Returned value** + +- A number. + +Type: [Int](../../sql-reference/data-types/int-uint.md) or [Float](../../sql-reference/data-types/float.md). + +**Parameters** + +- `arr` — [Array](../../sql-reference/data-types/array.md). + +**Examples** + +Query: + ```sql SELECT arrayMax([1, 2, 4]) AS res +``` + +Result: + +``` text ┌─res─┐ │ 4 │ └─────┘ +``` +Query: +``` sql SELECT arrayMax(x -> (-x), [1, 2, 4]) AS res +``` + +Result: + +``` text ┌─res─┐ │ -1 │ └─────┘ @@ -1334,21 +1398,52 @@ Returns the sum of the `func` values. If the function is omitted, it just return Note that the `arraySum` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You can pass a lambda function to it as the first argument. -Examples: +**Syntax** + +``` sql +arraySum(arr) +``` + +**Returned value** + +- A number. + +Type: [Int](../../sql-reference/data-types/int-uint.md) or [Float](../../sql-reference/data-types/float.md). + +**Parameters** + +- `arr` — [Array](../../sql-reference/data-types/array.md). + +**Examples** + +Query: + ```sql SELECT arraySum([2,3]) AS res +``` + +Result: + +``` text ┌─res─┐ │ 5 │ └─────┘ +``` +Query: +``` sql SELECT arraySum(x -> x*x, [2, 3]) AS res +``` + +Result: + +``` text ┌─res─┐ │ 13 │ └─────┘ ``` - ## arrayAvg(\[func,\] arr1, …) {#array-avg} Returns the average of the `func` values. If the function is omitted, it just returns the average of the array elements. diff --git a/docs/en/sql-reference/table-functions/mysql.md b/docs/en/sql-reference/table-functions/mysql.md index eec4a1d0c46..3126f635817 100644 --- a/docs/en/sql-reference/table-functions/mysql.md +++ b/docs/en/sql-reference/table-functions/mysql.md @@ -44,7 +44,7 @@ The rest of the conditions and the `LIMIT` sampling constraint are executed in C A table object with the same columns as the original MySQL table. !!! info "Note" - In the `INSERT` query to distinguish table function `mysql(...)` from table name with column names list you must use keywords `FUNCTION` or `TABLE FUNCTION`. See examples below. + In the `INSERT` query to distinguish table function `mysql(...)` from table name with column names list, you must use keywords `FUNCTION` or `TABLE FUNCTION`. See examples below. **Examples** From 45cb78a67b1ba39fe874817e523a7964751fb7cc Mon Sep 17 00:00:00 2001 From: feng lv Date: Fri, 29 Jan 2021 08:14:34 +0000 Subject: [PATCH 065/381] continue of #19487 fix --- src/Interpreters/TreeRewriter.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/Interpreters/TreeRewriter.cpp b/src/Interpreters/TreeRewriter.cpp index ce4103e97ec..a1d1605afd5 100644 --- a/src/Interpreters/TreeRewriter.cpp +++ b/src/Interpreters/TreeRewriter.cpp @@ -693,18 +693,17 @@ void TreeRewriterResult::collectUsedColumns(const ASTPtr & query, bool is_select if (storage) { - String hint_name{}; + std::vector hint_name{}; for (const auto & name : columns_context.requiredColumns()) { auto hints = storage->getHints(name); - if (!hints.empty()) - hint_name = hint_name + " '" + toString(hints) + "'"; + hint_name.insert(hint_name.end(), hints.begin(), hints.end()); } if (!hint_name.empty()) { ss << ", maybe you meant: "; - ss << hint_name; + ss << toString(hint_name); } } else From 9da445e740b45481da042d6e0264cdbe70245443 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Mon, 1 Feb 2021 22:29:47 +0300 Subject: [PATCH 066/381] execute initial query in the same thread --- src/Databases/DatabaseReplicated.cpp | 12 ++-- src/Databases/DatabaseReplicatedWorker.cpp | 68 ++++++++++++++++++--- src/Databases/DatabaseReplicatedWorker.h | 7 ++- src/Interpreters/DDLTask.cpp | 4 +- src/Interpreters/DDLTask.h | 2 +- src/Interpreters/DDLWorker.cpp | 22 ++++++- src/Interpreters/InterpreterAlterQuery.cpp | 3 + src/Interpreters/InterpreterCreateQuery.cpp | 7 ++- src/Interpreters/InterpreterDropQuery.cpp | 33 ++++++---- src/Interpreters/InterpreterRenameQuery.cpp | 5 +- src/Interpreters/InterpreterRenameQuery.h | 3 + tests/clickhouse-test | 4 +- 12 files changed, 128 insertions(+), 42 deletions(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 6f244ed7ec9..44746cd5716 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -42,9 +42,9 @@ zkutil::ZooKeeperPtr DatabaseReplicated::getZooKeeper() const return global_context.getZooKeeper(); } -static inline String getHostID(const Context & global_context) +static inline String getHostID(const Context & global_context, const UUID & db_uuid) { - return Cluster::Address::toString(getFQDNOrHostName(), global_context.getTCPPort()); + return Cluster::Address::toString(getFQDNOrHostName(), global_context.getTCPPort()) + ':' + toString(db_uuid); } @@ -94,7 +94,7 @@ DatabaseReplicated::DatabaseReplicated( String replica_host_id; if (current_zookeeper->tryGet(replica_path, replica_host_id)) { - String host_id = getHostID(global_context); + String host_id = getHostID(global_context, db_uuid); if (replica_host_id != host_id) throw Exception(ErrorCodes::REPLICA_IS_ALREADY_EXIST, "Replica {} of shard {} of replicated database at {} already exists. Replica host ID: '{}', current host ID: '{}'", @@ -144,7 +144,7 @@ void DatabaseReplicated::createReplicaNodesInZooKeeper(const zkutil::ZooKeeperPt //log_entry_to_execute = 0; //FIXME /// Write host name to replica_path, it will protect from multiple replicas with the same name - auto host_id = getHostID(global_context); + auto host_id = getHostID(global_context, db_uuid); /// On replica creation add empty entry to log. Can be used to trigger some actions on other replicas (e.g. update cluster info). DDLLogEntry entry; @@ -221,11 +221,11 @@ BlockIO DatabaseReplicated::propose(const ASTPtr & query) LOG_DEBUG(log, "Proposing query: {}", queryToString(query)); + /// TODO maybe write current settings to log entry? DDLLogEntry entry; - entry.hosts = {}; entry.query = queryToString(query); entry.initiator = ddl_worker->getCommonHostID(); - String node_path = ddl_worker->enqueueQuery(entry); + String node_path = ddl_worker->tryEnqueueAndExecuteEntry(entry); BlockIO io; //FIXME use query context diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp index 0c2368cdcf6..a1cdff204c7 100644 --- a/src/Databases/DatabaseReplicatedWorker.cpp +++ b/src/Databases/DatabaseReplicatedWorker.cpp @@ -8,13 +8,16 @@ namespace DB namespace ErrorCodes { extern const int LOGICAL_ERROR; + extern const int DATABASE_REPLICATION_FAILED; } DatabaseReplicatedDDLWorker::DatabaseReplicatedDDLWorker(DatabaseReplicated * db, const Context & context_) : DDLWorker(/* pool_size */ 1, db->zookeeper_path + "/log", context_, nullptr, {}, fmt::format("DDLWorker({})", db->getDatabaseName())) , database(db) { - /// Pool size must be 1 (to avoid reordering of log entries) + /// Pool size must be 1 to avoid reordering of log entries. + /// TODO Make a dependency graph of DDL queries. It will allow to execute independent entries in parallel. + /// We also need similar graph to load tables on server startup in order of topsort. } void DatabaseReplicatedDDLWorker::initializeMainThread() @@ -72,8 +75,51 @@ String DatabaseReplicatedDDLWorker::enqueueQuery(DDLLogEntry & entry) return node_path; } +String DatabaseReplicatedDDLWorker::tryEnqueueAndExecuteEntry(DDLLogEntry & entry) +{ + auto zookeeper = getAndSetZooKeeper(); + // TODO do not enqueue query if we have big replication lag + + String entry_path = enqueueQuery(entry); + auto try_node = zkutil::EphemeralNodeHolder::existing(entry_path + "/try", *zookeeper); + String entry_name = entry_path.substr(entry_path.rfind('/') + 1); + auto task = std::make_unique(entry_name, entry_path, database); + task->entry = entry; + task->parseQueryFromEntry(context); + assert(!task->entry.query.empty()); + assert(!zookeeper->exists(task->getFinishedNodePath())); + task->is_initial_query = true; + + LOG_DEBUG(log, "Waiting for worker thread to process all entries before {}", entry_name); + { + std::unique_lock lock{mutex}; + wait_current_task_change.wait(lock, [&]() { assert(current_task <= entry_name); return zookeeper->expired() || current_task == entry_name; }); + } + + if (zookeeper->expired()) + throw Exception(ErrorCodes::DATABASE_REPLICATION_FAILED, "ZooKeeper session expired, try again"); + + processTask(*task); + + if (!task->was_executed) + { + throw Exception(ErrorCodes::LOGICAL_ERROR, "Entry {} was executed, but was not committed: code {}: {}", + task->execution_status.code, task->execution_status.message); + } + + try_node->reset(); + + return entry_path; +} + DDLTaskPtr DatabaseReplicatedDDLWorker::initAndCheckTask(const String & entry_name, String & out_reason, const ZooKeeperPtr & zookeeper) { + { + std::lock_guard lock{mutex}; + current_task = entry_name; + wait_current_task_change.notify_all(); + } + UInt32 our_log_ptr = parse(current_zookeeper->get(database->replica_path + "/log_ptr")); UInt32 entry_num = DatabaseReplicatedTask::getLogEntryNumber(entry_name); @@ -91,27 +137,31 @@ DDLTaskPtr DatabaseReplicatedDDLWorker::initAndCheckTask(const String & entry_na if (zookeeper->tryGet(entry_path + "/try", initiator_name, nullptr, wait_committed_or_failed)) { - task->we_are_initiator = initiator_name == task->host_id_str; + task->is_initial_query = initiator_name == task->host_id_str; /// Query is not committed yet. We cannot just skip it and execute next one, because reordering may break replication. //FIXME add some timeouts - if (!task->we_are_initiator) - { - LOG_TRACE(log, "Waiting for initiator {} to commit or rollback entry {}", initiator_name, entry_path); - wait_committed_or_failed->wait(); - } + LOG_TRACE(log, "Waiting for initiator {} to commit or rollback entry {}", initiator_name, entry_path); + wait_committed_or_failed->wait(); } - if (!task->we_are_initiator && !zookeeper->exists(entry_path + "/committed")) + if (!zookeeper->exists(entry_path + "/committed")) { out_reason = "Entry " + entry_name + " hasn't been committed"; return {}; } + if (task->is_initial_query) + { + assert(!zookeeper->exists(entry_path + "/try")); + assert(zookeeper->exists(entry_path + "/committed") == (zookeeper->get(task->getFinishedNodePath()) == "0")); + out_reason = "Entry " + entry_name + " has been executed as initial query"; + return {}; + } + String node_data; if (!zookeeper->tryGet(entry_path, node_data)) { LOG_ERROR(log, "Cannot get log entry {}", entry_path); - database->onUnexpectedLogEntry(entry_name, zookeeper); throw Exception(ErrorCodes::LOGICAL_ERROR, "should be unreachable"); } diff --git a/src/Databases/DatabaseReplicatedWorker.h b/src/Databases/DatabaseReplicatedWorker.h index 7994104331e..7e6d64dab0b 100644 --- a/src/Databases/DatabaseReplicatedWorker.h +++ b/src/Databases/DatabaseReplicatedWorker.h @@ -1,7 +1,6 @@ #pragma once #include - namespace DB { @@ -14,6 +13,8 @@ public: String enqueueQuery(DDLLogEntry & entry) override; + String tryEnqueueAndExecuteEntry(DDLLogEntry & entry); + private: void initializeMainThread() override; void initializeReplication(); @@ -21,7 +22,9 @@ private: DDLTaskPtr initAndCheckTask(const String & entry_name, String & out_reason, const ZooKeeperPtr & zookeeper) override; DatabaseReplicated * database; - + mutable std::mutex mutex; + std::condition_variable wait_current_task_change; + String current_task; }; } diff --git a/src/Interpreters/DDLTask.cpp b/src/Interpreters/DDLTask.cpp index fd2de014581..55e613648ae 100644 --- a/src/Interpreters/DDLTask.cpp +++ b/src/Interpreters/DDLTask.cpp @@ -303,9 +303,9 @@ std::unique_ptr DatabaseReplicatedTask::makeQueryContext(Context & from query_context->initMetadataTransaction(txn); txn->current_zookeeper = from_context.getZooKeeper(); txn->zookeeper_path = database->zookeeper_path; - txn->is_initial_query = we_are_initiator; + txn->is_initial_query = is_initial_query; - if (we_are_initiator) + if (is_initial_query) { txn->ops.emplace_back(zkutil::makeRemoveRequest(entry_path + "/try", -1)); txn->ops.emplace_back(zkutil::makeCreateRequest(entry_path + "/committed", host_id_str, zkutil::CreateMode::Persistent)); diff --git a/src/Interpreters/DDLTask.h b/src/Interpreters/DDLTask.h index 5b50413b975..49f6d74a931 100644 --- a/src/Interpreters/DDLTask.h +++ b/src/Interpreters/DDLTask.h @@ -77,6 +77,7 @@ struct DDLTaskBase String host_id_str; ASTPtr query; + bool is_initial_query = false; bool is_circular_replicated = false; bool execute_on_leader = false; @@ -136,7 +137,6 @@ struct DatabaseReplicatedTask : public DDLTaskBase static UInt32 getLogEntryNumber(const String & log_entry_name); DatabaseReplicated * database; - bool we_are_initiator = false; }; diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index 7b9d3ef8f5b..fabb9f9563e 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -51,6 +51,7 @@ namespace ErrorCodes extern const int CANNOT_ASSIGN_ALTER; extern const int CANNOT_ALLOCATE_MEMORY; extern const int MEMORY_LIMIT_EXCEEDED; + extern const int INCORRECT_QUERY; } @@ -398,8 +399,9 @@ bool DDLWorker::tryExecuteQuery(const String & query, DDLTaskBase & task) try { auto query_context = task.makeQueryContext(context); - query_scope.emplace(*query_context); - executeQuery(istr, ostr, false, *query_context, {}); + if (!task.is_initial_query) + query_scope.emplace(*query_context); + executeQuery(istr, ostr, !task.is_initial_query, *query_context, {}); if (auto txn = query_context->getMetadataTransaction()) { @@ -409,6 +411,9 @@ bool DDLWorker::tryExecuteQuery(const String & query, DDLTaskBase & task) } catch (const DB::Exception & e) { + if (task.is_initial_query) + throw; + task.execution_status = ExecutionStatus::fromCurrentException(); tryLogCurrentException(log, "Query " + query + " wasn't finished successfully"); @@ -426,6 +431,9 @@ bool DDLWorker::tryExecuteQuery(const String & query, DDLTaskBase & task) } catch (...) { + if (task.is_initial_query) + throw; + task.execution_status = ExecutionStatus::fromCurrentException(); tryLogCurrentException(log, "Query " + query + " wasn't finished successfully"); @@ -474,7 +482,10 @@ void DDLWorker::processTask(DDLTaskBase & task) { /// It's not CREATE DATABASE auto table_id = context.tryResolveStorageID(*query_with_table, Context::ResolveOrdinary); - storage = DatabaseCatalog::instance().tryGetTable(table_id, context); + DatabasePtr database; + std::tie(database, storage) = DatabaseCatalog::instance().tryGetDatabaseAndTable(table_id, context); + if (database && database->getEngineName() == "Replicated") + throw Exception(ErrorCodes::INCORRECT_QUERY, "ON CLUSTER queries are not allowed for Replicated databases"); } task.execute_on_leader = storage && taskShouldBeExecutedOnLeader(task.query, storage) && !task.is_circular_replicated; @@ -496,6 +507,8 @@ void DDLWorker::processTask(DDLTaskBase & task) } catch (...) { + if (task.is_initial_query) + throw; tryLogCurrentException(log, "An error occurred before execution of DDL task: "); task.execution_status = ExecutionStatus::fromCurrentException("An error occurred before execution"); } @@ -628,6 +641,9 @@ bool DDLWorker::tryExecuteQueryOnLeaderReplica( StorageReplicatedMergeTree::Status status; replicated_storage->getStatus(status); + if (task.is_initial_query && !status.is_leader) + throw Exception(ErrorCodes::NOT_A_LEADER, "Cannot execute initial query on non-leader replica"); + /// Any replica which is leader tries to take lock if (status.is_leader && lock->tryLock()) { diff --git a/src/Interpreters/InterpreterAlterQuery.cpp b/src/Interpreters/InterpreterAlterQuery.cpp index db380bca2b1..0edd1a401b3 100644 --- a/src/Interpreters/InterpreterAlterQuery.cpp +++ b/src/Interpreters/InterpreterAlterQuery.cpp @@ -53,7 +53,10 @@ BlockIO InterpreterAlterQuery::execute() DatabasePtr database = DatabaseCatalog::instance().getDatabase(table_id.database_name); if (typeid_cast(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) + { + alter_lock.reset(); return typeid_cast(database.get())->propose(query_ptr); + } //FIXME commit MetadataTransaction for all ALTER kinds. Now its' implemented only for metadata alter. diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 926737ef888..d91f3140a96 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -572,6 +572,10 @@ InterpreterCreateQuery::TableProperties InterpreterCreateQuery::setProperties(AS validateTableStructure(create, properties); /// Set the table engine if it was not specified explicitly. setEngine(create); + + create.as_database.clear(); + create.as_table.clear(); + return properties; } @@ -835,7 +839,7 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create) /// Data path must be relative to root_path create.attach_from_path = fs::relative(data_path, root_path) / ""; } - else if (create.attach && !create.attach_short_syntax) + else if (create.attach && !create.attach_short_syntax && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { auto * log = &Poco::Logger::get("InterpreterCreateQuery"); LOG_WARNING(log, "ATTACH TABLE query with full table definition is not recommended: " @@ -881,6 +885,7 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create) if (typeid_cast(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { assertOrSetUUID(create, database); + guard.reset(); return typeid_cast(database.get())->propose(query_ptr); } } diff --git a/src/Interpreters/InterpreterDropQuery.cpp b/src/Interpreters/InterpreterDropQuery.cpp index ff7b6ef8387..eed7337b9ab 100644 --- a/src/Interpreters/InterpreterDropQuery.cpp +++ b/src/Interpreters/InterpreterDropQuery.cpp @@ -33,6 +33,7 @@ namespace ErrorCodes extern const int UNKNOWN_TABLE; extern const int UNKNOWN_DICTIONARY; extern const int NOT_IMPLEMENTED; + extern const int INCORRECT_QUERY; } @@ -119,12 +120,28 @@ BlockIO InterpreterDropQuery::executeToTableImpl(const ASTDropQuery & query, Dat if (database && table) { - if (query_ptr->as().is_view && !table->isView()) + if (query.as().is_view && !table->isView()) throw Exception("Table " + table_id.getNameForLogs() + " is not a View", ErrorCodes::LOGICAL_ERROR); /// Now get UUID, so we can wait for table data to be finally dropped table_id.uuid = database->tryGetTableUUID(table_id.table_name); + /// Prevents recursive drop from drop database query. The original query must specify a table. + bool is_drop_or_detach_database = query.table.empty(); + bool is_replicated_ddl_query = typeid_cast(database.get()) && + context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY && + !is_drop_or_detach_database; + if (is_replicated_ddl_query) + { + if (query.kind == ASTDropQuery::Kind::Detach && !query.permanently) + throw Exception(ErrorCodes::INCORRECT_QUERY, "DETACH TABLE is not allowed for Replicated databases. " + "Use DETACH TABLE PERMANENTLY or SYSTEM RESTART REPLICA"); + + ddl_guard.reset(); + table.reset(); + return typeid_cast(database.get())->propose(query.clone()); + } + if (query.kind == ASTDropQuery::Kind::Detach) { context.checkAccess(table->isView() ? AccessType::DROP_VIEW : AccessType::DROP_TABLE, table_id); @@ -135,9 +152,6 @@ BlockIO InterpreterDropQuery::executeToTableImpl(const ASTDropQuery & query, Dat if (database->getUUID() == UUIDHelpers::Nil) table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); - if (typeid_cast(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) - return typeid_cast(database.get())->propose(query_ptr); - if (query.permanently) { /// Drop table from memory, don't touch data, metadata file renamed and will be skipped during server restart @@ -157,10 +171,7 @@ BlockIO InterpreterDropQuery::executeToTableImpl(const ASTDropQuery & query, Dat auto table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); auto metadata_snapshot = table->getInMemoryMetadataPtr(); /// Drop table data, don't touch metadata - if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) - return typeid_cast(database.get())->propose(query_ptr); - else - table->truncate(query_ptr, metadata_snapshot, context, table_lock); + table->truncate(query_ptr, metadata_snapshot, context, table_lock); } else if (query.kind == ASTDropQuery::Kind::Drop) { @@ -173,11 +184,7 @@ BlockIO InterpreterDropQuery::executeToTableImpl(const ASTDropQuery & query, Dat if (database->getUUID() == UUIDHelpers::Nil) table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); - /// Prevents recursive drop from drop database query. The original query must specify a table. - if (typeid_cast(database.get()) && !query_ptr->as().table.empty() && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) - return typeid_cast(database.get())->propose(query_ptr); - else - database->dropTable(context, table_id.table_name, query.no_delay); + database->dropTable(context, table_id.table_name, query.no_delay); } db = database; diff --git a/src/Interpreters/InterpreterRenameQuery.cpp b/src/Interpreters/InterpreterRenameQuery.cpp index a6075643a96..52faa89eff1 100644 --- a/src/Interpreters/InterpreterRenameQuery.cpp +++ b/src/Interpreters/InterpreterRenameQuery.cpp @@ -43,9 +43,6 @@ BlockIO InterpreterRenameQuery::execute() RenameDescriptions descriptions; descriptions.reserve(rename.elements.size()); - /// Don't allow to drop tables (that we are renaming); don't allow to create tables in places where tables will be renamed. - TableGuards table_guards; - for (const auto & elem : rename.elements) { descriptions.emplace_back(elem, current_database); @@ -85,6 +82,8 @@ BlockIO InterpreterRenameQuery::executeToTables(const ASTRenameQuery & rename, c if (1 < descriptions.size()) throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Database {} is Replicated, " "it does not support renaming of multiple tables in single query.", elem.from_database_name); + + table_guards.clear(); return typeid_cast(database.get())->propose(query_ptr); } else diff --git a/src/Interpreters/InterpreterRenameQuery.h b/src/Interpreters/InterpreterRenameQuery.h index 055c15181c1..2bc84514b4c 100644 --- a/src/Interpreters/InterpreterRenameQuery.h +++ b/src/Interpreters/InterpreterRenameQuery.h @@ -64,6 +64,9 @@ private: ASTPtr query_ptr; Context & context; + + /// Don't allow to drop tables (that we are renaming); don't allow to create tables in places where tables will be renamed. + TableGuards table_guards; }; } diff --git a/tests/clickhouse-test b/tests/clickhouse-test index 13e7b4be001..3bfbd5d3e7f 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -186,9 +186,9 @@ def run_single_test(args, ext, server_logs_level, client_options, case_file, std total_time = (datetime.now() - start_time).total_seconds() + # Normalize randomized database names in stdout, stderr files. + os.system("LC_ALL=C sed -i -e 's/{test_db}/default/g' {file}".format(test_db=database, file=stdout_file)) if not args.show_db_name: - # Normalize randomized database names in stdout, stderr files. - os.system("LC_ALL=C sed -i -e 's/{test_db}/default/g' {file}".format(test_db=database, file=stdout_file)) os.system("LC_ALL=C sed -i -e 's/{test_db}/default/g' {file}".format(test_db=database, file=stderr_file)) stdout = open(stdout_file, 'rb').read() if os.path.exists(stdout_file) else b'' From 79f651f2b40379c0d515648b69875054831fe5dc Mon Sep 17 00:00:00 2001 From: romanzhukov Date: Mon, 1 Feb 2021 23:32:45 +0300 Subject: [PATCH 067/381] DOCSUP-5822: Add function documentation. --- .../functions/type-conversion-functions.md | 34 +++++++++++++++---- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 6237cd6a976..fdfc3c479ce 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -459,28 +459,48 @@ Code: 70. DB::Exception: Received from localhost:9000. DB::Exception: Value in c ## accurateCastOrNull(x, T) {#type_conversion_function-accurate-cast_or_null} -Converts ‘x’ to the ‘t’ data type. Always returns nullable type and returns NULL +Converts input value to the specified data type. Always returns nullable type and returns NULL if the casted value is not representable in the target type. -Example: +**Syntax** + +```sql +accurateCastOrNull(x, T) + +``` + +**Parameters** + +- `x` — Input value. +- `T` — Defines the data type of returned values. + +**Example** + +Query: ``` sql SELECT - accurateCastOrNull(-1, 'UInt8') as uint8, - accurateCastOrNull(128, 'Int8') as int8, - accurateCastOrNull('Test', 'FixedString(2)') as fixed_string + cast(-1, 'UInt8') as uint8, + cast(128, 'Int8') as int8, + cast('Test', 'FixedString(2)') as fixed_string; ``` +Result: + ``` text ┌─uint8─┬─int8─┬─fixed_string─┐ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ -└───────┴──────┴──────────────┘┘ +└───────┴──────┴──────────────┘ ``` +Query: + ``` sql -SELECT toTypeName(accurateCastOrNull(5, 'UInt8')) +SELECT toTypeName(accurateCastOrNull(5, 'UInt8')); ``` +Result: + ``` text ┌─toTypeName(accurateCastOrNull(5, 'UInt8'))─┐ │ Nullable(UInt8) │ From f6de1291645909affe5b9b3dbb5e929e95f7c7ea Mon Sep 17 00:00:00 2001 From: romanzhukov Date: Tue, 2 Feb 2021 09:57:41 +0300 Subject: [PATCH 068/381] DOCSUP-5822: Add function documentation. --- .../functions/type-conversion-functions.md | 34 +++++++------ .../functions/type-conversion-functions.md | 48 +++++++++++++++++++ 2 files changed, 64 insertions(+), 18 deletions(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index fdfc3c479ce..86217871ca1 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -459,25 +459,37 @@ Code: 70. DB::Exception: Received from localhost:9000. DB::Exception: Value in c ## accurateCastOrNull(x, T) {#type_conversion_function-accurate-cast_or_null} -Converts input value to the specified data type. Always returns nullable type and returns NULL -if the casted value is not representable in the target type. +Converts input value `x` to the specified data type `T`. Always returns [Nullable](../../sql-reference/data-types/nullable.md) type and returns [NULL](../../sql-reference/syntax.md#null-literal) if the casted value is not representable in the target type. **Syntax** ```sql accurateCastOrNull(x, T) - ``` **Parameters** - `x` — Input value. -- `T` — Defines the data type of returned values. +- `T` — The name of the returned data type. **Example** Query: +Query: + +``` sql +SELECT toTypeName(accurateCastOrNull(5, 'UInt8')); +``` + +Result: + +``` text +┌─toTypeName(accurateCastOrNull(5, 'UInt8'))─┐ +│ Nullable(UInt8) │ +└────────────────────────────────────────────┘ +``` + ``` sql SELECT cast(-1, 'UInt8') as uint8, @@ -493,20 +505,6 @@ Result: └───────┴──────┴──────────────┘ ``` -Query: - -``` sql -SELECT toTypeName(accurateCastOrNull(5, 'UInt8')); -``` - -Result: - -``` text -┌─toTypeName(accurateCastOrNull(5, 'UInt8'))─┐ -│ Nullable(UInt8) │ -└────────────────────────────────────────────┘ -``` - ## toInterval(Year\|Quarter\|Month\|Week\|Day\|Hour\|Minute\|Second) {#function-tointerval} Converts a Number type argument to an [Interval](../../sql-reference/data-types/special-data-types/interval.md) data type. diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index 4a314bd22d8..40fdbc6f5a0 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -427,6 +427,54 @@ SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null - Настройка [cast_keep_nullable](../../operations/settings/settings.md#cast_keep_nullable) +## accurateCastOrNull(x, T) {#type_conversion_function-accurate-cast_or_null} + +Преобразует входное значение `x` в указанный тип данных `T`. Всегда возвращает тип [Nullable](../../sql-reference/data-types/nullable.md), и возвращает [NULL](../../sql-reference/syntax.md#null-literal), если приведенное значение не может быть представлено в целевом типе. + +**Синтаксис** + +```sql +accurateCastOrNull(x, T) +``` + +**Parameters** + +- `x` — входное значение. +- `T` — имя возвращаемого типа данных. + +**Пример** + +Запрос: + +``` sql +SELECT toTypeName(accurateCastOrNull(5, 'UInt8')); +``` + +Результат: + +``` text +┌─toTypeName(accurateCastOrNull(5, 'UInt8'))─┐ +│ Nullable(UInt8) │ +└────────────────────────────────────────────┘ +``` + +Запрос: + +``` sql +SELECT + cast(-1, 'UInt8') as uint8, + cast(128, 'Int8') as int8, + cast('Test', 'FixedString(2)') as fixed_string; +``` + +Результат: + +``` text +┌─uint8─┬─int8─┬─fixed_string─┐ +│ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ +└───────┴──────┴──────────────┘ +``` + ## toInterval(Year\|Quarter\|Month\|Week\|Day\|Hour\|Minute\|Second) {#function-tointerval} Приводит аргумент из числового типа данных к типу данных [IntervalType](../../sql-reference/data-types/special-data-types/interval.md). From f3860134ab7b40aafaa585fbc90c6806cac1da4d Mon Sep 17 00:00:00 2001 From: romanzhukov Date: Tue, 2 Feb 2021 10:00:54 +0300 Subject: [PATCH 069/381] DOCSUP-5822: Add function documentation. --- docs/en/sql-reference/functions/type-conversion-functions.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 86217871ca1..047b3b1cbea 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -476,8 +476,6 @@ accurateCastOrNull(x, T) Query: -Query: - ``` sql SELECT toTypeName(accurateCastOrNull(5, 'UInt8')); ``` From 0073c87d5d2e80a054468255b021acdbe5ceb660 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Tue, 2 Feb 2021 13:32:42 +0300 Subject: [PATCH 070/381] fix --- src/Databases/DatabaseAtomic.cpp | 2 +- src/Interpreters/DDLWorker.cpp | 2 +- src/Interpreters/InterpreterDropQuery.cpp | 2 +- src/Storages/StorageMaterializedView.cpp | 12 ++++++------ src/Storages/StorageMaterializedView.h | 2 +- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/Databases/DatabaseAtomic.cpp b/src/Databases/DatabaseAtomic.cpp index 8b75f439152..e6bc3bfcd44 100644 --- a/src/Databases/DatabaseAtomic.cpp +++ b/src/Databases/DatabaseAtomic.cpp @@ -131,7 +131,7 @@ void DatabaseAtomic::dropTable(const Context & context, const String & table_nam /// Remove the inner table (if any) to avoid deadlock /// (due to attempt to execute DROP from the worker thread) if (auto * mv = dynamic_cast(table.get())) - mv->dropInnerTable(no_delay); + mv->dropInnerTable(no_delay, context); /// Notify DatabaseCatalog that table was dropped. It will remove table data in background. /// Cleanup is performed outside of database to allow easily DROP DATABASE without waiting for cleanup to complete. DatabaseCatalog::instance().enqueueDroppedTableCleanup(table->getStorageID(), table, table_metadata_path_drop, no_delay); diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index fabb9f9563e..dd822e0f237 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -484,7 +484,7 @@ void DDLWorker::processTask(DDLTaskBase & task) auto table_id = context.tryResolveStorageID(*query_with_table, Context::ResolveOrdinary); DatabasePtr database; std::tie(database, storage) = DatabaseCatalog::instance().tryGetDatabaseAndTable(table_id, context); - if (database && database->getEngineName() == "Replicated") + if (database && database->getEngineName() == "Replicated" && !typeid_cast(&task)) throw Exception(ErrorCodes::INCORRECT_QUERY, "ON CLUSTER queries are not allowed for Replicated databases"); } diff --git a/src/Interpreters/InterpreterDropQuery.cpp b/src/Interpreters/InterpreterDropQuery.cpp index eed7337b9ab..68680f27ea4 100644 --- a/src/Interpreters/InterpreterDropQuery.cpp +++ b/src/Interpreters/InterpreterDropQuery.cpp @@ -127,7 +127,7 @@ BlockIO InterpreterDropQuery::executeToTableImpl(const ASTDropQuery & query, Dat table_id.uuid = database->tryGetTableUUID(table_id.table_name); /// Prevents recursive drop from drop database query. The original query must specify a table. - bool is_drop_or_detach_database = query.table.empty(); + bool is_drop_or_detach_database = query_ptr->as()->table.empty(); bool is_replicated_ddl_query = typeid_cast(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY && !is_drop_or_detach_database; diff --git a/src/Storages/StorageMaterializedView.cpp b/src/Storages/StorageMaterializedView.cpp index 29aea3e6150..fb75a933910 100644 --- a/src/Storages/StorageMaterializedView.cpp +++ b/src/Storages/StorageMaterializedView.cpp @@ -194,7 +194,7 @@ BlockOutputStreamPtr StorageMaterializedView::write(const ASTPtr & query, const } -static void executeDropQuery(ASTDropQuery::Kind kind, Context & global_context, const StorageID & target_table_id, bool no_delay) +static void executeDropQuery(ASTDropQuery::Kind kind, const Context & global_context, const StorageID & target_table_id, bool no_delay) { if (DatabaseCatalog::instance().tryGetTable(target_table_id, global_context)) { @@ -220,19 +220,19 @@ void StorageMaterializedView::drop() if (!select_query.select_table_id.empty()) DatabaseCatalog::instance().removeDependency(select_query.select_table_id, table_id); - dropInnerTable(true); + dropInnerTable(true, global_context); } -void StorageMaterializedView::dropInnerTable(bool no_delay) +void StorageMaterializedView::dropInnerTable(bool no_delay, const Context & context) { if (has_inner_table && tryGetTargetTable()) - executeDropQuery(ASTDropQuery::Kind::Drop, global_context, target_table_id, no_delay); + executeDropQuery(ASTDropQuery::Kind::Drop, context, target_table_id, no_delay); } -void StorageMaterializedView::truncate(const ASTPtr &, const StorageMetadataPtr &, const Context &, TableExclusiveLockHolder &) +void StorageMaterializedView::truncate(const ASTPtr &, const StorageMetadataPtr &, const Context & context, TableExclusiveLockHolder &) { if (has_inner_table) - executeDropQuery(ASTDropQuery::Kind::Truncate, global_context, target_table_id, true); + executeDropQuery(ASTDropQuery::Kind::Truncate, context, target_table_id, true); } void StorageMaterializedView::checkStatementCanBeForwarded() const diff --git a/src/Storages/StorageMaterializedView.h b/src/Storages/StorageMaterializedView.h index fab9e28afe3..94e4295cd34 100644 --- a/src/Storages/StorageMaterializedView.h +++ b/src/Storages/StorageMaterializedView.h @@ -37,7 +37,7 @@ public: BlockOutputStreamPtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, const Context & context) override; void drop() override; - void dropInnerTable(bool no_delay); + void dropInnerTable(bool no_delay, const Context & context); void truncate(const ASTPtr &, const StorageMetadataPtr &, const Context &, TableExclusiveLockHolder &) override; From ed3de186a4c34fd9c39656b6723f89b3cafc4d40 Mon Sep 17 00:00:00 2001 From: benbiti Date: Tue, 2 Feb 2021 20:26:36 +0800 Subject: [PATCH 071/381] [Docs]fix mistype in avg --- docs/en/sql-reference/aggregate-functions/reference/avg.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/aggregate-functions/reference/avg.md b/docs/en/sql-reference/aggregate-functions/reference/avg.md index e2e6aace734..0b80a1be704 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/avg.md +++ b/docs/en/sql-reference/aggregate-functions/reference/avg.md @@ -9,7 +9,7 @@ Calculates the arithmetic mean. **Syntax** ``` sql -avgWeighted(x) +avg(x) ``` **Parameter** From 6456ccf0da4ae12568c559b40015459da07fb6d6 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Tue, 2 Feb 2021 22:39:04 +0300 Subject: [PATCH 072/381] better test --- src/Databases/DatabaseReplicatedWorker.h | 2 +- src/Interpreters/DatabaseCatalog.cpp | 18 +++-- src/Interpreters/DatabaseCatalog.h | 7 +- src/Interpreters/InterpreterAlterQuery.cpp | 10 +-- src/Interpreters/InterpreterCreateQuery.cpp | 2 +- src/Interpreters/InterpreterDropQuery.cpp | 2 +- src/Interpreters/InterpreterRenameQuery.cpp | 12 +++- src/Interpreters/InterpreterRenameQuery.h | 5 +- .../MergeTree/registerStorageMergeTree.cpp | 8 ++- .../configs/config.xml | 31 ++++++++ .../test_replicated_database/test.py | 71 +++++++++++-------- 11 files changed, 112 insertions(+), 56 deletions(-) diff --git a/src/Databases/DatabaseReplicatedWorker.h b/src/Databases/DatabaseReplicatedWorker.h index 7e6d64dab0b..6e29e48469b 100644 --- a/src/Databases/DatabaseReplicatedWorker.h +++ b/src/Databases/DatabaseReplicatedWorker.h @@ -21,7 +21,7 @@ private: DDLTaskPtr initAndCheckTask(const String & entry_name, String & out_reason, const ZooKeeperPtr & zookeeper) override; - DatabaseReplicated * database; + DatabaseReplicated * const database; mutable std::mutex mutex; std::condition_variable wait_current_task_change; String current_task; diff --git a/src/Interpreters/DatabaseCatalog.cpp b/src/Interpreters/DatabaseCatalog.cpp index 18cf69675ba..4ab3fb28785 100644 --- a/src/Interpreters/DatabaseCatalog.cpp +++ b/src/Interpreters/DatabaseCatalog.cpp @@ -956,21 +956,25 @@ DDLGuard::DDLGuard(Map & map_, std::shared_mutex & db_mutex_, std::unique_locksecond.counter; guards_lock.unlock(); table_lock = std::unique_lock(*it->second.mutex); - bool is_database = elem.empty(); - if (!is_database) + is_database_guard = elem.empty(); + if (!is_database_guard) { bool locked_database_for_read = db_mutex.try_lock_shared(); if (!locked_database_for_read) { - removeTableLock(); + releaseTableLock(); throw Exception(ErrorCodes::UNKNOWN_DATABASE, "Database {} is currently dropped or renamed", database_name); } } } -void DDLGuard::removeTableLock() +void DDLGuard::releaseTableLock() noexcept { + if (table_lock_removed) + return; + + table_lock_removed = true; guards_lock.lock(); --it->second.counter; if (!it->second.counter) @@ -978,14 +982,14 @@ void DDLGuard::removeTableLock() table_lock.unlock(); map.erase(it); } + guards_lock.unlock(); } DDLGuard::~DDLGuard() { - bool is_database = it->first.empty(); - if (!is_database) + if (!is_database_guard) db_mutex.unlock_shared(); - removeTableLock(); + releaseTableLock(); } } diff --git a/src/Interpreters/DatabaseCatalog.h b/src/Interpreters/DatabaseCatalog.h index 5146c786f64..c9f031ef678 100644 --- a/src/Interpreters/DatabaseCatalog.h +++ b/src/Interpreters/DatabaseCatalog.h @@ -54,14 +54,17 @@ public: DDLGuard(Map & map_, std::shared_mutex & db_mutex_, std::unique_lock guards_lock_, const String & elem, const String & database_name); ~DDLGuard(); + /// Unlocks table name, keeps holding read lock for database name + void releaseTableLock() noexcept; + private: Map & map; std::shared_mutex & db_mutex; Map::iterator it; std::unique_lock guards_lock; std::unique_lock table_lock; - - void removeTableLock(); + bool table_lock_removed = false; + bool is_database_guard = false; }; diff --git a/src/Interpreters/InterpreterAlterQuery.cpp b/src/Interpreters/InterpreterAlterQuery.cpp index 0edd1a401b3..612f9833af5 100644 --- a/src/Interpreters/InterpreterAlterQuery.cpp +++ b/src/Interpreters/InterpreterAlterQuery.cpp @@ -47,17 +47,19 @@ BlockIO InterpreterAlterQuery::execute() context.checkAccess(getRequiredAccess()); auto table_id = context.resolveStorageID(alter, Context::ResolveOrdinary); - StoragePtr table = DatabaseCatalog::instance().getTable(table_id, context); - auto alter_lock = table->lockForAlter(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); - auto metadata_snapshot = table->getInMemoryMetadataPtr(); DatabasePtr database = DatabaseCatalog::instance().getDatabase(table_id.database_name); if (typeid_cast(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { - alter_lock.reset(); + auto guard = DatabaseCatalog::instance().getDDLGuard(table_id.database_name, table_id.table_name); + guard->releaseTableLock(); return typeid_cast(database.get())->propose(query_ptr); } + StoragePtr table = DatabaseCatalog::instance().getTable(table_id, context); + auto alter_lock = table->lockForAlter(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); + auto metadata_snapshot = table->getInMemoryMetadataPtr(); + //FIXME commit MetadataTransaction for all ALTER kinds. Now its' implemented only for metadata alter. /// Add default database to table identifiers that we can encounter in e.g. default expressions, diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index d91f3140a96..8d344545c8a 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -885,7 +885,7 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create) if (typeid_cast(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { assertOrSetUUID(create, database); - guard.reset(); + guard->releaseTableLock(); return typeid_cast(database.get())->propose(query_ptr); } } diff --git a/src/Interpreters/InterpreterDropQuery.cpp b/src/Interpreters/InterpreterDropQuery.cpp index 68680f27ea4..db2f463893e 100644 --- a/src/Interpreters/InterpreterDropQuery.cpp +++ b/src/Interpreters/InterpreterDropQuery.cpp @@ -137,7 +137,7 @@ BlockIO InterpreterDropQuery::executeToTableImpl(const ASTDropQuery & query, Dat throw Exception(ErrorCodes::INCORRECT_QUERY, "DETACH TABLE is not allowed for Replicated databases. " "Use DETACH TABLE PERMANENTLY or SYSTEM RESTART REPLICA"); - ddl_guard.reset(); + ddl_guard->releaseTableLock(); table.reset(); return typeid_cast(database.get())->propose(query.clone()); } diff --git a/src/Interpreters/InterpreterRenameQuery.cpp b/src/Interpreters/InterpreterRenameQuery.cpp index 52faa89eff1..d2f79ba071c 100644 --- a/src/Interpreters/InterpreterRenameQuery.cpp +++ b/src/Interpreters/InterpreterRenameQuery.cpp @@ -43,6 +43,9 @@ BlockIO InterpreterRenameQuery::execute() RenameDescriptions descriptions; descriptions.reserve(rename.elements.size()); + /// Don't allow to drop tables (that we are renaming); don't allow to create tables in places where tables will be renamed. + TableGuards table_guards; + for (const auto & elem : rename.elements) { descriptions.emplace_back(elem, current_database); @@ -64,10 +67,10 @@ BlockIO InterpreterRenameQuery::execute() if (rename.database) return executeToDatabase(rename, descriptions); else - return executeToTables(rename, descriptions); + return executeToTables(rename, descriptions, table_guards); } -BlockIO InterpreterRenameQuery::executeToTables(const ASTRenameQuery & rename, const RenameDescriptions & descriptions) +BlockIO InterpreterRenameQuery::executeToTables(const ASTRenameQuery & rename, const RenameDescriptions & descriptions, TableGuards & ddl_guards) { auto & database_catalog = DatabaseCatalog::instance(); @@ -83,7 +86,10 @@ BlockIO InterpreterRenameQuery::executeToTables(const ASTRenameQuery & rename, c throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Database {} is Replicated, " "it does not support renaming of multiple tables in single query.", elem.from_database_name); - table_guards.clear(); + UniqueTableName from(elem.from_database_name, elem.from_table_name); + UniqueTableName to(elem.to_database_name, elem.to_table_name); + ddl_guards[from]->releaseTableLock(); + ddl_guards[to]->releaseTableLock(); return typeid_cast(database.get())->propose(query_ptr); } else diff --git a/src/Interpreters/InterpreterRenameQuery.h b/src/Interpreters/InterpreterRenameQuery.h index 2bc84514b4c..0da25f63e8d 100644 --- a/src/Interpreters/InterpreterRenameQuery.h +++ b/src/Interpreters/InterpreterRenameQuery.h @@ -57,16 +57,13 @@ public: void extendQueryLogElemImpl(QueryLogElement & elem, const ASTPtr & ast, const Context &) const override; private: - BlockIO executeToTables(const ASTRenameQuery & rename, const RenameDescriptions & descriptions); + BlockIO executeToTables(const ASTRenameQuery & rename, const RenameDescriptions & descriptions, TableGuards & ddl_guards); static BlockIO executeToDatabase(const ASTRenameQuery & rename, const RenameDescriptions & descriptions); AccessRightsElements getRequiredAccess() const; ASTPtr query_ptr; Context & context; - - /// Don't allow to drop tables (that we are renaming); don't allow to create tables in places where tables will be renamed. - TableGuards table_guards; }; } diff --git a/src/Storages/MergeTree/registerStorageMergeTree.cpp b/src/Storages/MergeTree/registerStorageMergeTree.cpp index 9a881a60a69..1d68f788a42 100644 --- a/src/Storages/MergeTree/registerStorageMergeTree.cpp +++ b/src/Storages/MergeTree/registerStorageMergeTree.cpp @@ -450,17 +450,21 @@ static StoragePtr create(const StorageFactory::Arguments & args) arg_cnt += 2; } else - throw Exception("Expected two string literal arguments: zookeper_path and replica_name", ErrorCodes::BAD_ARGUMENTS); + throw Exception("Expected two string literal arguments: zookeeper_path and replica_name", ErrorCodes::BAD_ARGUMENTS); /// Allow implicit {uuid} macros only for zookeeper_path in ON CLUSTER queries bool is_on_cluster = args.local_context.getClientInfo().query_kind == ClientInfo::QueryKind::SECONDARY_QUERY; - bool is_replicated_database = args.local_context.getClientInfo().query_kind == ClientInfo::QueryKind::REPLICATED_LOG_QUERY; + bool is_replicated_database = args.local_context.getClientInfo().query_kind == ClientInfo::QueryKind::REPLICATED_LOG_QUERY && + DatabaseCatalog::instance().getDatabase(args.table_id.database_name)->getEngineName() == "Replicated"; bool allow_uuid_macro = is_on_cluster || is_replicated_database || args.query.attach; /// Unfold {database} and {table} macro on table creation, so table can be renamed. /// We also unfold {uuid} macro, so path will not be broken after moving table from Atomic to Ordinary database. if (!args.attach) { + if (is_replicated_database && !is_extended_storage_def) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Old syntax is not allowed for ReplicatedMergeTree tables in Replicated databases"); + Macros::MacroExpansionInfo info; /// NOTE: it's not recursive info.expand_special_macros_only = true; diff --git a/tests/integration/test_replicated_database/configs/config.xml b/tests/integration/test_replicated_database/configs/config.xml index d751454437c..ebceee3aa5c 100644 --- a/tests/integration/test_replicated_database/configs/config.xml +++ b/tests/integration/test_replicated_database/configs/config.xml @@ -1,3 +1,34 @@ 10 + + + + + true + + main_node + 9000 + + + dummy_node + 9000 + + + competing_node + 9000 + + + + true + + snapshotting_node + 9000 + + + snapshot_recovering_node + 9000 + + + + diff --git a/tests/integration/test_replicated_database/test.py b/tests/integration/test_replicated_database/test.py index f99f4517e5a..2471228b55e 100644 --- a/tests/integration/test_replicated_database/test.py +++ b/tests/integration/test_replicated_database/test.py @@ -13,6 +13,8 @@ competing_node = cluster.add_instance('competing_node', main_configs=['configs/c snapshotting_node = cluster.add_instance('snapshotting_node', main_configs=['configs/config.xml'], with_zookeeper=True, macros={"shard": 2, "replica": 1}) snapshot_recovering_node = cluster.add_instance('snapshot_recovering_node', main_configs=['configs/config.xml'], with_zookeeper=True, macros={"shard": 2, "replica": 2}) +all_nodes = [main_node, dummy_node, competing_node, snapshotting_node, snapshot_recovering_node] + uuid_regex = re.compile("[0-9a-f]{8}\-[0-9a-f]{4}\-[0-9a-f]{4}\-[0-9a-f]{4}\-[0-9a-f]{12}") def assert_create_query(nodes, table_name, expected): replace_uuid = lambda x: re.sub(uuid_regex, "uuid", x) @@ -31,11 +33,10 @@ def started_cluster(): finally: cluster.shutdown() -#TODO better tests - def test_create_replicated_table(started_cluster): - #FIXME should fail (replicated with old syntax) - #main_node.query("CREATE TABLE testdb.replicated_table (d Date, k UInt64, i32 Int32) ENGINE=ReplicatedMergeTree(d, k, 8192);") + assert "Old syntax is not allowed" in \ + main_node.query_and_get_error("CREATE TABLE testdb.replicated_table (d Date, k UInt64, i32 Int32) ENGINE=ReplicatedMergeTree('/test/tmp', 'r', d, k, 8192);") + main_node.query("CREATE TABLE testdb.replicated_table (d Date, k UInt64, i32 Int32) ENGINE=ReplicatedMergeTree ORDER BY k PARTITION BY toYYYYMM(d);") expected = "CREATE TABLE testdb.replicated_table\\n(\\n `d` Date,\\n `k` UInt64,\\n `i32` Int32\\n)\\n" \ @@ -47,6 +48,7 @@ def test_create_replicated_table(started_cluster): @pytest.mark.parametrize("engine", ['MergeTree', 'ReplicatedMergeTree']) def test_simple_alter_table(started_cluster, engine): + # test_simple_alter_table name = "testdb.alter_test_{}".format(engine) main_node.query("CREATE TABLE {} " "(CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) " @@ -69,10 +71,7 @@ def test_simple_alter_table(started_cluster, engine): assert_create_query([main_node, dummy_node], name, expected) - -@pytest.mark.dependency(depends=['test_simple_alter_table']) -@pytest.mark.parametrize("engine", ['MergeTree', 'ReplicatedMergeTree']) -def test_create_replica_after_delay(started_cluster, engine): + # test_create_replica_after_delay competing_node.query("CREATE DATABASE IF NOT EXISTS testdb ENGINE = Replicated('/clickhouse/databases/test1', 'shard1', 'replica3');") name = "testdb.alter_test_{}".format(engine) @@ -90,13 +89,17 @@ def test_create_replica_after_delay(started_cluster, engine): assert_create_query([main_node, dummy_node, competing_node], name, expected) -@pytest.mark.dependency(depends=['test_create_replica_after_delay']) + def test_alters_from_different_replicas(started_cluster): + # test_alters_from_different_replicas + competing_node.query("CREATE DATABASE IF NOT EXISTS testdb ENGINE = Replicated('/clickhouse/databases/test1', 'shard1', 'replica3');") + main_node.query("CREATE TABLE testdb.concurrent_test " "(CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) " "ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192);") - time.sleep(1) #FIXME + main_node.query("CREATE TABLE testdb.dist AS testdb.concurrent_test ENGINE = Distributed(cluster, testdb, concurrent_test, CounterID)") + dummy_node.kill_clickhouse(stop_start_wait_sec=0) competing_node.query("ALTER TABLE testdb.concurrent_test ADD COLUMN Added0 UInt32;") @@ -115,50 +118,56 @@ def test_alters_from_different_replicas(started_cluster): assert_create_query([main_node, competing_node], "testdb.concurrent_test", expected) -@pytest.mark.dependency(depends=['test_alters_from_different_replicas']) -def test_drop_and_create_table(started_cluster): + # test_create_replica_after_delay main_node.query("DROP TABLE testdb.concurrent_test") main_node.query("CREATE TABLE testdb.concurrent_test " "(CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) " - "ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192);") + "ENGINE = ReplicatedMergeTree ORDER BY CounterID;") expected = "CREATE TABLE testdb.concurrent_test\\n(\\n `CounterID` UInt32,\\n `StartDate` Date,\\n `UserID` UInt32,\\n" \ " `VisitID` UInt32,\\n `NestedColumn.A` Array(UInt8),\\n `NestedColumn.S` Array(String),\\n `ToDrop` UInt32\\n)\\n" \ - "ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192)" + "ENGINE = ReplicatedMergeTree(\\'/clickhouse/tables/uuid/{shard}\\', \\'{replica}\\')\\nORDER BY CounterID\\nSETTINGS index_granularity = 8192" assert_create_query([main_node, competing_node], "testdb.concurrent_test", expected) -@pytest.mark.dependency(depends=['test_drop_and_create_table']) -def test_replica_restart(started_cluster): + main_node.query("INSERT INTO testdb.dist (CounterID, StartDate, UserID) SELECT number, addDays(toDate('2020-02-02'), number), intHash32(number) FROM numbers(10)") + + # test_replica_restart main_node.restart_clickhouse() expected = "CREATE TABLE testdb.concurrent_test\\n(\\n `CounterID` UInt32,\\n `StartDate` Date,\\n `UserID` UInt32,\\n" \ " `VisitID` UInt32,\\n `NestedColumn.A` Array(UInt8),\\n `NestedColumn.S` Array(String),\\n `ToDrop` UInt32\\n)\\n" \ - "ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192)" - - assert_create_query([main_node, competing_node], "testdb.concurrent_test", expected) + "ENGINE = ReplicatedMergeTree(\\'/clickhouse/tables/uuid/{shard}\\', \\'{replica}\\')\\nORDER BY CounterID\\nSETTINGS index_granularity = 8192" -@pytest.mark.dependency(depends=['test_replica_restart']) -def test_snapshot_and_snapshot_recover(started_cluster): - snapshotting_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'shard1', 'replica4');") - snapshot_recovering_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'shard1', 'replica5');") + # test_snapshot_and_snapshot_recover + snapshotting_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'shard2', 'replica1');") + snapshot_recovering_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'shard2', 'replica2');") + assert_create_query(all_nodes, "testdb.concurrent_test", expected) - assert_eq_with_retry(snapshotting_node, "select count() from system.tables where name like 'alter_test_%'", "2\n") - assert_eq_with_retry(snapshot_recovering_node, "select count() from system.tables where name like 'alter_test_%'", "2\n") - assert snapshotting_node.query("desc table testdb.alter_test_MergeTree") == snapshot_recovering_node.query("desc table testdb.alter_test_MergeTree") - assert snapshotting_node.query("desc table testdb.alter_test_ReplicatedMergeTree") == snapshot_recovering_node.query("desc table testdb.alter_test_ReplicatedMergeTree") + main_node.query("SYSTEM FLUSH DISTRIBUTED testdb.dist") + main_node.query("ALTER TABLE testdb.concurrent_test UPDATE StartDate = addYears(StartDate, 1) WHERE 1") + main_node.query("ALTER TABLE testdb.concurrent_test DELETE WHERE UserID % 2") -@pytest.mark.dependency(depends=['test_replica_restart']) -def test_drop_and_create_replica(started_cluster): + # test_drop_and_create_replica main_node.query("DROP DATABASE testdb") main_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'shard1', 'replica1');") expected = "CREATE TABLE testdb.concurrent_test\\n(\\n `CounterID` UInt32,\\n `StartDate` Date,\\n `UserID` UInt32,\\n" \ " `VisitID` UInt32,\\n `NestedColumn.A` Array(UInt8),\\n `NestedColumn.S` Array(String),\\n `ToDrop` UInt32\\n)\\n" \ - "ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192)" + "ENGINE = ReplicatedMergeTree(\\'/clickhouse/tables/uuid/{shard}\\', \\'{replica}\\')\\nORDER BY CounterID\\nSETTINGS index_granularity = 8192" assert_create_query([main_node, competing_node], "testdb.concurrent_test", expected) + assert_create_query(all_nodes, "testdb.concurrent_test", expected) -#TODO tests with Distributed + for node in all_nodes: + node.query("SYSTEM SYNC REPLICA testdb.concurrent_test") + + expected = "0\t2021-02-02\t4249604106\n" \ + "1\t2021-02-03\t1343103100\n" \ + "4\t2021-02-06\t3902320246\n" \ + "7\t2021-02-09\t3844986530\n" \ + "9\t2021-02-11\t1241149650\n" + + assert_eq_with_retry(dummy_node, "SELECT CounterID, StartDate, UserID FROM testdb.dist ORDER BY CounterID", expected) From 066fb4c82bd33744dc8a99d34d88674d83764ba1 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Wed, 3 Feb 2021 23:02:37 +0300 Subject: [PATCH 073/381] fix --- src/Databases/DatabaseReplicatedWorker.cpp | 2 +- src/Interpreters/DDLWorker.cpp | 9 +- src/Interpreters/DDLWorker.h | 2 +- src/Interpreters/DatabaseCatalog.cpp | 8 +- src/Interpreters/executeDDLQueryOnCluster.cpp | 2 +- .../0_stateless/01238_http_memory_tracking.sh | 3 + .../01281_group_by_limit_memory_tracking.sh | 3 + .../01541_max_memory_usage_for_user.sh | 3 + tests/queries/skip_list.json | 128 +++++++++++++++++- 9 files changed, 147 insertions(+), 13 deletions(-) diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp index a1cdff204c7..5af216c3d0d 100644 --- a/src/Databases/DatabaseReplicatedWorker.cpp +++ b/src/Databases/DatabaseReplicatedWorker.cpp @@ -93,7 +93,7 @@ String DatabaseReplicatedDDLWorker::tryEnqueueAndExecuteEntry(DDLLogEntry & entr LOG_DEBUG(log, "Waiting for worker thread to process all entries before {}", entry_name); { std::unique_lock lock{mutex}; - wait_current_task_change.wait(lock, [&]() { assert(current_task <= entry_name); return zookeeper->expired() || current_task == entry_name; }); + wait_current_task_change.wait(lock, [&]() { assert(zookeeper->expired() || current_task <= entry_name); return zookeeper->expired() || current_task == entry_name; }); } if (zookeeper->expired()) diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index 4470a3649c5..545e00296e8 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -357,7 +357,7 @@ void DDLWorker::scheduleTasks() if (!task) { LOG_DEBUG(log, "Will not execute task {}: {}", entry_name, reason); - updateMaxDDLEntryID(*task); + updateMaxDDLEntryID(entry_name); continue; } @@ -449,9 +449,9 @@ bool DDLWorker::tryExecuteQuery(const String & query, DDLTaskBase & task) return true; } -void DDLWorker::updateMaxDDLEntryID(const DDLTaskBase & task) +void DDLWorker::updateMaxDDLEntryID(const String & entry_name) { - DB::ReadBufferFromString in(task.entry_name); + DB::ReadBufferFromString in(entry_name); DB::assertString("query-", in); UInt64 id; readText(id, in); @@ -511,6 +511,7 @@ void DDLWorker::processTask(DDLTaskBase & task) if (task.execute_on_leader) { + tryExecuteQueryOnLeaderReplica(task, storage, rewritten_query, task.entry_path, zookeeper); } else { @@ -549,7 +550,7 @@ void DDLWorker::processTask(DDLTaskBase & task) task.was_executed = true; } - updateMaxDDLEntryID(task); + updateMaxDDLEntryID(task.entry_name); /// FIXME: if server fails right here, the task will be executed twice. We need WAL here. /// If ZooKeeper connection is lost here, we will try again to write query status. diff --git a/src/Interpreters/DDLWorker.h b/src/Interpreters/DDLWorker.h index 6124e5ee8ec..d9fd4e58cb6 100644 --- a/src/Interpreters/DDLWorker.h +++ b/src/Interpreters/DDLWorker.h @@ -73,7 +73,7 @@ protected: virtual DDLTaskPtr initAndCheckTask(const String & entry_name, String & out_reason, const ZooKeeperPtr & zookeeper); void processTask(DDLTaskBase & task); - void updateMaxDDLEntryID(const DDLTaskBase & task); + void updateMaxDDLEntryID(const String & entry_name); /// Check that query should be executed on leader replica only static bool taskShouldBeExecutedOnLeader(const ASTPtr ast_ddl, StoragePtr storage); diff --git a/src/Interpreters/DatabaseCatalog.cpp b/src/Interpreters/DatabaseCatalog.cpp index 4ab3fb28785..6313da7132d 100644 --- a/src/Interpreters/DatabaseCatalog.cpp +++ b/src/Interpreters/DatabaseCatalog.cpp @@ -976,12 +976,10 @@ void DDLGuard::releaseTableLock() noexcept table_lock_removed = true; guards_lock.lock(); - --it->second.counter; - if (!it->second.counter) - { - table_lock.unlock(); + UInt32 counter = --it->second.counter; + table_lock.unlock(); + if (counter == 0) map.erase(it); - } guards_lock.unlock(); } diff --git a/src/Interpreters/executeDDLQueryOnCluster.cpp b/src/Interpreters/executeDDLQueryOnCluster.cpp index fb155e82926..a0148316610 100644 --- a/src/Interpreters/executeDDLQueryOnCluster.cpp +++ b/src/Interpreters/executeDDLQueryOnCluster.cpp @@ -33,7 +33,7 @@ bool isSupportedAlterType(int type) { assert(type != ASTAlterCommand::NO_TYPE); static const std::unordered_set unsupported_alter_types{ - /// It's dangerous, because it may duplicate data if executed on multiple replicas + /// It's dangerous, because it may duplicate data if executed on multiple replicas. We can allow it after #18978 ASTAlterCommand::ATTACH_PARTITION, /// Usually followed by ATTACH PARTITION ASTAlterCommand::FETCH_PARTITION, diff --git a/tests/queries/0_stateless/01238_http_memory_tracking.sh b/tests/queries/0_stateless/01238_http_memory_tracking.sh index 90a7611c7c7..8c900e4c208 100755 --- a/tests/queries/0_stateless/01238_http_memory_tracking.sh +++ b/tests/queries/0_stateless/01238_http_memory_tracking.sh @@ -18,3 +18,6 @@ yes 'SELECT 1' 2>/dev/null | { } | grep -x -c 1 wait + +# Reset max_memory_usage_for_user, so it will not affect other tests +${CLICKHOUSE_CLIENT} --max_memory_usage_for_user=0 -q "SELECT 1 FORMAT Null" diff --git a/tests/queries/0_stateless/01281_group_by_limit_memory_tracking.sh b/tests/queries/0_stateless/01281_group_by_limit_memory_tracking.sh index 285e2ab8dad..222f7edd787 100755 --- a/tests/queries/0_stateless/01281_group_by_limit_memory_tracking.sh +++ b/tests/queries/0_stateless/01281_group_by_limit_memory_tracking.sh @@ -42,3 +42,6 @@ execute_group_by # if memory accounting will be incorrect, the second query will be failed with MEMORY_LIMIT_EXCEEDED execute_group_by wait + +# Reset max_memory_usage_for_user, so it will not affect other tests +${CLICKHOUSE_CLIENT} --max_memory_usage_for_user=0 -q "SELECT 1 FORMAT Null" diff --git a/tests/queries/0_stateless/01541_max_memory_usage_for_user.sh b/tests/queries/0_stateless/01541_max_memory_usage_for_user.sh index c81bd1a6ce4..32877bfd0fe 100755 --- a/tests/queries/0_stateless/01541_max_memory_usage_for_user.sh +++ b/tests/queries/0_stateless/01541_max_memory_usage_for_user.sh @@ -66,4 +66,7 @@ echo 'OK' ${CLICKHOUSE_CLIENT} --query "DROP USER test_01541"; +# Reset max_memory_usage_for_user, so it will not affect other tests +${CLICKHOUSE_CLIENT} --max_memory_usage_for_user=0 -q "SELECT 1 FORMAT Null" + exit 0 diff --git a/tests/queries/skip_list.json b/tests/queries/skip_list.json index 3311eb3882d..273e00c8a23 100644 --- a/tests/queries/skip_list.json +++ b/tests/queries/skip_list.json @@ -100,7 +100,133 @@ "00604_show_create_database", "00609_mv_index_in_in", "00510_materizlized_view_and_deduplication_zookeeper", - "00738_lock_for_inner_table" + "memory_tracking", /// FIXME remove it before merge + "memory_tracking", + "memory_usage", + "00738_lock_for_inner_table", + "01666_blns", + "01652_ignore_and_low_cardinality", + "01651_map_functions", + "01650_fetch_patition_with_macro_in_zk_path", + "01648_mutations_and_escaping", + "01640_marks_corruption_regression", + "01622_byte_size", + "01611_string_to_low_cardinality_key_alter", + "01602_show_create_view", + "01600_log_queries_with_extensive_info", + "01560_ttl_remove_empty_parts", + "01554_bloom_filter_index_big_integer_uuid", + "01550_type_map_formats_input", + "01550_type_map_formats", + "01550_create_map_type", + "01532_primary_key_without_order_by_zookeeper", + "01511_alter_version_versioned_collapsing_merge_tree_zookeeper", + "01509_parallel_quorum_insert_no_replicas", + "01504_compression_multiple_streams", + "01494_storage_join_persistency", + "01493_storage_set_persistency", + "01493_alter_remove_properties_zookeeper", + "01475_read_subcolumns_storages", + "01475_read_subcolumns", + "01463_test_alter_live_view_refresh", + "01451_replicated_detach_drop_part", + "01451_detach_drop_part", + "01440_big_int_exotic_casts", + "01430_modify_sample_by_zookeeper", + "01417_freeze_partition_verbose_zookeeper", + "01417_freeze_partition_verbose", + "01396_inactive_replica_cleanup_nodes_zookeeper", + "01375_compact_parts_codecs", + "01357_version_collapsing_attach_detach_zookeeper", + "01355_alter_column_with_order", + "01291_geo_types", + "01270_optimize_skip_unused_shards_low_cardinality", + "01237_live_view_over_distributed_with_subquery_select_table_alias", + "01236_distributed_over_live_view_over_distributed", + "01235_live_view_over_distributed", + "01182_materialized_view_different_structure", + "01150_ddl_guard_rwr", + "01148_zookeeper_path_macros_unfolding", + "01135_default_and_alter_zookeeper", + "01130_in_memory_parts_partitons", + "01127_month_partitioning_consistency_select", + "01114_database_atomic", + "01083_expressions_in_engine_arguments", + "01073_attach_if_not_exists", + "01072_optimize_skip_unused_shards_const_expr_eval", + "01071_prohibition_secondary_index_with_old_format_merge_tree", + "01071_live_view_detach_dependency", + "01062_alter_on_mutataion_zookeeper", + "01060_shutdown_table_after_detach", + "01056_create_table_as", + "01035_avg", + "01021_only_tuple_columns", + "01019_alter_materialized_view_query", + "01019_alter_materialized_view_consistent", + "01019_alter_materialized_view_atomic", + "01015_attach_part", + "00989_parallel_parts_loading", + "00980_zookeeper_merge_tree_alter_settings", + "00980_merge_alter_settings", + "00980_create_temporary_live_view", + "00978_live_view_watch", + "00977_live_view_watch_events", + "00976_live_view_select_version", + "00975_live_view_create", + "00974_live_view_select_with_aggregation", + "00973_live_view_with_subquery_select_with_aggregation_in_subquery", + "00973_live_view_with_subquery_select_with_aggregation", + "00973_live_view_with_subquery_select_table_alias", + "00973_live_view_with_subquery_select_nested_with_aggregation_table_alias", + "00973_live_view_with_subquery_select_nested_with_aggregation", + "00973_live_view_with_subquery_select_nested", + "00973_live_view_with_subquery_select_join_no_alias", + "00973_live_view_with_subquery_select_join", + "00973_live_view_with_subquery_select", + "00973_live_view_select_prewhere", + "00973_live_view_select", + "00972_live_view_select_1", + "00969_live_view_watch_format_jsoneachrowwithprogress", + "00968_live_view_select_format_jsoneachrowwithprogress", + "00961_temporary_live_view_watch", + "00955_test_final_mark", + "00933_reserved_word", + "00926_zookeeper_adaptive_index_granularity_replicated_merge_tree", + "00926_adaptive_index_granularity_replacing_merge_tree", + "00926_adaptive_index_granularity_merge_tree", + "00925_zookeeper_empty_replicated_merge_tree_optimize_final", + "00800_low_cardinality_distinct_numeric", + "00754_alter_modify_order_by_replicated_zookeeper", + "00751_low_cardinality_nullable_group_by", + "00751_default_databasename_for_view", + "00719_parallel_ddl_table", + "00718_low_cardinaliry_alter", + "00717_low_cardinaliry_distributed_group_by", + "00688_low_cardinality_syntax", + "00688_low_cardinality_nullable_cast", + "00688_low_cardinality_in", + "00652_replicated_mutations_zookeeper", + "00634_rename_view", + "00626_replace_partition_from_table", + "00625_arrays_in_nested", + "00623_replicated_truncate_table_zookeeper", + "00619_union_highlite", + "00599_create_view_with_subquery", + "00571_non_exist_database_when_create_materializ_view", + "00553_buff_exists_materlized_column", + "00516_deduplication_after_drop_partition_zookeeper", + "00508_materialized_view_to", + "00446_clear_column_in_partition_concurrent_zookeeper", + "00423_storage_log_single_thread", + "00311_array_primary_key", + "00236_replicated_drop_on_non_leader_zookeeper", + "00226_zookeeper_deduplication_and_unexpected_parts", + "00215_primary_key_order_zookeeper", + "00180_attach_materialized_view", + "00121_drop_column_zookeeper", + "00116_storage_set", + "00083_create_merge_tree_zookeeper", + "00062_replicated_merge_tree_alter_zookeeper" ], "polymorphic-parts": [ "01508_partition_pruning", /// bug, shoud be fixed From 18f6b5bbad353431e5f7494103756264b0f2ca79 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Thu, 4 Feb 2021 22:41:44 +0300 Subject: [PATCH 074/381] add timeouts --- src/Databases/DatabaseReplicated.cpp | 40 +++++---- src/Databases/DatabaseReplicated.h | 3 +- src/Databases/DatabaseReplicatedWorker.cpp | 90 ++++++++++++++++----- src/Databases/DatabaseReplicatedWorker.h | 2 +- src/Interpreters/DDLTask.cpp | 4 - src/Interpreters/DDLTask.h | 2 +- src/Interpreters/DDLWorker.cpp | 18 ++--- src/Interpreters/DDLWorker.h | 2 +- src/Interpreters/DatabaseCatalog.cpp | 2 +- src/Interpreters/DatabaseCatalog.h | 4 +- src/Interpreters/InterpreterAlterQuery.cpp | 2 +- src/Interpreters/InterpreterCreateQuery.cpp | 2 +- src/Interpreters/InterpreterDropQuery.cpp | 4 +- src/Interpreters/InterpreterRenameQuery.cpp | 2 +- tests/queries/skip_list.json | 21 +++++ 15 files changed, 139 insertions(+), 59 deletions(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 44746cd5716..5a11787331c 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -35,6 +35,7 @@ namespace ErrorCodes extern const int DATABASE_REPLICATION_FAILED; extern const int UNKNOWN_DATABASE; extern const int NOT_IMPLEMENTED; + extern const int INCORRECT_QUERY; } zkutil::ZooKeeperPtr DatabaseReplicated::getZooKeeper() const @@ -121,8 +122,8 @@ bool DatabaseReplicated::createDatabaseNodesInZooKeeper(const zkutil::ZooKeeperP ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/counter/cnt-", "", zkutil::CreateMode::Persistent)); ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_path + "/counter/cnt-", -1)); ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/metadata", "", zkutil::CreateMode::Persistent)); - ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/min_log_ptr", "1", zkutil::CreateMode::Persistent)); ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/max_log_ptr", "1", zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/logs_to_keep", "1000", zkutil::CreateMode::Persistent)); Coordination::Responses responses; auto res = current_zookeeper->tryMulti(ops, responses); @@ -194,7 +195,7 @@ void DatabaseReplicated::onUnexpectedLogEntry(const String & entry_name, const Z throw Exception(ErrorCodes::LOGICAL_ERROR, "Entry {} already executed, current pointer is {}", entry_number, log_entry_to_execute); /// Entry name is valid. Let's get min log pointer to check if replica is staled. - UInt32 min_snapshot = parse(zookeeper->get(zookeeper_path + "/min_log_ptr")); + UInt32 min_snapshot = parse(zookeeper->get(zookeeper_path + "/min_log_ptr")); // FIXME if (log_entry_to_execute < min_snapshot) { @@ -207,13 +208,15 @@ void DatabaseReplicated::onUnexpectedLogEntry(const String & entry_name, const Z } -BlockIO DatabaseReplicated::propose(const ASTPtr & query) +BlockIO DatabaseReplicated::propose(const ASTPtr & query, const Context & query_context) { + if (query_context.getClientInfo().query_kind != ClientInfo::QueryKind::INITIAL_QUERY) + throw Exception(ErrorCodes::INCORRECT_QUERY, "It's not initial query. ON CLUSTER is not allowed for Replicated database."); + if (const auto * query_alter = query->as()) { for (const auto & command : query_alter->command_list->children) { - //FIXME allow all types of queries (maybe we should execute ATTACH an similar queries on leader) if (!isSupportedAlterType(command->as().type)) throw Exception("Unsupported type of ALTER query", ErrorCodes::NOT_IMPLEMENTED); } @@ -225,17 +228,16 @@ BlockIO DatabaseReplicated::propose(const ASTPtr & query) DDLLogEntry entry; entry.query = queryToString(query); entry.initiator = ddl_worker->getCommonHostID(); - String node_path = ddl_worker->tryEnqueueAndExecuteEntry(entry); + String node_path = ddl_worker->tryEnqueueAndExecuteEntry(entry, query_context); BlockIO io; - //FIXME use query context - if (global_context.getSettingsRef().distributed_ddl_task_timeout == 0) + if (query_context.getSettingsRef().distributed_ddl_task_timeout == 0) return io; //FIXME need list of all replicas, we can obtain it from zk Strings hosts_to_wait; hosts_to_wait.emplace_back(getFullReplicaName()); - auto stream = std::make_shared(node_path, entry, global_context, hosts_to_wait); + auto stream = std::make_shared(node_path, entry, query_context, hosts_to_wait); io.in = std::move(stream); return io; } @@ -295,17 +297,20 @@ void DatabaseReplicated::drop(const Context & context_) { auto current_zookeeper = getZooKeeper(); current_zookeeper->set(replica_path, "DROPPED"); - current_zookeeper->tryRemoveRecursive(replica_path); DatabaseAtomic::drop(context_); + current_zookeeper->tryRemoveRecursive(replica_path); +} + +void DatabaseReplicated::stopReplication() +{ + if (ddl_worker) + ddl_worker->shutdown(); } void DatabaseReplicated::shutdown() { - if (ddl_worker) - { - ddl_worker->shutdown(); - ddl_worker = nullptr; - } + stopReplication(); + ddl_worker = nullptr; DatabaseAtomic::shutdown(); } @@ -330,10 +335,15 @@ void DatabaseReplicated::renameTable(const Context & context, const String & tab if (txn->is_initial_query) { + if (!isTableExist(table_name, context)) + throw Exception(ErrorCodes::UNKNOWN_TABLE, "Table {} does not exist", table_name); + if (exchange && !to_database.isTableExist(to_table_name, context)) + throw Exception(ErrorCodes::UNKNOWN_TABLE, "Table {} does not exist", to_table_name); + String statement; String statement_to; { - //FIXME It's not atomic (however we have only one thread) + /// NOTE It's not atomic (however, we have only one thread) ReadBufferFromFile in(getObjectMetadataPath(table_name), 4096); readStringUntilEOF(statement, in); if (exchange) diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index 586f381c962..a866a61558c 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -60,8 +60,9 @@ public: String getEngineName() const override { return "Replicated"; } - BlockIO propose(const ASTPtr & query); + BlockIO propose(const ASTPtr & query, const Context & query_context); + void stopReplication(); void shutdown() override; void loadStoredObjects(Context & context, bool has_force_restore_data_flag, bool force_attach) override; diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp index 5af216c3d0d..1c000a8f0a7 100644 --- a/src/Databases/DatabaseReplicatedWorker.cpp +++ b/src/Databases/DatabaseReplicatedWorker.cpp @@ -9,6 +9,8 @@ namespace ErrorCodes { extern const int LOGICAL_ERROR; extern const int DATABASE_REPLICATION_FAILED; + extern const int NOT_A_LEADER; + extern const int UNFINISHED; } DatabaseReplicatedDDLWorker::DatabaseReplicatedDDLWorker(DatabaseReplicated * db, const Context & context_) @@ -22,7 +24,7 @@ DatabaseReplicatedDDLWorker::DatabaseReplicatedDDLWorker(DatabaseReplicated * db void DatabaseReplicatedDDLWorker::initializeMainThread() { - do + while (!initialized && !stop_flag) { try { @@ -36,17 +38,17 @@ void DatabaseReplicatedDDLWorker::initializeMainThread() sleepForSeconds(5); } } - while (!initialized && !stop_flag); } void DatabaseReplicatedDDLWorker::initializeReplication() { /// Check if we need to recover replica. - /// Invariant: replica is lost if it's log_ptr value is less then min_log_ptr value. + /// Invariant: replica is lost if it's log_ptr value is less then max_log_ptr - logs_to_keep. UInt32 our_log_ptr = parse(current_zookeeper->get(database->replica_path + "/log_ptr")); - UInt32 min_log_ptr = parse(current_zookeeper->get(database->zookeeper_path + "/min_log_ptr")); - if (our_log_ptr < min_log_ptr) + UInt32 max_log_ptr = parse(current_zookeeper->get(database->zookeeper_path + "/max_log_ptr")); + UInt32 logs_to_keep = parse(current_zookeeper->get(database->zookeeper_path + "/logs_to_keep")); + if (our_log_ptr + logs_to_keep < max_log_ptr) database->recoverLostReplica(current_zookeeper, 0); } @@ -75,10 +77,19 @@ String DatabaseReplicatedDDLWorker::enqueueQuery(DDLLogEntry & entry) return node_path; } -String DatabaseReplicatedDDLWorker::tryEnqueueAndExecuteEntry(DDLLogEntry & entry) +String DatabaseReplicatedDDLWorker::tryEnqueueAndExecuteEntry(DDLLogEntry & entry, const Context & query_context) { + /// NOTE Possibly it would be better to execute initial query on the most up-to-date node, + /// but it requires more complex logic around /try node. + auto zookeeper = getAndSetZooKeeper(); - // TODO do not enqueue query if we have big replication lag + UInt32 our_log_ptr = parse(zookeeper->get(database->replica_path + "/log_ptr")); + UInt32 max_log_ptr = parse(zookeeper->get(database->zookeeper_path + "/max_log_ptr")); + assert(our_log_ptr <= max_log_ptr); + constexpr UInt32 max_replication_lag = 16; + if (max_replication_lag < max_log_ptr - our_log_ptr) + throw Exception(ErrorCodes::NOT_A_LEADER, "Cannot enqueue query on this replica, " + "because it has replication lag of {} queries. Try other replica.", max_log_ptr - our_log_ptr); String entry_path = enqueueQuery(entry); auto try_node = zkutil::EphemeralNodeHolder::existing(entry_path + "/try", *zookeeper); @@ -91,9 +102,18 @@ String DatabaseReplicatedDDLWorker::tryEnqueueAndExecuteEntry(DDLLogEntry & entr task->is_initial_query = true; LOG_DEBUG(log, "Waiting for worker thread to process all entries before {}", entry_name); + UInt64 timeout = query_context.getSettingsRef().distributed_ddl_task_timeout; { std::unique_lock lock{mutex}; - wait_current_task_change.wait(lock, [&]() { assert(zookeeper->expired() || current_task <= entry_name); return zookeeper->expired() || current_task == entry_name; }); + bool processed = wait_current_task_change.wait_for(lock, std::chrono::seconds(timeout), [&]() + { + assert(zookeeper->expired() || current_task <= entry_name); + return zookeeper->expired() || current_task == entry_name || stop_flag; + }); + + if (!processed) + throw Exception(ErrorCodes::UNFINISHED, "Timeout: Cannot enqueue query on this replica," + "most likely because replica is busy with previous queue entries"); } if (zookeeper->expired()) @@ -116,8 +136,11 @@ DDLTaskPtr DatabaseReplicatedDDLWorker::initAndCheckTask(const String & entry_na { { std::lock_guard lock{mutex}; - current_task = entry_name; - wait_current_task_change.notify_all(); + if (current_task < entry_name) + { + current_task = entry_name; + wait_current_task_change.notify_all(); + } } UInt32 our_log_ptr = parse(current_zookeeper->get(database->replica_path + "/log_ptr")); @@ -135,18 +158,50 @@ DDLTaskPtr DatabaseReplicatedDDLWorker::initAndCheckTask(const String & entry_na String initiator_name; zkutil::EventPtr wait_committed_or_failed = std::make_shared(); - if (zookeeper->tryGet(entry_path + "/try", initiator_name, nullptr, wait_committed_or_failed)) + String try_node_path = entry_path + "/try"; + if (zookeeper->tryGet(try_node_path, initiator_name, nullptr, wait_committed_or_failed)) { task->is_initial_query = initiator_name == task->host_id_str; + /// Query is not committed yet. We cannot just skip it and execute next one, because reordering may break replication. - //FIXME add some timeouts LOG_TRACE(log, "Waiting for initiator {} to commit or rollback entry {}", initiator_name, entry_path); - wait_committed_or_failed->wait(); + constexpr size_t wait_time_ms = 1000; + constexpr size_t max_iterations = 3600; + size_t iteration = 0; + + while (!wait_committed_or_failed->tryWait(wait_time_ms)) + { + if (stop_flag) + { + /// We cannot return task to process and we cannot return nullptr too, + /// because nullptr means "task should not be executed". + /// We can only exit by exception. + throw Exception(ErrorCodes::UNFINISHED, "Replication was stopped"); + } + + if (max_iterations <= ++iteration) + { + /// What can we do if initiator hangs for some reason? Seems like we can remove /try node. + /// Initiator will fail to commit entry to ZK (including ops for replicated table) if /try does not exist. + /// But it's questionable. + + /// We use tryRemove(...) because multiple hosts (including initiator) may try to do it concurrently. + auto code = zookeeper->tryRemove(try_node_path); + if (code != Coordination::Error::ZOK && code != Coordination::Error::ZNONODE) + throw Coordination::Exception(code, try_node_path); + + if (!zookeeper->exists(entry_path + "/committed")) + { + out_reason = fmt::format("Entry {} was forcefully cancelled due to timeout", entry_name); + return {}; + } + } + } } if (!zookeeper->exists(entry_path + "/committed")) { - out_reason = "Entry " + entry_name + " hasn't been committed"; + out_reason = fmt::format("Entry {} hasn't been committed", entry_name); return {}; } @@ -154,7 +209,7 @@ DDLTaskPtr DatabaseReplicatedDDLWorker::initAndCheckTask(const String & entry_na { assert(!zookeeper->exists(entry_path + "/try")); assert(zookeeper->exists(entry_path + "/committed") == (zookeeper->get(task->getFinishedNodePath()) == "0")); - out_reason = "Entry " + entry_name + " has been executed as initial query"; + out_reason = fmt::format("Entry {} has been executed as initial query", entry_name); return {}; } @@ -169,8 +224,7 @@ DDLTaskPtr DatabaseReplicatedDDLWorker::initAndCheckTask(const String & entry_na if (task->entry.query.empty()) { - //TODO better way to determine special entries - out_reason = "It's dummy task"; + out_reason = fmt::format("Entry {} is a dummy task", entry_name); return {}; } @@ -178,7 +232,7 @@ DDLTaskPtr DatabaseReplicatedDDLWorker::initAndCheckTask(const String & entry_na if (zookeeper->exists(task->getFinishedNodePath())) { - out_reason = "Task has been already processed"; + out_reason = fmt::format("Task {} has been already processed", entry_name); return {}; } diff --git a/src/Databases/DatabaseReplicatedWorker.h b/src/Databases/DatabaseReplicatedWorker.h index 6e29e48469b..e3fd58c4305 100644 --- a/src/Databases/DatabaseReplicatedWorker.h +++ b/src/Databases/DatabaseReplicatedWorker.h @@ -13,7 +13,7 @@ public: String enqueueQuery(DDLLogEntry & entry) override; - String tryEnqueueAndExecuteEntry(DDLLogEntry & entry); + String tryEnqueueAndExecuteEntry(DDLLogEntry & entry, const Context & query_context); private: void initializeMainThread() override; diff --git a/src/Interpreters/DDLTask.cpp b/src/Interpreters/DDLTask.cpp index 55e613648ae..9737167fa4c 100644 --- a/src/Interpreters/DDLTask.cpp +++ b/src/Interpreters/DDLTask.cpp @@ -309,13 +309,9 @@ std::unique_ptr DatabaseReplicatedTask::makeQueryContext(Context & from { txn->ops.emplace_back(zkutil::makeRemoveRequest(entry_path + "/try", -1)); txn->ops.emplace_back(zkutil::makeCreateRequest(entry_path + "/committed", host_id_str, zkutil::CreateMode::Persistent)); - //txn->ops.emplace_back(zkutil::makeRemoveRequest(getActiveNodePath(), -1)); txn->ops.emplace_back(zkutil::makeSetRequest(database->zookeeper_path + "/max_log_ptr", toString(getLogEntryNumber(entry_name)), -1)); } - //if (execute_on_leader) - // txn->ops.emplace_back(zkutil::makeCreateRequest(getShardNodePath() + "/executed", host_id_str, zkutil::CreateMode::Persistent)); - //txn->ops.emplace_back(zkutil::makeCreateRequest(getFinishedNodePath(), execution_status.serializeText(), zkutil::CreateMode::Persistent)); txn->ops.emplace_back(zkutil::makeSetRequest(database->replica_path + "/log_ptr", toString(getLogEntryNumber(entry_name)), -1)); std::move(ops.begin(), ops.end(), std::back_inserter(txn->ops)); diff --git a/src/Interpreters/DDLTask.h b/src/Interpreters/DDLTask.h index 49f6d74a931..552f4919765 100644 --- a/src/Interpreters/DDLTask.h +++ b/src/Interpreters/DDLTask.h @@ -81,7 +81,6 @@ struct DDLTaskBase bool is_circular_replicated = false; bool execute_on_leader = false; - //MetadataTransactionPtr txn; Coordination::Requests ops; ExecutionStatus execution_status; bool was_executed = false; @@ -163,6 +162,7 @@ struct MetadataTransaction void commit(); + ~MetadataTransaction() { assert(state != CREATED || std::uncaught_exception()); } }; } diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index 545e00296e8..da2e878541d 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -341,7 +341,8 @@ void DDLWorker::scheduleTasks() { /// We will recheck status of last executed tasks. It's useful if main thread was just restarted. auto & min_task = *std::min_element(current_tasks.begin(), current_tasks.end()); - begin_node = std::upper_bound(queue_nodes.begin(), queue_nodes.end(), min_task->entry_name); + String min_entry_name = last_skipped_entry_name ? std::min(min_task->entry_name, *last_skipped_entry_name) : min_task->entry_name; + begin_node = std::upper_bound(queue_nodes.begin(), queue_nodes.end(), min_entry_name); current_tasks.clear(); } @@ -358,6 +359,7 @@ void DDLWorker::scheduleTasks() { LOG_DEBUG(log, "Will not execute task {}: {}", entry_name, reason); updateMaxDDLEntryID(entry_name); + last_skipped_entry_name.emplace(entry_name); continue; } @@ -500,10 +502,7 @@ void DDLWorker::processTask(DDLTaskBase & task) { /// It's not CREATE DATABASE auto table_id = context.tryResolveStorageID(*query_with_table, Context::ResolveOrdinary); - DatabasePtr database; - std::tie(database, storage) = DatabaseCatalog::instance().tryGetDatabaseAndTable(table_id, context); - if (database && database->getEngineName() == "Replicated" && !typeid_cast(&task)) - throw Exception(ErrorCodes::INCORRECT_QUERY, "ON CLUSTER queries are not allowed for Replicated databases"); + storage = DatabaseCatalog::instance().tryGetTable(table_id, context); } task.execute_on_leader = storage && taskShouldBeExecutedOnLeader(task.query, storage) && !task.is_circular_replicated; @@ -553,7 +552,8 @@ void DDLWorker::processTask(DDLTaskBase & task) updateMaxDDLEntryID(task.entry_name); /// FIXME: if server fails right here, the task will be executed twice. We need WAL here. - /// If ZooKeeper connection is lost here, we will try again to write query status. + /// NOTE: If ZooKeeper connection is lost here, we will try again to write query status. + /// NOTE: If both table and database are replicated, task is executed in single ZK transaction. bool status_written = task.ops.empty(); if (!status_written) @@ -959,12 +959,6 @@ void DDLWorker::runMainThread() initialized = false; LOG_INFO(log, "Lost ZooKeeper connection, will try to connect again: {}", getCurrentExceptionMessage(true)); } - else if (e.code == Coordination::Error::ZNONODE) - { - // TODO add comment: when it happens and why it's expected? - // maybe because cleanup thread may remove nodes inside queue entry which are currently processed - LOG_ERROR(log, "ZooKeeper error: {}", getCurrentExceptionMessage(true)); - } else { LOG_ERROR(log, "Unexpected ZooKeeper error, will try to restart main thread: {}", getCurrentExceptionMessage(true)); diff --git a/src/Interpreters/DDLWorker.h b/src/Interpreters/DDLWorker.h index d9fd4e58cb6..706face3885 100644 --- a/src/Interpreters/DDLWorker.h +++ b/src/Interpreters/DDLWorker.h @@ -115,7 +115,7 @@ protected: ZooKeeperPtr current_zookeeper; /// Save state of executed task to avoid duplicate execution on ZK error - //std::optional last_entry_name; + std::optional last_skipped_entry_name; std::list current_tasks; std::shared_ptr queue_updated_event = std::make_shared(); diff --git a/src/Interpreters/DatabaseCatalog.cpp b/src/Interpreters/DatabaseCatalog.cpp index 6313da7132d..f27fb93b2d4 100644 --- a/src/Interpreters/DatabaseCatalog.cpp +++ b/src/Interpreters/DatabaseCatalog.cpp @@ -609,7 +609,7 @@ DatabaseCatalog::updateDependency(const StorageID & old_from, const StorageID & view_dependencies[{new_from.getDatabaseName(), new_from.getTableName()}].insert(new_where); } -std::unique_ptr DatabaseCatalog::getDDLGuard(const String & database, const String & table) +DDLGuardPtr DatabaseCatalog::getDDLGuard(const String & database, const String & table) { std::unique_lock lock(ddl_guards_mutex); auto db_guard_iter = ddl_guards.try_emplace(database).first; diff --git a/src/Interpreters/DatabaseCatalog.h b/src/Interpreters/DatabaseCatalog.h index c9f031ef678..bb82dbfc440 100644 --- a/src/Interpreters/DatabaseCatalog.h +++ b/src/Interpreters/DatabaseCatalog.h @@ -67,6 +67,8 @@ private: bool is_database_guard = false; }; +using DDLGuardPtr = std::unique_ptr; + /// Creates temporary table in `_temporary_and_external_tables` with randomly generated unique StorageID. /// Such table can be accessed from everywhere by its ID. @@ -120,7 +122,7 @@ public: void loadDatabases(); /// Get an object that protects the table from concurrently executing multiple DDL operations. - std::unique_ptr getDDLGuard(const String & database, const String & table); + DDLGuardPtr getDDLGuard(const String & database, const String & table); /// Get an object that protects the database from concurrent DDL queries all tables in the database std::unique_lock getExclusiveDDLGuardForDatabase(const String & database); diff --git a/src/Interpreters/InterpreterAlterQuery.cpp b/src/Interpreters/InterpreterAlterQuery.cpp index 612f9833af5..cee9b9083ea 100644 --- a/src/Interpreters/InterpreterAlterQuery.cpp +++ b/src/Interpreters/InterpreterAlterQuery.cpp @@ -53,7 +53,7 @@ BlockIO InterpreterAlterQuery::execute() { auto guard = DatabaseCatalog::instance().getDDLGuard(table_id.database_name, table_id.table_name); guard->releaseTableLock(); - return typeid_cast(database.get())->propose(query_ptr); + return typeid_cast(database.get())->propose(query_ptr, context); } StoragePtr table = DatabaseCatalog::instance().getTable(table_id, context); diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 8d344545c8a..6af212172b2 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -886,7 +886,7 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create) { assertOrSetUUID(create, database); guard->releaseTableLock(); - return typeid_cast(database.get())->propose(query_ptr); + return typeid_cast(database.get())->propose(query_ptr, context); } } diff --git a/src/Interpreters/InterpreterDropQuery.cpp b/src/Interpreters/InterpreterDropQuery.cpp index db2f463893e..b22d46358f9 100644 --- a/src/Interpreters/InterpreterDropQuery.cpp +++ b/src/Interpreters/InterpreterDropQuery.cpp @@ -139,7 +139,7 @@ BlockIO InterpreterDropQuery::executeToTableImpl(const ASTDropQuery & query, Dat ddl_guard->releaseTableLock(); table.reset(); - return typeid_cast(database.get())->propose(query.clone()); + return typeid_cast(database.get())->propose(query.clone(), context); } if (query.kind == ASTDropQuery::Kind::Detach) @@ -325,6 +325,8 @@ BlockIO InterpreterDropQuery::executeToDatabaseImpl(const ASTDropQuery & query, if (database->getEngineName() == "MaterializeMySQL") stopDatabaseSynchronization(database); #endif + if (auto * replicated = typeid_cast(database.get())) + replicated->stopReplication(); if (database->shouldBeEmptyOnDetach()) { diff --git a/src/Interpreters/InterpreterRenameQuery.cpp b/src/Interpreters/InterpreterRenameQuery.cpp index d2f79ba071c..5bfc144e014 100644 --- a/src/Interpreters/InterpreterRenameQuery.cpp +++ b/src/Interpreters/InterpreterRenameQuery.cpp @@ -90,7 +90,7 @@ BlockIO InterpreterRenameQuery::executeToTables(const ASTRenameQuery & rename, c UniqueTableName to(elem.to_database_name, elem.to_table_name); ddl_guards[from]->releaseTableLock(); ddl_guards[to]->releaseTableLock(); - return typeid_cast(database.get())->propose(query_ptr); + return typeid_cast(database.get())->propose(query_ptr, context); } else { diff --git a/tests/queries/skip_list.json b/tests/queries/skip_list.json index 273e00c8a23..adee777f900 100644 --- a/tests/queries/skip_list.json +++ b/tests/queries/skip_list.json @@ -103,6 +103,27 @@ "memory_tracking", /// FIXME remove it before merge "memory_tracking", "memory_usage", + "01533_multiple_nested", + "01575_disable_detach_table_of_dictionary", + "01457_create_as_table_function_structure", + "01415_inconsistent_merge_tree_settings", + "01413_allow_non_metadata_alters", + "01378_alter_rename_with_ttl_zookeeper", + "01349_mutation_datetime_key", + "01325_freeze_mutation_stuck", + "01272_suspicious_codecs", + "01181_db_atomic_drop_on_cluster", + "00957_delta_diff_bug", + "00910_zookeeper_custom_compression_codecs_replicated", + "00899_long_attach_memory_limit", + "00804_test_custom_compression_codes_log_storages", + "00804_test_alter_compression_codecs", + "00804_test_delta_codec_no_type_alter", + "00804_test_custom_compression_codecs", + "00753_alter_attach", + "00715_fetch_merged_or_mutated_part_zookeeper", + "00688_low_cardinality_serialization", + "01575_disable_detach_table_of_dictionary", "00738_lock_for_inner_table", "01666_blns", "01652_ignore_and_low_cardinality", From 9a9138d0380ddf67cceda85eb26f8c4d2c978b63 Mon Sep 17 00:00:00 2001 From: romanzhukov Date: Fri, 5 Feb 2021 01:37:59 +0300 Subject: [PATCH 075/381] DOCSUP-5266: Fix ticket comments. --- .../functions/type-conversion-functions.md | 119 ++++++++++---- .../functions/type-conversion-functions.md | 149 +++++++++++++----- 2 files changed, 194 insertions(+), 74 deletions(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 047b3b1cbea..b2ede6ba6ec 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -36,10 +36,14 @@ The behavior of functions for the [NaN and Inf](../../sql-reference/data-types/f **Example** +Query: + ``` sql -SELECT toInt64(nan), toInt32(32), toInt16('16'), toInt8(8.8) +SELECT toInt64(nan), toInt32(32), toInt16('16'), toInt8(8.8); ``` +Result: + ``` text ┌─────────toInt64(nan)─┬─toInt32(32)─┬─toInt16('16')─┬─toInt8(8.8)─┐ │ -9223372036854775808 │ 32 │ 16 │ 8 │ @@ -52,10 +56,14 @@ It takes an argument of type String and tries to parse it into Int (8 \| 16 \| 3 **Example** +Query: + ``` sql -select toInt64OrZero('123123'), toInt8OrZero('123qwe123') +select toInt64OrZero('123123'), toInt8OrZero('123qwe123'); ``` +Result: + ``` text ┌─toInt64OrZero('123123')─┬─toInt8OrZero('123qwe123')─┐ │ 123123 │ 0 │ @@ -68,10 +76,14 @@ It takes an argument of type String and tries to parse it into Int (8 \| 16 \| 3 **Example** +Query: + ``` sql -select toInt64OrNull('123123'), toInt8OrNull('123qwe123') +select toInt64OrNull('123123'), toInt8OrNull('123qwe123'); ``` +String: + ``` text ┌─toInt64OrNull('123123')─┬─toInt8OrNull('123qwe123')─┐ │ 123123 │ ᴺᵁᴸᴸ │ @@ -102,10 +114,14 @@ The behavior of functions for negative agruments and for the [NaN and Inf](../.. **Example** +Query: + ``` sql -SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8) +SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8); ``` +Result: + ``` text ┌───────toUInt64(nan)─┬─toUInt32(-32)─┬─toUInt16('16')─┬─toUInt8(8.8)─┐ │ 9223372036854775808 │ 4294967264 │ 16 │ 8 │ @@ -168,20 +184,28 @@ A value in the `Nullable(Decimal(P,S))` data type. The value contains: **Examples** +Query: + ``` sql -SELECT toDecimal32OrNull(toString(-1.111), 5) AS val, toTypeName(val) +SELECT toDecimal32OrNull(toString(-1.111), 5) AS val, toTypeName(val); ``` +Result: + ``` text ┌──────val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 5))─┐ │ -1.11100 │ Nullable(Decimal(9, 5)) │ └──────────┴────────────────────────────────────────────────────┘ ``` +Query: + ``` sql -SELECT toDecimal32OrNull(toString(-1.111), 2) AS val, toTypeName(val) +SELECT toDecimal32OrNull(toString(-1.111), 2) AS val, toTypeName(val); ``` +Result: + ``` text ┌──val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 2))─┐ │ ᴺᵁᴸᴸ │ Nullable(Decimal(9, 2)) │ @@ -348,7 +372,7 @@ String to UUID. Query: ``` sql -SELECT reinterpretAsUUID(reverse(unhex('000102030405060708090a0b0c0d0e0f'))) +SELECT reinterpretAsUUID(reverse(unhex('000102030405060708090a0b0c0d0e0f'))); ``` Result: @@ -381,9 +405,11 @@ Result: ## CAST(x, T) {#type_conversion_function-cast} -Converts ‘x’ to the ‘t’ data type. The syntax CAST(x AS t) is also supported. +Converts unput value `x` to the `T` data type. The syntax `CAST(x AS t)` is also supported. -Example: +**Example** + +Query: ``` sql SELECT @@ -394,6 +420,8 @@ SELECT CAST(timestamp, 'FixedString(22)') AS fixed_string ``` +Result: + ``` text ┌─timestamp───────────┬────────────datetime─┬───────date─┬─string──────────────┬─fixed_string──────────────┐ │ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00 │ 2016-06-15 │ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00\0\0\0 │ @@ -402,12 +430,18 @@ SELECT Conversion to FixedString(N) only works for arguments of type String or FixedString(N). -Type conversion to [Nullable](../../sql-reference/data-types/nullable.md) and back is supported. Example: +Type conversion to [Nullable](../../sql-reference/data-types/nullable.md) and back is supported. + +**Examples** + +Query: ``` sql -SELECT toTypeName(x) FROM t_null +SELECT toTypeName(x) FROM t_null; ``` +Result: + ``` text ┌─toTypeName(x)─┐ │ Int8 │ @@ -415,10 +449,14 @@ SELECT toTypeName(x) FROM t_null └───────────────┘ ``` +Query: + ``` sql -SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null +SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null; ``` +Result: + ``` text ┌─toTypeName(CAST(x, 'Nullable(UInt16)'))─┐ │ Nullable(UInt16) │ @@ -432,15 +470,18 @@ SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null ## accurateCast(x, T) {#type_conversion_function-accurate-cast} -Converts ‘x’ to the ‘t’ data type. The differente from cast(x, T) is that accurateCast -does not allow overflow of numeric types during cast if type value x does not fit -bounds of type T. +Converts `x` to the `T` data type. The differente from [cast(x, T)](#type_conversion_function-cast) is that `accurateCast` +does not allow overflow of numeric types during cast if type value `x` does not fit bounds of type `T`. + +**Examples** + +Query: -Example ``` sql -SELECT cast(-1, 'UInt8') as uint8; +SELECT cast(-1, 'UInt8') as uint8; ``` +Result: ``` text ┌─uint8─┐ @@ -448,10 +489,14 @@ SELECT cast(-1, 'UInt8') as uint8; └───────┘ ``` +Query: + ```sql SELECT accurateCast(-1, 'UInt8') as uint8; ``` +Result: + ``` text Code: 70. DB::Exception: Received from localhost:9000. DB::Exception: Value in column Int8 cannot be safely converted into type UInt8: While processing accurateCast(-1, 'UInt8') AS uint8. @@ -472,7 +517,7 @@ accurateCastOrNull(x, T) - `x` — Input value. - `T` — The name of the returned data type. -**Example** +**Examples** Query: @@ -488,6 +533,8 @@ Result: └────────────────────────────────────────────┘ ``` +Query: + ``` sql SELECT cast(-1, 'UInt8') as uint8, @@ -530,6 +577,8 @@ toIntervalYear(number) **Example** +Query: + ``` sql WITH toDate('2019-01-01') AS date, @@ -537,9 +586,11 @@ WITH toIntervalWeek(1) AS interval_to_week SELECT date + interval_week, - date + interval_to_week + date + interval_to_week; ``` +Result: + ``` text ┌─plus(date, interval_week)─┬─plus(date, interval_to_week)─┐ │ 2019-01-08 │ 2019-01-08 │ @@ -598,7 +649,7 @@ Query: ``` sql SELECT parseDateTimeBestEffort('Sat, 18 Aug 2018 07:22:16 GMT', 'Europe/Moscow') -AS parseDateTimeBestEffort +AS parseDateTimeBestEffort; ``` Result: @@ -613,7 +664,7 @@ Query: ``` sql SELECT parseDateTimeBestEffort('1284101485') -AS parseDateTimeBestEffort +AS parseDateTimeBestEffort; ``` Result: @@ -628,7 +679,7 @@ Query: ``` sql SELECT parseDateTimeBestEffort('2018-12-12 10:12:12') -AS parseDateTimeBestEffort +AS parseDateTimeBestEffort; ``` Result: @@ -642,7 +693,7 @@ Result: Query: ``` sql -SELECT parseDateTimeBestEffort('10 20:19') +SELECT parseDateTimeBestEffort('10 20:19'); ``` Result: @@ -667,7 +718,7 @@ This function is similar to [‘parseDateTimeBestEffort’](#parsedatetimebestef **Syntax** ``` sql -parseDateTimeBestEffortUS(time_string [, time_zone]); +parseDateTimeBestEffortUS(time_string [, time_zone]) ``` **Parameters** @@ -769,7 +820,7 @@ Type: `LowCardinality(expr_result_type)` Query: ``` sql -SELECT toLowCardinality('1') +SELECT toLowCardinality('1'); ``` Result: @@ -808,7 +859,7 @@ Query: ``` sql WITH toDateTime64('2019-09-16 19:20:12.345678910', 6) AS dt64 -SELECT toUnixTimestamp64Milli(dt64) +SELECT toUnixTimestamp64Milli(dt64); ``` Result: @@ -819,9 +870,11 @@ Result: └──────────────────────────────┘ ``` +Query: + ``` sql WITH toDateTime64('2019-09-16 19:20:12.345678910', 6) AS dt64 -SELECT toUnixTimestamp64Nano(dt64) +SELECT toUnixTimestamp64Nano(dt64); ``` Result: @@ -855,13 +908,17 @@ fromUnixTimestamp64Milli(value [, ti]) - `value` converted to the `DateTime64` data type. -**Examples** +**Example** + +Query: ``` sql WITH CAST(1234567891011, 'Int64') AS i64 -SELECT fromUnixTimestamp64Milli(i64, 'UTC') +SELECT fromUnixTimestamp64Milli(i64, 'UTC'); ``` +Result: + ``` text ┌─fromUnixTimestamp64Milli(i64, 'UTC')─┐ │ 2009-02-13 23:31:31.011 │ @@ -893,7 +950,7 @@ Query: ``` sql SELECT formatRow('CSV', number, 'good') -FROM numbers(3) +FROM numbers(3); ``` Result: @@ -934,7 +991,7 @@ Query: ``` sql SELECT formatRowNoNewline('CSV', number, 'good') -FROM numbers(3) +FROM numbers(3); ``` Result: diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index 40fdbc6f5a0..ee3e8583504 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -36,10 +36,14 @@ toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438\u0020\u043f\u0440\u0435\u **Пример** +Запрос: + ``` sql -SELECT toInt64(nan), toInt32(32), toInt16('16'), toInt8(8.8) +SELECT toInt64(nan), toInt32(32), toInt16('16'), toInt8(8.8); ``` +Результат: + ``` text ┌─────────toInt64(nan)─┬─toInt32(32)─┬─toInt16('16')─┬─toInt8(8.8)─┐ │ -9223372036854775808 │ 32 │ 16 │ 8 │ @@ -52,10 +56,14 @@ SELECT toInt64(nan), toInt32(32), toInt16('16'), toInt8(8.8) **Пример** +Запрос: + ``` sql -select toInt64OrZero('123123'), toInt8OrZero('123qwe123') +select toInt64OrZero('123123'), toInt8OrZero('123qwe123'); ``` +Результат: + ``` text ┌─toInt64OrZero('123123')─┬─toInt8OrZero('123qwe123')─┐ │ 123123 │ 0 │ @@ -68,10 +76,14 @@ select toInt64OrZero('123123'), toInt8OrZero('123qwe123') **Пример** +Запрос: + ``` sql -select toInt64OrNull('123123'), toInt8OrNull('123qwe123') +select toInt64OrNull('123123'), toInt8OrNull('123qwe123'); ``` +Результат: + ``` text ┌─toInt64OrNull('123123')─┬─toInt8OrNull('123qwe123')─┐ │ 123123 │ ᴺᵁᴸᴸ │ @@ -102,10 +114,14 @@ select toInt64OrNull('123123'), toInt8OrNull('123qwe123') **Пример** +Запрос: + ``` sql -SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8) +SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8); ``` +Результат: + ``` text ┌───────toUInt64(nan)─┬─toUInt32(-32)─┬─toUInt16('16')─┬─toUInt8(8.8)─┐ │ 9223372036854775808 │ 4294967264 │ 16 │ 8 │ @@ -168,20 +184,28 @@ SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8) **Примеры** +Запрос: + ``` sql -SELECT toDecimal32OrNull(toString(-1.111), 5) AS val, toTypeName(val) +SELECT toDecimal32OrNull(toString(-1.111), 5) AS val, toTypeName(val); ``` +Результат: + ``` text ┌──────val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 5))─┐ │ -1.11100 │ Nullable(Decimal(9, 5)) │ └──────────┴────────────────────────────────────────────────────┘ ``` +Запрос: + ``` sql -SELECT toDecimal32OrNull(toString(-1.111), 2) AS val, toTypeName(val) +SELECT toDecimal32OrNull(toString(-1.111), 2) AS val, toTypeName(val); ``` +Результат: + ``` text ┌──val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 2))─┐ │ ᴺᵁᴸᴸ │ Nullable(Decimal(9, 2)) │ @@ -211,22 +235,30 @@ SELECT toDecimal32OrNull(toString(-1.111), 2) AS val, toTypeName(val) - Число с `S` десятичными знаками, если ClickHouse распознал число во входной строке. - 0 c `S` десятичными знаками, если ClickHouse не смог распознать число во входной строке или входное число содержит больше чем `S` десятичных знаков. -**Пример** +**Примеры** + +Запрос: ``` sql -SELECT toDecimal32OrZero(toString(-1.111), 5) AS val, toTypeName(val) +SELECT toDecimal32OrZero(toString(-1.111), 5) AS val, toTypeName(val); ``` +Результат: + ``` text ┌──────val─┬─toTypeName(toDecimal32OrZero(toString(-1.111), 5))─┐ │ -1.11100 │ Decimal(9, 5) │ └──────────┴────────────────────────────────────────────────────┘ ``` +Запрос: + ``` sql -SELECT toDecimal32OrZero(toString(-1.111), 2) AS val, toTypeName(val) +SELECT toDecimal32OrZero(toString(-1.111), 2) AS val, toTypeName(val); ``` +Результат: + ``` text ┌──val─┬─toTypeName(toDecimal32OrZero(toString(-1.111), 2))─┐ │ 0.00 │ Decimal(9, 2) │ @@ -258,12 +290,18 @@ YYYY-MM-DD hh:mm:ss Дополнительно, функция toString от аргумента типа DateTime может принимать второй аргумент String - имя тайм-зоны. Пример: `Asia/Yekaterinburg` В этом случае, форматирование времени производится согласно указанной тайм-зоне. +**Пример** + +Запрос: + ``` sql SELECT now() AS now_local, - toString(now(), 'Asia/Yekaterinburg') AS now_yekat + toString(now(), 'Asia/Yekaterinburg') AS now_yekat; ``` +Результат: + ``` text ┌───────────now_local─┬─now_yekat───────────┐ │ 2016-06-15 00:11:21 │ 2016-06-15 02:11:21 │ @@ -281,22 +319,30 @@ SELECT Принимает аргумент типа String или FixedString. Возвращает String, вырезая содержимое строки до первого найденного нулевого байта. -Пример: +**Примеры** + +Запрос: ``` sql -SELECT toFixedString('foo', 8) AS s, toStringCutToZero(s) AS s_cut +SELECT toFixedString('foo', 8) AS s, toStringCutToZero(s) AS s_cut; ``` +Результат: + ``` text ┌─s─────────────┬─s_cut─┐ │ foo\0\0\0\0\0 │ foo │ └───────────────┴───────┘ ``` +Запрос: + ``` sql -SELECT toFixedString('foo\0bar', 8) AS s, toStringCutToZero(s) AS s_cut +SELECT toFixedString('foo\0bar', 8) AS s, toStringCutToZero(s) AS s_cut; ``` +Результат: + ``` text ┌─s──────────┬─s_cut─┐ │ foo\0bar\0 │ foo │ @@ -344,7 +390,7 @@ reinterpretAsUUID(fixed_string) Запрос: ``` sql -SELECT reinterpretAsUUID(reverse(unhex('000102030405060708090a0b0c0d0e0f'))) +SELECT reinterpretAsUUID(reverse(unhex('000102030405060708090a0b0c0d0e0f'))); ``` Результат: @@ -377,10 +423,11 @@ SELECT uuid = uuid2; ## CAST(x, T) {#type_conversion_function-cast} -Преобразует x в тип данных t. -Поддерживается также синтаксис CAST(x AS t). +Преобразует входное значение `x` в тип данных `T`. Поддерживается также синтаксис `CAST(x AS t)`. -Пример: +**Пример** + +Запрос: ``` sql SELECT @@ -388,9 +435,11 @@ SELECT CAST(timestamp AS DateTime) AS datetime, CAST(timestamp AS Date) AS date, CAST(timestamp, 'String') AS string, - CAST(timestamp, 'FixedString(22)') AS fixed_string + CAST(timestamp, 'FixedString(22)') AS fixed_string; ``` +Результат: + ``` text ┌─timestamp───────────┬────────────datetime─┬───────date─┬─string──────────────┬─fixed_string──────────────┐ │ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00 │ 2016-06-15 │ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00\0\0\0 │ @@ -399,12 +448,18 @@ SELECT Преобразование в FixedString(N) работает только для аргументов типа String или FixedString(N). -Поддержано преобразование к типу [Nullable](../../sql-reference/functions/type-conversion-functions.md) и обратно. Пример: +Поддерживается преобразование к типу [Nullable](../../sql-reference/functions/type-conversion-functions.md) и обратно. + +**Примеры** + +Запрос: ``` sql -SELECT toTypeName(x) FROM t_null +SELECT toTypeName(x) FROM t_null; ``` +Результат: + ``` text ┌─toTypeName(x)─┐ │ Int8 │ @@ -412,10 +467,14 @@ SELECT toTypeName(x) FROM t_null └───────────────┘ ``` +Запрос: + ``` sql -SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null +SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null; ``` +Результат: + ``` text ┌─toTypeName(CAST(x, 'Nullable(UInt16)'))─┐ │ Nullable(UInt16) │ @@ -442,7 +501,7 @@ accurateCastOrNull(x, T) - `x` — входное значение. - `T` — имя возвращаемого типа данных. -**Пример** +**Примеры** Запрос: @@ -502,6 +561,8 @@ toIntervalYear(number) **Пример** +Запрос: + ``` sql WITH toDate('2019-01-01') AS date, @@ -509,9 +570,11 @@ WITH toIntervalWeek(1) AS interval_to_week SELECT date + interval_week, - date + interval_to_week + date + interval_to_week; ``` +Результат: + ``` text ┌─plus(date, interval_week)─┬─plus(date, interval_to_week)─┐ │ 2019-01-08 │ 2019-01-08 │ @@ -527,7 +590,7 @@ SELECT **Синтаксис** ``` sql -parseDateTimeBestEffort(time_string[, time_zone]); +parseDateTimeBestEffort(time_string[, time_zone]) ``` **Параметры** @@ -570,7 +633,7 @@ AS parseDateTimeBestEffort; ``` sql SELECT parseDateTimeBestEffort('Sat, 18 Aug 2018 07:22:16 GMT', 'Europe/Moscow') -AS parseDateTimeBestEffort +AS parseDateTimeBestEffort; ``` Результат: @@ -585,7 +648,7 @@ AS parseDateTimeBestEffort ``` sql SELECT parseDateTimeBestEffort('1284101485') -AS parseDateTimeBestEffort +AS parseDateTimeBestEffort; ``` Результат: @@ -600,7 +663,7 @@ AS parseDateTimeBestEffort ``` sql SELECT parseDateTimeBestEffort('2018-12-12 10:12:12') -AS parseDateTimeBestEffort +AS parseDateTimeBestEffort; ``` Результат: @@ -614,7 +677,7 @@ AS parseDateTimeBestEffort Запрос: ``` sql -SELECT parseDateTimeBestEffort('10 20:19') +SELECT parseDateTimeBestEffort('10 20:19'); ``` Результат: @@ -639,7 +702,7 @@ SELECT parseDateTimeBestEffort('10 20:19') **Синтаксис** ``` sql -parseDateTimeBestEffortUS(time_string [, time_zone]); +parseDateTimeBestEffortUS(time_string [, time_zone]) ``` **Параметры** @@ -668,7 +731,7 @@ SELECT parseDateTimeBestEffortUS('09/12/2020 12:12:57') AS parseDateTimeBestEffortUS; ``` -Ответ: +Результат: ``` text ┌─parseDateTimeBestEffortUS─┐ @@ -683,7 +746,7 @@ SELECT parseDateTimeBestEffortUS('09-12-2020 12:12:57') AS parseDateTimeBestEffortUS; ``` -Ответ: +Результат: ``` text ┌─parseDateTimeBestEffortUS─┐ @@ -698,7 +761,7 @@ SELECT parseDateTimeBestEffortUS('09.12.2020 12:12:57') AS parseDateTimeBestEffortUS; ``` -Ответ: +Результат: ``` text ┌─parseDateTimeBestEffortUS─┐ @@ -733,10 +796,10 @@ toUnixTimestamp64Milli(value) ``` sql WITH toDateTime64('2019-09-16 19:20:12.345678910', 6) AS dt64 -SELECT toUnixTimestamp64Milli(dt64) +SELECT toUnixTimestamp64Milli(dt64); ``` -Ответ: +Результат: ``` text ┌─toUnixTimestamp64Milli(dt64)─┐ @@ -748,10 +811,10 @@ SELECT toUnixTimestamp64Milli(dt64) ``` sql WITH toDateTime64('2019-09-16 19:20:12.345678910', 6) AS dt64 -SELECT toUnixTimestamp64Nano(dt64) +SELECT toUnixTimestamp64Nano(dt64); ``` -Ответ: +Результат: ``` text ┌─toUnixTimestamp64Nano(dt64)─┐ @@ -786,10 +849,10 @@ fromUnixTimestamp64Milli(value [, ti]) ``` sql WITH CAST(1234567891011, 'Int64') AS i64 -SELECT fromUnixTimestamp64Milli(i64, 'UTC') +SELECT fromUnixTimestamp64Milli(i64, 'UTC'); ``` -Ответ: +Результат: ``` text ┌─fromUnixTimestamp64Milli(i64, 'UTC')─┐ @@ -820,7 +883,7 @@ toLowCardinality(expr) Тип: `LowCardinality(expr_result_type)` -**Example** +**Пример** Запрос: @@ -861,10 +924,10 @@ formatRow(format, x, y, ...) ``` sql SELECT formatRow('CSV', number, 'good') -FROM numbers(3) +FROM numbers(3); ``` -Ответ: +Результат: ``` text ┌─formatRow('CSV', number, 'good')─┐ @@ -902,10 +965,10 @@ formatRowNoNewline(format, x, y, ...) ``` sql SELECT formatRowNoNewline('CSV', number, 'good') -FROM numbers(3) +FROM numbers(3); ``` -Ответ: +Результат: ``` text ┌─formatRowNoNewline('CSV', number, 'good')─┐ From a3721ef0ac77046bc0db336b0bb71aa274b2fe97 Mon Sep 17 00:00:00 2001 From: romanzhukov Date: Fri, 5 Feb 2021 01:44:18 +0300 Subject: [PATCH 076/381] Revert "DOCSUP-5266: Fix ticket comments." This reverts commit 9a9138d0380ddf67cceda85eb26f8c4d2c978b63. --- .../functions/type-conversion-functions.md | 119 ++++---------- .../functions/type-conversion-functions.md | 149 +++++------------- 2 files changed, 74 insertions(+), 194 deletions(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index b2ede6ba6ec..047b3b1cbea 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -36,14 +36,10 @@ The behavior of functions for the [NaN and Inf](../../sql-reference/data-types/f **Example** -Query: - ``` sql -SELECT toInt64(nan), toInt32(32), toInt16('16'), toInt8(8.8); +SELECT toInt64(nan), toInt32(32), toInt16('16'), toInt8(8.8) ``` -Result: - ``` text ┌─────────toInt64(nan)─┬─toInt32(32)─┬─toInt16('16')─┬─toInt8(8.8)─┐ │ -9223372036854775808 │ 32 │ 16 │ 8 │ @@ -56,14 +52,10 @@ It takes an argument of type String and tries to parse it into Int (8 \| 16 \| 3 **Example** -Query: - ``` sql -select toInt64OrZero('123123'), toInt8OrZero('123qwe123'); +select toInt64OrZero('123123'), toInt8OrZero('123qwe123') ``` -Result: - ``` text ┌─toInt64OrZero('123123')─┬─toInt8OrZero('123qwe123')─┐ │ 123123 │ 0 │ @@ -76,14 +68,10 @@ It takes an argument of type String and tries to parse it into Int (8 \| 16 \| 3 **Example** -Query: - ``` sql -select toInt64OrNull('123123'), toInt8OrNull('123qwe123'); +select toInt64OrNull('123123'), toInt8OrNull('123qwe123') ``` -String: - ``` text ┌─toInt64OrNull('123123')─┬─toInt8OrNull('123qwe123')─┐ │ 123123 │ ᴺᵁᴸᴸ │ @@ -114,14 +102,10 @@ The behavior of functions for negative agruments and for the [NaN and Inf](../.. **Example** -Query: - ``` sql -SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8); +SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8) ``` -Result: - ``` text ┌───────toUInt64(nan)─┬─toUInt32(-32)─┬─toUInt16('16')─┬─toUInt8(8.8)─┐ │ 9223372036854775808 │ 4294967264 │ 16 │ 8 │ @@ -184,28 +168,20 @@ A value in the `Nullable(Decimal(P,S))` data type. The value contains: **Examples** -Query: - ``` sql -SELECT toDecimal32OrNull(toString(-1.111), 5) AS val, toTypeName(val); +SELECT toDecimal32OrNull(toString(-1.111), 5) AS val, toTypeName(val) ``` -Result: - ``` text ┌──────val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 5))─┐ │ -1.11100 │ Nullable(Decimal(9, 5)) │ └──────────┴────────────────────────────────────────────────────┘ ``` -Query: - ``` sql -SELECT toDecimal32OrNull(toString(-1.111), 2) AS val, toTypeName(val); +SELECT toDecimal32OrNull(toString(-1.111), 2) AS val, toTypeName(val) ``` -Result: - ``` text ┌──val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 2))─┐ │ ᴺᵁᴸᴸ │ Nullable(Decimal(9, 2)) │ @@ -372,7 +348,7 @@ String to UUID. Query: ``` sql -SELECT reinterpretAsUUID(reverse(unhex('000102030405060708090a0b0c0d0e0f'))); +SELECT reinterpretAsUUID(reverse(unhex('000102030405060708090a0b0c0d0e0f'))) ``` Result: @@ -405,11 +381,9 @@ Result: ## CAST(x, T) {#type_conversion_function-cast} -Converts unput value `x` to the `T` data type. The syntax `CAST(x AS t)` is also supported. +Converts ‘x’ to the ‘t’ data type. The syntax CAST(x AS t) is also supported. -**Example** - -Query: +Example: ``` sql SELECT @@ -420,8 +394,6 @@ SELECT CAST(timestamp, 'FixedString(22)') AS fixed_string ``` -Result: - ``` text ┌─timestamp───────────┬────────────datetime─┬───────date─┬─string──────────────┬─fixed_string──────────────┐ │ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00 │ 2016-06-15 │ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00\0\0\0 │ @@ -430,18 +402,12 @@ Result: Conversion to FixedString(N) only works for arguments of type String or FixedString(N). -Type conversion to [Nullable](../../sql-reference/data-types/nullable.md) and back is supported. - -**Examples** - -Query: +Type conversion to [Nullable](../../sql-reference/data-types/nullable.md) and back is supported. Example: ``` sql -SELECT toTypeName(x) FROM t_null; +SELECT toTypeName(x) FROM t_null ``` -Result: - ``` text ┌─toTypeName(x)─┐ │ Int8 │ @@ -449,14 +415,10 @@ Result: └───────────────┘ ``` -Query: - ``` sql -SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null; +SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null ``` -Result: - ``` text ┌─toTypeName(CAST(x, 'Nullable(UInt16)'))─┐ │ Nullable(UInt16) │ @@ -470,18 +432,15 @@ Result: ## accurateCast(x, T) {#type_conversion_function-accurate-cast} -Converts `x` to the `T` data type. The differente from [cast(x, T)](#type_conversion_function-cast) is that `accurateCast` -does not allow overflow of numeric types during cast if type value `x` does not fit bounds of type `T`. - -**Examples** - -Query: +Converts ‘x’ to the ‘t’ data type. The differente from cast(x, T) is that accurateCast +does not allow overflow of numeric types during cast if type value x does not fit +bounds of type T. +Example ``` sql -SELECT cast(-1, 'UInt8') as uint8; +SELECT cast(-1, 'UInt8') as uint8; ``` -Result: ``` text ┌─uint8─┐ @@ -489,14 +448,10 @@ Result: └───────┘ ``` -Query: - ```sql SELECT accurateCast(-1, 'UInt8') as uint8; ``` -Result: - ``` text Code: 70. DB::Exception: Received from localhost:9000. DB::Exception: Value in column Int8 cannot be safely converted into type UInt8: While processing accurateCast(-1, 'UInt8') AS uint8. @@ -517,7 +472,7 @@ accurateCastOrNull(x, T) - `x` — Input value. - `T` — The name of the returned data type. -**Examples** +**Example** Query: @@ -533,8 +488,6 @@ Result: └────────────────────────────────────────────┘ ``` -Query: - ``` sql SELECT cast(-1, 'UInt8') as uint8, @@ -577,8 +530,6 @@ toIntervalYear(number) **Example** -Query: - ``` sql WITH toDate('2019-01-01') AS date, @@ -586,11 +537,9 @@ WITH toIntervalWeek(1) AS interval_to_week SELECT date + interval_week, - date + interval_to_week; + date + interval_to_week ``` -Result: - ``` text ┌─plus(date, interval_week)─┬─plus(date, interval_to_week)─┐ │ 2019-01-08 │ 2019-01-08 │ @@ -649,7 +598,7 @@ Query: ``` sql SELECT parseDateTimeBestEffort('Sat, 18 Aug 2018 07:22:16 GMT', 'Europe/Moscow') -AS parseDateTimeBestEffort; +AS parseDateTimeBestEffort ``` Result: @@ -664,7 +613,7 @@ Query: ``` sql SELECT parseDateTimeBestEffort('1284101485') -AS parseDateTimeBestEffort; +AS parseDateTimeBestEffort ``` Result: @@ -679,7 +628,7 @@ Query: ``` sql SELECT parseDateTimeBestEffort('2018-12-12 10:12:12') -AS parseDateTimeBestEffort; +AS parseDateTimeBestEffort ``` Result: @@ -693,7 +642,7 @@ Result: Query: ``` sql -SELECT parseDateTimeBestEffort('10 20:19'); +SELECT parseDateTimeBestEffort('10 20:19') ``` Result: @@ -718,7 +667,7 @@ This function is similar to [‘parseDateTimeBestEffort’](#parsedatetimebestef **Syntax** ``` sql -parseDateTimeBestEffortUS(time_string [, time_zone]) +parseDateTimeBestEffortUS(time_string [, time_zone]); ``` **Parameters** @@ -820,7 +769,7 @@ Type: `LowCardinality(expr_result_type)` Query: ``` sql -SELECT toLowCardinality('1'); +SELECT toLowCardinality('1') ``` Result: @@ -859,7 +808,7 @@ Query: ``` sql WITH toDateTime64('2019-09-16 19:20:12.345678910', 6) AS dt64 -SELECT toUnixTimestamp64Milli(dt64); +SELECT toUnixTimestamp64Milli(dt64) ``` Result: @@ -870,11 +819,9 @@ Result: └──────────────────────────────┘ ``` -Query: - ``` sql WITH toDateTime64('2019-09-16 19:20:12.345678910', 6) AS dt64 -SELECT toUnixTimestamp64Nano(dt64); +SELECT toUnixTimestamp64Nano(dt64) ``` Result: @@ -908,17 +855,13 @@ fromUnixTimestamp64Milli(value [, ti]) - `value` converted to the `DateTime64` data type. -**Example** - -Query: +**Examples** ``` sql WITH CAST(1234567891011, 'Int64') AS i64 -SELECT fromUnixTimestamp64Milli(i64, 'UTC'); +SELECT fromUnixTimestamp64Milli(i64, 'UTC') ``` -Result: - ``` text ┌─fromUnixTimestamp64Milli(i64, 'UTC')─┐ │ 2009-02-13 23:31:31.011 │ @@ -950,7 +893,7 @@ Query: ``` sql SELECT formatRow('CSV', number, 'good') -FROM numbers(3); +FROM numbers(3) ``` Result: @@ -991,7 +934,7 @@ Query: ``` sql SELECT formatRowNoNewline('CSV', number, 'good') -FROM numbers(3); +FROM numbers(3) ``` Result: diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index ee3e8583504..40fdbc6f5a0 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -36,14 +36,10 @@ toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438\u0020\u043f\u0440\u0435\u **Пример** -Запрос: - ``` sql -SELECT toInt64(nan), toInt32(32), toInt16('16'), toInt8(8.8); +SELECT toInt64(nan), toInt32(32), toInt16('16'), toInt8(8.8) ``` -Результат: - ``` text ┌─────────toInt64(nan)─┬─toInt32(32)─┬─toInt16('16')─┬─toInt8(8.8)─┐ │ -9223372036854775808 │ 32 │ 16 │ 8 │ @@ -56,14 +52,10 @@ SELECT toInt64(nan), toInt32(32), toInt16('16'), toInt8(8.8); **Пример** -Запрос: - ``` sql -select toInt64OrZero('123123'), toInt8OrZero('123qwe123'); +select toInt64OrZero('123123'), toInt8OrZero('123qwe123') ``` -Результат: - ``` text ┌─toInt64OrZero('123123')─┬─toInt8OrZero('123qwe123')─┐ │ 123123 │ 0 │ @@ -76,14 +68,10 @@ select toInt64OrZero('123123'), toInt8OrZero('123qwe123'); **Пример** -Запрос: - ``` sql -select toInt64OrNull('123123'), toInt8OrNull('123qwe123'); +select toInt64OrNull('123123'), toInt8OrNull('123qwe123') ``` -Результат: - ``` text ┌─toInt64OrNull('123123')─┬─toInt8OrNull('123qwe123')─┐ │ 123123 │ ᴺᵁᴸᴸ │ @@ -114,14 +102,10 @@ select toInt64OrNull('123123'), toInt8OrNull('123qwe123'); **Пример** -Запрос: - ``` sql -SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8); +SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8) ``` -Результат: - ``` text ┌───────toUInt64(nan)─┬─toUInt32(-32)─┬─toUInt16('16')─┬─toUInt8(8.8)─┐ │ 9223372036854775808 │ 4294967264 │ 16 │ 8 │ @@ -184,28 +168,20 @@ SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8); **Примеры** -Запрос: - ``` sql -SELECT toDecimal32OrNull(toString(-1.111), 5) AS val, toTypeName(val); +SELECT toDecimal32OrNull(toString(-1.111), 5) AS val, toTypeName(val) ``` -Результат: - ``` text ┌──────val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 5))─┐ │ -1.11100 │ Nullable(Decimal(9, 5)) │ └──────────┴────────────────────────────────────────────────────┘ ``` -Запрос: - ``` sql -SELECT toDecimal32OrNull(toString(-1.111), 2) AS val, toTypeName(val); +SELECT toDecimal32OrNull(toString(-1.111), 2) AS val, toTypeName(val) ``` -Результат: - ``` text ┌──val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 2))─┐ │ ᴺᵁᴸᴸ │ Nullable(Decimal(9, 2)) │ @@ -235,30 +211,22 @@ SELECT toDecimal32OrNull(toString(-1.111), 2) AS val, toTypeName(val); - Число с `S` десятичными знаками, если ClickHouse распознал число во входной строке. - 0 c `S` десятичными знаками, если ClickHouse не смог распознать число во входной строке или входное число содержит больше чем `S` десятичных знаков. -**Примеры** - -Запрос: +**Пример** ``` sql -SELECT toDecimal32OrZero(toString(-1.111), 5) AS val, toTypeName(val); +SELECT toDecimal32OrZero(toString(-1.111), 5) AS val, toTypeName(val) ``` -Результат: - ``` text ┌──────val─┬─toTypeName(toDecimal32OrZero(toString(-1.111), 5))─┐ │ -1.11100 │ Decimal(9, 5) │ └──────────┴────────────────────────────────────────────────────┘ ``` -Запрос: - ``` sql -SELECT toDecimal32OrZero(toString(-1.111), 2) AS val, toTypeName(val); +SELECT toDecimal32OrZero(toString(-1.111), 2) AS val, toTypeName(val) ``` -Результат: - ``` text ┌──val─┬─toTypeName(toDecimal32OrZero(toString(-1.111), 2))─┐ │ 0.00 │ Decimal(9, 2) │ @@ -290,18 +258,12 @@ YYYY-MM-DD hh:mm:ss Дополнительно, функция toString от аргумента типа DateTime может принимать второй аргумент String - имя тайм-зоны. Пример: `Asia/Yekaterinburg` В этом случае, форматирование времени производится согласно указанной тайм-зоне. -**Пример** - -Запрос: - ``` sql SELECT now() AS now_local, - toString(now(), 'Asia/Yekaterinburg') AS now_yekat; + toString(now(), 'Asia/Yekaterinburg') AS now_yekat ``` -Результат: - ``` text ┌───────────now_local─┬─now_yekat───────────┐ │ 2016-06-15 00:11:21 │ 2016-06-15 02:11:21 │ @@ -319,30 +281,22 @@ SELECT Принимает аргумент типа String или FixedString. Возвращает String, вырезая содержимое строки до первого найденного нулевого байта. -**Примеры** - -Запрос: +Пример: ``` sql -SELECT toFixedString('foo', 8) AS s, toStringCutToZero(s) AS s_cut; +SELECT toFixedString('foo', 8) AS s, toStringCutToZero(s) AS s_cut ``` -Результат: - ``` text ┌─s─────────────┬─s_cut─┐ │ foo\0\0\0\0\0 │ foo │ └───────────────┴───────┘ ``` -Запрос: - ``` sql -SELECT toFixedString('foo\0bar', 8) AS s, toStringCutToZero(s) AS s_cut; +SELECT toFixedString('foo\0bar', 8) AS s, toStringCutToZero(s) AS s_cut ``` -Результат: - ``` text ┌─s──────────┬─s_cut─┐ │ foo\0bar\0 │ foo │ @@ -390,7 +344,7 @@ reinterpretAsUUID(fixed_string) Запрос: ``` sql -SELECT reinterpretAsUUID(reverse(unhex('000102030405060708090a0b0c0d0e0f'))); +SELECT reinterpretAsUUID(reverse(unhex('000102030405060708090a0b0c0d0e0f'))) ``` Результат: @@ -423,11 +377,10 @@ SELECT uuid = uuid2; ## CAST(x, T) {#type_conversion_function-cast} -Преобразует входное значение `x` в тип данных `T`. Поддерживается также синтаксис `CAST(x AS t)`. +Преобразует x в тип данных t. +Поддерживается также синтаксис CAST(x AS t). -**Пример** - -Запрос: +Пример: ``` sql SELECT @@ -435,11 +388,9 @@ SELECT CAST(timestamp AS DateTime) AS datetime, CAST(timestamp AS Date) AS date, CAST(timestamp, 'String') AS string, - CAST(timestamp, 'FixedString(22)') AS fixed_string; + CAST(timestamp, 'FixedString(22)') AS fixed_string ``` -Результат: - ``` text ┌─timestamp───────────┬────────────datetime─┬───────date─┬─string──────────────┬─fixed_string──────────────┐ │ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00 │ 2016-06-15 │ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00\0\0\0 │ @@ -448,18 +399,12 @@ SELECT Преобразование в FixedString(N) работает только для аргументов типа String или FixedString(N). -Поддерживается преобразование к типу [Nullable](../../sql-reference/functions/type-conversion-functions.md) и обратно. - -**Примеры** - -Запрос: +Поддержано преобразование к типу [Nullable](../../sql-reference/functions/type-conversion-functions.md) и обратно. Пример: ``` sql -SELECT toTypeName(x) FROM t_null; +SELECT toTypeName(x) FROM t_null ``` -Результат: - ``` text ┌─toTypeName(x)─┐ │ Int8 │ @@ -467,14 +412,10 @@ SELECT toTypeName(x) FROM t_null; └───────────────┘ ``` -Запрос: - ``` sql -SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null; +SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null ``` -Результат: - ``` text ┌─toTypeName(CAST(x, 'Nullable(UInt16)'))─┐ │ Nullable(UInt16) │ @@ -501,7 +442,7 @@ accurateCastOrNull(x, T) - `x` — входное значение. - `T` — имя возвращаемого типа данных. -**Примеры** +**Пример** Запрос: @@ -561,8 +502,6 @@ toIntervalYear(number) **Пример** -Запрос: - ``` sql WITH toDate('2019-01-01') AS date, @@ -570,11 +509,9 @@ WITH toIntervalWeek(1) AS interval_to_week SELECT date + interval_week, - date + interval_to_week; + date + interval_to_week ``` -Результат: - ``` text ┌─plus(date, interval_week)─┬─plus(date, interval_to_week)─┐ │ 2019-01-08 │ 2019-01-08 │ @@ -590,7 +527,7 @@ SELECT **Синтаксис** ``` sql -parseDateTimeBestEffort(time_string[, time_zone]) +parseDateTimeBestEffort(time_string[, time_zone]); ``` **Параметры** @@ -633,7 +570,7 @@ AS parseDateTimeBestEffort; ``` sql SELECT parseDateTimeBestEffort('Sat, 18 Aug 2018 07:22:16 GMT', 'Europe/Moscow') -AS parseDateTimeBestEffort; +AS parseDateTimeBestEffort ``` Результат: @@ -648,7 +585,7 @@ AS parseDateTimeBestEffort; ``` sql SELECT parseDateTimeBestEffort('1284101485') -AS parseDateTimeBestEffort; +AS parseDateTimeBestEffort ``` Результат: @@ -663,7 +600,7 @@ AS parseDateTimeBestEffort; ``` sql SELECT parseDateTimeBestEffort('2018-12-12 10:12:12') -AS parseDateTimeBestEffort; +AS parseDateTimeBestEffort ``` Результат: @@ -677,7 +614,7 @@ AS parseDateTimeBestEffort; Запрос: ``` sql -SELECT parseDateTimeBestEffort('10 20:19'); +SELECT parseDateTimeBestEffort('10 20:19') ``` Результат: @@ -702,7 +639,7 @@ SELECT parseDateTimeBestEffort('10 20:19'); **Синтаксис** ``` sql -parseDateTimeBestEffortUS(time_string [, time_zone]) +parseDateTimeBestEffortUS(time_string [, time_zone]); ``` **Параметры** @@ -731,7 +668,7 @@ SELECT parseDateTimeBestEffortUS('09/12/2020 12:12:57') AS parseDateTimeBestEffortUS; ``` -Результат: +Ответ: ``` text ┌─parseDateTimeBestEffortUS─┐ @@ -746,7 +683,7 @@ SELECT parseDateTimeBestEffortUS('09-12-2020 12:12:57') AS parseDateTimeBestEffortUS; ``` -Результат: +Ответ: ``` text ┌─parseDateTimeBestEffortUS─┐ @@ -761,7 +698,7 @@ SELECT parseDateTimeBestEffortUS('09.12.2020 12:12:57') AS parseDateTimeBestEffortUS; ``` -Результат: +Ответ: ``` text ┌─parseDateTimeBestEffortUS─┐ @@ -796,10 +733,10 @@ toUnixTimestamp64Milli(value) ``` sql WITH toDateTime64('2019-09-16 19:20:12.345678910', 6) AS dt64 -SELECT toUnixTimestamp64Milli(dt64); +SELECT toUnixTimestamp64Milli(dt64) ``` -Результат: +Ответ: ``` text ┌─toUnixTimestamp64Milli(dt64)─┐ @@ -811,10 +748,10 @@ SELECT toUnixTimestamp64Milli(dt64); ``` sql WITH toDateTime64('2019-09-16 19:20:12.345678910', 6) AS dt64 -SELECT toUnixTimestamp64Nano(dt64); +SELECT toUnixTimestamp64Nano(dt64) ``` -Результат: +Ответ: ``` text ┌─toUnixTimestamp64Nano(dt64)─┐ @@ -849,10 +786,10 @@ fromUnixTimestamp64Milli(value [, ti]) ``` sql WITH CAST(1234567891011, 'Int64') AS i64 -SELECT fromUnixTimestamp64Milli(i64, 'UTC'); +SELECT fromUnixTimestamp64Milli(i64, 'UTC') ``` -Результат: +Ответ: ``` text ┌─fromUnixTimestamp64Milli(i64, 'UTC')─┐ @@ -883,7 +820,7 @@ toLowCardinality(expr) Тип: `LowCardinality(expr_result_type)` -**Пример** +**Example** Запрос: @@ -924,10 +861,10 @@ formatRow(format, x, y, ...) ``` sql SELECT formatRow('CSV', number, 'good') -FROM numbers(3); +FROM numbers(3) ``` -Результат: +Ответ: ``` text ┌─formatRow('CSV', number, 'good')─┐ @@ -965,10 +902,10 @@ formatRowNoNewline(format, x, y, ...) ``` sql SELECT formatRowNoNewline('CSV', number, 'good') -FROM numbers(3); +FROM numbers(3) ``` -Результат: +Ответ: ``` text ┌─formatRowNoNewline('CSV', number, 'good')─┐ From cc17edbc99ed060f870331f5eb9da93baf5e1e03 Mon Sep 17 00:00:00 2001 From: romanzhukov Date: Fri, 5 Feb 2021 13:29:31 +0300 Subject: [PATCH 077/381] DOCSUP-5822: Add function documentation and fix all file examples. --- .../functions/type-conversion-functions.md | 154 +++++++++++++----- .../functions/type-conversion-functions.md | 146 ++++++++++++----- 2 files changed, 220 insertions(+), 80 deletions(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 047b3b1cbea..1742f6b8888 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -36,10 +36,14 @@ The behavior of functions for the [NaN and Inf](../../sql-reference/data-types/f **Example** +Query: + ``` sql -SELECT toInt64(nan), toInt32(32), toInt16('16'), toInt8(8.8) +SELECT toInt64(nan), toInt32(32), toInt16('16'), toInt8(8.8); ``` +Result: + ``` text ┌─────────toInt64(nan)─┬─toInt32(32)─┬─toInt16('16')─┬─toInt8(8.8)─┐ │ -9223372036854775808 │ 32 │ 16 │ 8 │ @@ -52,10 +56,14 @@ It takes an argument of type String and tries to parse it into Int (8 \| 16 \| 3 **Example** +Query: + ``` sql -select toInt64OrZero('123123'), toInt8OrZero('123qwe123') +SELECT toInt64OrZero('123123'), toInt8OrZero('123qwe123'); ``` +Result: + ``` text ┌─toInt64OrZero('123123')─┬─toInt8OrZero('123qwe123')─┐ │ 123123 │ 0 │ @@ -68,10 +76,14 @@ It takes an argument of type String and tries to parse it into Int (8 \| 16 \| 3 **Example** +Query: + ``` sql -select toInt64OrNull('123123'), toInt8OrNull('123qwe123') +SELECT toInt64OrNull('123123'), toInt8OrNull('123qwe123'); ``` +Result: + ``` text ┌─toInt64OrNull('123123')─┬─toInt8OrNull('123qwe123')─┐ │ 123123 │ ᴺᵁᴸᴸ │ @@ -102,10 +114,14 @@ The behavior of functions for negative agruments and for the [NaN and Inf](../.. **Example** +Query: + ``` sql -SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8) +SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8); ``` +Result: + ``` text ┌───────toUInt64(nan)─┬─toUInt32(-32)─┬─toUInt16('16')─┬─toUInt8(8.8)─┐ │ 9223372036854775808 │ 4294967264 │ 16 │ 8 │ @@ -168,20 +184,28 @@ A value in the `Nullable(Decimal(P,S))` data type. The value contains: **Examples** +Query: + ``` sql -SELECT toDecimal32OrNull(toString(-1.111), 5) AS val, toTypeName(val) +SELECT toDecimal32OrNull(toString(-1.111), 5) AS val, toTypeName(val); ``` +Result: + ``` text ┌──────val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 5))─┐ │ -1.11100 │ Nullable(Decimal(9, 5)) │ └──────────┴────────────────────────────────────────────────────┘ ``` +Query: + ``` sql -SELECT toDecimal32OrNull(toString(-1.111), 2) AS val, toTypeName(val) +SELECT toDecimal32OrNull(toString(-1.111), 2) AS val, toTypeName(val); ``` +Result: + ``` text ┌──val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 2))─┐ │ ᴺᵁᴸᴸ │ Nullable(Decimal(9, 2)) │ @@ -213,20 +237,28 @@ A value in the `Nullable(Decimal(P,S))` data type. The value contains: **Example** +Query: + ``` sql -SELECT toDecimal32OrZero(toString(-1.111), 5) AS val, toTypeName(val) +SELECT toDecimal32OrZero(toString(-1.111), 5) AS val, toTypeName(val); ``` +Result: + ``` text ┌──────val─┬─toTypeName(toDecimal32OrZero(toString(-1.111), 5))─┐ │ -1.11100 │ Decimal(9, 5) │ └──────────┴────────────────────────────────────────────────────┘ ``` +Query: + ``` sql -SELECT toDecimal32OrZero(toString(-1.111), 2) AS val, toTypeName(val) +SELECT toDecimal32OrZero(toString(-1.111), 2) AS val, toTypeName(val); ``` +Result: + ``` text ┌──val─┬─toTypeName(toDecimal32OrZero(toString(-1.111), 2))─┐ │ 0.00 │ Decimal(9, 2) │ @@ -258,12 +290,18 @@ Conversion between numeric types uses the same rules as assignments between diff Additionally, the toString function of the DateTime argument can take a second String argument containing the name of the time zone. Example: `Asia/Yekaterinburg` In this case, the time is formatted according to the specified time zone. +**Example** + +Query: + ``` sql SELECT now() AS now_local, - toString(now(), 'Asia/Yekaterinburg') AS now_yekat + toString(now(), 'Asia/Yekaterinburg') AS now_yekat; ``` +Result: + ``` text ┌───────────now_local─┬─now_yekat───────────┐ │ 2016-06-15 00:11:21 │ 2016-06-15 02:11:21 │ @@ -281,22 +319,30 @@ If the string has fewer bytes than N, it is padded with null bytes to the right. Accepts a String or FixedString argument. Returns the String with the content truncated at the first zero byte found. -Example: +**Example** + +Query: ``` sql -SELECT toFixedString('foo', 8) AS s, toStringCutToZero(s) AS s_cut +SELECT toFixedString('foo', 8) AS s, toStringCutToZero(s) AS s_cut; ``` +Result: + ``` text ┌─s─────────────┬─s_cut─┐ │ foo\0\0\0\0\0 │ foo │ └───────────────┴───────┘ ``` +Query: + ``` sql -SELECT toFixedString('foo\0bar', 8) AS s, toStringCutToZero(s) AS s_cut +SELECT toFixedString('foo\0bar', 8) AS s, toStringCutToZero(s) AS s_cut; ``` +Result: + ``` text ┌─s──────────┬─s_cut─┐ │ foo\0bar\0 │ foo │ @@ -348,7 +394,7 @@ String to UUID. Query: ``` sql -SELECT reinterpretAsUUID(reverse(unhex('000102030405060708090a0b0c0d0e0f'))) +SELECT reinterpretAsUUID(reverse(unhex('000102030405060708090a0b0c0d0e0f'))); ``` Result: @@ -381,9 +427,11 @@ Result: ## CAST(x, T) {#type_conversion_function-cast} -Converts ‘x’ to the ‘t’ data type. The syntax CAST(x AS t) is also supported. +Converts input value `x` to the `T` data type. The syntax `CAST(x AS t)` is also supported. -Example: +**Example** + +Query: ``` sql SELECT @@ -391,9 +439,11 @@ SELECT CAST(timestamp AS DateTime) AS datetime, CAST(timestamp AS Date) AS date, CAST(timestamp, 'String') AS string, - CAST(timestamp, 'FixedString(22)') AS fixed_string + CAST(timestamp, 'FixedString(22)') AS fixed_string; ``` +Result: + ``` text ┌─timestamp───────────┬────────────datetime─┬───────date─┬─string──────────────┬─fixed_string──────────────┐ │ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00 │ 2016-06-15 │ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00\0\0\0 │ @@ -402,12 +452,18 @@ SELECT Conversion to FixedString(N) only works for arguments of type String or FixedString(N). -Type conversion to [Nullable](../../sql-reference/data-types/nullable.md) and back is supported. Example: +Type conversion to [Nullable](../../sql-reference/data-types/nullable.md) and back is supported. + +**Example** + +Query: ``` sql -SELECT toTypeName(x) FROM t_null +SELECT toTypeName(x) FROM t_null; ``` +Result: + ``` text ┌─toTypeName(x)─┐ │ Int8 │ @@ -415,10 +471,14 @@ SELECT toTypeName(x) FROM t_null └───────────────┘ ``` +Query: + ``` sql -SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null +SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null; ``` +Result: + ``` text ┌─toTypeName(CAST(x, 'Nullable(UInt16)'))─┐ │ Nullable(UInt16) │ @@ -432,15 +492,18 @@ SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null ## accurateCast(x, T) {#type_conversion_function-accurate-cast} -Converts ‘x’ to the ‘t’ data type. The differente from cast(x, T) is that accurateCast -does not allow overflow of numeric types during cast if type value x does not fit -bounds of type T. +Converts `x` to the `T` data type. The differente from [cast(x, T)](#type_conversion_function-cast) is that `accurateCast` +does not allow overflow of numeric types during cast if type value `x` does not fit bounds of type `T`. + +**Example** + +Query: -Example ``` sql SELECT cast(-1, 'UInt8') as uint8; ``` +Result: ``` text ┌─uint8─┐ @@ -448,13 +511,16 @@ SELECT cast(-1, 'UInt8') as uint8; └───────┘ ``` +Query: + ```sql SELECT accurateCast(-1, 'UInt8') as uint8; ``` +Result: + ``` text Code: 70. DB::Exception: Received from localhost:9000. DB::Exception: Value in column Int8 cannot be safely converted into type UInt8: While processing accurateCast(-1, 'UInt8') AS uint8. - ``` ## accurateCastOrNull(x, T) {#type_conversion_function-accurate-cast_or_null} @@ -488,6 +554,8 @@ Result: └────────────────────────────────────────────┘ ``` +Query: + ``` sql SELECT cast(-1, 'UInt8') as uint8, @@ -530,6 +598,8 @@ toIntervalYear(number) **Example** +Query: + ``` sql WITH toDate('2019-01-01') AS date, @@ -537,9 +607,11 @@ WITH toIntervalWeek(1) AS interval_to_week SELECT date + interval_week, - date + interval_to_week + date + interval_to_week; ``` +Result: + ``` text ┌─plus(date, interval_week)─┬─plus(date, interval_to_week)─┐ │ 2019-01-08 │ 2019-01-08 │ @@ -555,7 +627,7 @@ The function parses [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601), [RFC 112 **Syntax** ``` sql -parseDateTimeBestEffort(time_string [, time_zone]); +parseDateTimeBestEffort(time_string [, time_zone]) ``` **Parameters** @@ -598,7 +670,7 @@ Query: ``` sql SELECT parseDateTimeBestEffort('Sat, 18 Aug 2018 07:22:16 GMT', 'Europe/Moscow') -AS parseDateTimeBestEffort +AS parseDateTimeBestEffort; ``` Result: @@ -613,7 +685,7 @@ Query: ``` sql SELECT parseDateTimeBestEffort('1284101485') -AS parseDateTimeBestEffort +AS parseDateTimeBestEffort; ``` Result: @@ -628,7 +700,7 @@ Query: ``` sql SELECT parseDateTimeBestEffort('2018-12-12 10:12:12') -AS parseDateTimeBestEffort +AS parseDateTimeBestEffort; ``` Result: @@ -642,7 +714,7 @@ Result: Query: ``` sql -SELECT parseDateTimeBestEffort('10 20:19') +SELECT parseDateTimeBestEffort('10 20:19'); ``` Result: @@ -662,12 +734,12 @@ Result: ## parseDateTimeBestEffortUS {#parsedatetimebesteffortUS} -This function is similar to [‘parseDateTimeBestEffort’](#parsedatetimebesteffort), the only difference is that this function prefers US date format (`MM/DD/YYYY` etc.) in case of ambiguity. +This function is similar to [parseDateTimeBestEffort](#parsedatetimebesteffort), the only difference is that this function prefers US date format (`MM/DD/YYYY` etc.) in case of ambiguity. **Syntax** ``` sql -parseDateTimeBestEffortUS(time_string [, time_zone]); +parseDateTimeBestEffortUS(time_string [, time_zone]) ``` **Parameters** @@ -769,7 +841,7 @@ Type: `LowCardinality(expr_result_type)` Query: ``` sql -SELECT toLowCardinality('1') +SELECT toLowCardinality('1'); ``` Result: @@ -808,7 +880,7 @@ Query: ``` sql WITH toDateTime64('2019-09-16 19:20:12.345678910', 6) AS dt64 -SELECT toUnixTimestamp64Milli(dt64) +SELECT toUnixTimestamp64Milli(dt64); ``` Result: @@ -821,7 +893,7 @@ Result: ``` sql WITH toDateTime64('2019-09-16 19:20:12.345678910', 6) AS dt64 -SELECT toUnixTimestamp64Nano(dt64) +SELECT toUnixTimestamp64Nano(dt64); ``` Result: @@ -855,13 +927,17 @@ fromUnixTimestamp64Milli(value [, ti]) - `value` converted to the `DateTime64` data type. -**Examples** +**Example** + +Query: ``` sql WITH CAST(1234567891011, 'Int64') AS i64 -SELECT fromUnixTimestamp64Milli(i64, 'UTC') +SELECT fromUnixTimestamp64Milli(i64, 'UTC'); ``` +Result: + ``` text ┌─fromUnixTimestamp64Milli(i64, 'UTC')─┐ │ 2009-02-13 23:31:31.011 │ @@ -893,7 +969,7 @@ Query: ``` sql SELECT formatRow('CSV', number, 'good') -FROM numbers(3) +FROM numbers(3); ``` Result: @@ -934,7 +1010,7 @@ Query: ``` sql SELECT formatRowNoNewline('CSV', number, 'good') -FROM numbers(3) +FROM numbers(3); ``` Result: diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index 40fdbc6f5a0..aa55e015c61 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -36,10 +36,14 @@ toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438\u0020\u043f\u0440\u0435\u **Пример** +Запрос: + ``` sql -SELECT toInt64(nan), toInt32(32), toInt16('16'), toInt8(8.8) +SELECT toInt64(nan), toInt32(32), toInt16('16'), toInt8(8.8); ``` +Результат: + ``` text ┌─────────toInt64(nan)─┬─toInt32(32)─┬─toInt16('16')─┬─toInt8(8.8)─┐ │ -9223372036854775808 │ 32 │ 16 │ 8 │ @@ -52,10 +56,14 @@ SELECT toInt64(nan), toInt32(32), toInt16('16'), toInt8(8.8) **Пример** +Запрос: + ``` sql -select toInt64OrZero('123123'), toInt8OrZero('123qwe123') +SELECT toInt64OrZero('123123'), toInt8OrZero('123qwe123'); ``` +Результат: + ``` text ┌─toInt64OrZero('123123')─┬─toInt8OrZero('123qwe123')─┐ │ 123123 │ 0 │ @@ -68,10 +76,14 @@ select toInt64OrZero('123123'), toInt8OrZero('123qwe123') **Пример** +Запрос: + ``` sql -select toInt64OrNull('123123'), toInt8OrNull('123qwe123') +SELECT toInt64OrNull('123123'), toInt8OrNull('123qwe123'); ``` +Результат: + ``` text ┌─toInt64OrNull('123123')─┬─toInt8OrNull('123qwe123')─┐ │ 123123 │ ᴺᵁᴸᴸ │ @@ -102,10 +114,14 @@ select toInt64OrNull('123123'), toInt8OrNull('123qwe123') **Пример** +Запрос: + ``` sql -SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8) +SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8); ``` +Результат: + ``` text ┌───────toUInt64(nan)─┬─toUInt32(-32)─┬─toUInt16('16')─┬─toUInt8(8.8)─┐ │ 9223372036854775808 │ 4294967264 │ 16 │ 8 │ @@ -168,20 +184,28 @@ SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8) **Примеры** +Запрос: + ``` sql -SELECT toDecimal32OrNull(toString(-1.111), 5) AS val, toTypeName(val) +SELECT toDecimal32OrNull(toString(-1.111), 5) AS val, toTypeName(val); ``` +Результат: + ``` text ┌──────val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 5))─┐ │ -1.11100 │ Nullable(Decimal(9, 5)) │ └──────────┴────────────────────────────────────────────────────┘ ``` +Запрос: + ``` sql -SELECT toDecimal32OrNull(toString(-1.111), 2) AS val, toTypeName(val) +SELECT toDecimal32OrNull(toString(-1.111), 2) AS val, toTypeName(val); ``` +Результат: + ``` text ┌──val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 2))─┐ │ ᴺᵁᴸᴸ │ Nullable(Decimal(9, 2)) │ @@ -213,20 +237,28 @@ SELECT toDecimal32OrNull(toString(-1.111), 2) AS val, toTypeName(val) **Пример** +Запрос: + ``` sql -SELECT toDecimal32OrZero(toString(-1.111), 5) AS val, toTypeName(val) +SELECT toDecimal32OrZero(toString(-1.111), 5) AS val, toTypeName(val); ``` +Результат: + ``` text ┌──────val─┬─toTypeName(toDecimal32OrZero(toString(-1.111), 5))─┐ │ -1.11100 │ Decimal(9, 5) │ └──────────┴────────────────────────────────────────────────────┘ ``` +Запрос: + ``` sql -SELECT toDecimal32OrZero(toString(-1.111), 2) AS val, toTypeName(val) +SELECT toDecimal32OrZero(toString(-1.111), 2) AS val, toTypeName(val); ``` +Результат: + ``` text ┌──val─┬─toTypeName(toDecimal32OrZero(toString(-1.111), 2))─┐ │ 0.00 │ Decimal(9, 2) │ @@ -258,12 +290,18 @@ YYYY-MM-DD hh:mm:ss Дополнительно, функция toString от аргумента типа DateTime может принимать второй аргумент String - имя тайм-зоны. Пример: `Asia/Yekaterinburg` В этом случае, форматирование времени производится согласно указанной тайм-зоне. +**Пример** + +Запрос: + ``` sql SELECT now() AS now_local, - toString(now(), 'Asia/Yekaterinburg') AS now_yekat + toString(now(), 'Asia/Yekaterinburg') AS now_yekat; ``` +Результат: + ``` text ┌───────────now_local─┬─now_yekat───────────┐ │ 2016-06-15 00:11:21 │ 2016-06-15 02:11:21 │ @@ -281,22 +319,30 @@ SELECT Принимает аргумент типа String или FixedString. Возвращает String, вырезая содержимое строки до первого найденного нулевого байта. -Пример: +**Примеры** + +Запрос: ``` sql -SELECT toFixedString('foo', 8) AS s, toStringCutToZero(s) AS s_cut +SELECT toFixedString('foo', 8) AS s, toStringCutToZero(s) AS s_cut; ``` +Результат: + ``` text ┌─s─────────────┬─s_cut─┐ │ foo\0\0\0\0\0 │ foo │ └───────────────┴───────┘ ``` +Запрос: + ``` sql -SELECT toFixedString('foo\0bar', 8) AS s, toStringCutToZero(s) AS s_cut +SELECT toFixedString('foo\0bar', 8) AS s, toStringCutToZero(s) AS s_cut; ``` +Результат: + ``` text ┌─s──────────┬─s_cut─┐ │ foo\0bar\0 │ foo │ @@ -344,7 +390,7 @@ reinterpretAsUUID(fixed_string) Запрос: ``` sql -SELECT reinterpretAsUUID(reverse(unhex('000102030405060708090a0b0c0d0e0f'))) +SELECT reinterpretAsUUID(reverse(unhex('000102030405060708090a0b0c0d0e0f'))); ``` Результат: @@ -380,7 +426,9 @@ SELECT uuid = uuid2; Преобразует x в тип данных t. Поддерживается также синтаксис CAST(x AS t). -Пример: +**Пример** + +Запрос: ``` sql SELECT @@ -388,9 +436,11 @@ SELECT CAST(timestamp AS DateTime) AS datetime, CAST(timestamp AS Date) AS date, CAST(timestamp, 'String') AS string, - CAST(timestamp, 'FixedString(22)') AS fixed_string + CAST(timestamp, 'FixedString(22)') AS fixed_string; ``` +Результат: + ``` text ┌─timestamp───────────┬────────────datetime─┬───────date─┬─string──────────────┬─fixed_string──────────────┐ │ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00 │ 2016-06-15 │ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00\0\0\0 │ @@ -399,12 +449,18 @@ SELECT Преобразование в FixedString(N) работает только для аргументов типа String или FixedString(N). -Поддержано преобразование к типу [Nullable](../../sql-reference/functions/type-conversion-functions.md) и обратно. Пример: +Поддержано преобразование к типу [Nullable](../../sql-reference/functions/type-conversion-functions.md) и обратно. + +**Примеры** + +Запрос: ``` sql -SELECT toTypeName(x) FROM t_null +SELECT toTypeName(x) FROM t_null; ``` +Результат: + ``` text ┌─toTypeName(x)─┐ │ Int8 │ @@ -412,10 +468,14 @@ SELECT toTypeName(x) FROM t_null └───────────────┘ ``` +Запрос: + ``` sql -SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null +SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null; ``` +Результат: + ``` text ┌─toTypeName(CAST(x, 'Nullable(UInt16)'))─┐ │ Nullable(UInt16) │ @@ -442,7 +502,7 @@ accurateCastOrNull(x, T) - `x` — входное значение. - `T` — имя возвращаемого типа данных. -**Пример** +**Примеры** Запрос: @@ -502,6 +562,8 @@ toIntervalYear(number) **Пример** +Запрос: + ``` sql WITH toDate('2019-01-01') AS date, @@ -509,9 +571,11 @@ WITH toIntervalWeek(1) AS interval_to_week SELECT date + interval_week, - date + interval_to_week + date + interval_to_week; ``` +Результат: + ``` text ┌─plus(date, interval_week)─┬─plus(date, interval_to_week)─┐ │ 2019-01-08 │ 2019-01-08 │ @@ -527,7 +591,7 @@ SELECT **Синтаксис** ``` sql -parseDateTimeBestEffort(time_string[, time_zone]); +parseDateTimeBestEffort(time_string[, time_zone]) ``` **Параметры** @@ -570,7 +634,7 @@ AS parseDateTimeBestEffort; ``` sql SELECT parseDateTimeBestEffort('Sat, 18 Aug 2018 07:22:16 GMT', 'Europe/Moscow') -AS parseDateTimeBestEffort +AS parseDateTimeBestEffort; ``` Результат: @@ -585,7 +649,7 @@ AS parseDateTimeBestEffort ``` sql SELECT parseDateTimeBestEffort('1284101485') -AS parseDateTimeBestEffort +AS parseDateTimeBestEffort; ``` Результат: @@ -600,7 +664,7 @@ AS parseDateTimeBestEffort ``` sql SELECT parseDateTimeBestEffort('2018-12-12 10:12:12') -AS parseDateTimeBestEffort +AS parseDateTimeBestEffort; ``` Результат: @@ -614,7 +678,7 @@ AS parseDateTimeBestEffort Запрос: ``` sql -SELECT parseDateTimeBestEffort('10 20:19') +SELECT parseDateTimeBestEffort('10 20:19'); ``` Результат: @@ -639,7 +703,7 @@ SELECT parseDateTimeBestEffort('10 20:19') **Синтаксис** ``` sql -parseDateTimeBestEffortUS(time_string [, time_zone]); +parseDateTimeBestEffortUS(time_string [, time_zone]) ``` **Параметры** @@ -668,7 +732,7 @@ SELECT parseDateTimeBestEffortUS('09/12/2020 12:12:57') AS parseDateTimeBestEffortUS; ``` -Ответ: +Результат: ``` text ┌─parseDateTimeBestEffortUS─┐ @@ -683,7 +747,7 @@ SELECT parseDateTimeBestEffortUS('09-12-2020 12:12:57') AS parseDateTimeBestEffortUS; ``` -Ответ: +Результат: ``` text ┌─parseDateTimeBestEffortUS─┐ @@ -698,7 +762,7 @@ SELECT parseDateTimeBestEffortUS('09.12.2020 12:12:57') AS parseDateTimeBestEffortUS; ``` -Ответ: +Результат: ``` text ┌─parseDateTimeBestEffortUS─┐ @@ -733,10 +797,10 @@ toUnixTimestamp64Milli(value) ``` sql WITH toDateTime64('2019-09-16 19:20:12.345678910', 6) AS dt64 -SELECT toUnixTimestamp64Milli(dt64) +SELECT toUnixTimestamp64Milli(dt64); ``` -Ответ: +Результат: ``` text ┌─toUnixTimestamp64Milli(dt64)─┐ @@ -748,10 +812,10 @@ SELECT toUnixTimestamp64Milli(dt64) ``` sql WITH toDateTime64('2019-09-16 19:20:12.345678910', 6) AS dt64 -SELECT toUnixTimestamp64Nano(dt64) +SELECT toUnixTimestamp64Nano(dt64); ``` -Ответ: +Результат: ``` text ┌─toUnixTimestamp64Nano(dt64)─┐ @@ -786,10 +850,10 @@ fromUnixTimestamp64Milli(value [, ti]) ``` sql WITH CAST(1234567891011, 'Int64') AS i64 -SELECT fromUnixTimestamp64Milli(i64, 'UTC') +SELECT fromUnixTimestamp64Milli(i64, 'UTC'); ``` -Ответ: +Результат: ``` text ┌─fromUnixTimestamp64Milli(i64, 'UTC')─┐ @@ -820,12 +884,12 @@ toLowCardinality(expr) Тип: `LowCardinality(expr_result_type)` -**Example** +**Пример** Запрос: ```sql -SELECT toLowCardinality('1') +SELECT toLowCardinality('1'); ``` Результат: @@ -861,10 +925,10 @@ formatRow(format, x, y, ...) ``` sql SELECT formatRow('CSV', number, 'good') -FROM numbers(3) +FROM numbers(3); ``` -Ответ: +Результат: ``` text ┌─formatRow('CSV', number, 'good')─┐ @@ -902,10 +966,10 @@ formatRowNoNewline(format, x, y, ...) ``` sql SELECT formatRowNoNewline('CSV', number, 'good') -FROM numbers(3) +FROM numbers(3); ``` -Ответ: +Результат: ``` text ┌─formatRowNoNewline('CSV', number, 'good')─┐ From 5472eb5bd99aa712821a30b4e6aa1a73dfb6d40b Mon Sep 17 00:00:00 2001 From: Nicolae Vartolomei Date: Fri, 5 Feb 2021 10:39:58 +0000 Subject: [PATCH 078/381] Allow to drop readonly tables This check doesn't seem to be necessary. There seem to be a deadlock due to a logical race of drop with restarting thread. Seen in https://clickhouse-test-reports.s3.yandex.net/20088/4ebb44bb9936ed1daa330cb38f343664ca83751c/integration_tests_flaky_check_(asan).html#fail1 --- src/Storages/StorageReplicatedMergeTree.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 69cbe0d7062..cb5f4dd5185 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -751,7 +751,7 @@ void StorageReplicatedMergeTree::drop() auto zookeeper = global_context.getZooKeeper(); /// If probably there is metadata in ZooKeeper, we don't allow to drop the table. - if (is_readonly || !zookeeper) + if (!zookeeper) throw Exception("Can't drop readonly replicated table (need to drop data in ZooKeeper as well)", ErrorCodes::TABLE_IS_READ_ONLY); shutdown(); From e051423584855ef75bbe7d41d1b6db8a649f7bee Mon Sep 17 00:00:00 2001 From: Evgeniia Sudarikova Date: Fri, 5 Feb 2021 22:14:52 +0300 Subject: [PATCH 079/381] add RU docs --- .../example-datasets/brown-benchmark.md | 416 ++++++++++++++++++ .../functions/array-functions.md | 150 +++++++ .../ru/sql-reference/table-functions/mysql.md | 63 ++- 3 files changed, 605 insertions(+), 24 deletions(-) create mode 100644 docs/ru/getting-started/example-datasets/brown-benchmark.md diff --git a/docs/ru/getting-started/example-datasets/brown-benchmark.md b/docs/ru/getting-started/example-datasets/brown-benchmark.md new file mode 100644 index 00000000000..b3f2285093a --- /dev/null +++ b/docs/ru/getting-started/example-datasets/brown-benchmark.md @@ -0,0 +1,416 @@ +--- +toc_priority: 20 +toc_title: Brown University Benchmark +--- + +# Brown University Benchmark + +`MgBench` — это новый аналитический бенчмарк для сгенерированного журнала событий, разработанный [Andrew Crotty](http://cs.brown.edu/people/acrotty/). + +Скачать данные: +``` +wget https://datasets.clickhouse.tech/mgbench{1..3}.csv.xz +``` + +Распаковать данные: +``` +xz -v -d mgbench{1..3}.csv.xz +``` + +Создание таблиц: +``` +CREATE DATABASE mgbench; + + +CREATE TABLE mgbench.logs1 ( + log_time DateTime, + machine_name LowCardinality(String), + machine_group LowCardinality(String), + cpu_idle Nullable(Float32), + cpu_nice Nullable(Float32), + cpu_system Nullable(Float32), + cpu_user Nullable(Float32), + cpu_wio Nullable(Float32), + disk_free Nullable(Float32), + disk_total Nullable(Float32), + part_max_used Nullable(Float32), + load_fifteen Nullable(Float32), + load_five Nullable(Float32), + load_one Nullable(Float32), + mem_buffers Nullable(Float32), + mem_cached Nullable(Float32), + mem_free Nullable(Float32), + mem_shared Nullable(Float32), + swap_free Nullable(Float32), + bytes_in Nullable(Float32), + bytes_out Nullable(Float32) +) +ENGINE = MergeTree() +ORDER BY (machine_group, machine_name, log_time); + + +CREATE TABLE mgbench.logs2 ( + log_time DateTime, + client_ip IPv4, + request String, + status_code UInt16, + object_size UInt64 +) +ENGINE = MergeTree() +ORDER BY log_time; + + +CREATE TABLE mgbench.logs3 ( + log_time DateTime64, + device_id FixedString(15), + device_name LowCardinality(String), + device_type LowCardinality(String), + device_floor UInt8, + event_type LowCardinality(String), + event_unit FixedString(1), + event_value Nullable(Float32) +) +ENGINE = MergeTree() +ORDER BY (event_type, log_time); +``` + +Insert data: + +``` +clickhouse-client --query "INSERT INTO mgbench.logs1 FORMAT CSVWithNames" < mgbench1.csv +clickhouse-client --query "INSERT INTO mgbench.logs2 FORMAT CSVWithNames" < mgbench2.csv +clickhouse-client --query "INSERT INTO mgbench.logs3 FORMAT CSVWithNames" < mgbench3.csv +``` + +Run benchmark queries: +``` +-- Q1.1: What is the CPU/network utilization for each web server since midnight? + +SELECT machine_name, + MIN(cpu) AS cpu_min, + MAX(cpu) AS cpu_max, + AVG(cpu) AS cpu_avg, + MIN(net_in) AS net_in_min, + MAX(net_in) AS net_in_max, + AVG(net_in) AS net_in_avg, + MIN(net_out) AS net_out_min, + MAX(net_out) AS net_out_max, + AVG(net_out) AS net_out_avg +FROM ( + SELECT machine_name, + COALESCE(cpu_user, 0.0) AS cpu, + COALESCE(bytes_in, 0.0) AS net_in, + COALESCE(bytes_out, 0.0) AS net_out + FROM logs1 + WHERE machine_name IN ('anansi','aragog','urd') + AND log_time >= TIMESTAMP '2017-01-11 00:00:00' +) AS r +GROUP BY machine_name; + + +-- Q1.2: Which computer lab machines have been offline in the past day? + +SELECT machine_name, + log_time +FROM logs1 +WHERE (machine_name LIKE 'cslab%' OR + machine_name LIKE 'mslab%') + AND load_one IS NULL + AND log_time >= TIMESTAMP '2017-01-10 00:00:00' +ORDER BY machine_name, + log_time; + + +-- Q1.3: What are the hourly average metrics during the past 10 days for a specific workstation? + +SELECT dt, + hr, + AVG(load_fifteen) AS load_fifteen_avg, + AVG(load_five) AS load_five_avg, + AVG(load_one) AS load_one_avg, + AVG(mem_free) AS mem_free_avg, + AVG(swap_free) AS swap_free_avg +FROM ( + SELECT CAST(log_time AS DATE) AS dt, + EXTRACT(HOUR FROM log_time) AS hr, + load_fifteen, + load_five, + load_one, + mem_free, + swap_free + FROM logs1 + WHERE machine_name = 'babbage' + AND load_fifteen IS NOT NULL + AND load_five IS NOT NULL + AND load_one IS NOT NULL + AND mem_free IS NOT NULL + AND swap_free IS NOT NULL + AND log_time >= TIMESTAMP '2017-01-01 00:00:00' +) AS r +GROUP BY dt, + hr +ORDER BY dt, + hr; + + +-- Q1.4: Over 1 month, how often was each server blocked on disk I/O? + +SELECT machine_name, + COUNT(*) AS spikes +FROM logs1 +WHERE machine_group = 'Servers' + AND cpu_wio > 0.99 + AND log_time >= TIMESTAMP '2016-12-01 00:00:00' + AND log_time < TIMESTAMP '2017-01-01 00:00:00' +GROUP BY machine_name +ORDER BY spikes DESC +LIMIT 10; + + +-- Q1.5: Which externally reachable VMs have run low on memory? + +SELECT machine_name, + dt, + MIN(mem_free) AS mem_free_min +FROM ( + SELECT machine_name, + CAST(log_time AS DATE) AS dt, + mem_free + FROM logs1 + WHERE machine_group = 'DMZ' + AND mem_free IS NOT NULL +) AS r +GROUP BY machine_name, + dt +HAVING MIN(mem_free) < 10000 +ORDER BY machine_name, + dt; + + +-- Q1.6: What is the total hourly network traffic across all file servers? + +SELECT dt, + hr, + SUM(net_in) AS net_in_sum, + SUM(net_out) AS net_out_sum, + SUM(net_in) + SUM(net_out) AS both_sum +FROM ( + SELECT CAST(log_time AS DATE) AS dt, + EXTRACT(HOUR FROM log_time) AS hr, + COALESCE(bytes_in, 0.0) / 1000000000.0 AS net_in, + COALESCE(bytes_out, 0.0) / 1000000000.0 AS net_out + FROM logs1 + WHERE machine_name IN ('allsorts','andes','bigred','blackjack','bonbon', + 'cadbury','chiclets','cotton','crows','dove','fireball','hearts','huey', + 'lindt','milkduds','milkyway','mnm','necco','nerds','orbit','peeps', + 'poprocks','razzles','runts','smarties','smuggler','spree','stride', + 'tootsie','trident','wrigley','york') +) AS r +GROUP BY dt, + hr +ORDER BY both_sum DESC +LIMIT 10; + + +-- Q2.1: Which requests have caused server errors within the past 2 weeks? + +SELECT * +FROM logs2 +WHERE status_code >= 500 + AND log_time >= TIMESTAMP '2012-12-18 00:00:00' +ORDER BY log_time; + + +-- Q2.2: During a specific 2-week period, was the user password file leaked? + +SELECT * +FROM logs2 +WHERE status_code >= 200 + AND status_code < 300 + AND request LIKE '%/etc/passwd%' + AND log_time >= TIMESTAMP '2012-05-06 00:00:00' + AND log_time < TIMESTAMP '2012-05-20 00:00:00'; + + +-- Q2.3: What was the average path depth for top-level requests in the past month? + +SELECT top_level, + AVG(LENGTH(request) - LENGTH(REPLACE(request, '/', ''))) AS depth_avg +FROM ( + SELECT SUBSTRING(request FROM 1 FOR len) AS top_level, + request + FROM ( + SELECT POSITION(SUBSTRING(request FROM 2), '/') AS len, + request + FROM logs2 + WHERE status_code >= 200 + AND status_code < 300 + AND log_time >= TIMESTAMP '2012-12-01 00:00:00' + ) AS r + WHERE len > 0 +) AS s +WHERE top_level IN ('/about','/courses','/degrees','/events', + '/grad','/industry','/news','/people', + '/publications','/research','/teaching','/ugrad') +GROUP BY top_level +ORDER BY top_level; + + +-- Q2.4: During the last 3 months, which clients have made an excessive number of requests? + +SELECT client_ip, + COUNT(*) AS num_requests +FROM logs2 +WHERE log_time >= TIMESTAMP '2012-10-01 00:00:00' +GROUP BY client_ip +HAVING COUNT(*) >= 100000 +ORDER BY num_requests DESC; + + +-- Q2.5: What are the daily unique visitors? + +SELECT dt, + COUNT(DISTINCT client_ip) +FROM ( + SELECT CAST(log_time AS DATE) AS dt, + client_ip + FROM logs2 +) AS r +GROUP BY dt +ORDER BY dt; + + +-- Q2.6: What are the average and maximum data transfer rates (Gbps)? + +SELECT AVG(transfer) / 125000000.0 AS transfer_avg, + MAX(transfer) / 125000000.0 AS transfer_max +FROM ( + SELECT log_time, + SUM(object_size) AS transfer + FROM logs2 + GROUP BY log_time +) AS r; + + +-- Q3.1: Did the indoor temperature reach freezing over the weekend? + +SELECT * +FROM logs3 +WHERE event_type = 'temperature' + AND event_value <= 32.0 + AND log_time >= '2019-11-29 17:00:00.000'; + + +-- Q3.4: Over the past 6 months, how frequently were each door opened? + +SELECT device_name, + device_floor, + COUNT(*) AS ct +FROM logs3 +WHERE event_type = 'door_open' + AND log_time >= '2019-06-01 00:00:00.000' +GROUP BY device_name, + device_floor +ORDER BY ct DESC; + + +-- Q3.5: Where in the building do large temperature variations occur in winter and summer? + +WITH temperature AS ( + SELECT dt, + device_name, + device_type, + device_floor + FROM ( + SELECT dt, + hr, + device_name, + device_type, + device_floor, + AVG(event_value) AS temperature_hourly_avg + FROM ( + SELECT CAST(log_time AS DATE) AS dt, + EXTRACT(HOUR FROM log_time) AS hr, + device_name, + device_type, + device_floor, + event_value + FROM logs3 + WHERE event_type = 'temperature' + ) AS r + GROUP BY dt, + hr, + device_name, + device_type, + device_floor + ) AS s + GROUP BY dt, + device_name, + device_type, + device_floor + HAVING MAX(temperature_hourly_avg) - MIN(temperature_hourly_avg) >= 25.0 +) +SELECT DISTINCT device_name, + device_type, + device_floor, + 'WINTER' +FROM temperature +WHERE dt >= DATE '2018-12-01' + AND dt < DATE '2019-03-01' +UNION +SELECT DISTINCT device_name, + device_type, + device_floor, + 'SUMMER' +FROM temperature +WHERE dt >= DATE '2019-06-01' + AND dt < DATE '2019-09-01'; + + +-- Q3.6: For each device category, what are the monthly power consumption metrics? + +SELECT yr, + mo, + SUM(coffee_hourly_avg) AS coffee_monthly_sum, + AVG(coffee_hourly_avg) AS coffee_monthly_avg, + SUM(printer_hourly_avg) AS printer_monthly_sum, + AVG(printer_hourly_avg) AS printer_monthly_avg, + SUM(projector_hourly_avg) AS projector_monthly_sum, + AVG(projector_hourly_avg) AS projector_monthly_avg, + SUM(vending_hourly_avg) AS vending_monthly_sum, + AVG(vending_hourly_avg) AS vending_monthly_avg +FROM ( + SELECT dt, + yr, + mo, + hr, + AVG(coffee) AS coffee_hourly_avg, + AVG(printer) AS printer_hourly_avg, + AVG(projector) AS projector_hourly_avg, + AVG(vending) AS vending_hourly_avg + FROM ( + SELECT CAST(log_time AS DATE) AS dt, + EXTRACT(YEAR FROM log_time) AS yr, + EXTRACT(MONTH FROM log_time) AS mo, + EXTRACT(HOUR FROM log_time) AS hr, + CASE WHEN device_name LIKE 'coffee%' THEN event_value END AS coffee, + CASE WHEN device_name LIKE 'printer%' THEN event_value END AS printer, + CASE WHEN device_name LIKE 'projector%' THEN event_value END AS projector, + CASE WHEN device_name LIKE 'vending%' THEN event_value END AS vending + FROM logs3 + WHERE device_type = 'meter' + ) AS r + GROUP BY dt, + yr, + mo, + hr +) AS s +GROUP BY yr, + mo +ORDER BY yr, + mo; +``` + +Данные также доступны для работы с интерактивными запросами через [Playground](https://gh-api.clickhouse.tech/play?user=play), [пример](https://gh-api.clickhouse.tech/play?user=play#U0VMRUNUIG1hY2hpbmVfbmFtZSwKICAgICAgIE1JTihjcHUpIEFTIGNwdV9taW4sCiAgICAgICBNQVgoY3B1KSBBUyBjcHVfbWF4LAogICAgICAgQVZHKGNwdSkgQVMgY3B1X2F2ZywKICAgICAgIE1JTihuZXRfaW4pIEFTIG5ldF9pbl9taW4sCiAgICAgICBNQVgobmV0X2luKSBBUyBuZXRfaW5fbWF4LAogICAgICAgQVZHKG5ldF9pbikgQVMgbmV0X2luX2F2ZywKICAgICAgIE1JTihuZXRfb3V0KSBBUyBuZXRfb3V0X21pbiwKICAgICAgIE1BWChuZXRfb3V0KSBBUyBuZXRfb3V0X21heCwKICAgICAgIEFWRyhuZXRfb3V0KSBBUyBuZXRfb3V0X2F2ZwpGUk9NICgKICBTRUxFQ1QgbWFjaGluZV9uYW1lLAogICAgICAgICBDT0FMRVNDRShjcHVfdXNlciwgMC4wKSBBUyBjcHUsCiAgICAgICAgIENPQUxFU0NFKGJ5dGVzX2luLCAwLjApIEFTIG5ldF9pbiwKICAgICAgICAgQ09BTEVTQ0UoYnl0ZXNfb3V0LCAwLjApIEFTIG5ldF9vdXQKICBGUk9NIG1nYmVuY2gubG9nczEKICBXSEVSRSBtYWNoaW5lX25hbWUgSU4gKCdhbmFuc2knLCdhcmFnb2cnLCd1cmQnKQogICAgQU5EIGxvZ190aW1lID49IFRJTUVTVEFNUCAnMjAxNy0wMS0xMSAwMDowMDowMCcKKSBBUyByCkdST1VQIEJZIG1hY2hpbmVfbmFtZQ==). + +[Оригинальная статья](https://clickhouse.tech/docs/ru/getting_started/example_datasets/brown-benchmark/) diff --git a/docs/ru/sql-reference/functions/array-functions.md b/docs/ru/sql-reference/functions/array-functions.md index 015d14b9de5..7afd9da471e 100644 --- a/docs/ru/sql-reference/functions/array-functions.md +++ b/docs/ru/sql-reference/functions/array-functions.md @@ -1135,12 +1135,162 @@ SELECT Функция `arrayFirstIndex` является [функцией высшего порядка](../../sql-reference/functions/index.md#higher-order-functions) — в качестве первого аргумента ей нужно передать лямбда-функцию, и этот аргумент не может быть опущен. +## arrayMin(\[func,\] arr1, …) {#array-min} + +Возвращает минимальное значение функции `func`. Если функция не указана, возвращает минимальный из элементов массива. + +Функция `arrayMin` является [функцией высшего порядка](../../sql-reference/functions/index.md#higher-order-functions) — в качестве первого аргумента ей нужно передать лямбда-функцию. + +**Синтаксис** + +``` sql +arrayMin(arr) +``` + +**Возвращаемое значение** + +- Число. + +Тип: [Int](../../sql-reference/data-types/int-uint.md) или [Float](../../sql-reference/data-types/float.md). + +**Параметры** + +- `arr` — [Массив](../../sql-reference/data-types/array.md). + +**Примеры** + +Запрос: + +``` sql +SELECT arrayMin([1, 2, 4]) AS res +``` + +Результат: + +``` text +┌─res─┐ +│ 1 │ +└─────┘ +``` + +Запрос: + +``` sql +SELECT arrayMin(x -> (-x), [1, 2, 4]) AS res +``` + +Результат: + +``` text +┌─res─┐ +│ -4 │ +└─────┘ +``` + +## arrayMax(\[func,\] arr1, …) {#array-max} + +Возвращает максимальное значение функции `func`. Если функция не указана, возвращает максимальный из элементов массива. + +Функция `arrayMax` является [функцией высшего порядка](../../sql-reference/functions/index.md#higher-order-functions) — в качестве первого аргумента ей нужно передать лямбда-функцию. + +**Синтаксис** + +``` sql +arrayMax(arr) +``` + +**Возвращаемое значение** + +- Число. + +Тип: [Int](../../sql-reference/data-types/int-uint.md) или [Float](../../sql-reference/data-types/float.md). + +**Параметры** + +- `arr` — [Массив](../../sql-reference/data-types/array.md). + +**Примеры** + +Запрос: + +```sql +SELECT arrayMax([1, 2, 4]) AS res +``` + +Результат: + +``` text +┌─res─┐ +│ 4 │ +└─────┘ +``` + +Запрос: + +``` sql +SELECT arrayMax(x -> (-x), [1, 2, 4]) AS res +``` + +Результат: + +``` text +┌─res─┐ +│ -1 │ +└─────┘ +``` + ## arraySum(\[func,\] arr1, …) {#array-sum} Возвращает сумму значений функции `func`. Если функция не указана - просто возвращает сумму элементов массива. Функция `arraySum` является [функцией высшего порядка](../../sql-reference/functions/index.md#higher-order-functions) - в качестве первого аргумента ей можно передать лямбда-функцию. +**Синтаксис** + +``` sql +arraySum(arr) +``` + +**Возвращаемое значение** + +- Число. + +Тип: [Int](../../sql-reference/data-types/int-uint.md) или [Float](../../sql-reference/data-types/float.md). + +**Параметры** + +- `arr` — [Массив](../../sql-reference/data-types/array.md). + +**Примеры** + +Запрос: + +```sql +SELECT arraySum([2,3]) AS res +``` + +Результат: + +``` text +┌─res─┐ +│ 5 │ +└─────┘ +``` + +Запрос: + +``` sql +SELECT arraySum(x -> x*x, [2, 3]) AS res +``` + +Результат: + +``` text +┌─res─┐ +│ 13 │ +└─────┘ +``` + ## arrayCumSum(\[func,\] arr1, …) {#arraycumsumfunc-arr1} Возвращает массив из частичных сумм элементов исходного массива (сумма с накоплением). Если указана функция `func`, то значения элементов массива преобразуются этой функцией перед суммированием. diff --git a/docs/ru/sql-reference/table-functions/mysql.md b/docs/ru/sql-reference/table-functions/mysql.md index 21841eee67a..18b34d0bf6c 100644 --- a/docs/ru/sql-reference/table-functions/mysql.md +++ b/docs/ru/sql-reference/table-functions/mysql.md @@ -7,6 +7,8 @@ toc_title: mysql Позволяет выполнять запросы `SELECT` над данными, хранящимися на удалённом MySQL сервере. +**Синтаксис** + ``` sql mysql('host:port', 'database', 'table', 'user', 'password'[, replace_query, 'on_duplicate_clause']); ``` @@ -23,13 +25,13 @@ mysql('host:port', 'database', 'table', 'user', 'password'[, replace_query, 'on_ - `password` — пароль пользователя. -- `replace_query` — флаг, отвечающий за преобразование запросов `INSERT INTO` в `REPLACE INTO`. Если `replace_query=1`, то запрос заменяется. +- `replace_query` — флаг, отвечающий за преобразование запросов `INSERT INTO` в `REPLACE INTO`. Возможные значения: + - `0` - выполняется запрос `INSERT INTO`. + - `1` - выполняется запрос `REPLACE INTO`. -- `on_duplicate_clause` — выражение `ON DUPLICATE KEY on_duplicate_clause`, добавляемое в запрос `INSERT`. +- `on_duplicate_clause` — выражение `ON DUPLICATE KEY on_duplicate_clause`, добавляемое в запрос `INSERT`. Может быть передано только с помощью `replace_query = 0` (если вы одновременно передадите `replace_query = 1` и `on_duplicate_clause`, будет сгенерировано исключение). - Пример: `INSERT INTO t (c1,c2) VALUES ('a', 2) ON DUPLICATE KEY UPDATE c2 = c2 + 1`, где `on_duplicate_clause` это `UPDATE c2 = c2 + 1`. Чтобы узнать какие `on_duplicate_clause` можно использовать с секцией `ON DUPLICATE KEY` обратитесь к документации MySQL. - - Чтобы указать `'on_duplicate_clause'` необходимо передать `0` в параметр `replace_query`. Если одновременно передать `replace_query = 1` и `'on_duplicate_clause'`, то ClickHouse сгенерирует исключение. + Пример: `INSERT INTO t (c1,c2) VALUES ('a', 2) ON DUPLICATE KEY UPDATE c2 = c2 + 1`, где `on_duplicate_clause` это `UPDATE c2 = c2 + 1;` Простые условия `WHERE` такие как `=, !=, >, >=, <, =` выполняются на стороне сервера MySQL. @@ -39,46 +41,59 @@ mysql('host:port', 'database', 'table', 'user', 'password'[, replace_query, 'on_ Объект таблицы с теми же столбцами, что и в исходной таблице MySQL. -## Пример использования {#primer-ispolzovaniia} +!!! note "Примечание" + Чтобы отличить табличную функцию `mysql (...)` в запросе `INSERT` от имени таблицы со списком имен столбцов, используйте ключевые слова `FUNCTION` или `TABLE FUNCTION`. См. примеры ниже. + +**Примеры** Таблица в MySQL: ``` text mysql> CREATE TABLE `test`.`test` ( -> `int_id` INT NOT NULL AUTO_INCREMENT, - -> `int_nullable` INT NULL DEFAULT NULL, -> `float` FLOAT NOT NULL, - -> `float_nullable` FLOAT NULL DEFAULT NULL, -> PRIMARY KEY (`int_id`)); -Query OK, 0 rows affected (0,09 sec) -mysql> insert into test (`int_id`, `float`) VALUES (1,2); -Query OK, 1 row affected (0,00 sec) +mysql> INSERT INTO test (`int_id`, `float`) VALUES (1,2); -mysql> select * from test; -+--------+--------------+-------+----------------+ -| int_id | int_nullable | float | float_nullable | -+--------+--------------+-------+----------------+ -| 1 | NULL | 2 | NULL | -+--------+--------------+-------+----------------+ -1 row in set (0,00 sec) +mysql> SELECT * FROM test; ++--------+-------+ +| int_id | float | ++--------+-------+ +| 1 | 2 | ++--------+-------+ ``` Получение данных в ClickHouse: ``` sql -SELECT * FROM mysql('localhost:3306', 'test', 'test', 'bayonet', '123') +SELECT * FROM mysql('localhost:3306', 'test', 'test', 'bayonet', '123'); ``` ``` text -┌─int_id─┬─int_nullable─┬─float─┬─float_nullable─┐ -│ 1 │ ᴺᵁᴸᴸ │ 2 │ ᴺᵁᴸᴸ │ -└────────┴──────────────┴───────┴────────────────┘ +┌─int_id─┬─float─┐ +│ 1 │ 2 │ +└────────┴───────┘ ``` -## Смотрите также {#smotrite-takzhe} +Замена и вставка: + +```sql +INSERT INTO FUNCTION mysql('localhost:3306', 'test', 'test', 'bayonet', '123', 1) (int_id, float) VALUES (1, 3); +INSERT INTO TABLE FUNCTION mysql('localhost:3306', 'test', 'test', 'bayonet', '123', 0, 'UPDATE int_id = int_id + 1') (int_id, float) VALUES (1, 4); +SELECT * FROM mysql('localhost:3306', 'test', 'test', 'bayonet', '123'); +``` + +``` text +┌─int_id─┬─float─┐ +│ 1 │ 3 │ +│ 2 │ 4 │ +└────────┴───────┘ +``` + +**Смотрите также** - [Движок таблиц ‘MySQL’](../../sql-reference/table-functions/mysql.md) - [Использование MySQL как источника данных для внешнего словаря](../../sql-reference/table-functions/mysql.md#dicts-external_dicts_dict_sources-mysql) -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/table_functions/mysql/) +[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/table_functions/mysql/) From c285dafb5d2f6655fdf62febd0c9177f0bee5c1e Mon Sep 17 00:00:00 2001 From: Evgeniia Sudarikova Date: Fri, 5 Feb 2021 22:20:07 +0300 Subject: [PATCH 080/381] edited brown benchmark --- docs/en/getting-started/example-datasets/brown-benchmark.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/en/getting-started/example-datasets/brown-benchmark.md b/docs/en/getting-started/example-datasets/brown-benchmark.md index effae6d5adb..c9b74a84a54 100644 --- a/docs/en/getting-started/example-datasets/brown-benchmark.md +++ b/docs/en/getting-started/example-datasets/brown-benchmark.md @@ -412,3 +412,5 @@ ORDER BY yr, ``` The data is also available for interactive queries in the [Playground](https://gh-api.clickhouse.tech/play?user=play), [example](https://gh-api.clickhouse.tech/play?user=play#U0VMRUNUIG1hY2hpbmVfbmFtZSwKICAgICAgIE1JTihjcHUpIEFTIGNwdV9taW4sCiAgICAgICBNQVgoY3B1KSBBUyBjcHVfbWF4LAogICAgICAgQVZHKGNwdSkgQVMgY3B1X2F2ZywKICAgICAgIE1JTihuZXRfaW4pIEFTIG5ldF9pbl9taW4sCiAgICAgICBNQVgobmV0X2luKSBBUyBuZXRfaW5fbWF4LAogICAgICAgQVZHKG5ldF9pbikgQVMgbmV0X2luX2F2ZywKICAgICAgIE1JTihuZXRfb3V0KSBBUyBuZXRfb3V0X21pbiwKICAgICAgIE1BWChuZXRfb3V0KSBBUyBuZXRfb3V0X21heCwKICAgICAgIEFWRyhuZXRfb3V0KSBBUyBuZXRfb3V0X2F2ZwpGUk9NICgKICBTRUxFQ1QgbWFjaGluZV9uYW1lLAogICAgICAgICBDT0FMRVNDRShjcHVfdXNlciwgMC4wKSBBUyBjcHUsCiAgICAgICAgIENPQUxFU0NFKGJ5dGVzX2luLCAwLjApIEFTIG5ldF9pbiwKICAgICAgICAgQ09BTEVTQ0UoYnl0ZXNfb3V0LCAwLjApIEFTIG5ldF9vdXQKICBGUk9NIG1nYmVuY2gubG9nczEKICBXSEVSRSBtYWNoaW5lX25hbWUgSU4gKCdhbmFuc2knLCdhcmFnb2cnLCd1cmQnKQogICAgQU5EIGxvZ190aW1lID49IFRJTUVTVEFNUCAnMjAxNy0wMS0xMSAwMDowMDowMCcKKSBBUyByCkdST1VQIEJZIG1hY2hpbmVfbmFtZQ==). + +[Original article](https://clickhouse.tech/docs/en/getting_started/example_datasets/brown-benchmark/) From 44714c3fa895d0b827f771e0e3b9fcd876651d81 Mon Sep 17 00:00:00 2001 From: Evgeniia Sudarikova Date: Fri, 5 Feb 2021 22:34:26 +0300 Subject: [PATCH 081/381] edited RU brown benchmark --- docs/ru/getting-started/example-datasets/brown-benchmark.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/getting-started/example-datasets/brown-benchmark.md b/docs/ru/getting-started/example-datasets/brown-benchmark.md index b3f2285093a..e4fe00ace93 100644 --- a/docs/ru/getting-started/example-datasets/brown-benchmark.md +++ b/docs/ru/getting-started/example-datasets/brown-benchmark.md @@ -5,7 +5,7 @@ toc_title: Brown University Benchmark # Brown University Benchmark -`MgBench` — это новый аналитический бенчмарк для сгенерированного журнала событий, разработанный [Andrew Crotty](http://cs.brown.edu/people/acrotty/). +`MgBench` — это новый аналитический бенчмарк для данных журнала событий, сгенерированных машиной. Бенчмарк разработан [Andrew Crotty](http://cs.brown.edu/people/acrotty/). Скачать данные: ``` From 7ce0ef2561deda64192a2a0531dcc054b6ea1c60 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Mon, 8 Feb 2021 12:14:17 +0300 Subject: [PATCH 082/381] show clusters for replicated db --- src/Databases/DatabaseReplicated.cpp | 108 +++++++++++++++++- src/Databases/DatabaseReplicated.h | 8 +- src/Databases/DatabaseReplicatedWorker.cpp | 2 +- src/Interpreters/DDLWorker.cpp | 2 +- src/Interpreters/InterpreterCreateQuery.cpp | 29 +++-- src/Storages/System/StorageSystemClusters.cpp | 66 ++++++----- src/Storages/System/StorageSystemClusters.h | 3 + tests/queries/skip_list.json | 12 ++ 8 files changed, 186 insertions(+), 44 deletions(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 5a11787331c..43568379632 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -36,8 +36,11 @@ namespace ErrorCodes extern const int UNKNOWN_DATABASE; extern const int NOT_IMPLEMENTED; extern const int INCORRECT_QUERY; + extern const int ALL_CONNECTION_TRIES_FAILED; } +static constexpr const char * DROPPED_MARK = "DROPPED"; + zkutil::ZooKeeperPtr DatabaseReplicated::getZooKeeper() const { return global_context.getZooKeeper(); @@ -68,6 +71,8 @@ DatabaseReplicated::DatabaseReplicated( throw Exception("ZooKeeper path, shard and replica names must be non-empty", ErrorCodes::BAD_ARGUMENTS); if (shard_name.find('/') != std::string::npos || replica_name.find('/') != std::string::npos) throw Exception("Shard and replica names should not contain '/'", ErrorCodes::BAD_ARGUMENTS); + if (shard_name.find('|') != std::string::npos || replica_name.find('|') != std::string::npos) + throw Exception("Shard and replica names should not contain '|'", ErrorCodes::BAD_ARGUMENTS); if (zookeeper_path.back() == '/') zookeeper_path.resize(zookeeper_path.size() - 1); @@ -90,7 +95,7 @@ DatabaseReplicated::DatabaseReplicated( createDatabaseNodesInZooKeeper(current_zookeeper); } - replica_path = zookeeper_path + "/replicas/" + shard_name + "/" + replica_name; + replica_path = zookeeper_path + "/replicas/" + getFullReplicaName(); String replica_host_id; if (current_zookeeper->tryGet(replica_path, replica_host_id)) @@ -110,6 +115,93 @@ DatabaseReplicated::DatabaseReplicated( } } +String DatabaseReplicated::getFullReplicaName() const +{ + return shard_name + '|' + replica_name; +} + +std::pair DatabaseReplicated::parseFullReplicaName(const String & name) +{ + String shard; + String replica; + auto pos = name.find('|'); + if (pos == std::string::npos || name.find('|', pos + 1) != std::string::npos) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Incorrect replica identifier: {}", name); + shard = name.substr(0, pos); + replica = name.substr(pos + 1); + return {shard, replica}; +} + +ClusterPtr DatabaseReplicated::getCluster() const +{ + Strings hosts; + Strings host_ids; + + auto zookeeper = global_context.getZooKeeper(); + constexpr int max_retries = 10; + int iteration = 0; + bool success = false; + while (++iteration <= max_retries) + { + host_ids.resize(0); + Coordination::Stat stat; + hosts = zookeeper->getChildren(zookeeper_path + "/replicas", &stat); + if (hosts.empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "No hosts found"); + Int32 cver = stat.cversion; + + std::vector futures; + futures.reserve(hosts.size()); + host_ids.reserve(hosts.size()); + for (const auto & host : hosts) + futures.emplace_back(zookeeper->asyncTryGet(zookeeper_path + "/replicas/" + host)); + + success = true; + for (auto & future : futures) + { + auto res = future.get(); + if (res.error != Coordination::Error::ZOK) + success = false; + host_ids.emplace_back(res.data); + } + + zookeeper->get(zookeeper_path + "/replicas", &stat); + if (success && cver == stat.version) + break; + } + if (!success) + throw Exception(ErrorCodes::ALL_CONNECTION_TRIES_FAILED, "Cannot get consistent cluster snapshot"); + + assert(!hosts.empty()); + assert(hosts.size() == host_ids.size()); + std::sort(hosts.begin(), hosts.end()); + String current_shard = parseFullReplicaName(hosts.front()).first; + std::vector shards; + shards.emplace_back(); + for (size_t i = 0; i < hosts.size(); ++i) + { + const auto & id = host_ids[i]; + if (id == DROPPED_MARK) + continue; + auto [shard, replica] = parseFullReplicaName(hosts[i]); + auto pos = id.find(':'); + String host = id.substr(0, pos); + if (shard != current_shard) + { + current_shard = shard; + if (!shards.back().empty()) + shards.emplace_back(); + } + shards.back().emplace_back(unescapeForFileName(host)); + } + + /// TODO make it configurable + String username = "default"; + String password; + + return std::make_shared(global_context.getSettingsRef(), shards, username, password, global_context.getTCPPort(), false); +} + bool DatabaseReplicated::createDatabaseNodesInZooKeeper(const zkutil::ZooKeeperPtr & current_zookeeper) { current_zookeeper->createAncestors(zookeeper_path); @@ -139,8 +231,6 @@ bool DatabaseReplicated::createDatabaseNodesInZooKeeper(const zkutil::ZooKeeperP void DatabaseReplicated::createReplicaNodesInZooKeeper(const zkutil::ZooKeeperPtr & current_zookeeper) { - current_zookeeper->createAncestors(replica_path); - /// When creating new replica, use latest snapshot version as initial value of log_pointer //log_entry_to_execute = 0; //FIXME @@ -296,9 +386,15 @@ ASTPtr DatabaseReplicated::parseQueryFromMetadataInZooKeeper(const String & node void DatabaseReplicated::drop(const Context & context_) { auto current_zookeeper = getZooKeeper(); - current_zookeeper->set(replica_path, "DROPPED"); + current_zookeeper->set(replica_path, DROPPED_MARK); DatabaseAtomic::drop(context_); current_zookeeper->tryRemoveRecursive(replica_path); + /// TODO it may leave garbage in ZooKeeper if the last node lost connection here + if (current_zookeeper->tryRemove(zookeeper_path + "/replicas") == Coordination::Error::ZOK) + { + /// It was the last replica, remove all metadata + current_zookeeper->tryRemoveRecursive(zookeeper_path); + } } void DatabaseReplicated::stopReplication() @@ -318,7 +414,7 @@ void DatabaseReplicated::shutdown() void DatabaseReplicated::dropTable(const Context & context, const String & table_name, bool no_delay) { auto txn = context.getMetadataTransaction(); - //assert(!ddl_worker->isCurrentlyActive() || txn /*|| called from DROP DATABASE */); + assert(!ddl_worker->isCurrentlyActive() || txn); if (txn && txn->is_initial_query) { String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(table_name); @@ -335,6 +431,8 @@ void DatabaseReplicated::renameTable(const Context & context, const String & tab if (txn->is_initial_query) { + if (this != &to_database) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Moving tables between databases is not supported for Replicated engine"); if (!isTableExist(table_name, context)) throw Exception(ErrorCodes::UNKNOWN_TABLE, "Table {} does not exist", table_name); if (exchange && !to_database.isTableExist(to_table_name, context)) diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index a866a61558c..0f500b16470 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -15,6 +15,9 @@ namespace DB class DatabaseReplicatedDDLWorker; using ZooKeeperPtr = std::shared_ptr; +class Cluster; +using ClusterPtr = std::shared_ptr; + /** DatabaseReplicated engine * supports replication of metadata * via DDL log being written to ZooKeeper @@ -67,7 +70,10 @@ public: void loadStoredObjects(Context & context, bool has_force_restore_data_flag, bool force_attach) override; - String getFullReplicaName() const { return shard_name + '|' + replica_name; } + String getFullReplicaName() const; + static std::pair parseFullReplicaName(const String & name); + + ClusterPtr getCluster() const; //FIXME friend struct DatabaseReplicatedTask; diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp index 1c000a8f0a7..748305922b7 100644 --- a/src/Databases/DatabaseReplicatedWorker.cpp +++ b/src/Databases/DatabaseReplicatedWorker.cpp @@ -208,7 +208,7 @@ DDLTaskPtr DatabaseReplicatedDDLWorker::initAndCheckTask(const String & entry_na if (task->is_initial_query) { assert(!zookeeper->exists(entry_path + "/try")); - assert(zookeeper->exists(entry_path + "/committed") == (zookeeper->get(task->getFinishedNodePath()) == "0")); + assert(zookeeper->exists(entry_path + "/committed") == (zookeeper->get(task->getFinishedNodePath()) == ExecutionStatus(0).serializeText())); out_reason = fmt::format("Entry {} has been executed as initial query", entry_name); return {}; } diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index da2e878541d..f0cc3370211 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -488,7 +488,7 @@ void DDLWorker::processTask(DDLTaskBase & task) /// updating metadata in Replicated database), so we make create request for finished_node_path with status "0", /// which means that query executed successfully. task.ops.emplace_back(zkutil::makeRemoveRequest(active_node_path, -1)); - task.ops.emplace_back(zkutil::makeCreateRequest(finished_node_path, "0", zkutil::CreateMode::Persistent)); + task.ops.emplace_back(zkutil::makeCreateRequest(finished_node_path, ExecutionStatus(0).serializeText(), zkutil::CreateMode::Persistent)); try { diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 6af212172b2..be241339ef7 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -827,17 +827,28 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create) if (create.attach_from_path) { - fs::path data_path = fs::path(*create.attach_from_path).lexically_normal(); fs::path user_files = fs::path(context.getUserFilesPath()).lexically_normal(); - if (data_path.is_relative()) - data_path = (user_files / data_path).lexically_normal(); - if (!startsWith(data_path, user_files)) - throw Exception(ErrorCodes::PATH_ACCESS_DENIED, - "Data directory {} must be inside {} to attach it", String(data_path), String(user_files)); - fs::path root_path = fs::path(context.getPath()).lexically_normal(); - /// Data path must be relative to root_path - create.attach_from_path = fs::relative(data_path, root_path) / ""; + + if (context.getClientInfo().query_kind == ClientInfo::QueryKind::INITIAL_QUERY) + { + fs::path data_path = fs::path(*create.attach_from_path).lexically_normal(); + if (data_path.is_relative()) + data_path = (user_files / data_path).lexically_normal(); + if (!startsWith(data_path, user_files)) + throw Exception(ErrorCodes::PATH_ACCESS_DENIED, + "Data directory {} must be inside {} to attach it", String(data_path), String(user_files)); + + /// Data path must be relative to root_path + create.attach_from_path = fs::relative(data_path, root_path) / ""; + } + else + { + fs::path data_path = (root_path / *create.attach_from_path).lexically_normal(); + if (!startsWith(data_path, user_files)) + throw Exception(ErrorCodes::PATH_ACCESS_DENIED, + "Data directory {} must be inside {} to attach it", String(data_path), String(user_files)); + } } else if (create.attach && !create.attach_short_syntax && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { diff --git a/src/Storages/System/StorageSystemClusters.cpp b/src/Storages/System/StorageSystemClusters.cpp index ae8bcca2804..62ad1c5150f 100644 --- a/src/Storages/System/StorageSystemClusters.cpp +++ b/src/Storages/System/StorageSystemClusters.cpp @@ -3,6 +3,7 @@ #include #include #include +#include namespace DB { @@ -26,40 +27,51 @@ NamesAndTypesList StorageSystemClusters::getNamesAndTypes() }; } + void StorageSystemClusters::fillData(MutableColumns & res_columns, const Context & context, const SelectQueryInfo &) const { for (const auto & name_and_cluster : context.getClusters().getContainer()) + writeCluster(res_columns, name_and_cluster); + + const auto databases = DatabaseCatalog::instance().getDatabases(); + for (const auto & name_and_database : databases) { - const String & cluster_name = name_and_cluster.first; - const ClusterPtr & cluster = name_and_cluster.second; - const auto & shards_info = cluster->getShardsInfo(); - const auto & addresses_with_failover = cluster->getShardsAddresses(); + if (const auto * replicated = typeid_cast(name_and_database.second.get())) + writeCluster(res_columns, {name_and_database.first, replicated->getCluster()}); + } +} - for (size_t shard_index = 0; shard_index < shards_info.size(); ++shard_index) +void StorageSystemClusters::writeCluster(MutableColumns & res_columns, const NameAndCluster & name_and_cluster) const +{ + const String & cluster_name = name_and_cluster.first; + const ClusterPtr & cluster = name_and_cluster.second; + const auto & shards_info = cluster->getShardsInfo(); + const auto & addresses_with_failover = cluster->getShardsAddresses(); + + for (size_t shard_index = 0; shard_index < shards_info.size(); ++shard_index) + { + const auto & shard_info = shards_info[shard_index]; + const auto & shard_addresses = addresses_with_failover[shard_index]; + const auto pool_status = shard_info.pool->getStatus(); + + for (size_t replica_index = 0; replica_index < shard_addresses.size(); ++replica_index) { - const auto & shard_info = shards_info[shard_index]; - const auto & shard_addresses = addresses_with_failover[shard_index]; - const auto pool_status = shard_info.pool->getStatus(); + size_t i = 0; + const auto & address = shard_addresses[replica_index]; - for (size_t replica_index = 0; replica_index < shard_addresses.size(); ++replica_index) - { - size_t i = 0; - const auto & address = shard_addresses[replica_index]; - - res_columns[i++]->insert(cluster_name); - res_columns[i++]->insert(shard_info.shard_num); - res_columns[i++]->insert(shard_info.weight); - res_columns[i++]->insert(replica_index + 1); - res_columns[i++]->insert(address.host_name); - auto resolved = address.getResolvedAddress(); - res_columns[i++]->insert(resolved ? resolved->host().toString() : String()); - res_columns[i++]->insert(address.port); - res_columns[i++]->insert(address.is_local); - res_columns[i++]->insert(address.user); - res_columns[i++]->insert(address.default_database); - res_columns[i++]->insert(pool_status[replica_index].error_count); - res_columns[i++]->insert(pool_status[replica_index].estimated_recovery_time.count()); - } + res_columns[i++]->insert(cluster_name); + res_columns[i++]->insert(shard_info.shard_num); + res_columns[i++]->insert(shard_info.weight); + res_columns[i++]->insert(replica_index + 1); + res_columns[i++]->insert(address.host_name); + auto resolved = address.getResolvedAddress(); + res_columns[i++]->insert(resolved ? resolved->host().toString() : String()); + res_columns[i++]->insert(address.port); + res_columns[i++]->insert(address.is_local); + res_columns[i++]->insert(address.user); + res_columns[i++]->insert(address.default_database); + res_columns[i++]->insert(pool_status[replica_index].error_count); + res_columns[i++]->insert(pool_status[replica_index].estimated_recovery_time.count()); } } } diff --git a/src/Storages/System/StorageSystemClusters.h b/src/Storages/System/StorageSystemClusters.h index 4cda7c372b2..68282f1b1fe 100644 --- a/src/Storages/System/StorageSystemClusters.h +++ b/src/Storages/System/StorageSystemClusters.h @@ -10,6 +10,7 @@ namespace DB { class Context; +class Cluster; /** Implements system table 'clusters' * that allows to obtain information about available clusters @@ -25,8 +26,10 @@ public: protected: using IStorageSystemOneBlock::IStorageSystemOneBlock; + using NameAndCluster = std::pair>; void fillData(MutableColumns & res_columns, const Context & context, const SelectQueryInfo & query_info) const override; + void writeCluster(MutableColumns & res_columns, const NameAndCluster & name_and_cluster) const; }; } diff --git a/tests/queries/skip_list.json b/tests/queries/skip_list.json index adee777f900..4c6927f575a 100644 --- a/tests/queries/skip_list.json +++ b/tests/queries/skip_list.json @@ -103,7 +103,19 @@ "memory_tracking", /// FIXME remove it before merge "memory_tracking", "memory_usage", + "01686_rocksdb", + "01550_mutation_subquery", + "01070_mutations_with_dependencies", + "01070_materialize_ttl", + "01055_compact_parts", + "01017_mutations_with_nondeterministic_functions_zookeeper", + "00926_adaptive_index_granularity_pk", + "00910_zookeeper_test_alter_compression_codecs", + "00908_bloom_filter_index", + "00616_final_single_part", + "00446_clear_column_in_partition_zookeeper", "01533_multiple_nested", + "01213_alter_rename_column_zookeeper", "01575_disable_detach_table_of_dictionary", "01457_create_as_table_function_structure", "01415_inconsistent_merge_tree_settings", From 91d0924665401514396ed30ef6c01c8212b0b4bb Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Mon, 8 Feb 2021 12:46:30 +0300 Subject: [PATCH 083/381] write dictionaries metadata to zk --- src/Databases/DatabaseReplicated.cpp | 30 +++++++++++++++++++++ src/Databases/DatabaseReplicated.h | 4 +++ src/Databases/DatabaseWithDictionaries.cpp | 12 ++++++++- src/Interpreters/InterpreterCreateQuery.cpp | 7 +++++ src/Interpreters/InterpreterDropQuery.cpp | 13 +++++++++ 5 files changed, 65 insertions(+), 1 deletion(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 43568379632..a134ba5dec7 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -303,6 +303,9 @@ BlockIO DatabaseReplicated::propose(const ASTPtr & query, const Context & query_ if (query_context.getClientInfo().query_kind != ClientInfo::QueryKind::INITIAL_QUERY) throw Exception(ErrorCodes::INCORRECT_QUERY, "It's not initial query. ON CLUSTER is not allowed for Replicated database."); + if (auto * ddl_query = query->as()) + ddl_query->database.clear(); + if (const auto * query_alter = query->as()) { for (const auto & command : query_alter->command_list->children) @@ -493,4 +496,31 @@ void DatabaseReplicated::commitAlterTable(const StorageID & table_id, DatabaseAtomic::commitAlterTable(table_id, table_metadata_tmp_path, table_metadata_path, statement, query_context); } +void DatabaseReplicated::createDictionary(const Context & context, + const String & dictionary_name, + const ASTPtr & query) +{ + auto txn = context.getMetadataTransaction(); + assert(!ddl_worker->isCurrentlyActive() || txn); + if (txn && txn->is_initial_query) + { + String metadata_zk_path = txn->zookeeper_path + "/metadata/" + escapeForFileName(dictionary_name); + String statement = getObjectDefinitionFromCreateQuery(query->clone()); + txn->ops.emplace_back(zkutil::makeCreateRequest(metadata_zk_path, statement, zkutil::CreateMode::Persistent)); + } + DatabaseAtomic::createDictionary(context, dictionary_name, query); +} + +void DatabaseReplicated::removeDictionary(const Context & context, const String & dictionary_name) +{ + auto txn = context.getMetadataTransaction(); + assert(!ddl_worker->isCurrentlyActive() || txn); + if (txn && txn->is_initial_query) + { + String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(dictionary_name); + txn->ops.emplace_back(zkutil::makeRemoveRequest(metadata_zk_path, -1)); + } + DatabaseAtomic::removeDictionary(context, dictionary_name); +} + } diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index 0f500b16470..c39321f0caa 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -58,6 +58,10 @@ public: void commitAlterTable(const StorageID & table_id, const String & table_metadata_tmp_path, const String & table_metadata_path, const String & statement, const Context & query_context) override; + void createDictionary(const Context & context, + const String & dictionary_name, + const ASTPtr & query) override; + void removeDictionary(const Context & context, const String & dictionary_name) override; void drop(const Context & /*context*/) override; diff --git a/src/Databases/DatabaseWithDictionaries.cpp b/src/Databases/DatabaseWithDictionaries.cpp index ee16f4ae15e..7ce5de56b64 100644 --- a/src/Databases/DatabaseWithDictionaries.cpp +++ b/src/Databases/DatabaseWithDictionaries.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -193,6 +194,10 @@ void DatabaseWithDictionaries::createDictionary(const Context & context, const S detachDictionary(dictionary_name); }); + auto txn = context.getMetadataTransaction(); + if (txn && !context.isInternalSubquery()) + txn->commit(); /// Commit point (a sort of) for Replicated database + /// If it was ATTACH query and file with dictionary metadata already exist /// (so, ATTACH is done after DETACH), then rename atomically replaces old file with new one. Poco::File(dictionary_metadata_tmp_path).renameTo(dictionary_metadata_path); @@ -205,7 +210,7 @@ void DatabaseWithDictionaries::createDictionary(const Context & context, const S succeeded = true; } -void DatabaseWithDictionaries::removeDictionary(const Context &, const String & dictionary_name) +void DatabaseWithDictionaries::removeDictionary(const Context & context, const String & dictionary_name) { DictionaryAttachInfo attach_info; detachDictionaryImpl(dictionary_name, attach_info); @@ -213,6 +218,11 @@ void DatabaseWithDictionaries::removeDictionary(const Context &, const String & try { String dictionary_metadata_path = getObjectMetadataPath(dictionary_name); + + auto txn = context.getMetadataTransaction(); + if (txn && !context.isInternalSubquery()) + txn->commit(); /// Commit point (a sort of) for Replicated database + Poco::File(dictionary_metadata_path).remove(); CurrentStatusInfo::unset(CurrentStatusInfo::DictionaryStatus, StorageID(attach_info.create_query).getInternalDictionaryName()); diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index be241339ef7..376bf8417ff 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -1107,6 +1107,13 @@ BlockIO InterpreterCreateQuery::createDictionary(ASTCreateQuery & create) auto guard = DatabaseCatalog::instance().getDDLGuard(database_name, dictionary_name); DatabasePtr database = DatabaseCatalog::instance().getDatabase(database_name); + if (typeid_cast(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) + { + assertOrSetUUID(create, database); + guard->releaseTableLock(); + return typeid_cast(database.get())->propose(query_ptr, context); + } + if (database->isDictionaryExist(dictionary_name)) { /// TODO Check structure of dictionary diff --git a/src/Interpreters/InterpreterDropQuery.cpp b/src/Interpreters/InterpreterDropQuery.cpp index b22d46358f9..e6943f06e06 100644 --- a/src/Interpreters/InterpreterDropQuery.cpp +++ b/src/Interpreters/InterpreterDropQuery.cpp @@ -212,6 +212,19 @@ BlockIO InterpreterDropQuery::executeToDictionary( DatabasePtr database = tryGetDatabase(database_name, if_exists); + bool is_drop_or_detach_database = query_ptr->as()->table.empty(); + bool is_replicated_ddl_query = typeid_cast(database.get()) && + context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY && + !is_drop_or_detach_database; + if (is_replicated_ddl_query) + { + if (kind == ASTDropQuery::Kind::Detach) + throw Exception(ErrorCodes::INCORRECT_QUERY, "DETACH DICTIONARY is not allowed for Replicated databases."); + + ddl_guard->releaseTableLock(); + return typeid_cast(database.get())->propose(query_ptr, context); + } + if (!database || !database->isDictionaryExist(dictionary_name)) { if (!if_exists) From 8efee9ed9a5db0d4cc773b7bf60760160bb8b79c Mon Sep 17 00:00:00 2001 From: romanzhukov Date: Mon, 8 Feb 2021 15:40:23 +0300 Subject: [PATCH 084/381] DOCSUP-5822: IN oper - supports diff types. --- docs/en/sql-reference/operators/in.md | 20 +++++++++++++++++++- docs/ru/sql-reference/operators/in.md | 18 +++++++++++++++++- 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/docs/en/sql-reference/operators/in.md b/docs/en/sql-reference/operators/in.md index bfa8b3d1003..5f928f12024 100644 --- a/docs/en/sql-reference/operators/in.md +++ b/docs/en/sql-reference/operators/in.md @@ -13,10 +13,28 @@ SELECT (CounterID, UserID) IN ((34, 123), (101500, 456)) FROM ... If the left side is a single column that is in the index, and the right side is a set of constants, the system uses the index for processing the query. -Don’t list too many values explicitly (i.e. millions). If a data set is large, put it in a temporary table (for example, see the section “External data for query processing”), then use a subquery. +Don’t list too many values explicitly (i.e. millions). If a data set is large, put it in a temporary table (for example, see the section [External data for query processing](../../engines/table-engines/special/external-data.md)), then use a subquery. The right side of the operator can be a set of constant expressions, a set of tuples with constant expressions (shown in the examples above), or the name of a database table or SELECT subquery in brackets. +ClickHouse allows different types inside `IN` subquery. For left hand side it applies type conversion to the type of right hand side. + +**Example** + +Query: + +``` sql +SELECT '1' IN (SELECT 1); +``` + +Result: + +``` text +┌─in('1', _subquery49)─┐ +│ 1 │ +└──────────────────────┘ +``` + If the right side of the operator is the name of a table (for example, `UserID IN users`), this is equivalent to the subquery `UserID IN (SELECT * FROM users)`. Use this when working with external data that is sent along with the query. For example, the query can be sent together with a set of user IDs loaded to the ‘users’ temporary table, which should be filtered. If the right side of the operator is a table name that has the Set engine (a prepared data set that is always in RAM), the data set will not be created over again for each query. diff --git a/docs/ru/sql-reference/operators/in.md b/docs/ru/sql-reference/operators/in.md index 4c1290df166..5a4fe95f108 100644 --- a/docs/ru/sql-reference/operators/in.md +++ b/docs/ru/sql-reference/operators/in.md @@ -13,10 +13,26 @@ SELECT (CounterID, UserID) IN ((34, 123), (101500, 456)) FROM ... Если слева стоит один столбец, входящий в индекс, а справа - множество констант, то при выполнении запроса, система воспользуется индексом. -Не перечисляйте слишком большое количество значений (миллионы) явно. Если множество большое - лучше загрузить его во временную таблицу (например, смотрите раздел «Внешние данные для обработки запроса»), и затем воспользоваться подзапросом. +Не перечисляйте слишком большое количество значений (миллионы) явно. Если множество большое - лучше загрузить его во временную таблицу (например, смотрите раздел [Внешние данные для обработки запроса](../../engines/table-engines/special/external-data.md)), и затем воспользоваться подзапросом. В качестве правой части оператора может быть множество константных выражений, множество кортежей с константными выражениями (показано в примерах выше), а также имя таблицы или подзапрос SELECT в скобках. +**Пример** + +Запрос: + +``` sql +SELECT '1' IN (SELECT 1); +``` + +Результат: + +``` text +┌─in('1', _subquery49)─┐ +│ 1 │ +└──────────────────────┘ +``` + Если в качестве правой части оператора указано имя таблицы (например, `UserID IN users`), то это эквивалентно подзапросу `UserID IN (SELECT * FROM users)`. Это используется при работе с внешними данными, отправляемым вместе с запросом. Например, вместе с запросом может быть отправлено множество идентификаторов посетителей, загруженное во временную таблицу users, по которому следует выполнить фильтрацию. Если в качестве правой части оператора, указано имя таблицы, имеющий движок Set (подготовленное множество, постоянно находящееся в оперативке), то множество не будет создаваться заново при каждом запросе. From 5647f0eb8c25fc302179661d77e27e8d5e7bf479 Mon Sep 17 00:00:00 2001 From: romanzhukov Date: Mon, 8 Feb 2021 15:51:33 +0300 Subject: [PATCH 085/381] DOCSUP-5822: IN oper - supports diff types. --- docs/en/sql-reference/operators/in.md | 2 +- docs/ru/sql-reference/operators/in.md | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/en/sql-reference/operators/in.md b/docs/en/sql-reference/operators/in.md index 5f928f12024..1b6531a57f8 100644 --- a/docs/en/sql-reference/operators/in.md +++ b/docs/en/sql-reference/operators/in.md @@ -17,7 +17,7 @@ Don’t list too many values explicitly (i.e. millions). If a data set is large The right side of the operator can be a set of constant expressions, a set of tuples with constant expressions (shown in the examples above), or the name of a database table or SELECT subquery in brackets. -ClickHouse allows different types inside `IN` subquery. For left hand side it applies type conversion to the type of right hand side. +ClickHouse allows different types inside `IN` subquery. For left hand side it applies type conversion to the type of right hand side with [accurateCastOrNull](../functions/type-conversion-functions.md#type_conversion_function-accurate-cast_or_null). **Example** diff --git a/docs/ru/sql-reference/operators/in.md b/docs/ru/sql-reference/operators/in.md index 5a4fe95f108..d86d6f9ec57 100644 --- a/docs/ru/sql-reference/operators/in.md +++ b/docs/ru/sql-reference/operators/in.md @@ -17,6 +17,8 @@ SELECT (CounterID, UserID) IN ((34, 123), (101500, 456)) FROM ... В качестве правой части оператора может быть множество константных выражений, множество кортежей с константными выражениями (показано в примерах выше), а также имя таблицы или подзапрос SELECT в скобках. +ClickHouse допускает различные типы внутри подзапроса `IN`. Для левой стороны он применяет преобразование к типу правой стороны с помощью [accurateCastOrNull](../functions/type-conversion-functions.md#type_conversion_function-accurate-cast_or_null). + **Пример** Запрос: From 78f5f416171a192c4c6dbad4dd79d069be389a43 Mon Sep 17 00:00:00 2001 From: romanzhukov Date: Mon, 8 Feb 2021 15:55:53 +0300 Subject: [PATCH 086/381] DOCSUP-5822: Minor text fix. --- docs/en/sql-reference/operators/in.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/operators/in.md b/docs/en/sql-reference/operators/in.md index 1b6531a57f8..a0dd0455c4d 100644 --- a/docs/en/sql-reference/operators/in.md +++ b/docs/en/sql-reference/operators/in.md @@ -17,7 +17,7 @@ Don’t list too many values explicitly (i.e. millions). If a data set is large The right side of the operator can be a set of constant expressions, a set of tuples with constant expressions (shown in the examples above), or the name of a database table or SELECT subquery in brackets. -ClickHouse allows different types inside `IN` subquery. For left hand side it applies type conversion to the type of right hand side with [accurateCastOrNull](../functions/type-conversion-functions.md#type_conversion_function-accurate-cast_or_null). +ClickHouse allows different types inside `IN` subquery. For left hand side it applies conversion to the type of right hand side with [accurateCastOrNull](../functions/type-conversion-functions.md#type_conversion_function-accurate-cast_or_null). **Example** From 78c1d69b8c55a651f77f630e34e582dabb006f1f Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Mon, 8 Feb 2021 22:36:17 +0300 Subject: [PATCH 087/381] better code --- src/Common/CurrentMetrics.cpp | 1 - src/Databases/DatabaseOnDisk.cpp | 54 +++++++++++++++++++ src/Databases/DatabaseOnDisk.h | 2 + src/Databases/DatabaseOrdinary.cpp | 50 +---------------- src/Databases/DatabaseReplicated.cpp | 13 ++--- src/Databases/DatabaseReplicatedWorker.cpp | 2 +- src/Interpreters/ClientInfo.h | 1 - src/Interpreters/Context.cpp | 1 - src/Interpreters/Context.h | 1 - src/Interpreters/DDLTask.cpp | 4 +- src/Interpreters/DDLTask.h | 5 +- src/Interpreters/DDLWorker.cpp | 2 - src/Interpreters/InterpreterAlterQuery.cpp | 13 +++-- src/Interpreters/InterpreterCreateQuery.cpp | 41 +++++--------- src/Interpreters/InterpreterCreateQuery.h | 3 ++ src/Interpreters/InterpreterDropQuery.cpp | 13 ++++- src/Interpreters/InterpreterRenameQuery.cpp | 2 +- .../MergeTree/registerStorageMergeTree.cpp | 2 +- src/Storages/StorageReplicatedMergeTree.cpp | 32 +++++------ src/Storages/StorageReplicatedMergeTree.h | 4 +- src/Storages/System/StorageSystemClusters.cpp | 2 +- src/Storages/System/StorageSystemClusters.h | 2 +- .../test_replicated_database/test.py | 11 +++- ...8_ddl_dictionaries_concurrent_requrests.sh | 4 +- tests/queries/skip_list.json | 6 +++ 25 files changed, 146 insertions(+), 125 deletions(-) diff --git a/src/Common/CurrentMetrics.cpp b/src/Common/CurrentMetrics.cpp index c524467d8ca..4fb2709c8e4 100644 --- a/src/Common/CurrentMetrics.cpp +++ b/src/Common/CurrentMetrics.cpp @@ -15,7 +15,6 @@ M(BackgroundSchedulePoolTask, "Number of active tasks in BackgroundSchedulePool. This pool is used for periodic ReplicatedMergeTree tasks, like cleaning old data parts, altering data parts, replica re-initialization, etc.") \ M(BackgroundBufferFlushSchedulePoolTask, "Number of active tasks in BackgroundBufferFlushSchedulePool. This pool is used for periodic Buffer flushes") \ M(BackgroundDistributedSchedulePoolTask, "Number of active tasks in BackgroundDistributedSchedulePool. This pool is used for distributed sends that is done in background.") \ - M(BackgroundReplicatedSchedulePoolTask, "Number of active tasks in BackgroundReplicatedSchedulePoolTask. The pool is used by replicated database for executing DDL log coming from other replicas. One task corresponds to one replicated database") \ M(BackgroundMessageBrokerSchedulePoolTask, "Number of active tasks in BackgroundProcessingPool for message streaming") \ M(CacheDictionaryUpdateQueueBatches, "Number of 'batches' (a set of keys) in update queue in CacheDictionaries.") \ M(CacheDictionaryUpdateQueueKeys, "Exact number of keys in update queue in CacheDictionaries.") \ diff --git a/src/Databases/DatabaseOnDisk.cpp b/src/Databases/DatabaseOnDisk.cpp index 275f5bd3976..a03cb33591c 100644 --- a/src/Databases/DatabaseOnDisk.cpp +++ b/src/Databases/DatabaseOnDisk.cpp @@ -129,6 +129,60 @@ String getObjectDefinitionFromCreateQuery(const ASTPtr & query) return statement_buf.str(); } +void applyMetadataChangesToCreateQuery(const ASTPtr & query, const StorageInMemoryMetadata & metadata) +{ + auto & ast_create_query = query->as(); + + bool has_structure = ast_create_query.columns_list && ast_create_query.columns_list->columns; + if (ast_create_query.as_table_function && !has_structure) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Cannot alter table {} because it was created AS table function" + " and doesn't have structure in metadata", backQuote(ast_create_query.table)); + + assert(has_structure); + ASTPtr new_columns = InterpreterCreateQuery::formatColumns(metadata.columns); + ASTPtr new_indices = InterpreterCreateQuery::formatIndices(metadata.secondary_indices); + ASTPtr new_constraints = InterpreterCreateQuery::formatConstraints(metadata.constraints); + + ast_create_query.columns_list->replace(ast_create_query.columns_list->columns, new_columns); + ast_create_query.columns_list->setOrReplace(ast_create_query.columns_list->indices, new_indices); + ast_create_query.columns_list->setOrReplace(ast_create_query.columns_list->constraints, new_constraints); + + if (metadata.select.select_query) + { + query->replace(ast_create_query.select, metadata.select.select_query); + } + + /// MaterializedView is one type of CREATE query without storage. + if (ast_create_query.storage) + { + ASTStorage & storage_ast = *ast_create_query.storage; + + bool is_extended_storage_def + = storage_ast.partition_by || storage_ast.primary_key || storage_ast.order_by || storage_ast.sample_by || storage_ast.settings; + + if (is_extended_storage_def) + { + if (metadata.sorting_key.definition_ast) + storage_ast.set(storage_ast.order_by, metadata.sorting_key.definition_ast); + + if (metadata.primary_key.definition_ast) + storage_ast.set(storage_ast.primary_key, metadata.primary_key.definition_ast); + + if (metadata.sampling_key.definition_ast) + storage_ast.set(storage_ast.sample_by, metadata.sampling_key.definition_ast); + + if (metadata.table_ttl.definition_ast) + storage_ast.set(storage_ast.ttl_table, metadata.table_ttl.definition_ast); + else if (storage_ast.ttl_table != nullptr) /// TTL was removed + storage_ast.ttl_table = nullptr; + + if (metadata.settings_changes) + storage_ast.set(storage_ast.settings, metadata.settings_changes); + } + } +} + + DatabaseOnDisk::DatabaseOnDisk( const String & name, const String & metadata_path_, diff --git a/src/Databases/DatabaseOnDisk.h b/src/Databases/DatabaseOnDisk.h index b8cc1f60e66..60a50ac4539 100644 --- a/src/Databases/DatabaseOnDisk.h +++ b/src/Databases/DatabaseOnDisk.h @@ -25,6 +25,8 @@ std::pair createTableFromAST( */ String getObjectDefinitionFromCreateQuery(const ASTPtr & query); +void applyMetadataChangesToCreateQuery(const ASTPtr & query, const StorageInMemoryMetadata & metadata); + /* Class to provide basic operations with tables when metadata is stored on disk in .sql files. */ diff --git a/src/Databases/DatabaseOrdinary.cpp b/src/Databases/DatabaseOrdinary.cpp index 49bec28e4a1..d859578eb46 100644 --- a/src/Databases/DatabaseOrdinary.cpp +++ b/src/Databases/DatabaseOrdinary.cpp @@ -272,55 +272,7 @@ void DatabaseOrdinary::alterTable(const Context & context, const StorageID & tab 0, context.getSettingsRef().max_parser_depth); - auto & ast_create_query = ast->as(); - - bool has_structure = ast_create_query.columns_list && ast_create_query.columns_list->columns; - if (ast_create_query.as_table_function && !has_structure) - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Cannot alter table {} because it was created AS table function" - " and doesn't have structure in metadata", backQuote(table_name)); - - assert(has_structure); - ASTPtr new_columns = InterpreterCreateQuery::formatColumns(metadata.columns); - ASTPtr new_indices = InterpreterCreateQuery::formatIndices(metadata.secondary_indices); - ASTPtr new_constraints = InterpreterCreateQuery::formatConstraints(metadata.constraints); - - ast_create_query.columns_list->replace(ast_create_query.columns_list->columns, new_columns); - ast_create_query.columns_list->setOrReplace(ast_create_query.columns_list->indices, new_indices); - ast_create_query.columns_list->setOrReplace(ast_create_query.columns_list->constraints, new_constraints); - - if (metadata.select.select_query) - { - ast->replace(ast_create_query.select, metadata.select.select_query); - } - - /// MaterializedView is one type of CREATE query without storage. - if (ast_create_query.storage) - { - ASTStorage & storage_ast = *ast_create_query.storage; - - bool is_extended_storage_def - = storage_ast.partition_by || storage_ast.primary_key || storage_ast.order_by || storage_ast.sample_by || storage_ast.settings; - - if (is_extended_storage_def) - { - if (metadata.sorting_key.definition_ast) - storage_ast.set(storage_ast.order_by, metadata.sorting_key.definition_ast); - - if (metadata.primary_key.definition_ast) - storage_ast.set(storage_ast.primary_key, metadata.primary_key.definition_ast); - - if (metadata.sampling_key.definition_ast) - storage_ast.set(storage_ast.sample_by, metadata.sampling_key.definition_ast); - - if (metadata.table_ttl.definition_ast) - storage_ast.set(storage_ast.ttl_table, metadata.table_ttl.definition_ast); - else if (storage_ast.ttl_table != nullptr) /// TTL was removed - storage_ast.ttl_table = nullptr; - - if (metadata.settings_changes) - storage_ast.set(storage_ast.settings, metadata.settings_changes); - } - } + applyMetadataChangesToCreateQuery(ast, metadata); statement = getObjectDefinitionFromCreateQuery(ast); { diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index a134ba5dec7..4a6058afcd0 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -134,6 +134,7 @@ std::pair DatabaseReplicated::parseFullReplicaName(const String ClusterPtr DatabaseReplicated::getCluster() const { + /// TODO Maintain up-to-date Cluster and allow to use it in Distributed tables Strings hosts; Strings host_ids; @@ -149,6 +150,7 @@ ClusterPtr DatabaseReplicated::getCluster() const if (hosts.empty()) throw Exception(ErrorCodes::LOGICAL_ERROR, "No hosts found"); Int32 cver = stat.cversion; + std::sort(hosts.begin(), hosts.end()); std::vector futures; futures.reserve(hosts.size()); @@ -174,7 +176,6 @@ ClusterPtr DatabaseReplicated::getCluster() const assert(!hosts.empty()); assert(hosts.size() == host_ids.size()); - std::sort(hosts.begin(), hosts.end()); String current_shard = parseFullReplicaName(hosts.front()).first; std::vector shards; shards.emplace_back(); @@ -327,9 +328,7 @@ BlockIO DatabaseReplicated::propose(const ASTPtr & query, const Context & query_ if (query_context.getSettingsRef().distributed_ddl_task_timeout == 0) return io; - //FIXME need list of all replicas, we can obtain it from zk - Strings hosts_to_wait; - hosts_to_wait.emplace_back(getFullReplicaName()); + Strings hosts_to_wait = getZooKeeper()->getChildren(zookeeper_path + "/replicas"); auto stream = std::make_shared(node_path, entry, query_context, hosts_to_wait); io.in = std::move(stream); return io; @@ -338,7 +337,7 @@ BlockIO DatabaseReplicated::propose(const ASTPtr & query, const Context & query_ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeeper, UInt32 from_snapshot) { - LOG_WARNING(log, "Will recover replica"); + //LOG_WARNING(log, "Will recover replica"); //FIXME drop old tables @@ -355,7 +354,7 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep Context query_context = global_context; query_context.makeQueryContext(); - query_context.getClientInfo().query_kind = ClientInfo::QueryKind::REPLICATED_LOG_QUERY; + query_context.getClientInfo().query_kind = ClientInfo::QueryKind::SECONDARY_QUERY; query_context.setCurrentDatabase(database_name); query_context.setCurrentQueryId(""); // generate random query_id @@ -436,6 +435,8 @@ void DatabaseReplicated::renameTable(const Context & context, const String & tab { if (this != &to_database) throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Moving tables between databases is not supported for Replicated engine"); + if (table_name == to_table_name) + throw Exception(ErrorCodes::INCORRECT_QUERY, "Cannot rename table to itself"); if (!isTableExist(table_name, context)) throw Exception(ErrorCodes::UNKNOWN_TABLE, "Table {} does not exist", table_name); if (exchange && !to_database.isTableExist(to_table_name, context)) diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp index 748305922b7..dd9dc322f9d 100644 --- a/src/Databases/DatabaseReplicatedWorker.cpp +++ b/src/Databases/DatabaseReplicatedWorker.cpp @@ -48,7 +48,7 @@ void DatabaseReplicatedDDLWorker::initializeReplication() UInt32 our_log_ptr = parse(current_zookeeper->get(database->replica_path + "/log_ptr")); UInt32 max_log_ptr = parse(current_zookeeper->get(database->zookeeper_path + "/max_log_ptr")); UInt32 logs_to_keep = parse(current_zookeeper->get(database->zookeeper_path + "/logs_to_keep")); - if (our_log_ptr + logs_to_keep < max_log_ptr) + if (our_log_ptr == 0 || our_log_ptr + logs_to_keep < max_log_ptr) database->recoverLostReplica(current_zookeeper, 0); } diff --git a/src/Interpreters/ClientInfo.h b/src/Interpreters/ClientInfo.h index cacbed44c42..d2b7beb7d8c 100644 --- a/src/Interpreters/ClientInfo.h +++ b/src/Interpreters/ClientInfo.h @@ -42,7 +42,6 @@ public: NO_QUERY = 0, /// Uninitialized object. INITIAL_QUERY = 1, SECONDARY_QUERY = 2, /// Query that was initiated by another query for distributed or ON CLUSTER query execution. - REPLICATED_LOG_QUERY = 3, /// Query from replicated DDL log. }; diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index 83804125cd4..10619e3ad9a 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -79,7 +79,6 @@ namespace CurrentMetrics extern const Metric BackgroundSchedulePoolTask; extern const Metric BackgroundBufferFlushSchedulePoolTask; extern const Metric BackgroundDistributedSchedulePoolTask; - extern const Metric BackgroundReplicatedSchedulePoolTask; extern const Metric BackgroundMessageBrokerSchedulePoolTask; } diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index 906efcc6dba..636255d6190 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -622,7 +622,6 @@ public: BackgroundSchedulePool & getSchedulePool() const; BackgroundSchedulePool & getMessageBrokerSchedulePool() const; BackgroundSchedulePool & getDistributedSchedulePool() const; - BackgroundSchedulePool & getReplicatedSchedulePool() const; /// Has distributed_ddl configuration or not. bool hasDistributedDDL() const; diff --git a/src/Interpreters/DDLTask.cpp b/src/Interpreters/DDLTask.cpp index 9737167fa4c..9e379443364 100644 --- a/src/Interpreters/DDLTask.cpp +++ b/src/Interpreters/DDLTask.cpp @@ -296,7 +296,7 @@ String DatabaseReplicatedTask::getShardID() const std::unique_ptr DatabaseReplicatedTask::makeQueryContext(Context & from_context) { auto query_context = DDLTaskBase::makeQueryContext(from_context); - query_context->getClientInfo().query_kind = ClientInfo::QueryKind::REPLICATED_LOG_QUERY; //FIXME why do we need separate query kind? + query_context->getClientInfo().query_kind = ClientInfo::QueryKind::SECONDARY_QUERY; query_context->setCurrentDatabase(database->getDatabaseName()); auto txn = std::make_shared(); @@ -340,7 +340,7 @@ void MetadataTransaction::commit() assert(state == CREATED); state = FAILED; current_zookeeper->multi(ops); - state = COMMITED; + state = COMMITTED; } } diff --git a/src/Interpreters/DDLTask.h b/src/Interpreters/DDLTask.h index 552f4919765..43d9fa1c0ae 100644 --- a/src/Interpreters/DDLTask.h +++ b/src/Interpreters/DDLTask.h @@ -144,7 +144,7 @@ struct MetadataTransaction enum State { CREATED, - COMMITED, + COMMITTED, FAILED }; @@ -154,10 +154,11 @@ struct MetadataTransaction bool is_initial_query; Coordination::Requests ops; - void addOps(Coordination::Requests & other_ops) + void moveOpsTo(Coordination::Requests & other_ops) { std::move(ops.begin(), ops.end(), std::back_inserter(other_ops)); ops.clear(); + state = COMMITTED; } void commit(); diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index f0cc3370211..665bacf9d6d 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -42,7 +42,6 @@ namespace DB namespace ErrorCodes { - extern const int NOT_IMPLEMENTED; extern const int LOGICAL_ERROR; extern const int TIMEOUT_EXCEEDED; extern const int UNFINISHED; @@ -51,7 +50,6 @@ namespace ErrorCodes extern const int CANNOT_ASSIGN_ALTER; extern const int CANNOT_ALLOCATE_MEMORY; extern const int MEMORY_LIMIT_EXCEEDED; - extern const int INCORRECT_QUERY; } diff --git a/src/Interpreters/InterpreterAlterQuery.cpp b/src/Interpreters/InterpreterAlterQuery.cpp index cee9b9083ea..402f05895bc 100644 --- a/src/Interpreters/InterpreterAlterQuery.cpp +++ b/src/Interpreters/InterpreterAlterQuery.cpp @@ -28,6 +28,7 @@ namespace ErrorCodes { extern const int LOGICAL_ERROR; extern const int INCORRECT_QUERY; + extern const int NOT_IMPLEMENTED; } @@ -49,7 +50,7 @@ BlockIO InterpreterAlterQuery::execute() auto table_id = context.resolveStorageID(alter, Context::ResolveOrdinary); DatabasePtr database = DatabaseCatalog::instance().getDatabase(table_id.database_name); - if (typeid_cast(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) + if (typeid_cast(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::SECONDARY_QUERY) { auto guard = DatabaseCatalog::instance().getDDLGuard(table_id.database_name, table_id.table_name); guard->releaseTableLock(); @@ -60,8 +61,6 @@ BlockIO InterpreterAlterQuery::execute() auto alter_lock = table->lockForAlter(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); auto metadata_snapshot = table->getInMemoryMetadataPtr(); - //FIXME commit MetadataTransaction for all ALTER kinds. Now its' implemented only for metadata alter. - /// Add default database to table identifiers that we can encounter in e.g. default expressions, /// mutation expression, etc. AddDefaultDatabaseVisitor visitor(table_id.getDatabaseName()); @@ -95,6 +94,14 @@ BlockIO InterpreterAlterQuery::execute() throw Exception("Wrong parameter type in ALTER query", ErrorCodes::LOGICAL_ERROR); } + if (typeid_cast(database.get())) + { + int command_types_count = !mutation_commands.empty() + !partition_commands.empty() + !live_view_commands.empty() + !alter_commands.empty(); + if (1 < command_types_count) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "For Replicated databases it's not allowed " + "to execute ALTERs of different types in single query"); + } + if (!mutation_commands.empty()) { MutationsInterpreter(table, metadata_snapshot, mutation_commands, context, false).validate(); diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 376bf8417ff..bbe8526ae5b 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -149,7 +149,7 @@ BlockIO InterpreterCreateQuery::createDatabase(ASTCreateQuery & create) engine = makeASTFunction("Replicated", std::make_shared(fmt::format("/clickhouse/db/{}/", create.database)), std::make_shared("s1"), - std::make_shared("r1")); + std::make_shared("r" + toString(getpid()))); } engine->no_empty_args = true; @@ -573,8 +573,9 @@ InterpreterCreateQuery::TableProperties InterpreterCreateQuery::setProperties(AS /// Set the table engine if it was not specified explicitly. setEngine(create); - create.as_database.clear(); - create.as_table.clear(); + assert(as_database_saved.empty() && as_table_saved.empty()); + std::swap(create.as_database, as_database_saved); + std::swap(create.as_table, as_table_saved); return properties; } @@ -722,7 +723,7 @@ void InterpreterCreateQuery::assertOrSetUUID(ASTCreateQuery & create, const Data const auto * kind = create.is_dictionary ? "Dictionary" : "Table"; const auto * kind_upper = create.is_dictionary ? "DICTIONARY" : "TABLE"; - if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind == ClientInfo::QueryKind::REPLICATED_LOG_QUERY && !internal) + if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind == ClientInfo::QueryKind::SECONDARY_QUERY && !internal) { if (create.uuid == UUIDHelpers::Nil) throw Exception("Table UUID is not specified in DDL log", ErrorCodes::LOGICAL_ERROR); @@ -753,7 +754,6 @@ void InterpreterCreateQuery::assertOrSetUUID(ASTCreateQuery & create, const Data } else { - assert(context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY); bool is_on_cluster = context.getClientInfo().query_kind == ClientInfo::QueryKind::SECONDARY_QUERY; if (create.uuid != UUIDHelpers::Nil && !is_on_cluster) throw Exception(ErrorCodes::INCORRECT_QUERY, @@ -850,7 +850,7 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create) "Data directory {} must be inside {} to attach it", String(data_path), String(user_files)); } } - else if (create.attach && !create.attach_short_syntax && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) + else if (create.attach && !create.attach_short_syntax && context.getClientInfo().query_kind != ClientInfo::QueryKind::SECONDARY_QUERY) { auto * log = &Poco::Logger::get("InterpreterCreateQuery"); LOG_WARNING(log, "ATTACH TABLE query with full table definition is not recommended: " @@ -874,16 +874,6 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create) /// Set and retrieve list of columns, indices and constraints. Set table engine if needed. Rewrite query in canonical way. TableProperties properties = setProperties(create); - /// DDL log for replicated databases can not - /// contain the right database name for every replica - /// therefore for such queries the AST database - /// field is modified right before an actual execution - if (context.getClientInfo().query_kind == ClientInfo::QueryKind::REPLICATED_LOG_QUERY) - { - create.database = current_database; - } - - //TODO make code better if possible DatabasePtr database; bool need_add_to_database = !create.temporary; if (need_add_to_database) @@ -893,7 +883,7 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create) { auto guard = DatabaseCatalog::instance().getDDLGuard(create.database, create.table); database = DatabaseCatalog::instance().getDatabase(create.database); - if (typeid_cast(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) + if (typeid_cast(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::SECONDARY_QUERY) { assertOrSetUUID(create, database); guard->releaseTableLock(); @@ -930,9 +920,6 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, guard = DatabaseCatalog::instance().getDDLGuard(create.database, create.table); database = DatabaseCatalog::instance().getDatabase(create.database); - //TODO do we need it? - if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) - throw Exception(ErrorCodes::UNKNOWN_DATABASE, "Database was renamed"); assertOrSetUUID(create, database); /// Table can be created before or it can be created concurrently in another thread, while we were waiting in DDLGuard. @@ -1107,9 +1094,10 @@ BlockIO InterpreterCreateQuery::createDictionary(ASTCreateQuery & create) auto guard = DatabaseCatalog::instance().getDDLGuard(database_name, dictionary_name); DatabasePtr database = DatabaseCatalog::instance().getDatabase(database_name); - if (typeid_cast(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) + if (typeid_cast(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::SECONDARY_QUERY) { - assertOrSetUUID(create, database); + if (!create.attach) + assertOrSetUUID(create, database); guard->releaseTableLock(); return typeid_cast(database.get())->propose(query_ptr, context); } @@ -1266,15 +1254,14 @@ AccessRightsElements InterpreterCreateQuery::getRequiredAccess() const return required_access; } -void InterpreterCreateQuery::extendQueryLogElemImpl(QueryLogElement & elem, const ASTPtr & ast, const Context &) const +void InterpreterCreateQuery::extendQueryLogElemImpl(QueryLogElement & elem, const ASTPtr &, const Context &) const { - const auto & create = ast->as(); elem.query_kind = "Create"; - if (!create.as_table.empty()) + if (!as_table_saved.empty()) { - String database = backQuoteIfNeed(create.as_database.empty() ? context.getCurrentDatabase() : create.as_database); + String database = backQuoteIfNeed(as_database_saved.empty() ? context.getCurrentDatabase() : as_database_saved); elem.query_databases.insert(database); - elem.query_tables.insert(database + "." + backQuoteIfNeed(create.as_table)); + elem.query_tables.insert(database + "." + backQuoteIfNeed(as_table_saved)); } } diff --git a/src/Interpreters/InterpreterCreateQuery.h b/src/Interpreters/InterpreterCreateQuery.h index c109b0b7760..d88357fe412 100644 --- a/src/Interpreters/InterpreterCreateQuery.h +++ b/src/Interpreters/InterpreterCreateQuery.h @@ -95,5 +95,8 @@ private: /// Is this an internal query - not from the user. bool internal = false; bool force_attach = false; + + mutable String as_database_saved; + mutable String as_table_saved; }; } diff --git a/src/Interpreters/InterpreterDropQuery.cpp b/src/Interpreters/InterpreterDropQuery.cpp index e6943f06e06..ae76e8efd46 100644 --- a/src/Interpreters/InterpreterDropQuery.cpp +++ b/src/Interpreters/InterpreterDropQuery.cpp @@ -129,7 +129,7 @@ BlockIO InterpreterDropQuery::executeToTableImpl(const ASTDropQuery & query, Dat /// Prevents recursive drop from drop database query. The original query must specify a table. bool is_drop_or_detach_database = query_ptr->as()->table.empty(); bool is_replicated_ddl_query = typeid_cast(database.get()) && - context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY && + context.getClientInfo().query_kind != ClientInfo::QueryKind::SECONDARY_QUERY && !is_drop_or_detach_database; if (is_replicated_ddl_query) { @@ -137,6 +137,13 @@ BlockIO InterpreterDropQuery::executeToTableImpl(const ASTDropQuery & query, Dat throw Exception(ErrorCodes::INCORRECT_QUERY, "DETACH TABLE is not allowed for Replicated databases. " "Use DETACH TABLE PERMANENTLY or SYSTEM RESTART REPLICA"); + if (query.kind == ASTDropQuery::Kind::Detach) + context.checkAccess(table->isView() ? AccessType::DROP_VIEW : AccessType::DROP_TABLE, table_id); + else if (query.kind == ASTDropQuery::Kind::Truncate) + context.checkAccess(AccessType::TRUNCATE, table_id); + else if (query.kind == ASTDropQuery::Kind::Drop) + context.checkAccess(table->isView() ? AccessType::DROP_VIEW : AccessType::DROP_TABLE, table_id); + ddl_guard->releaseTableLock(); table.reset(); return typeid_cast(database.get())->propose(query.clone(), context); @@ -214,13 +221,15 @@ BlockIO InterpreterDropQuery::executeToDictionary( bool is_drop_or_detach_database = query_ptr->as()->table.empty(); bool is_replicated_ddl_query = typeid_cast(database.get()) && - context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY && + context.getClientInfo().query_kind != ClientInfo::QueryKind::SECONDARY_QUERY && !is_drop_or_detach_database; if (is_replicated_ddl_query) { if (kind == ASTDropQuery::Kind::Detach) throw Exception(ErrorCodes::INCORRECT_QUERY, "DETACH DICTIONARY is not allowed for Replicated databases."); + context.checkAccess(AccessType::DROP_DICTIONARY, database_name, dictionary_name); + ddl_guard->releaseTableLock(); return typeid_cast(database.get())->propose(query_ptr, context); } diff --git a/src/Interpreters/InterpreterRenameQuery.cpp b/src/Interpreters/InterpreterRenameQuery.cpp index 5bfc144e014..b9d7faac73c 100644 --- a/src/Interpreters/InterpreterRenameQuery.cpp +++ b/src/Interpreters/InterpreterRenameQuery.cpp @@ -80,7 +80,7 @@ BlockIO InterpreterRenameQuery::executeToTables(const ASTRenameQuery & rename, c database_catalog.assertTableDoesntExist(StorageID(elem.to_database_name, elem.to_table_name), context); DatabasePtr database = database_catalog.getDatabase(elem.from_database_name); - if (typeid_cast(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) + if (typeid_cast(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::SECONDARY_QUERY) { if (1 < descriptions.size()) throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Database {} is Replicated, " diff --git a/src/Storages/MergeTree/registerStorageMergeTree.cpp b/src/Storages/MergeTree/registerStorageMergeTree.cpp index 1d68f788a42..8377e37b07a 100644 --- a/src/Storages/MergeTree/registerStorageMergeTree.cpp +++ b/src/Storages/MergeTree/registerStorageMergeTree.cpp @@ -454,7 +454,7 @@ static StoragePtr create(const StorageFactory::Arguments & args) /// Allow implicit {uuid} macros only for zookeeper_path in ON CLUSTER queries bool is_on_cluster = args.local_context.getClientInfo().query_kind == ClientInfo::QueryKind::SECONDARY_QUERY; - bool is_replicated_database = args.local_context.getClientInfo().query_kind == ClientInfo::QueryKind::REPLICATED_LOG_QUERY && + bool is_replicated_database = args.local_context.getClientInfo().query_kind == ClientInfo::QueryKind::SECONDARY_QUERY && DatabaseCatalog::instance().getDatabase(args.table_id.database_name)->getEngineName() == "Replicated"; bool allow_uuid_macro = is_on_cluster || is_replicated_database || args.query.attach; diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index a4b83e365d1..3295be311d1 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -4284,24 +4284,12 @@ void StorageReplicatedMergeTree::alter( if (auto txn = query_context.getMetadataTransaction()) { - txn->addOps(ops); + txn->moveOpsTo(ops); /// NOTE: IDatabase::alterTable(...) is called when executing ALTER_METADATA queue entry without query context, /// so we have to update metadata of DatabaseReplicated here. - /// It also may cause "Table columns structure in ZooKeeper is different" error on server startup - /// even for Ordinary and Atomic databases. String metadata_zk_path = txn->zookeeper_path + "/metadata/" + escapeForFileName(table_id.table_name); auto ast = DatabaseCatalog::instance().getDatabase(table_id.database_name)->getCreateTableQuery(table_id.table_name, query_context); - auto & ast_create_query = ast->as(); - - //FIXME copy-paste - ASTPtr new_columns = InterpreterCreateQuery::formatColumns(future_metadata.columns); - ASTPtr new_indices = InterpreterCreateQuery::formatIndices(future_metadata.secondary_indices); - ASTPtr new_constraints = InterpreterCreateQuery::formatConstraints(future_metadata.constraints); - - ast_create_query.columns_list->replace(ast_create_query.columns_list->columns, new_columns); - ast_create_query.columns_list->setOrReplace(ast_create_query.columns_list->indices, new_indices); - ast_create_query.columns_list->setOrReplace(ast_create_query.columns_list->constraints, new_constraints); - + applyMetadataChangesToCreateQuery(ast, future_metadata); ops.emplace_back(zkutil::makeSetRequest(metadata_zk_path, getObjectDefinitionFromCreateQuery(ast), -1)); } @@ -4450,7 +4438,7 @@ void StorageReplicatedMergeTree::dropPartition(const ASTPtr & partition, bool de else { String partition_id = getPartitionIDFromQuery(partition, query_context); - did_drop = dropAllPartsInPartition(*zookeeper, partition_id, entry, detach); + did_drop = dropAllPartsInPartition(*zookeeper, partition_id, entry, query_context, detach); } if (did_drop) @@ -4474,7 +4462,7 @@ void StorageReplicatedMergeTree::dropPartition(const ASTPtr & partition, bool de void StorageReplicatedMergeTree::truncate( - const ASTPtr &, const StorageMetadataPtr &, const Context &, TableExclusiveLockHolder & table_lock) + const ASTPtr &, const StorageMetadataPtr &, const Context & query_context, TableExclusiveLockHolder & table_lock) { table_lock.release(); /// Truncate is done asynchronously. @@ -4490,7 +4478,7 @@ void StorageReplicatedMergeTree::truncate( { LogEntry entry; - if (dropAllPartsInPartition(*zookeeper, partition_id, entry, false)) + if (dropAllPartsInPartition(*zookeeper, partition_id, entry, query_context, false)) waitForAllReplicasToProcessLogEntry(entry); } } @@ -5274,6 +5262,9 @@ void StorageReplicatedMergeTree::mutate(const MutationCommands & commands, const requests.emplace_back(zkutil::makeCreateRequest( mutations_path + "/", mutation_entry.toString(), zkutil::CreateMode::PersistentSequential)); + if (auto txn = query_context.getMetadataTransaction()) + txn->moveOpsTo(requests); + Coordination::Responses responses; Coordination::Error rc = zookeeper->tryMulti(requests, responses); @@ -5775,6 +5766,9 @@ void StorageReplicatedMergeTree::replacePartitionFrom( } } + if (auto txn = context.getMetadataTransaction()) + txn->moveOpsTo(ops); + ops.emplace_back(zkutil::makeSetRequest(zookeeper_path + "/log", "", -1)); /// Just update version ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/log/log-", entry.toString(), zkutil::CreateMode::PersistentSequential)); @@ -6243,7 +6237,7 @@ bool StorageReplicatedMergeTree::dropPart( } bool StorageReplicatedMergeTree::dropAllPartsInPartition( - zkutil::ZooKeeper & zookeeper, String & partition_id, LogEntry & entry, bool detach) + zkutil::ZooKeeper & zookeeper, String & partition_id, LogEntry & entry, const Context & query_context, bool detach) { MergeTreePartInfo drop_range_info; if (!getFakePartCoveringAllPartsInPartition(partition_id, drop_range_info)) @@ -6275,6 +6269,8 @@ bool StorageReplicatedMergeTree::dropAllPartsInPartition( Coordination::Requests ops; ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/log/log-", entry.toString(), zkutil::CreateMode::PersistentSequential)); ops.emplace_back(zkutil::makeSetRequest(zookeeper_path + "/log", "", -1)); /// Just update version. + if (auto txn = query_context.getMetadataTransaction()) + txn->moveOpsTo(ops); Coordination::Responses responses = zookeeper.multi(ops); String log_znode_path = dynamic_cast(*responses.front()).path_created; diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index 6db05294b63..a1a70ada9b2 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -134,7 +134,7 @@ public: */ void drop() override; - void truncate(const ASTPtr &, const StorageMetadataPtr &, const Context &, TableExclusiveLockHolder &) override; + void truncate(const ASTPtr &, const StorageMetadataPtr &, const Context & query_context, TableExclusiveLockHolder &) override; void checkTableCanBeRenamed() const override; @@ -577,7 +577,7 @@ private: bool dropPart(zkutil::ZooKeeperPtr & zookeeper, String part_name, LogEntry & entry, bool detach, bool throw_if_noop); bool dropAllPartsInPartition( - zkutil::ZooKeeper & zookeeper, String & partition_id, LogEntry & entry, bool detach); + zkutil::ZooKeeper & zookeeper, String & partition_id, LogEntry & entry, const Context & query_context, bool detach); // Partition helpers void dropPartition(const ASTPtr & partition, bool detach, bool drop_part, const Context & query_context, bool throw_if_noop) override; diff --git a/src/Storages/System/StorageSystemClusters.cpp b/src/Storages/System/StorageSystemClusters.cpp index 62ad1c5150f..7e16deb6d22 100644 --- a/src/Storages/System/StorageSystemClusters.cpp +++ b/src/Storages/System/StorageSystemClusters.cpp @@ -41,7 +41,7 @@ void StorageSystemClusters::fillData(MutableColumns & res_columns, const Context } } -void StorageSystemClusters::writeCluster(MutableColumns & res_columns, const NameAndCluster & name_and_cluster) const +void StorageSystemClusters::writeCluster(MutableColumns & res_columns, const NameAndCluster & name_and_cluster) { const String & cluster_name = name_and_cluster.first; const ClusterPtr & cluster = name_and_cluster.second; diff --git a/src/Storages/System/StorageSystemClusters.h b/src/Storages/System/StorageSystemClusters.h index 68282f1b1fe..4f2a843999f 100644 --- a/src/Storages/System/StorageSystemClusters.h +++ b/src/Storages/System/StorageSystemClusters.h @@ -29,7 +29,7 @@ protected: using NameAndCluster = std::pair>; void fillData(MutableColumns & res_columns, const Context & context, const SelectQueryInfo & query_info) const override; - void writeCluster(MutableColumns & res_columns, const NameAndCluster & name_and_cluster) const; + static void writeCluster(MutableColumns & res_columns, const NameAndCluster & name_and_cluster); }; } diff --git a/tests/integration/test_replicated_database/test.py b/tests/integration/test_replicated_database/test.py index 2471228b55e..2a5a7f4716e 100644 --- a/tests/integration/test_replicated_database/test.py +++ b/tests/integration/test_replicated_database/test.py @@ -147,7 +147,16 @@ def test_alters_from_different_replicas(started_cluster): main_node.query("SYSTEM FLUSH DISTRIBUTED testdb.dist") main_node.query("ALTER TABLE testdb.concurrent_test UPDATE StartDate = addYears(StartDate, 1) WHERE 1") - main_node.query("ALTER TABLE testdb.concurrent_test DELETE WHERE UserID % 2") + res = main_node.query("ALTER TABLE testdb.concurrent_test DELETE WHERE UserID % 2") + assert "shard1|replica1" in res and "shard1|replica2" in res and "shard1|replica3" in res + assert "shard2|replica1" in res and "shard2|replica2" in res + + expected = "1\t1\tmain_node\n" \ + "1\t2\tdummy_node\n" \ + "1\t3\tcompeting_node\n" \ + "2\t1\tsnapshotting_node\n" \ + "2\t2\tsnapshot_recovering_node\n" + assert main_node.query("SELECT shard_num, replica_num, host_name FROM system.clusters WHERE cluster='testdb'") == expected # test_drop_and_create_replica main_node.query("DROP DATABASE testdb") diff --git a/tests/queries/0_stateless/01018_ddl_dictionaries_concurrent_requrests.sh b/tests/queries/0_stateless/01018_ddl_dictionaries_concurrent_requrests.sh index bc13e44934a..025fe51e2a9 100755 --- a/tests/queries/0_stateless/01018_ddl_dictionaries_concurrent_requrests.sh +++ b/tests/queries/0_stateless/01018_ddl_dictionaries_concurrent_requrests.sh @@ -113,8 +113,8 @@ timeout $TIMEOUT bash -c thread7 2> /dev/null & wait $CLICKHOUSE_CLIENT -q "SELECT 'Still alive'" -$CLICKHOUSE_CLIENT -q "ATTACH DICTIONARY database_for_dict.dict1" -$CLICKHOUSE_CLIENT -q "ATTACH DICTIONARY database_for_dict.dict2" +$CLICKHOUSE_CLIENT -q "ATTACH DICTIONARY IF NOT EXISTS database_for_dict.dict1" +$CLICKHOUSE_CLIENT -q "ATTACH DICTIONARY IF NOT EXISTS database_for_dict.dict2" $CLICKHOUSE_CLIENT -n -q " DROP TABLE table_for_dict1; diff --git a/tests/queries/skip_list.json b/tests/queries/skip_list.json index 4c6927f575a..1c5136b6bde 100644 --- a/tests/queries/skip_list.json +++ b/tests/queries/skip_list.json @@ -103,6 +103,12 @@ "memory_tracking", /// FIXME remove it before merge "memory_tracking", "memory_usage", + "01188_attach_table_from_pat", + "01110_dictionary_layout_without_arguments", + "01018_ddl_dictionaries_create", + "01018_ddl_dictionaries_select", + "01414_freeze_does_not_prevent_alters", + "01018_ddl_dictionaries_bad_queries", "01686_rocksdb", "01550_mutation_subquery", "01070_mutations_with_dependencies", From fd396d1d36600acb6efedb8bdb957e3359454ef7 Mon Sep 17 00:00:00 2001 From: Vitaliy Zakaznikov Date: Mon, 8 Feb 2021 16:08:32 -0500 Subject: [PATCH 088/381] Starting to add documentation for live views. --- .../sql-reference/statements/create/view.md | 79 +++++++++++++++++++ docs/en/sql-reference/statements/watch.md | 68 ++++++++++++++++ 2 files changed, 147 insertions(+) create mode 100644 docs/en/sql-reference/statements/watch.md diff --git a/docs/en/sql-reference/statements/create/view.md b/docs/en/sql-reference/statements/create/view.md index 4370735b8d9..a9fe48ed6ac 100644 --- a/docs/en/sql-reference/statements/create/view.md +++ b/docs/en/sql-reference/statements/create/view.md @@ -62,3 +62,82 @@ The execution of [ALTER](../../../sql-reference/statements/alter/index.md) queri Views look the same as normal tables. For example, they are listed in the result of the `SHOW TABLES` query. There isn’t a separate query for deleting views. To delete a view, use [DROP TABLE](../../../sql-reference/statements/drop.md). + +## Live View (Experimental) {#live-view) + +!!! important "Important" + This is an experimental feature that may change in backwards-incompatible ways in the future releases. + Enable usage of live views and `WATCH` query using `set allow_experimental_live_view = 1`. + + +```sql +CREATE LIVE VIEW [IF NOT EXISTS] [db.]table_name [WITH [TIMEOUT [value_in_sec] [AND]] [REFRESH [value_in_sec]]] AS SELECT ... +``` + +Live views store result of the corresponding [SELECT](../../../sql-reference/statements/select/index.md) query +and are updated any time the result of the query changes. Query result as well as partial result +needed to combine with new data are stored in memory providing increased performance +for repeated queries. Live views can provide push notifications +when query result changes using the [WATCH](../../../sql-reference/statements/watch.md) query. + +Live views are triggered by insert into the innermost table specified in the query. + +!!! info "Note" + [Table function](../../../sql-reference/table-functions/index.md) is not supported as the innermost table. + +!!! info "Note" + Tables that do not have inserts such as a [dictionary](../../../sql-reference/dictionaries/index.md) + or a [system table](../../../operations/system-tables/index.md) + will not trigger a live view. See [WITH REFRESH](#live-view-with-refresh) to enable periodic + updates of a live view. + +Live views work similarly to how a query in a distributed table works. But instead of combining partial results +from different servers they combine partial result from current data with partial result from the new data. +When a live view query includes a subquery then the cached partial result is only stored for the innermost subquery. + +!!! info "Note" + Only queries where one can combine partial result from the old data plus partial result from the new data will work. + Live view will not work for queries that require the complete data set to compute the final result. + +You can execute [SELECT](../../../sql-reference/statements/select/index.md) query on a live view +in the same way as for any regular view or a table. If the query result is cached +it will return the result immediately without running the stored query on the underlying tables. + +### Force Refresh {#live-view-alter-refresh} + +You can force live view refresh using the `ALTER LIVE VIEW [db.]table_name REFRESH` statement. + +### With Timeout {#live-view-with-timeout} + +When a live view is create with a `WITH TIMEOUT` clause then the live view will be dropped automatically after the specified +number of seconds elapse since the end of the last [WATCH](../../../sql-reference/statements/watch.md) query. + +```sql +CREATE LIVE VIEW [db.]table_name WITH TIMEOUT value_in_sec AS SELECT ... +``` + +### With Refresh {#live-view-with-refresh} + +When a live view is created with a `WITH REFRESH` clause then it will be automatically refreshed +after the specified number of seconds elapse since the last refresh or trigger. + +```sql +CREATE LIVE VIEW [db.]table_name WITH REFRESH value_in_sec AS SELECT ... +``` + +You can combine `WITH TIMEOUT` and `WITH REFRESH` clauses using an `AND`. + +```sql +CREATE LIVE VIEW [db.]table_name WITH TIMEOUT value_in_sec AND REFRESH value_in_sec AS SELECT ... +``` + +### Settings {#live-view-settings} + +You can use the following settings to control the behaviour of live views. + +- `allow_experimental_live_view` - enable live views. Default `0`. +- `live_view_heartbeat_interval` - the heartbeat interval in seconds to indicate live query is alive +- `max_live_view_insert_blocks_before_refresh` - maximum number of inserted blocks after which + mergeable blocks are dropped and query is re-executed. Default `64`. +- `temporary_live_view_timeout` - interval after which live view with timeout is deleted. Default `0`. +- `periodic_live_view_refresh` - interval after which periodically refreshed live view is forced to refresh. Default `0`. diff --git a/docs/en/sql-reference/statements/watch.md b/docs/en/sql-reference/statements/watch.md new file mode 100644 index 00000000000..b09147f15eb --- /dev/null +++ b/docs/en/sql-reference/statements/watch.md @@ -0,0 +1,68 @@ +--- +toc_priority: 53 +toc_title: WATCH +--- + +# WATCH Statement {#watch} + +!!! important "Important" + This is an experimental feature that may change in backwards-incompatible ways in the future releases. + Enable live views and `WATCH` query using `set allow_experimental_live_view = 1`. + + +``` sql +WATCH [db.]live_view +[EVENTS] +[LIMIT n] +[FORMAT format] +``` + +The `WATCH` query performs continuous data retrieval from a [live view](./create/view.md#live-view) table. +Unless the `LIMIT` clause is specified it provides an infinite stream of query results +from a live view. + +```sql +WATCH [db.]live_view +``` + +The virtual `_version` column in the query result indicates the current result version. + +By default, the requested data is returned to the client, while in conjunction with [INSERT INTO](../../../sql-reference/statements/insert-into.md) +it can be forwarded to a different table. + +```sql +INSERT INTO [db.]table WATCH [db.]live_view ... +``` + +## EVENTS Clause + +The `EVENTS` clause can be used to obtain a short form of the `WATCH` query +where instead of the query result, you will just get the latest query +result version. + +```sql +WATCH [db.]live_view EVENTS LIMIT 1 +``` + +## LIMIT Clause {#limit-clause} + +The `LIMIT n` clause species the number of updates the `WATCH` query should wait +for before terminating. The value of `0` +indicates that the `WATCH` query should not wait for any new query results +and therefore will return immediately once query is evaluated. + +```sql +WATCH [db.]live_view LIMIT 1 +``` + +## FORMAT Clause {#format-clause} + +The `FORMAT` clause works the same way as for the [SELECT](./select/index.md#format-clause). + +### JSONEachRowWithProgress + +The `JSONEachRowWithProgress` format should be used when watching [live view](./create/view.md#live-view) +tables over the HTTP interface. The progress messages will be added to the output +to keep the long-lived HTTP connection alive until the query result changes. +The interval between progress messages is controlled using the [live_view_heartbeat_interval](./create/view.md#live-view-settings) setting. + From 2e113a0faf9f264853289d9e2ba61ea7913a4d4a Mon Sep 17 00:00:00 2001 From: Vitaliy Zakaznikov Date: Mon, 8 Feb 2021 16:24:05 -0500 Subject: [PATCH 089/381] Update to live view docs. --- .../en/sql-reference/statements/create/view.md | 8 ++++---- docs/en/sql-reference/statements/watch.md | 18 +++++++++--------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/docs/en/sql-reference/statements/create/view.md b/docs/en/sql-reference/statements/create/view.md index a9fe48ed6ac..381dbbfe08a 100644 --- a/docs/en/sql-reference/statements/create/view.md +++ b/docs/en/sql-reference/statements/create/view.md @@ -136,8 +136,8 @@ CREATE LIVE VIEW [db.]table_name WITH TIMEOUT value_in_sec AND REFRESH value_in_ You can use the following settings to control the behaviour of live views. - `allow_experimental_live_view` - enable live views. Default `0`. -- `live_view_heartbeat_interval` - the heartbeat interval in seconds to indicate live query is alive +- `live_view_heartbeat_interval` - the heartbeat interval in seconds to indicate live query is alive. Default `15` seconds. - `max_live_view_insert_blocks_before_refresh` - maximum number of inserted blocks after which - mergeable blocks are dropped and query is re-executed. Default `64`. -- `temporary_live_view_timeout` - interval after which live view with timeout is deleted. Default `0`. -- `periodic_live_view_refresh` - interval after which periodically refreshed live view is forced to refresh. Default `0`. + mergeable blocks are dropped and query is re-executed. Default `64` inserts. +- `temporary_live_view_timeout` - interval after which live view with timeout is deleted. Default `5` seconds. +- `periodic_live_view_refresh` - interval after which periodically refreshed live view is forced to refresh. Default `60` seconds. diff --git a/docs/en/sql-reference/statements/watch.md b/docs/en/sql-reference/statements/watch.md index b09147f15eb..5cf10cdd5a0 100644 --- a/docs/en/sql-reference/statements/watch.md +++ b/docs/en/sql-reference/statements/watch.md @@ -27,7 +27,7 @@ WATCH [db.]live_view The virtual `_version` column in the query result indicates the current result version. -By default, the requested data is returned to the client, while in conjunction with [INSERT INTO](../../../sql-reference/statements/insert-into.md) +By default, the requested data is returned to the client, while in conjunction with [INSERT INTO](../../sql-reference/statements/insert-into.md) it can be forwarded to a different table. ```sql @@ -37,7 +37,7 @@ INSERT INTO [db.]table WATCH [db.]live_view ... ## EVENTS Clause The `EVENTS` clause can be used to obtain a short form of the `WATCH` query -where instead of the query result, you will just get the latest query +where instead of the query result you will just get the latest query result version. ```sql @@ -47,7 +47,8 @@ WATCH [db.]live_view EVENTS LIMIT 1 ## LIMIT Clause {#limit-clause} The `LIMIT n` clause species the number of updates the `WATCH` query should wait -for before terminating. The value of `0` +for before terminating. By default there is no limit on the number of updates and therefore +the query will not terminate. The value of `0` indicates that the `WATCH` query should not wait for any new query results and therefore will return immediately once query is evaluated. @@ -59,10 +60,9 @@ WATCH [db.]live_view LIMIT 1 The `FORMAT` clause works the same way as for the [SELECT](./select/index.md#format-clause). -### JSONEachRowWithProgress - -The `JSONEachRowWithProgress` format should be used when watching [live view](./create/view.md#live-view) -tables over the HTTP interface. The progress messages will be added to the output -to keep the long-lived HTTP connection alive until the query result changes. -The interval between progress messages is controlled using the [live_view_heartbeat_interval](./create/view.md#live-view-settings) setting. +!!! info "Note" + The [JSONEachRowWithProgress](../../interfaces/formats/#jsoneachrowwithprogress) format should be used when watching [live view](./create/view.md#live-view) + tables over the HTTP interface. The progress messages will be added to the output + to keep the long-lived HTTP connection alive until the query result changes. + The interval between progress messages is controlled using the [live_view_heartbeat_interval](./create/view.md#live-view-settings) setting. From d7f5ea784096ae0fe0049c9e2dcefff1ca059cfc Mon Sep 17 00:00:00 2001 From: Vitaliy Zakaznikov Date: Mon, 8 Feb 2021 16:25:07 -0500 Subject: [PATCH 090/381] Adding experimental note to the watch query. --- docs/en/sql-reference/statements/watch.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/statements/watch.md b/docs/en/sql-reference/statements/watch.md index 5cf10cdd5a0..b89cc63375c 100644 --- a/docs/en/sql-reference/statements/watch.md +++ b/docs/en/sql-reference/statements/watch.md @@ -3,7 +3,7 @@ toc_priority: 53 toc_title: WATCH --- -# WATCH Statement {#watch} +# WATCH Statement (Experimental) {#watch} !!! important "Important" This is an experimental feature that may change in backwards-incompatible ways in the future releases. From cd097e250b1544cceb487f4e950243a1c039269d Mon Sep 17 00:00:00 2001 From: Vitaliy Zakaznikov Date: Mon, 8 Feb 2021 16:29:47 -0500 Subject: [PATCH 091/381] Fix type in live view reference. --- docs/en/sql-reference/statements/create/view.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/statements/create/view.md b/docs/en/sql-reference/statements/create/view.md index 381dbbfe08a..0fdb36249ac 100644 --- a/docs/en/sql-reference/statements/create/view.md +++ b/docs/en/sql-reference/statements/create/view.md @@ -63,7 +63,7 @@ Views look the same as normal tables. For example, they are listed in the result There isn’t a separate query for deleting views. To delete a view, use [DROP TABLE](../../../sql-reference/statements/drop.md). -## Live View (Experimental) {#live-view) +## Live View (Experimental) {#live-view} !!! important "Important" This is an experimental feature that may change in backwards-incompatible ways in the future releases. From 52e9b9d73974d3f4b277fb0f37d14b1a0c29e1e9 Mon Sep 17 00:00:00 2001 From: Vitaliy Zakaznikov Date: Mon, 8 Feb 2021 16:56:25 -0500 Subject: [PATCH 092/381] Minor updates to the live view docs. --- .../sql-reference/statements/create/view.md | 29 ++++++++++++------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/docs/en/sql-reference/statements/create/view.md b/docs/en/sql-reference/statements/create/view.md index 0fdb36249ac..5a5c77534fb 100644 --- a/docs/en/sql-reference/statements/create/view.md +++ b/docs/en/sql-reference/statements/create/view.md @@ -103,6 +103,10 @@ You can execute [SELECT](../../../sql-reference/statements/select/index.md) quer in the same way as for any regular view or a table. If the query result is cached it will return the result immediately without running the stored query on the underlying tables. +```sql +SELECT * FROM [db.]live_view WHERE ... +``` + ### Force Refresh {#live-view-alter-refresh} You can force live view refresh using the `ALTER LIVE VIEW [db.]table_name REFRESH` statement. @@ -110,34 +114,39 @@ You can force live view refresh using the `ALTER LIVE VIEW [db.]table_name REFRE ### With Timeout {#live-view-with-timeout} When a live view is create with a `WITH TIMEOUT` clause then the live view will be dropped automatically after the specified -number of seconds elapse since the end of the last [WATCH](../../../sql-reference/statements/watch.md) query. +number of seconds elapse since the end of the last [WATCH](../../../sql-reference/statements/watch.md) query +that was watching the live view. ```sql -CREATE LIVE VIEW [db.]table_name WITH TIMEOUT value_in_sec AS SELECT ... +CREATE LIVE VIEW [db.]table_name WITH TIMEOUT [value_in_sec] AS SELECT ... ``` +If the timeout value is not specified then the value specified by the `temporary_live_view_timeout` setting is used. + ### With Refresh {#live-view-with-refresh} When a live view is created with a `WITH REFRESH` clause then it will be automatically refreshed after the specified number of seconds elapse since the last refresh or trigger. ```sql -CREATE LIVE VIEW [db.]table_name WITH REFRESH value_in_sec AS SELECT ... +CREATE LIVE VIEW [db.]table_name WITH REFRESH [value_in_sec] AS SELECT ... ``` -You can combine `WITH TIMEOUT` and `WITH REFRESH` clauses using an `AND`. +If the refresh value is not specified then the value specified by the `periodic_live_view_refresh` setting is used. + +You can combine `WITH TIMEOUT` and `WITH REFRESH` clauses using an `AND` clause. ```sql -CREATE LIVE VIEW [db.]table_name WITH TIMEOUT value_in_sec AND REFRESH value_in_sec AS SELECT ... +CREATE LIVE VIEW [db.]table_name WITH TIMEOUT [value_in_sec] AND REFRESH [value_in_sec] AS SELECT ... ``` ### Settings {#live-view-settings} You can use the following settings to control the behaviour of live views. -- `allow_experimental_live_view` - enable live views. Default `0`. -- `live_view_heartbeat_interval` - the heartbeat interval in seconds to indicate live query is alive. Default `15` seconds. +- `allow_experimental_live_view` - enable live views. Default is `0`. +- `live_view_heartbeat_interval` - the heartbeat interval in seconds to indicate live query is alive. Default is `15` seconds. - `max_live_view_insert_blocks_before_refresh` - maximum number of inserted blocks after which - mergeable blocks are dropped and query is re-executed. Default `64` inserts. -- `temporary_live_view_timeout` - interval after which live view with timeout is deleted. Default `5` seconds. -- `periodic_live_view_refresh` - interval after which periodically refreshed live view is forced to refresh. Default `60` seconds. + mergeable blocks are dropped and query is re-executed. Default is `64` inserts. +- `temporary_live_view_timeout` - interval after which live view with timeout is deleted. Default is `5` seconds. +- `periodic_live_view_refresh` - interval after which periodically refreshed live view is forced to refresh. Default is `60` seconds. From d737ffbe0c448d77be6f40fd812fea1bb6c6c55c Mon Sep 17 00:00:00 2001 From: Vitaliy Zakaznikov Date: Mon, 8 Feb 2021 16:59:39 -0500 Subject: [PATCH 093/381] Adding event clause reference. --- docs/en/sql-reference/statements/watch.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/statements/watch.md b/docs/en/sql-reference/statements/watch.md index b89cc63375c..480841cf1b9 100644 --- a/docs/en/sql-reference/statements/watch.md +++ b/docs/en/sql-reference/statements/watch.md @@ -34,7 +34,7 @@ it can be forwarded to a different table. INSERT INTO [db.]table WATCH [db.]live_view ... ``` -## EVENTS Clause +## EVENTS Clause {#events-clause} The `EVENTS` clause can be used to obtain a short form of the `WATCH` query where instead of the query result you will just get the latest query From 0270b96ffb48d305ea2125aca995c5046fff842f Mon Sep 17 00:00:00 2001 From: Vitaliy Zakaznikov Date: Mon, 8 Feb 2021 17:18:37 -0500 Subject: [PATCH 094/381] Adding example of using WATCH and WATCH ... EVENTS to live view description. --- docs/en/sql-reference/statements/create/view.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docs/en/sql-reference/statements/create/view.md b/docs/en/sql-reference/statements/create/view.md index 5a5c77534fb..3544ad93aa5 100644 --- a/docs/en/sql-reference/statements/create/view.md +++ b/docs/en/sql-reference/statements/create/view.md @@ -99,6 +99,18 @@ When a live view query includes a subquery then the cached partial result is onl Only queries where one can combine partial result from the old data plus partial result from the new data will work. Live view will not work for queries that require the complete data set to compute the final result. +You can watch for changes in the live view query result using the [WATCH](../../../sql-reference/statements/watch.md) query + +```sql +WATCH [db.]live_view +``` + +or add [EVENTS](../../../sql-reference/statements/watch.md#events-clause) clause to just get change events. + +```sql +WATCH [db.]live_view EVENTS +``` + You can execute [SELECT](../../../sql-reference/statements/select/index.md) query on a live view in the same way as for any regular view or a table. If the query result is cached it will return the result immediately without running the stored query on the underlying tables. From 5769822c53aeca7ba772b8966322235a5e5192fe Mon Sep 17 00:00:00 2001 From: Vitaliy Zakaznikov Date: Mon, 8 Feb 2021 17:28:31 -0500 Subject: [PATCH 095/381] Fixing rendering. --- .../sql-reference/statements/create/view.md | 36 +++++-------------- docs/en/sql-reference/statements/watch.md | 24 ++++--------- 2 files changed, 15 insertions(+), 45 deletions(-) diff --git a/docs/en/sql-reference/statements/create/view.md b/docs/en/sql-reference/statements/create/view.md index 3544ad93aa5..1d6621ff67d 100644 --- a/docs/en/sql-reference/statements/create/view.md +++ b/docs/en/sql-reference/statements/create/view.md @@ -74,30 +74,17 @@ There isn’t a separate query for deleting views. To delete a view, use [DROP T CREATE LIVE VIEW [IF NOT EXISTS] [db.]table_name [WITH [TIMEOUT [value_in_sec] [AND]] [REFRESH [value_in_sec]]] AS SELECT ... ``` -Live views store result of the corresponding [SELECT](../../../sql-reference/statements/select/index.md) query -and are updated any time the result of the query changes. Query result as well as partial result -needed to combine with new data are stored in memory providing increased performance -for repeated queries. Live views can provide push notifications -when query result changes using the [WATCH](../../../sql-reference/statements/watch.md) query. +Live views store result of the corresponding [SELECT](../../../sql-reference/statements/select/index.md) query and are updated any time the result of the query changes. Query result as well as partial result needed to combine with new data are stored in memory providing increased performance +for repeated queries. Live views can provide push notifications when query result changes using the [WATCH](../../../sql-reference/statements/watch.md) query. Live views are triggered by insert into the innermost table specified in the query. -!!! info "Note" - [Table function](../../../sql-reference/table-functions/index.md) is not supported as the innermost table. +Live views work similarly to how a query in a distributed table works. But instead of combining partial results from different servers they combine partial result from current data with partial result from the new data. When a live view query includes a subquery then the cached partial result is only stored for the innermost subquery. !!! info "Note" - Tables that do not have inserts such as a [dictionary](../../../sql-reference/dictionaries/index.md) - or a [system table](../../../operations/system-tables/index.md) - will not trigger a live view. See [WITH REFRESH](#live-view-with-refresh) to enable periodic - updates of a live view. - -Live views work similarly to how a query in a distributed table works. But instead of combining partial results -from different servers they combine partial result from current data with partial result from the new data. -When a live view query includes a subquery then the cached partial result is only stored for the innermost subquery. - -!!! info "Note" - Only queries where one can combine partial result from the old data plus partial result from the new data will work. - Live view will not work for queries that require the complete data set to compute the final result. + - [Table function](../../../sql-reference/table-functions/index.md) is not supported as the innermost table. + - Tables that do not have inserts such as a [dictionary](../../../sql-reference/dictionaries/index.md) or a [system table](../../../operations/system-tables/index.md) will not trigger a live view. See [WITH REFRESH](#live-view-with-refresh) to enable periodic updates of a live view. + - Only queries where one can combine partial result from the old data plus partial result from the new data will work. Live view will not work for queries that require the complete data set to compute the final result. You can watch for changes in the live view query result using the [WATCH](../../../sql-reference/statements/watch.md) query @@ -111,9 +98,7 @@ or add [EVENTS](../../../sql-reference/statements/watch.md#events-clause) clause WATCH [db.]live_view EVENTS ``` -You can execute [SELECT](../../../sql-reference/statements/select/index.md) query on a live view -in the same way as for any regular view or a table. If the query result is cached -it will return the result immediately without running the stored query on the underlying tables. +You can execute [SELECT](../../../sql-reference/statements/select/index.md) query on a live view in the same way as for any regular view or a table. If the query result is cached it will return the result immediately without running the stored query on the underlying tables. ```sql SELECT * FROM [db.]live_view WHERE ... @@ -125,9 +110,7 @@ You can force live view refresh using the `ALTER LIVE VIEW [db.]table_name REFRE ### With Timeout {#live-view-with-timeout} -When a live view is create with a `WITH TIMEOUT` clause then the live view will be dropped automatically after the specified -number of seconds elapse since the end of the last [WATCH](../../../sql-reference/statements/watch.md) query -that was watching the live view. +When a live view is create with a `WITH TIMEOUT` clause then the live view will be dropped automatically after the specified number of seconds elapse since the end of the last [WATCH](../../../sql-reference/statements/watch.md) query that was watching the live view. ```sql CREATE LIVE VIEW [db.]table_name WITH TIMEOUT [value_in_sec] AS SELECT ... @@ -137,8 +120,7 @@ If the timeout value is not specified then the value specified by the `temporary ### With Refresh {#live-view-with-refresh} -When a live view is created with a `WITH REFRESH` clause then it will be automatically refreshed -after the specified number of seconds elapse since the last refresh or trigger. +When a live view is created with a `WITH REFRESH` clause then it will be automatically refreshed after the specified number of seconds elapse since the last refresh or trigger. ```sql CREATE LIVE VIEW [db.]table_name WITH REFRESH [value_in_sec] AS SELECT ... diff --git a/docs/en/sql-reference/statements/watch.md b/docs/en/sql-reference/statements/watch.md index 480841cf1b9..10d2a2715fb 100644 --- a/docs/en/sql-reference/statements/watch.md +++ b/docs/en/sql-reference/statements/watch.md @@ -17,9 +17,7 @@ WATCH [db.]live_view [FORMAT format] ``` -The `WATCH` query performs continuous data retrieval from a [live view](./create/view.md#live-view) table. -Unless the `LIMIT` clause is specified it provides an infinite stream of query results -from a live view. +The `WATCH` query performs continuous data retrieval from a [live view](./create/view.md#live-view) table. Unless the `LIMIT` clause is specified it provides an infinite stream of query results from a live view. ```sql WATCH [db.]live_view @@ -27,8 +25,7 @@ WATCH [db.]live_view The virtual `_version` column in the query result indicates the current result version. -By default, the requested data is returned to the client, while in conjunction with [INSERT INTO](../../sql-reference/statements/insert-into.md) -it can be forwarded to a different table. +By default, the requested data is returned to the client, while in conjunction with [INSERT INTO](../../sql-reference/statements/insert-into.md) it can be forwarded to a different table. ```sql INSERT INTO [db.]table WATCH [db.]live_view ... @@ -36,9 +33,7 @@ INSERT INTO [db.]table WATCH [db.]live_view ... ## EVENTS Clause {#events-clause} -The `EVENTS` clause can be used to obtain a short form of the `WATCH` query -where instead of the query result you will just get the latest query -result version. +The `EVENTS` clause can be used to obtain a short form of the `WATCH` query where instead of the query result you will just get the latest query result version. ```sql WATCH [db.]live_view EVENTS LIMIT 1 @@ -46,14 +41,10 @@ WATCH [db.]live_view EVENTS LIMIT 1 ## LIMIT Clause {#limit-clause} -The `LIMIT n` clause species the number of updates the `WATCH` query should wait -for before terminating. By default there is no limit on the number of updates and therefore -the query will not terminate. The value of `0` -indicates that the `WATCH` query should not wait for any new query results -and therefore will return immediately once query is evaluated. +The `LIMIT n` clause species the number of updates the `WATCH` query should wait for before terminating. By default there is no limit on the number of updates and therefore the query will not terminate. The value of `0` indicates that the `WATCH` query should not wait for any new query results and therefore will return immediately once query is evaluated. ```sql -WATCH [db.]live_view LIMIT 1 +WATCH [db.]live_view LIMIT 2 ``` ## FORMAT Clause {#format-clause} @@ -61,8 +52,5 @@ WATCH [db.]live_view LIMIT 1 The `FORMAT` clause works the same way as for the [SELECT](./select/index.md#format-clause). !!! info "Note" - The [JSONEachRowWithProgress](../../interfaces/formats/#jsoneachrowwithprogress) format should be used when watching [live view](./create/view.md#live-view) - tables over the HTTP interface. The progress messages will be added to the output - to keep the long-lived HTTP connection alive until the query result changes. - The interval between progress messages is controlled using the [live_view_heartbeat_interval](./create/view.md#live-view-settings) setting. + The [JSONEachRowWithProgress](../../interfaces/formats/#jsoneachrowwithprogress) format should be used when watching [live view](./create/view.md#live-view) tables over the HTTP interface. The progress messages will be added to the output to keep the long-lived HTTP connection alive until the query result changes. The interval between progress messages is controlled using the [live_view_heartbeat_interval](./create/view.md#live-view-settings) setting. From a56ffcee1830e3452eaf064696cc8b8508b28ac5 Mon Sep 17 00:00:00 2001 From: Vitaliy Zakaznikov Date: Mon, 8 Feb 2021 17:53:15 -0500 Subject: [PATCH 096/381] Fixing links in WATCH query docs. --- docs/en/sql-reference/statements/watch.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/en/sql-reference/statements/watch.md b/docs/en/sql-reference/statements/watch.md index 10d2a2715fb..71f26d71e85 100644 --- a/docs/en/sql-reference/statements/watch.md +++ b/docs/en/sql-reference/statements/watch.md @@ -17,7 +17,7 @@ WATCH [db.]live_view [FORMAT format] ``` -The `WATCH` query performs continuous data retrieval from a [live view](./create/view.md#live-view) table. Unless the `LIMIT` clause is specified it provides an infinite stream of query results from a live view. +The `WATCH` query performs continuous data retrieval from a [live view](./create/view.md#live-view) table. Unless the `LIMIT` clause is specified it provides an infinite stream of query results from a [live view](./create/view.md#live-view). ```sql WATCH [db.]live_view @@ -49,8 +49,8 @@ WATCH [db.]live_view LIMIT 2 ## FORMAT Clause {#format-clause} -The `FORMAT` clause works the same way as for the [SELECT](./select/index.md#format-clause). +The `FORMAT` clause works the same way as for the [SELECT](./select/format.md). !!! info "Note" - The [JSONEachRowWithProgress](../../interfaces/formats/#jsoneachrowwithprogress) format should be used when watching [live view](./create/view.md#live-view) tables over the HTTP interface. The progress messages will be added to the output to keep the long-lived HTTP connection alive until the query result changes. The interval between progress messages is controlled using the [live_view_heartbeat_interval](./create/view.md#live-view-settings) setting. + The [JSONEachRowWithProgress](../../../interfaces/formats/#jsoneachrowwithprogress) format should be used when watching [live view](./create/view.md#live-view) tables over the HTTP interface. The progress messages will be added to the output to keep the long-lived HTTP connection alive until the query result changes. The interval between progress messages is controlled using the [live_view_heartbeat_interval](./create/view.md#live-view-settings) setting. From 3d2788e1b5b622f96fd15dd4636eba30984d39fb Mon Sep 17 00:00:00 2001 From: Vitaliy Zakaznikov Date: Mon, 8 Feb 2021 19:23:32 -0500 Subject: [PATCH 097/381] Fixes and updates to live view docs. --- .../sql-reference/statements/create/view.md | 21 ++++++++++++++----- docs/en/sql-reference/statements/watch.md | 2 +- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/docs/en/sql-reference/statements/create/view.md b/docs/en/sql-reference/statements/create/view.md index 1d6621ff67d..662a4b54754 100644 --- a/docs/en/sql-reference/statements/create/view.md +++ b/docs/en/sql-reference/statements/create/view.md @@ -74,17 +74,20 @@ There isn’t a separate query for deleting views. To delete a view, use [DROP T CREATE LIVE VIEW [IF NOT EXISTS] [db.]table_name [WITH [TIMEOUT [value_in_sec] [AND]] [REFRESH [value_in_sec]]] AS SELECT ... ``` -Live views store result of the corresponding [SELECT](../../../sql-reference/statements/select/index.md) query and are updated any time the result of the query changes. Query result as well as partial result needed to combine with new data are stored in memory providing increased performance -for repeated queries. Live views can provide push notifications when query result changes using the [WATCH](../../../sql-reference/statements/watch.md) query. +Live views store result of the corresponding [SELECT](../../../sql-reference/statements/select/index.md) query and are updated any time the result of the query changes. Query result as well as partial result needed to combine with new data are stored in memory providing increased performance for repeated queries. Live views can provide push notifications when query result changes using the [WATCH](../../../sql-reference/statements/watch.md) query. Live views are triggered by insert into the innermost table specified in the query. Live views work similarly to how a query in a distributed table works. But instead of combining partial results from different servers they combine partial result from current data with partial result from the new data. When a live view query includes a subquery then the cached partial result is only stored for the innermost subquery. -!!! info "Note" +!!! info "Limitations" - [Table function](../../../sql-reference/table-functions/index.md) is not supported as the innermost table. - - Tables that do not have inserts such as a [dictionary](../../../sql-reference/dictionaries/index.md) or a [system table](../../../operations/system-tables/index.md) will not trigger a live view. See [WITH REFRESH](#live-view-with-refresh) to enable periodic updates of a live view. - - Only queries where one can combine partial result from the old data plus partial result from the new data will work. Live view will not work for queries that require the complete data set to compute the final result. + - Tables that do not have inserts such as a [dictionary](../../../sql-reference/dictionaries/index.md), [system table](../../../operations/system-tables/index.md), a [normal view](#normal), or a [materialized view](#materialized) will not trigger a live view. + - Only queries where one can combine partial result from the old data plus partial result from the new data will work. Live view will not work for queries that require the complete data set to compute the final result or aggregations where the state of the aggregation must be preserved. + - Does not work with replicated or distributed tables where inserts are performed on different nodes. + - Can't be triggered by multiple tables. + + See [WITH REFRESH](#live-view-with-refresh) to force periodic updates of a live view that in some cases can be used as a workaround. You can watch for changes in the live view query result using the [WATCH](../../../sql-reference/statements/watch.md) query @@ -133,6 +136,14 @@ You can combine `WITH TIMEOUT` and `WITH REFRESH` clauses using an `AND` clause. ```sql CREATE LIVE VIEW [db.]table_name WITH TIMEOUT [value_in_sec] AND REFRESH [value_in_sec] AS SELECT ... ``` +### Usage + +Most common uses of live view tables include: + +- Providing push notifications for query result changes to avoid polling. +- Caching results of most frequent queries to provide immediate query results. +- Watching for table changes and triggering a follow-up select queries. +- Watching metrics from system tables using periodic refresh. ### Settings {#live-view-settings} diff --git a/docs/en/sql-reference/statements/watch.md b/docs/en/sql-reference/statements/watch.md index 71f26d71e85..07b050d4c4e 100644 --- a/docs/en/sql-reference/statements/watch.md +++ b/docs/en/sql-reference/statements/watch.md @@ -49,7 +49,7 @@ WATCH [db.]live_view LIMIT 2 ## FORMAT Clause {#format-clause} -The `FORMAT` clause works the same way as for the [SELECT](./select/format.md). +The `FORMAT` clause works the same way as for the [SELECT](../../sql-reference/statements/select/format.md#format-clause). !!! info "Note" The [JSONEachRowWithProgress](../../../interfaces/formats/#jsoneachrowwithprogress) format should be used when watching [live view](./create/view.md#live-view) tables over the HTTP interface. The progress messages will be added to the output to keep the long-lived HTTP connection alive until the query result changes. The interval between progress messages is controlled using the [live_view_heartbeat_interval](./create/view.md#live-view-settings) setting. From be3be85fa2167beb909ec75a6180ae0a63421186 Mon Sep 17 00:00:00 2001 From: Roman Bug Date: Tue, 9 Feb 2021 13:57:41 +0300 Subject: [PATCH 098/381] Update docs/en/sql-reference/functions/type-conversion-functions.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/en/sql-reference/functions/type-conversion-functions.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 1742f6b8888..cab71f46bf5 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -492,8 +492,9 @@ Result: ## accurateCast(x, T) {#type_conversion_function-accurate-cast} -Converts `x` to the `T` data type. The differente from [cast(x, T)](#type_conversion_function-cast) is that `accurateCast` -does not allow overflow of numeric types during cast if type value `x` does not fit bounds of type `T`. +Converts `x` to the `T` data type. + +The difference from [cast(x, T)](#type_conversion_function-cast) is that `accurateCast` does not allow overflow of numeric types during cast if type value `x` does not fit the bounds of type `T`. For example, `accurateCast(-1, 'UInt8')` throws an exception. **Example** From b676f63f1dec7b606f4f5559f910f02098f9c135 Mon Sep 17 00:00:00 2001 From: Roman Bug Date: Tue, 9 Feb 2021 13:58:22 +0300 Subject: [PATCH 099/381] Update docs/ru/sql-reference/functions/type-conversion-functions.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/ru/sql-reference/functions/type-conversion-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index aa55e015c61..d95a5279716 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -497,7 +497,7 @@ SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null; accurateCastOrNull(x, T) ``` -**Parameters** +**Параметры** - `x` — входное значение. - `T` — имя возвращаемого типа данных. From c22412b775b36009f3ceba36fb82a595a5d49075 Mon Sep 17 00:00:00 2001 From: Roman Bug Date: Tue, 9 Feb 2021 13:58:47 +0300 Subject: [PATCH 100/381] Update docs/en/sql-reference/operators/in.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/en/sql-reference/operators/in.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/operators/in.md b/docs/en/sql-reference/operators/in.md index a0dd0455c4d..4796c0f6bc0 100644 --- a/docs/en/sql-reference/operators/in.md +++ b/docs/en/sql-reference/operators/in.md @@ -17,7 +17,7 @@ Don’t list too many values explicitly (i.e. millions). If a data set is large The right side of the operator can be a set of constant expressions, a set of tuples with constant expressions (shown in the examples above), or the name of a database table or SELECT subquery in brackets. -ClickHouse allows different types inside `IN` subquery. For left hand side it applies conversion to the type of right hand side with [accurateCastOrNull](../functions/type-conversion-functions.md#type_conversion_function-accurate-cast_or_null). +ClickHouse allows different types in the left and right parts of `IN` subquery. In this case it converts the left hand side to the type of the right hand side as if the [accurateCastOrNull](../functions/type-conversion-functions.md#type_conversion_function-accurate-cast_or_null) function is applied. **Example** From df123e91e650c9f4dd11d12dff78753df58bbe6d Mon Sep 17 00:00:00 2001 From: Roman Bug Date: Tue, 9 Feb 2021 13:59:58 +0300 Subject: [PATCH 101/381] Update docs/en/sql-reference/functions/type-conversion-functions.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- .../en/sql-reference/functions/type-conversion-functions.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index cab71f46bf5..83cbad6f53b 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -559,9 +559,9 @@ Query: ``` sql SELECT - cast(-1, 'UInt8') as uint8, - cast(128, 'Int8') as int8, - cast('Test', 'FixedString(2)') as fixed_string; + accurateCastOrNull(-1, 'UInt8') as uint8, + accurateCastOrNull(128, 'Int8') as int8, + accurateCastOrNull('Test', 'FixedString(2)') as fixed_string; ``` Result: From 2c6a0e74fb90d2cd5c8b988c4e9f3eebf60366c8 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Tue, 9 Feb 2021 18:14:20 +0300 Subject: [PATCH 102/381] better replica creation --- src/Databases/DatabaseReplicated.cpp | 119 ++++++++++-------- src/Databases/DatabaseReplicated.h | 6 +- src/Databases/DatabaseReplicatedWorker.cpp | 16 ++- src/Databases/DatabaseReplicatedWorker.h | 2 + src/Interpreters/DDLTask.cpp | 4 +- src/Interpreters/DDLTask.h | 5 +- src/Interpreters/DDLWorker.cpp | 39 +++--- src/Interpreters/DDLWorker.h | 6 + src/Interpreters/executeDDLQueryOnCluster.cpp | 1 - .../test_replicated_database/test.py | 9 +- 10 files changed, 115 insertions(+), 92 deletions(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 4a6058afcd0..a3da271a597 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -3,7 +3,6 @@ #include #include #include -#include #include #include #include @@ -105,8 +104,6 @@ DatabaseReplicated::DatabaseReplicated( throw Exception(ErrorCodes::REPLICA_IS_ALREADY_EXIST, "Replica {} of shard {} of replicated database at {} already exists. Replica host ID: '{}', current host ID: '{}'", replica_name, shard_name, zookeeper_path, replica_host_id, host_id); - - log_entry_to_execute = parse(current_zookeeper->get(replica_path + "/log_ptr")); } else { @@ -232,9 +229,6 @@ bool DatabaseReplicated::createDatabaseNodesInZooKeeper(const zkutil::ZooKeeperP void DatabaseReplicated::createReplicaNodesInZooKeeper(const zkutil::ZooKeeperPtr & current_zookeeper) { - /// When creating new replica, use latest snapshot version as initial value of log_pointer - //log_entry_to_execute = 0; //FIXME - /// Write host name to replica_path, it will protect from multiple replicas with the same name auto host_id = getHostID(global_context, db_uuid); @@ -265,40 +259,6 @@ void DatabaseReplicated::loadStoredObjects(Context & context, bool has_force_res ddl_worker->startup(); } -void DatabaseReplicated::onUnexpectedLogEntry(const String & entry_name, const ZooKeeperPtr & zookeeper) -{ - /// We cannot execute next entry of replication log. Possible reasons: - /// 1. Replica is staled, some entries were removed by log cleanup process. - /// In this case we should recover replica from the last snapshot. - /// 2. Replication log is broken due to manual operations with ZooKeeper or logical error. - /// In this case we just stop replication without any attempts to recover it automatically, - /// because such attempts may lead to unexpected data removal. - - constexpr const char * name = "query-"; - if (!startsWith(entry_name, name)) - throw Exception(ErrorCodes::DATABASE_REPLICATION_FAILED, "Unexpected entry in replication log: {}", entry_name); - - UInt32 entry_number; - if (!tryParse(entry_number, entry_name.substr(strlen(name)))) - throw Exception(ErrorCodes::DATABASE_REPLICATION_FAILED, "Cannot parse number of replication log entry {}", entry_name); - - if (entry_number < log_entry_to_execute) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Entry {} already executed, current pointer is {}", entry_number, log_entry_to_execute); - - /// Entry name is valid. Let's get min log pointer to check if replica is staled. - UInt32 min_snapshot = parse(zookeeper->get(zookeeper_path + "/min_log_ptr")); // FIXME - - if (log_entry_to_execute < min_snapshot) - { - recoverLostReplica(zookeeper, 0); //FIXME log_pointer - return; - } - - throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot recover replica, probably it's a bug. " - "Got log entry '{}' when expected entry number {}"); -} - - BlockIO DatabaseReplicated::propose(const ASTPtr & query, const Context & query_context) { if (query_context.getClientInfo().query_kind != ClientInfo::QueryKind::INITIAL_QUERY) @@ -335,22 +295,25 @@ BlockIO DatabaseReplicated::propose(const ASTPtr & query, const Context & query_ } -void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeeper, UInt32 from_snapshot) +void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeeper, UInt32 our_log_ptr, UInt32 max_log_ptr) { - //LOG_WARNING(log, "Will recover replica"); + bool new_replica = our_log_ptr == 0; + if (new_replica) + LOG_INFO(log, "Will create new replica from log pointer {}", max_log_ptr); + else + LOG_WARNING(log, "Will recover replica with staled log pointer {} from log pointer {}", our_log_ptr, max_log_ptr); - //FIXME drop old tables + if (new_replica && !empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "It's new replica, but database is not empty"); - String snapshot_metadata_path = zookeeper_path + "/metadata"; - Strings tables_in_snapshot = current_zookeeper->getChildren(snapshot_metadata_path); - snapshot_metadata_path += '/'; - from_snapshot = parse(current_zookeeper->get(zookeeper_path + "/max_log_ptr")); + if (!new_replica) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Automatic replica recovery is not implemented"); - for (const auto & table_name : tables_in_snapshot) + auto table_name_to_metadata = tryGetConsistentMetadataSnapshot(current_zookeeper, max_log_ptr); + + for (const auto & name_and_meta : table_name_to_metadata) { - //FIXME It's not atomic. We need multiget here (available since ZooKeeper 3.6.0). - String query_text = current_zookeeper->get(snapshot_metadata_path + table_name); - auto query_ast = parseQueryFromMetadataInZooKeeper(table_name, query_text); + auto query_ast = parseQueryFromMetadataInZooKeeper(name_and_meta.first, name_and_meta.second); Context query_context = global_context; query_context.makeQueryContext(); @@ -358,14 +321,60 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep query_context.setCurrentDatabase(database_name); query_context.setCurrentQueryId(""); // generate random query_id - //FIXME - DatabaseCatalog::instance().waitTableFinallyDropped(query_ast->as()->uuid); - LOG_INFO(log, "Executing {}", serializeAST(*query_ast)); InterpreterCreateQuery(query_ast, query_context).execute(); } - current_zookeeper->set(replica_path + "/log_ptr", toString(from_snapshot)); + current_zookeeper->set(replica_path + "/log_ptr", toString(max_log_ptr)); +} + +std::map DatabaseReplicated::tryGetConsistentMetadataSnapshot(const ZooKeeperPtr & zookeeper, UInt32 & max_log_ptr) +{ + std::map table_name_to_metadata; + constexpr int max_retries = 10; + int iteration = 0; + while (++iteration <= max_retries) + { + table_name_to_metadata.clear(); + LOG_DEBUG(log, "Trying to get consistent metadata snapshot for log pointer {}", max_log_ptr); + Strings table_names = zookeeper->getChildren(zookeeper_path + "/metadata"); + + std::vector futures; + futures.reserve(table_names.size()); + for (const auto & table : table_names) + futures.emplace_back(zookeeper->asyncTryGet(zookeeper_path + "/metadata/" + table)); + + for (size_t i = 0; i < table_names.size(); ++i) + { + auto res = futures[i].get(); + if (res.error != Coordination::Error::ZOK) + break; + table_name_to_metadata.emplace(table_names[i], res.data); + } + + UInt32 new_max_log_ptr = parse(zookeeper->get(zookeeper_path + "/max_log_ptr")); + if (new_max_log_ptr == max_log_ptr && table_names.size() == table_name_to_metadata.size()) + break; + + if (max_log_ptr < new_max_log_ptr) + { + LOG_DEBUG(log, "Log pointer moved from {} to {}, will retry", max_log_ptr, new_max_log_ptr); + max_log_ptr = new_max_log_ptr; + } + else + { + assert(max_log_ptr == new_max_log_ptr); + assert(table_names.size() != table_name_to_metadata.size()); + LOG_DEBUG(log, "Cannot get metadata of some tables due to ZooKeeper error, will retry"); + } + } + + if (max_retries < iteration) + throw Exception(ErrorCodes::DATABASE_REPLICATION_FAILED, "Cannot get consistent metadata snapshot"); + + LOG_DEBUG(log, "Got consistent metadata snapshot for log pointer {}", max_log_ptr); + + return table_name_to_metadata; } ASTPtr DatabaseReplicated::parseQueryFromMetadataInZooKeeper(const String & node_name, const String & query) diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index c39321f0caa..fffc2b5c98a 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -86,8 +86,8 @@ private: bool createDatabaseNodesInZooKeeper(const ZooKeeperPtr & current_zookeeper); void createReplicaNodesInZooKeeper(const ZooKeeperPtr & current_zookeeper); - void onUnexpectedLogEntry(const String & entry_name, const ZooKeeperPtr & zookeeper); - void recoverLostReplica(const ZooKeeperPtr & current_zookeeper, UInt32 from_snapshot); + void recoverLostReplica(const ZooKeeperPtr & current_zookeeper, UInt32 our_log_ptr, UInt32 max_log_ptr); + std::map tryGetConsistentMetadataSnapshot(const ZooKeeperPtr & zookeeper, UInt32 & max_log_ptr); ASTPtr parseQueryFromMetadataInZooKeeper(const String & node_name, const String & query); @@ -96,8 +96,6 @@ private: String replica_name; String replica_path; - UInt32 log_entry_to_execute; - zkutil::ZooKeeperPtr getZooKeeper() const; std::unique_ptr ddl_worker; diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp index dd9dc322f9d..3162979e787 100644 --- a/src/Databases/DatabaseReplicatedWorker.cpp +++ b/src/Databases/DatabaseReplicatedWorker.cpp @@ -45,11 +45,14 @@ void DatabaseReplicatedDDLWorker::initializeReplication() /// Check if we need to recover replica. /// Invariant: replica is lost if it's log_ptr value is less then max_log_ptr - logs_to_keep. - UInt32 our_log_ptr = parse(current_zookeeper->get(database->replica_path + "/log_ptr")); + String log_ptr_str = current_zookeeper->get(database->replica_path + "/log_ptr"); + UInt32 our_log_ptr = parse(log_ptr_str); UInt32 max_log_ptr = parse(current_zookeeper->get(database->zookeeper_path + "/max_log_ptr")); - UInt32 logs_to_keep = parse(current_zookeeper->get(database->zookeeper_path + "/logs_to_keep")); + logs_to_keep = parse(current_zookeeper->get(database->zookeeper_path + "/logs_to_keep")); if (our_log_ptr == 0 || our_log_ptr + logs_to_keep < max_log_ptr) - database->recoverLostReplica(current_zookeeper, 0); + database->recoverLostReplica(current_zookeeper, our_log_ptr, max_log_ptr); + else + last_skipped_entry_name.emplace(log_ptr_str); } String DatabaseReplicatedDDLWorker::enqueueQuery(DDLLogEntry & entry) @@ -239,4 +242,11 @@ DDLTaskPtr DatabaseReplicatedDDLWorker::initAndCheckTask(const String & entry_na return task; } +bool DatabaseReplicatedDDLWorker::canRemoveQueueEntry(const String & entry_name, const Coordination::Stat &) +{ + UInt32 entry_number = DDLTaskBase::getLogEntryNumber(entry_name); + UInt32 max_log_ptr = parse(getAndSetZooKeeper()->get(database->zookeeper_path + "/max_log_ptr")); + return entry_number + logs_to_keep < max_log_ptr; +} + } diff --git a/src/Databases/DatabaseReplicatedWorker.h b/src/Databases/DatabaseReplicatedWorker.h index e3fd58c4305..33806df88ba 100644 --- a/src/Databases/DatabaseReplicatedWorker.h +++ b/src/Databases/DatabaseReplicatedWorker.h @@ -20,11 +20,13 @@ private: void initializeReplication(); DDLTaskPtr initAndCheckTask(const String & entry_name, String & out_reason, const ZooKeeperPtr & zookeeper) override; + bool canRemoveQueueEntry(const String & entry_name, const Coordination::Stat & stat) override; DatabaseReplicated * const database; mutable std::mutex mutex; std::condition_variable wait_current_task_change; String current_task; + UInt32 logs_to_keep = std::numeric_limits::max(); }; } diff --git a/src/Interpreters/DDLTask.cpp b/src/Interpreters/DDLTask.cpp index 9e379443364..7f47f0a6659 100644 --- a/src/Interpreters/DDLTask.cpp +++ b/src/Interpreters/DDLTask.cpp @@ -320,7 +320,7 @@ std::unique_ptr DatabaseReplicatedTask::makeQueryContext(Context & from return query_context; } -String DatabaseReplicatedTask::getLogEntryName(UInt32 log_entry_number) +String DDLTaskBase::getLogEntryName(UInt32 log_entry_number) { constexpr size_t seq_node_digits = 10; String number = toString(log_entry_number); @@ -328,7 +328,7 @@ String DatabaseReplicatedTask::getLogEntryName(UInt32 log_entry_number) return name; } -UInt32 DatabaseReplicatedTask::getLogEntryNumber(const String & log_entry_name) +UInt32 DDLTaskBase::getLogEntryNumber(const String & log_entry_name) { constexpr const char * name = "query-"; assert(startsWith(log_entry_name, name)); diff --git a/src/Interpreters/DDLTask.h b/src/Interpreters/DDLTask.h index 43d9fa1c0ae..f02e17103aa 100644 --- a/src/Interpreters/DDLTask.h +++ b/src/Interpreters/DDLTask.h @@ -101,6 +101,8 @@ struct DDLTaskBase inline String getFinishedNodePath() const { return entry_path + "/finished/" + host_id_str; } inline String getShardNodePath() const { return entry_path + "/shards/" + getShardID(); } + static String getLogEntryName(UInt32 log_entry_number); + static UInt32 getLogEntryNumber(const String & log_entry_name); }; struct DDLTask : public DDLTaskBase @@ -132,9 +134,6 @@ struct DatabaseReplicatedTask : public DDLTaskBase String getShardID() const override; std::unique_ptr makeQueryContext(Context & from_context) override; - static String getLogEntryName(UInt32 log_entry_number); - static UInt32 getLogEntryNumber(const String & log_entry_name); - DatabaseReplicated * database; }; diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index 665bacf9d6d..efaacabf4de 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -451,10 +451,7 @@ bool DDLWorker::tryExecuteQuery(const String & query, DDLTaskBase & task) void DDLWorker::updateMaxDDLEntryID(const String & entry_name) { - DB::ReadBufferFromString in(entry_name); - DB::assertString("query-", in); - UInt64 id; - readText(id, in); + UInt64 id = DDLTaskBase::getLogEntryNumber(entry_name); auto prev_id = max_id.load(std::memory_order_relaxed); while (prev_id < id) { @@ -744,16 +741,13 @@ bool DDLWorker::tryExecuteQueryOnLeaderReplica( } -void DDLWorker::cleanupQueue(Int64 current_time_seconds, const ZooKeeperPtr & zookeeper) +void DDLWorker::cleanupQueue(Int64, const ZooKeeperPtr & zookeeper) { LOG_DEBUG(log, "Cleaning queue"); Strings queue_nodes = zookeeper->getChildren(queue_dir); filterAndSortQueueNodes(queue_nodes); - size_t num_outdated_nodes = (queue_nodes.size() > max_tasks_in_queue) ? queue_nodes.size() - max_tasks_in_queue : 0; - auto first_non_outdated_node = queue_nodes.begin() + num_outdated_nodes; - for (auto it = queue_nodes.cbegin(); it < queue_nodes.cend(); ++it) { if (stop_flag) @@ -772,15 +766,7 @@ void DDLWorker::cleanupQueue(Int64 current_time_seconds, const ZooKeeperPtr & zo if (!zookeeper->exists(node_path, &stat)) continue; - /// Delete node if its lifetime is expired (according to task_max_lifetime parameter) - constexpr UInt64 zookeeper_time_resolution = 1000; - Int64 zookeeper_time_seconds = stat.ctime / zookeeper_time_resolution; - bool node_lifetime_is_expired = zookeeper_time_seconds + task_max_lifetime < current_time_seconds; - - /// If too many nodes in task queue (> max_tasks_in_queue), delete oldest one - bool node_is_outside_max_window = it < first_non_outdated_node; - - if (!node_lifetime_is_expired && !node_is_outside_max_window) + if (!canRemoveQueueEntry(node_name, stat)) continue; /// Skip if there are active nodes (it is weak guard) @@ -799,10 +785,7 @@ void DDLWorker::cleanupQueue(Int64 current_time_seconds, const ZooKeeperPtr & zo continue; } - if (node_lifetime_is_expired) - LOG_INFO(log, "Lifetime of task {} is expired, deleting it", node_name); - else if (node_is_outside_max_window) - LOG_INFO(log, "Task {} is outdated, deleting it", node_name); + LOG_INFO(log, "Task {} is outdated, deleting it", node_name); /// Deleting { @@ -827,6 +810,19 @@ void DDLWorker::cleanupQueue(Int64 current_time_seconds, const ZooKeeperPtr & zo } } +bool DDLWorker::canRemoveQueueEntry(const String & entry_name, const Coordination::Stat & stat) +{ + /// Delete node if its lifetime is expired (according to task_max_lifetime parameter) + constexpr UInt64 zookeeper_time_resolution = 1000; + Int64 zookeeper_time_seconds = stat.ctime / zookeeper_time_resolution; + bool node_lifetime_is_expired = zookeeper_time_seconds + task_max_lifetime < Poco::Timestamp().epochTime(); + + /// If too many nodes in task queue (> max_tasks_in_queue), delete oldest one + UInt32 entry_number = DDLTaskBase::getLogEntryNumber(entry_name); + bool node_is_outside_max_window = entry_number < max_id.load(std::memory_order_relaxed) - max_tasks_in_queue; + + return node_lifetime_is_expired || node_is_outside_max_window; +} /// Try to create nonexisting "status" dirs for a node void DDLWorker::createStatusDirs(const std::string & node_path, const ZooKeeperPtr & zookeeper) @@ -927,6 +923,7 @@ void DDLWorker::runMainThread() worker_pool = std::make_unique(pool_size); /// Clear other in-memory state, like server just started. current_tasks.clear(); + last_skipped_entry_name.reset(); max_id = 0; }; diff --git a/src/Interpreters/DDLWorker.h b/src/Interpreters/DDLWorker.h index 706face3885..1ae4f815b44 100644 --- a/src/Interpreters/DDLWorker.h +++ b/src/Interpreters/DDLWorker.h @@ -24,6 +24,11 @@ namespace Poco namespace Util { class AbstractConfiguration; } } +namespace Coordination +{ + struct Stat; +} + namespace DB { class ASTAlterQuery; @@ -94,6 +99,7 @@ protected: /// Checks and cleanups queue's nodes void cleanupQueue(Int64 current_time_seconds, const ZooKeeperPtr & zookeeper); + virtual bool canRemoveQueueEntry(const String & entry_name, const Coordination::Stat & stat); /// Init task node static void createStatusDirs(const std::string & node_path, const ZooKeeperPtr & zookeeper); diff --git a/src/Interpreters/executeDDLQueryOnCluster.cpp b/src/Interpreters/executeDDLQueryOnCluster.cpp index a0148316610..2774f78663e 100644 --- a/src/Interpreters/executeDDLQueryOnCluster.cpp +++ b/src/Interpreters/executeDDLQueryOnCluster.cpp @@ -277,7 +277,6 @@ Block DDLQueryStatusInputStream::readImpl() status.tryDeserializeText(status_data); } - //FIXME String host = host_id; UInt16 port = 0; if (by_hostname) diff --git a/tests/integration/test_replicated_database/test.py b/tests/integration/test_replicated_database/test.py index 2a5a7f4716e..04646507ed7 100644 --- a/tests/integration/test_replicated_database/test.py +++ b/tests/integration/test_replicated_database/test.py @@ -8,7 +8,7 @@ from helpers.test_tools import assert_eq_with_retry cluster = ClickHouseCluster(__file__) main_node = cluster.add_instance('main_node', main_configs=['configs/config.xml'], with_zookeeper=True, stay_alive=True, macros={"shard": 1, "replica": 1}) -dummy_node = cluster.add_instance('dummy_node', main_configs=['configs/config.xml'], with_zookeeper=True, macros={"shard": 1, "replica": 2}) +dummy_node = cluster.add_instance('dummy_node', main_configs=['configs/config.xml'], with_zookeeper=True, stay_alive=True, macros={"shard": 1, "replica": 2}) competing_node = cluster.add_instance('competing_node', main_configs=['configs/config.xml'], with_zookeeper=True, macros={"shard": 1, "replica": 3}) snapshotting_node = cluster.add_instance('snapshotting_node', main_configs=['configs/config.xml'], with_zookeeper=True, macros={"shard": 2, "replica": 1}) snapshot_recovering_node = cluster.add_instance('snapshot_recovering_node', main_configs=['configs/config.xml'], with_zookeeper=True, macros={"shard": 2, "replica": 2}) @@ -100,9 +100,12 @@ def test_alters_from_different_replicas(started_cluster): main_node.query("CREATE TABLE testdb.dist AS testdb.concurrent_test ENGINE = Distributed(cluster, testdb, concurrent_test, CounterID)") - dummy_node.kill_clickhouse(stop_start_wait_sec=0) + dummy_node.stop_clickhouse(kill=True) - competing_node.query("ALTER TABLE testdb.concurrent_test ADD COLUMN Added0 UInt32;") + settings = {"distributed_ddl_task_timeout": 10} + assert "There are 1 unfinished hosts (0 of them are currently active)" in \ + competing_node.query_and_get_error("ALTER TABLE testdb.concurrent_test ADD COLUMN Added0 UInt32;", settings=settings) + dummy_node.start_clickhouse() main_node.query("ALTER TABLE testdb.concurrent_test ADD COLUMN Added2 UInt32;") competing_node.query("ALTER TABLE testdb.concurrent_test ADD COLUMN Added1 UInt32 AFTER Added0;") main_node.query("ALTER TABLE testdb.concurrent_test ADD COLUMN AddedNested1 Nested(A UInt32, B UInt64) AFTER Added2;") From e87e71ee43550f0f3a59abf227d20ce661a3bf4f Mon Sep 17 00:00:00 2001 From: Dmitriy Date: Wed, 10 Feb 2021 21:59:28 +0300 Subject: [PATCH 103/381] Document two functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Задокументировал две функции. --- .../functions/type-conversion-functions.md | 180 ++++++++++++++++++ 1 file changed, 180 insertions(+) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 3ca36f41c78..2116e55e3ef 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -689,6 +689,186 @@ Same as for [parseDateTimeBestEffort](#parsedatetimebesteffort) except that it r Same as for [parseDateTimeBestEffort](#parsedatetimebesteffort) except that it returns zero date or zero date time when it encounters a date format that cannot be processed. +## parseDateTimeBestEffortUSOrNull {#parsedatetimebesteffortusornull} + +Same as for [parseDateTimeBestEffortUS](#parsedatetimebesteffortUS) except that it returns null when it encounters a date format that cannot be processed. + +**Syntax** + +``` sql +parseDateTimeBestEffortUSOrNull(time_string [, time_zone]); +``` + +**Parameters** + +- `time_string` — String containing a date and time to convert. [String](../../sql-reference/data-types/string.md). +- `time_zone` — Time zone. The function parses `time_string` according to the time zone. [String](../../sql-reference/data-types/string.md). + +**Supported non-standard formats** + +- A string containing 9..10 digit [unix timestamp](https://en.wikipedia.org/wiki/Unix_time). +- A string with a date and a time component: `YYYYMMDDhhmmss`, `MM/DD/YYYY hh:mm:ss`, `MM-DD-YY hh:mm`, `YYYY-MM-DD hh:mm:ss`, etc. +- A string with a date, but no time component: `YYYY`, `YYYYMM`, `YYYY*MM`, `MM/DD/YYYY`, `MM-DD-YY` etc. +- A string with a day and time: `DD`, `DD hh`, `DD hh:mm`. In this case, `YYYY-MM` are substituted as `2000-01`. +- A string that includes the date and time along with time zone offset information: `YYYY-MM-DD hh:mm:ss ±h:mm`, etc. For example, `2020-12-12 17:36:00 -5:00`. + +**Returned values** + +- `time_string` converted to the `DateTime` data type. +- `NULL`. + +**Examples** + +Query: + +``` sql +SELECT parseDateTimeBestEffortUSOrNull('02/10/2021 21:12:57') +AS parseDateTimeBestEffortUSOrNull; +``` + +Result: + +``` text +┌─parseDateTimeBestEffortUSOrNull─┐ +│ 2021-02-10 21:12:57 │ +└─────────────────────────────────┘ +``` + +Query: + +``` sql +SELECT parseDateTimeBestEffortUSOrNull('02-10-2021 21:12:57') +AS parseDateTimeBestEffortUSOrNull; +``` + +Result: + +``` text +┌─parseDateTimeBestEffortUSOrNull─┐ +│ 2021-02-10 21:12:57 │ +└─────────────────────────────────┘ +``` + +Query: + +``` sql +SELECT parseDateTimeBestEffortUSOrNull('02.10.2021 21:12:57') +AS parseDateTimeBestEffortUSOrNull; +``` + +Result: + +``` text +┌─parseDateTimeBestEffortUSOrNull─┐ +│ 2021-02-10 21:12:57 │ +└─────────────────────────────────┘ +``` + +Query: + +``` sql +SELECT parseDateTimeBestEffortUSOrNull('02.2021 21:12:57') +AS parseDateTimeBestEffortUSOrNull; +``` + +Result: + +``` text +┌─parseDateTimeBestEffortUSOrNull─┐ +│ ᴺᵁᴸᴸ │ +└─────────────────────────────────┘ +``` + +## parseDateTimeBestEffortUSOrZero {#parsedatetimebesteffortusorzero} + +Same as for [parseDateTimeBestEffortUS](#parsedatetimebesteffortUS) except that it returns zero date or zero date time when it encounters a date format that cannot be processed. + +**Syntax** + +``` sql +parseDateTimeBestEffortUSOrZero(time_string [, time_zone]); +``` + +**Parameters** + +- `time_string` — String containing a date and time to convert. [String](../../sql-reference/data-types/string.md). +- `time_zone` — Time zone. The function parses `time_string` according to the time zone. [String](../../sql-reference/data-types/string.md). + +**Supported non-standard formats** + +- A string containing 9..10 digit [unix timestamp](https://en.wikipedia.org/wiki/Unix_time). +- A string with a date and a time component: `YYYYMMDDhhmmss`, `MM/DD/YYYY hh:mm:ss`, `MM-DD-YY hh:mm`, `YYYY-MM-DD hh:mm:ss`, etc. +- A string with a date, but no time component: `YYYY`, `YYYYMM`, `YYYY*MM`, `MM/DD/YYYY`, `MM-DD-YY` etc. +- A string with a day and time: `DD`, `DD hh`, `DD hh:mm`. In this case, `YYYY-MM` are substituted as `2000-01`. +- A string that includes the date and time along with time zone offset information: `YYYY-MM-DD hh:mm:ss ±h:mm`, etc. For example, `2020-12-12 17:36:00 -5:00`. + +**Returned value** + +- `time_string` converted to the `DateTime` data type. +- `zero date time`. + +**Examples** + +Query: + +``` sql +SELECT parseDateTimeBestEffortUSOrZero('02/10/2021 21:12:57') +AS parseDateTimeBestEffortUSOrZero; +``` + +Result: + +``` text +┌─parseDateTimeBestEffortUSOrZero─┐ +│ 2021-02-10 21:12:57 │ +└─────────────────────────────────┘ +``` + +Query: + +``` sql +SELECT parseDateTimeBestEffortUSOrZero('02-10-2021 21:12:57') +AS parseDateTimeBestEffortUSOrZero; +``` + +Result: + +``` text +┌─parseDateTimeBestEffortUSOrZero─┐ +│ 2021-02-10 21:12:57 │ +└─────────────────────────────────┘ +``` + +Query: + +``` sql +SELECT parseDateTimeBestEffortUSOrZero('02.10.2021 21:12:57') +AS parseDateTimeBestEffortUS; +``` + +Result: + +``` text +┌─parseDateTimeBestEffortUSOrZero─┐ +│ 2021-02-10 21:12:57 │ +└─────────────────────────────────┘ +``` + +Query: + +``` sql +SELECT parseDateTimeBestEffortUSOrZero('02.2021 21:12:57') +AS parseDateTimeBestEffortUSOrZero; +``` + +Result: + +``` text +┌─parseDateTimeBestEffortUSOrZero─┐ +│ 1970-01-01 00:00:00 │ +└─────────────────────────────────┘ +``` + ## toLowCardinality {#tolowcardinality} Converts input parameter to the [LowCardianlity](../../sql-reference/data-types/lowcardinality.md) version of same data type. From 15256d86e59613d36d13c93bbdec960ededcf81e Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Wed, 10 Feb 2021 23:30:40 +0300 Subject: [PATCH 104/381] better replica recovery and queue cleanup --- src/Common/ZooKeeper/IKeeper.cpp | 2 +- src/Common/ZooKeeper/ZooKeeper.cpp | 21 ++-- src/Common/ZooKeeper/ZooKeeper.h | 11 +- src/Databases/DatabaseOnDisk.cpp | 2 +- src/Databases/DatabaseOnDisk.h | 2 +- src/Databases/DatabaseReplicated.cpp | 109 +++++++++++++++--- src/Databases/DatabaseReplicated.h | 2 + src/Databases/DatabaseReplicatedWorker.cpp | 3 +- src/Databases/IDatabase.h | 2 +- .../MySQL/DatabaseConnectionMySQL.cpp | 6 +- src/Databases/MySQL/DatabaseConnectionMySQL.h | 4 +- src/Interpreters/DDLWorker.cpp | 87 ++++++-------- src/Interpreters/InterpreterDropQuery.cpp | 2 +- .../test_distributed_ddl/cluster.py | 4 +- 14 files changed, 165 insertions(+), 92 deletions(-) diff --git a/src/Common/ZooKeeper/IKeeper.cpp b/src/Common/ZooKeeper/IKeeper.cpp index ad18fdd992a..94fd291bd12 100644 --- a/src/Common/ZooKeeper/IKeeper.cpp +++ b/src/Common/ZooKeeper/IKeeper.cpp @@ -59,7 +59,7 @@ static void addRootPath(String & path, const String & root_path) throw Exception("Path cannot be empty", Error::ZBADARGUMENTS); if (path[0] != '/') - throw Exception("Path must begin with /", Error::ZBADARGUMENTS); + throw Exception("Path must begin with /, got " + path, Error::ZBADARGUMENTS); if (root_path.empty()) return; diff --git a/src/Common/ZooKeeper/ZooKeeper.cpp b/src/Common/ZooKeeper/ZooKeeper.cpp index 7a64609dc22..dc6abca6892 100644 --- a/src/Common/ZooKeeper/ZooKeeper.cpp +++ b/src/Common/ZooKeeper/ZooKeeper.cpp @@ -610,7 +610,7 @@ void ZooKeeper::removeChildren(const std::string & path) } -void ZooKeeper::removeChildrenRecursive(const std::string & path) +void ZooKeeper::removeChildrenRecursive(const std::string & path, const String & keep_child_node) { Strings children = getChildren(path); while (!children.empty()) @@ -619,14 +619,15 @@ void ZooKeeper::removeChildrenRecursive(const std::string & path) for (size_t i = 0; i < MULTI_BATCH_SIZE && !children.empty(); ++i) { removeChildrenRecursive(path + "/" + children.back()); - ops.emplace_back(makeRemoveRequest(path + "/" + children.back(), -1)); + if (likely(keep_child_node.empty() || keep_child_node != children.back())) + ops.emplace_back(makeRemoveRequest(path + "/" + children.back(), -1)); children.pop_back(); } multi(ops); } } -void ZooKeeper::tryRemoveChildrenRecursive(const std::string & path) +void ZooKeeper::tryRemoveChildrenRecursive(const std::string & path, const String & keep_child_node) { Strings children; if (tryGetChildren(path, children) != Coordination::Error::ZOK) @@ -637,14 +638,14 @@ void ZooKeeper::tryRemoveChildrenRecursive(const std::string & path) Strings batch; for (size_t i = 0; i < MULTI_BATCH_SIZE && !children.empty(); ++i) { - batch.push_back(path + "/" + children.back()); + String child_path = path + "/" + children.back(); + tryRemoveChildrenRecursive(child_path); + if (likely(keep_child_node.empty() || keep_child_node != children.back())) + { + batch.push_back(child_path); + ops.emplace_back(zkutil::makeRemoveRequest(child_path, -1)); + } children.pop_back(); - tryRemoveChildrenRecursive(batch.back()); - - Coordination::RemoveRequest request; - request.path = batch.back(); - - ops.emplace_back(std::make_shared(std::move(request))); } /// Try to remove the children with a faster method - in bulk. If this fails, diff --git a/src/Common/ZooKeeper/ZooKeeper.h b/src/Common/ZooKeeper/ZooKeeper.h index d532da10f2f..fbe1bede91a 100644 --- a/src/Common/ZooKeeper/ZooKeeper.h +++ b/src/Common/ZooKeeper/ZooKeeper.h @@ -184,6 +184,12 @@ public: /// result would be the same as for the single call. void tryRemoveRecursive(const std::string & path); + /// Similar to removeRecursive(...) and tryRemoveRecursive(...), but does not remove path itself. + /// If keep_child_node is not empty, this method will not remove path/keep_child_node (but will remove its subtree). + /// It can be useful to keep some child node as a flag which indicates that path is currently removing. + void removeChildrenRecursive(const std::string & path, const String & keep_child_node = {}); + void tryRemoveChildrenRecursive(const std::string & path, const String & keep_child_node = {}); + /// Remove all children nodes (non recursive). void removeChildren(const std::string & path); @@ -247,9 +253,6 @@ private: void init(const std::string & implementation_, const std::string & hosts_, const std::string & identity_, int32_t session_timeout_ms_, int32_t operation_timeout_ms_, const std::string & chroot_); - void removeChildrenRecursive(const std::string & path); - void tryRemoveChildrenRecursive(const std::string & path); - /// The following methods don't throw exceptions but return error codes. Coordination::Error createImpl(const std::string & path, const std::string & data, int32_t mode, std::string & path_created); Coordination::Error removeImpl(const std::string & path, int32_t version); @@ -328,7 +331,7 @@ public: catch (...) { ProfileEvents::increment(ProfileEvents::CannotRemoveEphemeralNode); - DB::tryLogCurrentException(__PRETTY_FUNCTION__); + DB::tryLogCurrentException(__PRETTY_FUNCTION__, "Cannot remove " + path + ": "); } } diff --git a/src/Databases/DatabaseOnDisk.cpp b/src/Databases/DatabaseOnDisk.cpp index a03cb33591c..195f57d1bda 100644 --- a/src/Databases/DatabaseOnDisk.cpp +++ b/src/Databases/DatabaseOnDisk.cpp @@ -311,7 +311,7 @@ void DatabaseOnDisk::commitCreateTable(const ASTCreateQuery & query, const Stora } } -void DatabaseOnDisk::detachTablePermanently(const String & table_name) +void DatabaseOnDisk::detachTablePermanently(const Context &, const String & table_name) { auto table = detachTable(table_name); diff --git a/src/Databases/DatabaseOnDisk.h b/src/Databases/DatabaseOnDisk.h index 60a50ac4539..fefe6e91606 100644 --- a/src/Databases/DatabaseOnDisk.h +++ b/src/Databases/DatabaseOnDisk.h @@ -41,7 +41,7 @@ public: const StoragePtr & table, const ASTPtr & query) override; - void detachTablePermanently(const String & table_name) override; + void detachTablePermanently(const Context & context, const String & table_name) override; void dropTable( const Context & context, diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index a3da271a597..0ac71793e5d 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -39,6 +39,8 @@ namespace ErrorCodes } static constexpr const char * DROPPED_MARK = "DROPPED"; +static constexpr const char * BROKEN_TABLE_PREFIX = "_broken_"; + zkutil::ZooKeeperPtr DatabaseReplicated::getZooKeeper() const { @@ -306,13 +308,76 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep if (new_replica && !empty()) throw Exception(ErrorCodes::LOGICAL_ERROR, "It's new replica, but database is not empty"); - if (!new_replica) - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Automatic replica recovery is not implemented"); - auto table_name_to_metadata = tryGetConsistentMetadataSnapshot(current_zookeeper, max_log_ptr); + Strings tables_to_detach; + size_t total_tables = 0; + auto existing_tables_it = getTablesIterator(global_context, [&](const String & name) { return !startsWith(name, BROKEN_TABLE_PREFIX); }); + while (existing_tables_it->isValid()) + { + String name = existing_tables_it->name(); + auto in_zk = table_name_to_metadata.find(name); + String local_metadata = readMetadataFile(name); + if (in_zk == table_name_to_metadata.end() || in_zk->second != local_metadata) + { + bool should_detach = true; + bool looks_like_replicated = in_zk->second.find("ReplicatedMergeTree") != std::string::npos; + + if (looks_like_replicated) + { + ParserCreateQuery parser; + auto size = global_context.getSettingsRef().max_query_size; + auto depth = global_context.getSettingsRef().max_parser_depth; + ASTPtr local_create = parseQuery(parser, local_metadata, size, depth); + ASTPtr zk_create = parseQuery(parser, in_zk->second, size, depth); + if (local_create->as()->uuid == zk_create->as()->uuid) + { + /// For ReplicatedMergeTree tables we can compare only UUIDs to ensure that it's tha same table. + /// Metadata can be different, it's handled on table replication level. + /// TODO maybe we should also compare MergeTree SETTINGS? + should_detach = false; + } + } + + if (should_detach) + tables_to_detach.emplace_back(std::move(name)); + } + existing_tables_it->next(); + ++total_tables; + } + + if (total_tables < tables_to_detach.size() * 2) + throw Exception(ErrorCodes::DATABASE_REPLICATION_FAILED, "Too many tables to detach: {} of {}", tables_to_detach.size(), total_tables); + else if (!tables_to_detach.empty()) + LOG_WARNING(log, "Will DETACH PERMANENTLY {} broken tables to recover replica", tables_to_detach.size()); + + auto db_guard = DatabaseCatalog::instance().getDDLGuard(getDatabaseName(), ""); + for (const auto & table_name : tables_to_detach) + { + String to_name = fmt::format("{}_{}_{}_{}", BROKEN_TABLE_PREFIX, table_name, max_log_ptr, thread_local_rng() % 1000); + DDLGuardPtr table_guard = DatabaseCatalog::instance().getDDLGuard(getDatabaseName(), std::min(table_name, to_name)); + DDLGuardPtr to_table_guard = DatabaseCatalog::instance().getDDLGuard(getDatabaseName(), std::max(table_name, to_name)); + + if (isDictionaryExist(table_name)) + { + /// TODO implement DETACH DICTIONARY PERMANENTLY + DatabaseAtomic::removeDictionary(global_context, table_name); + } + else + { + DatabaseAtomic::renameTable(global_context, table_name, *this, to_name, false, false); + DatabaseAtomic::detachTablePermanently(global_context, to_name); + } + } + for (const auto & name_and_meta : table_name_to_metadata) { + if (isTableExist(name_and_meta.first, global_context)) + { + assert(name_and_meta.second == readMetadataFile(name_and_meta.first)); + continue; + } + auto query_ast = parseQueryFromMetadataInZooKeeper(name_and_meta.first, name_and_meta.second); Context query_context = global_context; @@ -349,7 +414,7 @@ std::map DatabaseReplicated::tryGetConsistentMetadataSnapshot(co auto res = futures[i].get(); if (res.error != Coordination::Error::ZOK) break; - table_name_to_metadata.emplace(table_names[i], res.data); + table_name_to_metadata.emplace(unescapeForFileName(table_names[i]), res.data); } UInt32 new_max_log_ptr = parse(zookeeper->get(zookeeper_path + "/max_log_ptr")); @@ -451,18 +516,8 @@ void DatabaseReplicated::renameTable(const Context & context, const String & tab if (exchange && !to_database.isTableExist(to_table_name, context)) throw Exception(ErrorCodes::UNKNOWN_TABLE, "Table {} does not exist", to_table_name); - String statement; - String statement_to; - { - /// NOTE It's not atomic (however, we have only one thread) - ReadBufferFromFile in(getObjectMetadataPath(table_name), 4096); - readStringUntilEOF(statement, in); - if (exchange) - { - ReadBufferFromFile in_to(to_database.getObjectMetadataPath(to_table_name), 4096); - readStringUntilEOF(statement_to, in_to); - } - } + String statement = readMetadataFile(table_name); + String statement_to = readMetadataFile(to_table_name); String metadata_zk_path = txn->zookeeper_path + "/metadata/" + escapeForFileName(table_name); String metadata_zk_path_to = txn->zookeeper_path + "/metadata/" + escapeForFileName(to_table_name); txn->ops.emplace_back(zkutil::makeRemoveRequest(metadata_zk_path, -1)); @@ -481,6 +536,8 @@ void DatabaseReplicated::commitCreateTable(const ASTCreateQuery & query, const S const String & table_metadata_tmp_path, const String & table_metadata_path, const Context & query_context) { + if (startsWith(query.table, BROKEN_TABLE_PREFIX)) + throw Exception(ErrorCodes::INCORRECT_QUERY, "It's not allowed to attach broken tables"); auto txn = query_context.getMetadataTransaction(); assert(!ddl_worker->isCurrentlyActive() || txn); if (txn && txn->is_initial_query) @@ -533,4 +590,24 @@ void DatabaseReplicated::removeDictionary(const Context & context, const String DatabaseAtomic::removeDictionary(context, dictionary_name); } +void DatabaseReplicated::detachTablePermanently(const Context & context, const String & table_name) +{ + auto txn = context.getMetadataTransaction(); + assert(!ddl_worker->isCurrentlyActive() || txn); + if (txn && txn->is_initial_query) + { + String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(table_name); + txn->ops.emplace_back(zkutil::makeRemoveRequest(metadata_zk_path, -1)); + } + DatabaseAtomic::detachTablePermanently(context, table_name); +} + +String DatabaseReplicated::readMetadataFile(const String & table_name) const +{ + String statement; + ReadBufferFromFile in(getObjectMetadataPath(table_name), 4096); + readStringUntilEOF(statement, in); + return statement; +} + } diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index fffc2b5c98a..2c998a8bc97 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -62,6 +62,7 @@ public: const String & dictionary_name, const ASTPtr & query) override; void removeDictionary(const Context & context, const String & dictionary_name) override; + void detachTablePermanently(const Context & context, const String & table_name) override; void drop(const Context & /*context*/) override; @@ -90,6 +91,7 @@ private: std::map tryGetConsistentMetadataSnapshot(const ZooKeeperPtr & zookeeper, UInt32 & max_log_ptr); ASTPtr parseQueryFromMetadataInZooKeeper(const String & node_name, const String & query); + String readMetadataFile(const String & table_name) const; String zookeeper_path; String shard_name; diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp index 3162979e787..b29a8822c0c 100644 --- a/src/Databases/DatabaseReplicatedWorker.cpp +++ b/src/Databases/DatabaseReplicatedWorker.cpp @@ -24,13 +24,14 @@ DatabaseReplicatedDDLWorker::DatabaseReplicatedDDLWorker(DatabaseReplicated * db void DatabaseReplicatedDDLWorker::initializeMainThread() { - while (!initialized && !stop_flag) + while (!stop_flag) { try { auto zookeeper = getAndSetZooKeeper(); initializeReplication(); initialized = true; + return; } catch (...) { diff --git a/src/Databases/IDatabase.h b/src/Databases/IDatabase.h index fc821fcab30..3a196f827b7 100644 --- a/src/Databases/IDatabase.h +++ b/src/Databases/IDatabase.h @@ -249,7 +249,7 @@ public: /// Forget about the table without deleting it's data, but rename metadata file to prevent reloading it /// with next restart. The database may not support this method. - virtual void detachTablePermanently(const String & /*name*/) + virtual void detachTablePermanently(const Context & /*context*/, const String & /*name*/) { throw Exception("There is no DETACH TABLE PERMANENTLY query for Database" + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); } diff --git a/src/Databases/MySQL/DatabaseConnectionMySQL.cpp b/src/Databases/MySQL/DatabaseConnectionMySQL.cpp index 35b016f255b..eeea12ae8f3 100644 --- a/src/Databases/MySQL/DatabaseConnectionMySQL.cpp +++ b/src/Databases/MySQL/DatabaseConnectionMySQL.cpp @@ -395,7 +395,7 @@ void DatabaseConnectionMySQL::loadStoredObjects(Context &, bool, bool /*force_at } } -void DatabaseConnectionMySQL::detachTablePermanently(const String & table_name) +void DatabaseConnectionMySQL::detachTablePermanently(const Context &, const String & table_name) { std::lock_guard lock{mutex}; @@ -429,9 +429,9 @@ void DatabaseConnectionMySQL::detachTablePermanently(const String & table_name) table_iter->second.second->is_dropped = true; } -void DatabaseConnectionMySQL::dropTable(const Context &, const String & table_name, bool /*no_delay*/) +void DatabaseConnectionMySQL::dropTable(const Context & context, const String & table_name, bool /*no_delay*/) { - detachTablePermanently(table_name); + detachTablePermanently(context, table_name); } DatabaseConnectionMySQL::~DatabaseConnectionMySQL() diff --git a/src/Databases/MySQL/DatabaseConnectionMySQL.h b/src/Databases/MySQL/DatabaseConnectionMySQL.h index 3e305fcb20d..d0a5c041d7b 100644 --- a/src/Databases/MySQL/DatabaseConnectionMySQL.h +++ b/src/Databases/MySQL/DatabaseConnectionMySQL.h @@ -72,9 +72,9 @@ public: StoragePtr detachTable(const String & table_name) override; - void detachTablePermanently(const String & table_name) override; + void detachTablePermanently(const Context & context, const String & table_name) override; - void dropTable(const Context &, const String & table_name, bool no_delay) override; + void dropTable(const Context & context, const String & table_name, bool no_delay) override; void attachTable(const String & table_name, const StoragePtr & storage, const String & relative_table_path) override; diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index efaacabf4de..975eaeaca1b 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -315,11 +315,10 @@ void DDLWorker::scheduleTasks() { /// Main thread of DDLWorker was restarted, probably due to lost connection with ZooKeeper. /// We have some unfinished tasks. To avoid duplication of some queries, try to write execution status. - bool status_written = task->ops.empty(); bool task_still_exists = zookeeper->exists(task->entry_path); + bool status_written = zookeeper->exists(task->getFinishedNodePath()); if (task->was_executed && !status_written && task_still_exists) { - assert(!zookeeper->exists(task->getFinishedNodePath())); processTask(*task); } } @@ -472,9 +471,16 @@ void DDLWorker::processTask(DDLTaskBase & task) String active_node_path = task.getActiveNodePath(); String finished_node_path = task.getFinishedNodePath(); - String dummy; - zookeeper->createAncestors(active_node_path); - auto active_node = zkutil::EphemeralNodeHolder::create(active_node_path, *zookeeper, ""); + auto create_active_res = zookeeper->tryCreate(active_node_path, {}, zkutil::CreateMode::Ephemeral); + if (create_active_res != Coordination::Error::ZOK) + { + if (create_active_res == Coordination::Error::ZNONODE) + throw Coordination::Exception(create_active_res, active_node_path); + createStatusDirs(task.entry_path, zookeeper); + zookeeper->create(active_node_path, {}, zkutil::CreateMode::Ephemeral); + + } + auto active_node = zkutil::EphemeralNodeHolder::existing(active_node_path, *zookeeper); if (!task.was_executed) { @@ -755,7 +761,6 @@ void DDLWorker::cleanupQueue(Int64, const ZooKeeperPtr & zookeeper) String node_name = *it; String node_path = fs::path(queue_dir) / node_name; - String lock_path = fs::path(node_path) / "lock"; Coordination::Stat stat; String dummy; @@ -769,39 +774,29 @@ void DDLWorker::cleanupQueue(Int64, const ZooKeeperPtr & zookeeper) if (!canRemoveQueueEntry(node_name, stat)) continue; - /// Skip if there are active nodes (it is weak guard) - if (zookeeper->exists(fs::path(node_path) / "active", &stat) && stat.numChildren > 0) + /// At first we remove entry/active node to prevent staled hosts from executing entry concurrently + auto rm_active_res = zookeeper->tryRemove(fs::path(node_path) / "active"); + if (rm_active_res != Coordination::Error::ZOK && rm_active_res != Coordination::Error::ZNONODE) { - LOG_INFO(log, "Task {} should be deleted, but there are active workers. Skipping it.", node_name); - continue; - } - - /// Usage of the lock is not necessary now (tryRemoveRecursive correctly removes node in a presence of concurrent cleaners) - /// But the lock will be required to implement system.distributed_ddl_queue table - auto lock = createSimpleZooKeeperLock(zookeeper, node_path, "lock", host_fqdn_id); - if (!lock->tryLock()) - { - LOG_INFO(log, "Task {} should be deleted, but it is locked. Skipping it.", node_name); + if (rm_active_res == Coordination::Error::ZNOTEMPTY) + LOG_DEBUG(log, "Task {} should be deleted, but there are active workers. Skipping it.", node_name); + else + LOG_WARNING(log, "Unexpected status code {} on attempt to remove {}/active", rm_active_res, node_name); continue; } + /// Now we can safely delete entry LOG_INFO(log, "Task {} is outdated, deleting it", node_name); - /// Deleting - { - Strings children = zookeeper->getChildren(node_path); - for (const String & child : children) - { - if (child != "lock") - zookeeper->tryRemoveRecursive(fs::path(node_path) / child); - } + /// We recursively delete all nodes except entry/finished to prevent staled hosts from + /// creating entry/active node (see createStatusDirs(...)) + zookeeper->tryRemoveChildrenRecursive(node_path, "finished"); - /// Remove the lock node and its parent atomically - Coordination::Requests ops; - ops.emplace_back(zkutil::makeRemoveRequest(lock_path, -1)); - ops.emplace_back(zkutil::makeRemoveRequest(node_path, -1)); - zookeeper->multi(ops); - } + /// And then we remove entry and entry/finished in a single transaction + Coordination::Requests ops; + ops.emplace_back(zkutil::makeRemoveRequest(fs::path(node_path) / "finished", -1)); + ops.emplace_back(zkutil::makeRemoveRequest(node_path, -1)); + zookeeper->multi(ops); } catch (...) { @@ -819,7 +814,7 @@ bool DDLWorker::canRemoveQueueEntry(const String & entry_name, const Coordinatio /// If too many nodes in task queue (> max_tasks_in_queue), delete oldest one UInt32 entry_number = DDLTaskBase::getLogEntryNumber(entry_name); - bool node_is_outside_max_window = entry_number < max_id.load(std::memory_order_relaxed) - max_tasks_in_queue; + bool node_is_outside_max_window = entry_number + max_tasks_in_queue < max_id.load(std::memory_order_relaxed); return node_lifetime_is_expired || node_is_outside_max_window; } @@ -828,21 +823,17 @@ bool DDLWorker::canRemoveQueueEntry(const String & entry_name, const Coordinatio void DDLWorker::createStatusDirs(const std::string & node_path, const ZooKeeperPtr & zookeeper) { Coordination::Requests ops; - { - Coordination::CreateRequest request; - request.path = fs::path(node_path) / "active"; - ops.emplace_back(std::make_shared(std::move(request))); - } - { - Coordination::CreateRequest request; - request.path = fs::path(node_path) / "finished"; - ops.emplace_back(std::make_shared(std::move(request))); - } + ops.emplace_back(zkutil::makeCreateRequest(fs::path(node_path) / "active", {}, zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(fs::path(node_path) / "finished", {}, zkutil::CreateMode::Persistent)); + Coordination::Responses responses; Coordination::Error code = zookeeper->tryMulti(ops, responses); - if (code != Coordination::Error::ZOK - && code != Coordination::Error::ZNODEEXISTS) - throw Coordination::Exception(code); + bool both_created = code == Coordination::Error::ZOK; + bool both_already_exists = responses.size() == 2 && responses[0]->error == Coordination::Error::ZNODEEXISTS + && responses[1]->error == Coordination::Error::ZNODEEXISTS; + if (both_created || both_already_exists) + return; + throw Coordination::Exception(code); } @@ -877,8 +868,6 @@ String DDLWorker::enqueueQuery(DDLLogEntry & entry) void DDLWorker::initializeMainThread() { assert(!initialized); - assert(max_id == 0); - assert(current_tasks.empty()); setThreadName("DDLWorker"); LOG_DEBUG(log, "Started DDLWorker thread"); @@ -896,7 +885,7 @@ void DDLWorker::initializeMainThread() if (!Coordination::isHardwareError(e.code)) { /// A logical error. - LOG_ERROR(log, "ZooKeeper error: {}. Failed to start DDLWorker.",getCurrentExceptionMessage(true)); + LOG_ERROR(log, "ZooKeeper error: {}. Failed to start DDLWorker.", getCurrentExceptionMessage(true)); assert(false); /// Catch such failures in tests with debug build } diff --git a/src/Interpreters/InterpreterDropQuery.cpp b/src/Interpreters/InterpreterDropQuery.cpp index ae76e8efd46..9e63c647f71 100644 --- a/src/Interpreters/InterpreterDropQuery.cpp +++ b/src/Interpreters/InterpreterDropQuery.cpp @@ -162,7 +162,7 @@ BlockIO InterpreterDropQuery::executeToTableImpl(const ASTDropQuery & query, Dat if (query.permanently) { /// Drop table from memory, don't touch data, metadata file renamed and will be skipped during server restart - database->detachTablePermanently(table_id.table_name); + database->detachTablePermanently(context, table_id.table_name); } else { diff --git a/tests/integration/test_distributed_ddl/cluster.py b/tests/integration/test_distributed_ddl/cluster.py index 811eb94bad4..45a159ed2b9 100644 --- a/tests/integration/test_distributed_ddl/cluster.py +++ b/tests/integration/test_distributed_ddl/cluster.py @@ -104,8 +104,8 @@ class ClickHouseClusterWithDDLHelpers(ClickHouseCluster): def ddl_check_there_are_no_dublicates(instance): query = "SELECT max(c), argMax(q, c) FROM (SELECT lower(query) AS q, count() AS c FROM system.query_log WHERE type=2 AND q LIKE '/* ddl_entry=query-%' GROUP BY query)" rows = instance.query(query) - assert len(rows) > 0 and rows[0][0] == "1", "dublicates on {} {}, query {}".format(instance.name, - instance.ip_address, query) + assert len(rows) > 0 and rows[0][0] == "1", "dublicates on {} {}: {}".format(instance.name, + instance.ip_address, rows) @staticmethod def insert_reliable(instance, query_insert): From 537b372c32732ddecc9a5f7414c23ea1722ec2fc Mon Sep 17 00:00:00 2001 From: Dmitriy Date: Thu, 11 Feb 2021 00:16:23 +0300 Subject: [PATCH 105/381] Update type-conversion-functions.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Исправил null на NULL. --- docs/en/sql-reference/functions/type-conversion-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 2116e55e3ef..f752bb9f6cb 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -691,7 +691,7 @@ Same as for [parseDateTimeBestEffort](#parsedatetimebesteffort) except that it r ## parseDateTimeBestEffortUSOrNull {#parsedatetimebesteffortusornull} -Same as for [parseDateTimeBestEffortUS](#parsedatetimebesteffortUS) except that it returns null when it encounters a date format that cannot be processed. +Same as for [parseDateTimeBestEffortUS](#parsedatetimebesteffortUS) except that it returns `NULL` when it encounters a date format that cannot be processed. **Syntax** From 59752cbf27104d76fa7a0c9b669f5dbe3b423c3e Mon Sep 17 00:00:00 2001 From: Roman Bug Date: Thu, 11 Feb 2021 01:50:13 +0300 Subject: [PATCH 106/381] Update type-conversion-functions.md Fix changes from EN review. --- .../functions/type-conversion-functions.md | 50 ++++++++++++++++--- 1 file changed, 44 insertions(+), 6 deletions(-) diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index d95a5279716..3a6d2bd9ca0 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -423,8 +423,11 @@ SELECT uuid = uuid2; ## CAST(x, T) {#type_conversion_function-cast} -Преобразует x в тип данных t. -Поддерживается также синтаксис CAST(x AS t). +Преобразует вхожное значение `x` в указананный тип данных `T`. + +Поддерживается также синтаксис `CAST(x AS t)`. + +Обратите внимание, что если значение `x` не соответствует границам типа `T`, функция переполняется. Например, `CAST(-1, 'UInt8')` возвращает 255. **Пример** @@ -487,9 +490,44 @@ SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null; - Настройка [cast_keep_nullable](../../operations/settings/settings.md#cast_keep_nullable) +## accurateCast(x, T) {#type_conversion_function-accurate-cast} + +Преобразует входное значение `x` в указанный тип данных `T`. + +Отличие от [cast(x, T)](#type_conversion_function-cast) в том, что `accurateCast` не допускает переполнения числовых типов, если значение типа `x` не соответствует границам типа `T`. Например, `accurateCast(-1, 'UInt8')` вернет ошибку. + +**Примеры** + +Запрос: + +``` sql +SELECT cast(-1, 'UInt8') as uint8; +``` + +Результат: + +``` text +┌─uint8─┐ +│ 255 │ +└───── + +Запрос: + +```sql +SELECT accurateCast(-1, 'UInt8') as uint8; +``` + +Результат: + +``` text +Code: 70. DB::Exception: Received from localhost:9000. DB::Exception: Value in column Int8 cannot be safely converted into type UInt8: While processing accurateCast(-1, 'UInt8') AS uint8. +``` + ## accurateCastOrNull(x, T) {#type_conversion_function-accurate-cast_or_null} -Преобразует входное значение `x` в указанный тип данных `T`. Всегда возвращает тип [Nullable](../../sql-reference/data-types/nullable.md), и возвращает [NULL](../../sql-reference/syntax.md#null-literal), если приведенное значение не может быть представлено в целевом типе. +Преобразует входное значение `x` в указанный тип данных `T`. + +Всегда возвращает тип [Nullable](../../sql-reference/data-types/nullable.md), и возвращает [NULL](../../sql-reference/syntax.md#null-literal), если приведенное значение не может быть представлено в целевом типе. **Синтаксис** @@ -522,9 +560,9 @@ SELECT toTypeName(accurateCastOrNull(5, 'UInt8')); ``` sql SELECT - cast(-1, 'UInt8') as uint8, - cast(128, 'Int8') as int8, - cast('Test', 'FixedString(2)') as fixed_string; + accurateCastOrNull(-1, 'UInt8') as uint8, + accurateCastOrNull(128, 'Int8') as int8, + accurateCastOrNull('Test', 'FixedString(2)') as fixed_string; ``` Результат: From d4580f9fb4b18d4bb9ec1e2870a8d35db06fa6ef Mon Sep 17 00:00:00 2001 From: Roman Bug Date: Thu, 11 Feb 2021 01:51:19 +0300 Subject: [PATCH 107/381] Update type-conversion-functions.md --- .../sql-reference/functions/type-conversion-functions.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 83cbad6f53b..b452adbde60 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -427,7 +427,12 @@ Result: ## CAST(x, T) {#type_conversion_function-cast} -Converts input value `x` to the `T` data type. The syntax `CAST(x AS t)` is also supported. +Converts input value `x` to the `T` data type. + +The syntax `CAST(x AS t)` is also supported. + +Note, that if value `x` does not fit the bounds of type T, the function overflows. For example, CAST(-1, 'UInt8') returns 255. + **Example** From d4bd82c6c98eb2c4942ce80a42a8f543fd3865e9 Mon Sep 17 00:00:00 2001 From: Roman Bug Date: Thu, 11 Feb 2021 01:56:12 +0300 Subject: [PATCH 108/381] Update in.md Updates in IN from EN comments. --- docs/ru/sql-reference/operators/in.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/operators/in.md b/docs/ru/sql-reference/operators/in.md index d86d6f9ec57..c2d88a729be 100644 --- a/docs/ru/sql-reference/operators/in.md +++ b/docs/ru/sql-reference/operators/in.md @@ -17,7 +17,8 @@ SELECT (CounterID, UserID) IN ((34, 123), (101500, 456)) FROM ... В качестве правой части оператора может быть множество константных выражений, множество кортежей с константными выражениями (показано в примерах выше), а также имя таблицы или подзапрос SELECT в скобках. -ClickHouse допускает различные типы внутри подзапроса `IN`. Для левой стороны он применяет преобразование к типу правой стороны с помощью [accurateCastOrNull](../functions/type-conversion-functions.md#type_conversion_function-accurate-cast_or_null). +ClickHouse допускает различные типы в левой и правой частях подзапроса `IN`. +В этом случае он преобразует левую сторону в тип правой стороны, применяя функцию [accurateCastOrNull](../functions/type-conversion-functions.md#type_conversion_function-accurate-cast_or_null). **Пример** From 60f9f2e913fed325c4747fecbe0e1291265bc666 Mon Sep 17 00:00:00 2001 From: Roman Bug Date: Thu, 11 Feb 2021 02:03:23 +0300 Subject: [PATCH 109/381] Update type-conversion-functions.md Add Returned values --- docs/en/sql-reference/functions/type-conversion-functions.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index b452adbde60..268a7565b81 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -544,6 +544,10 @@ accurateCastOrNull(x, T) - `x` — Input value. - `T` — The name of the returned data type. +**Returned value** + +- The value in specified data type `T`. + **Example** Query: From 37979c8b87d4747816446b1939248911a40ea081 Mon Sep 17 00:00:00 2001 From: Roman Bug Date: Thu, 11 Feb 2021 02:03:36 +0300 Subject: [PATCH 110/381] Update type-conversion-functions.md Add Returned values --- docs/ru/sql-reference/functions/type-conversion-functions.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index 3a6d2bd9ca0..e16fa438aed 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -540,6 +540,10 @@ accurateCastOrNull(x, T) - `x` — входное значение. - `T` — имя возвращаемого типа данных. +**Возвращаемое значение** + +- Значение, преобразованное в указанный тип `T`. + **Примеры** Запрос: From 3feded8d0cb562b7d0ed7a8c4bd4939f2524301c Mon Sep 17 00:00:00 2001 From: Roman Bug Date: Thu, 11 Feb 2021 02:03:53 +0300 Subject: [PATCH 111/381] Create type-conversion-functions.md Add Returned values From d539948fe72f3ee7c7e90a49cdffbc93d0a3749c Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 7 Feb 2021 04:41:31 +0300 Subject: [PATCH 112/381] In memory compression: a prototype --- src/Columns/ColumnVector.cpp | 51 ++++++++++++++++++ src/Columns/ColumnVector.h | 2 + src/Columns/IColumn.h | 11 ++++ src/Storages/MemorySettings.cpp | 36 +++++++++++++ src/Storages/MemorySettings.h | 26 +++++++++ src/Storages/StorageMemory.cpp | 96 +++++++++++++++++++++------------ src/Storages/StorageMemory.h | 16 +++++- src/Storages/StorageSet.cpp | 11 ++-- 8 files changed, 207 insertions(+), 42 deletions(-) create mode 100644 src/Storages/MemorySettings.cpp create mode 100644 src/Storages/MemorySettings.h diff --git a/src/Columns/ColumnVector.cpp b/src/Columns/ColumnVector.cpp index a075c10a8a9..59c8b5cf33b 100644 --- a/src/Columns/ColumnVector.cpp +++ b/src/Columns/ColumnVector.cpp @@ -16,6 +16,9 @@ #include #include #include +#include +#include +#include #include #include @@ -32,6 +35,8 @@ namespace ErrorCodes extern const int PARAMETER_OUT_OF_BOUND; extern const int SIZES_OF_COLUMNS_DOESNT_MATCH; extern const int LOGICAL_ERROR; + extern const int CANNOT_COMPRESS; + extern const int CANNOT_DECOMPRESS; } template @@ -520,6 +525,52 @@ void ColumnVector::getExtremes(Field & min, Field & max) const max = NearestFieldType(cur_max); } + +#pragma GCC diagnostic ignored "-Wold-style-cast" + +template +LazyColumn ColumnVector::compress() const +{ + size_t source_size = data.size() * sizeof(T); + size_t max_dest_size = LZ4_COMPRESSBOUND(source_size); + + if (max_dest_size > std::numeric_limits::max()) + throw Exception(ErrorCodes::CANNOT_COMPRESS, "Cannot compress column of size {}", formatReadableSizeWithBinarySuffix(source_size)); + + auto compressed = std::make_shared>(max_dest_size); + + auto compressed_size = LZ4_compress_default( + reinterpret_cast(data.data()), + compressed->data(), + source_size, + max_dest_size); + + if (compressed_size <= 0) + throw Exception(ErrorCodes::CANNOT_COMPRESS, "Cannot compress column"); + + /// If compression is inefficient. + if (static_cast(compressed_size) * 2 > source_size) + return IColumn::compress(); + + /// Shrink to fit. + auto shrank = std::make_shared>(compressed_size); + memcpy(shrank->data(), compressed->data(), compressed_size); + + return [compressed = std::move(shrank), column_size = data.size()] + { + auto res = ColumnVector::create(column_size); + auto processed_size = LZ4_decompress_fast( + compressed->data(), + reinterpret_cast(res->getData().data()), + column_size * sizeof(T)); + + if (processed_size <= 0) + throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress column"); + + return res; + }; +} + /// Explicit template instantiations - to avoid code bloat in headers. template class ColumnVector; template class ColumnVector; diff --git a/src/Columns/ColumnVector.h b/src/Columns/ColumnVector.h index 1b13859bdee..4f1cbcafcbc 100644 --- a/src/Columns/ColumnVector.h +++ b/src/Columns/ColumnVector.h @@ -298,6 +298,8 @@ public: return typeid(rhs) == typeid(ColumnVector); } + LazyColumn compress() const override; + /// Replace elements that match the filter with zeroes. If inverted replaces not matched elements. void applyZeroMap(const IColumn::Filter & filt, bool inverted = false); diff --git a/src/Columns/IColumn.h b/src/Columns/IColumn.h index 824b5411744..d441e9f7c4e 100644 --- a/src/Columns/IColumn.h +++ b/src/Columns/IColumn.h @@ -357,6 +357,14 @@ public: throw Exception("Method structureEquals is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED); } + /// Compress column in memory to some representation that allows to decompress it back. + using Lazy = std::function; + virtual Lazy compress() const + { + /// No compression by default, just wrap the object. + return [column = getPtr()] { return column; }; + } + static MutablePtr mutate(Ptr ptr) { @@ -462,6 +470,9 @@ using MutableColumns = std::vector; using ColumnRawPtrs = std::vector; //using MutableColumnRawPtrs = std::vector; +using LazyColumn = IColumn::Lazy; +using LazyColumns = std::vector; + template struct IsMutableColumns; diff --git a/src/Storages/MemorySettings.cpp b/src/Storages/MemorySettings.cpp new file mode 100644 index 00000000000..f5e182b3484 --- /dev/null +++ b/src/Storages/MemorySettings.cpp @@ -0,0 +1,36 @@ +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int UNKNOWN_SETTING; +} + +IMPLEMENT_SETTINGS_TRAITS(memorySettingsTraits, MEMORY_SETTINGS) + +void MemorySettings::loadFromQuery(ASTStorage & storage_def) +{ + if (storage_def.settings) + { + try + { + applyChanges(storage_def.settings->changes); + } + catch (Exception & e) + { + if (e.code() == ErrorCodes::UNKNOWN_SETTING) + e.addMessage("for storage " + storage_def.engine->name); + throw; + } + } +} + +} + diff --git a/src/Storages/MemorySettings.h b/src/Storages/MemorySettings.h new file mode 100644 index 00000000000..4a1ba57475f --- /dev/null +++ b/src/Storages/MemorySettings.h @@ -0,0 +1,26 @@ +#pragma once + +#include + + +namespace DB +{ +class ASTStorage; + + +#define MEMORY_SETTINGS(M) \ + M(Bool, compress, true, "Compress data in memory", 0) \ + +DECLARE_SETTINGS_TRAITS(memorySettingsTraits, MEMORY_SETTINGS) + + +/** Settings for the Memory engine. + * Could be loaded from a CREATE TABLE query (SETTINGS clause). + */ +struct MemorySettings : public BaseSettings +{ + void loadFromQuery(ASTStorage & storage_def); +}; + +} + diff --git a/src/Storages/StorageMemory.cpp b/src/Storages/StorageMemory.cpp index 4530d93c274..a67eea0f28a 100644 --- a/src/Storages/StorageMemory.cpp +++ b/src/Storages/StorageMemory.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -23,7 +24,7 @@ namespace ErrorCodes class MemorySource : public SourceWithProgress { - using InitializerFunc = std::function &)>; + using InitializerFunc = std::function &)>; public: /// Blocks are stored in std::list which may be appended in another thread. /// We use pointer to the beginning of the list and its current size. @@ -34,7 +35,7 @@ public: Names column_names_, const StorageMemory & storage, const StorageMetadataPtr & metadata_snapshot, - std::shared_ptr data_, + std::shared_ptr data_, std::shared_ptr> parallel_execution_index_, InitializerFunc initializer_func_ = {}) : SourceWithProgress(metadata_snapshot->getSampleBlockForColumns(column_names_, storage.getVirtuals(), storage.getStorageID())) @@ -43,6 +44,8 @@ public: , parallel_execution_index(parallel_execution_index_) , initializer_func(std::move(initializer_func_)) { + for (const auto & elem : column_names_and_types) + column_positions.push_back(metadata_snapshot->getSampleBlock().getPositionByName(elem.getNameInStorage())); } String getName() const override { return "Memory"; } @@ -63,21 +66,25 @@ protected: return {}; } - const Block & src = (*data)[current_index]; + const LazyBlock & src = (*data)[current_index]; Columns columns; columns.reserve(columns.size()); /// Add only required columns to `res`. + size_t i = 0; for (const auto & elem : column_names_and_types) { - auto current_column = src.getByName(elem.getNameInStorage()).column; + auto current_column = src[column_positions[i]](); if (elem.isSubcolumn()) columns.emplace_back(elem.getTypeInStorage()->getSubcolumn(elem.getSubcolumnName(), *current_column)); else columns.emplace_back(std::move(current_column)); + + ++i; } - return Chunk(std::move(columns), src.rows()); + size_t rows = columns.at(0)->size(); + return Chunk(std::move(columns), rows); } private: @@ -95,9 +102,10 @@ private: const NamesAndTypesList column_names_and_types; size_t execution_index = 0; - std::shared_ptr data; + std::shared_ptr data; std::shared_ptr> parallel_execution_index; InitializerFunc initializer_func; + std::vector column_positions; }; @@ -149,8 +157,12 @@ private: }; -StorageMemory::StorageMemory(const StorageID & table_id_, ColumnsDescription columns_description_, ConstraintsDescription constraints_) - : IStorage(table_id_), data(std::make_unique()) +StorageMemory::StorageMemory( + const StorageID & table_id_, + ColumnsDescription columns_description_, + ConstraintsDescription constraints_, + bool compress_) + : IStorage(table_id_), data(std::make_unique()), compress(compress_) { StorageInMemoryMetadata storage_metadata; storage_metadata.setColumns(std::move(columns_description_)); @@ -186,7 +198,7 @@ Pipe StorageMemory::read( metadata_snapshot, nullptr /* data */, nullptr /* parallel execution index */, - [this](std::shared_ptr & data_to_initialize) + [this](std::shared_ptr & data_to_initialize) { data_to_initialize = data.get(); })); @@ -219,18 +231,18 @@ BlockOutputStreamPtr StorageMemory::write(const ASTPtr & /*query*/, const Storag void StorageMemory::drop() { - data.set(std::make_unique()); + data.set(std::make_unique()); total_size_bytes.store(0, std::memory_order_relaxed); total_size_rows.store(0, std::memory_order_relaxed); } -static inline void updateBlockData(Block & old_block, const Block & new_block) +static inline void updateBlockData(LazyBlock & old_block, const LazyBlock & new_block, const Block & old_header, const Block & new_header) { - for (const auto & it : new_block) + size_t i = 0; + for (const auto & it : new_header) { - auto col_name = it.name; - auto & col_with_type_name = old_block.getByName(col_name); - col_with_type_name.column = it.column; + old_block[old_header.getPositionByName(it.name)] = new_block[i]; + ++i; } } @@ -242,36 +254,47 @@ void StorageMemory::mutate(const MutationCommands & commands, const Context & co auto storage_ptr = DatabaseCatalog::instance().getTable(storage, context); auto interpreter = std::make_unique(storage_ptr, metadata_snapshot, commands, context, true); auto in = interpreter->execute(); + Block old_header = metadata_snapshot->getSampleBlock(); + Block mutation_header = in->getHeader(); in->readPrefix(); - Blocks out; - Block block; - while ((block = in->read())) + LazyBlocks out; + while (Block block = in->read()) { - out.push_back(block); + LazyColumns lazy_columns; + + for (const auto & elem : block) + { + if (compress) + lazy_columns.emplace_back(elem.column->compress()); + else + lazy_columns.emplace_back([=]{ return elem.column; }); + } + + out.emplace_back(std::move(lazy_columns)); } in->readSuffix(); - std::unique_ptr new_data; + std::unique_ptr new_data; - // all column affected + /// All columns affected. if (interpreter->isAffectingAllColumns()) { - new_data = std::make_unique(out); + new_data = std::make_unique(out); } else { - /// just some of the column affected, we need update it with new column - new_data = std::make_unique(*(data.get())); + /// Just some of the columns affected, we need update it with new column. + new_data = std::make_unique(*(data.get())); auto data_it = new_data->begin(); auto out_it = out.begin(); while (data_it != new_data->end()) { - /// Mutation does not change the number of blocks + /// Mutation does not change the number of blocks. assert(out_it != out.end()); - updateBlockData(*data_it, *out_it); + updateBlockData(*data_it, *out_it, old_header, mutation_header); ++data_it; ++out_it; } @@ -279,7 +302,7 @@ void StorageMemory::mutate(const MutationCommands & commands, const Context & co assert(out_it == out.end()); } - size_t rows = 0; +/* size_t rows = 0; size_t bytes = 0; for (const auto & buffer : *new_data) { @@ -287,7 +310,8 @@ void StorageMemory::mutate(const MutationCommands & commands, const Context & co bytes += buffer.bytes(); } total_size_bytes.store(rows, std::memory_order_relaxed); - total_size_rows.store(bytes, std::memory_order_relaxed); + total_size_rows.store(bytes, std::memory_order_relaxed);*/ + data.set(std::move(new_data)); } @@ -295,7 +319,7 @@ void StorageMemory::mutate(const MutationCommands & commands, const Context & co void StorageMemory::truncate( const ASTPtr &, const StorageMetadataPtr &, const Context &, TableExclusiveLockHolder &) { - data.set(std::make_unique()); + data.set(std::make_unique()); total_size_bytes.store(0, std::memory_order_relaxed); total_size_rows.store(0, std::memory_order_relaxed); } @@ -317,13 +341,19 @@ void registerStorageMemory(StorageFactory & factory) factory.registerStorage("Memory", [](const StorageFactory::Arguments & args) { if (!args.engine_args.empty()) - throw Exception( - "Engine " + args.engine_name + " doesn't support any arguments (" + toString(args.engine_args.size()) + " given)", - ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Engine {} doesn't support any arguments ({} given)", + args.engine_name, args.engine_args.size()); - return StorageMemory::create(args.table_id, args.columns, args.constraints); + bool has_settings = args.storage_def->settings; + MemorySettings settings; + if (has_settings) + settings.loadFromQuery(*args.storage_def); + + return StorageMemory::create(args.table_id, args.columns, args.constraints, settings.compress); }, { + .supports_settings = true, .supports_parallel_insert = true, }); } diff --git a/src/Storages/StorageMemory.h b/src/Storages/StorageMemory.h index dc695427156..97ddfa93d9a 100644 --- a/src/Storages/StorageMemory.h +++ b/src/Storages/StorageMemory.h @@ -15,6 +15,11 @@ namespace DB { +/// Lazy block contains possibly compressed columns. LazyColumn is std::function that reconstructs Column on call. +using LazyBlock = LazyColumns; +using LazyBlocks = std::vector; + + /** Implements storage in the RAM. * Suitable for temporary data. * It does not support keys. @@ -95,7 +100,8 @@ public: private: /// MultiVersion data storage, so that we can copy the list of blocks to readers. - MultiVersion data; + + MultiVersion data; mutable std::mutex mutex; @@ -104,8 +110,14 @@ private: std::atomic total_size_bytes = 0; std::atomic total_size_rows = 0; + bool compress; + protected: - StorageMemory(const StorageID & table_id_, ColumnsDescription columns_description_, ConstraintsDescription constraints_); + StorageMemory( + const StorageID & table_id_, + ColumnsDescription columns_description_, + ConstraintsDescription constraints_, + bool compress_ = false); }; } diff --git a/src/Storages/StorageSet.cpp b/src/Storages/StorageSet.cpp index e518c7da0e4..d64042f0c1e 100644 --- a/src/Storages/StorageSet.cpp +++ b/src/Storages/StorageSet.cpp @@ -242,15 +242,12 @@ void registerStorageSet(StorageFactory & factory) ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); bool has_settings = args.storage_def->settings; - - auto set_settings = std::make_unique(); + SetSettings set_settings; if (has_settings) - { - set_settings->loadFromQuery(*args.storage_def); - } + set_settings.loadFromQuery(*args.storage_def); - DiskPtr disk = args.context.getDisk(set_settings->disk); - return StorageSet::create(disk, args.relative_data_path, args.table_id, args.columns, args.constraints, set_settings->persistent); + DiskPtr disk = args.context.getDisk(set_settings.disk); + return StorageSet::create(disk, args.relative_data_path, args.table_id, args.columns, args.constraints, set_settings.persistent); }, StorageFactory::StorageFeatures{ .supports_settings = true, }); } From 280f459f71513752696a2fcc9753aae4a7e342b2 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 7 Feb 2021 05:40:06 +0300 Subject: [PATCH 113/381] Fix quadratic INSERT --- src/Storages/StorageMemory.cpp | 37 ++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/src/Storages/StorageMemory.cpp b/src/Storages/StorageMemory.cpp index a67eea0f28a..20c8a44efd4 100644 --- a/src/Storages/StorageMemory.cpp +++ b/src/Storages/StorageMemory.cpp @@ -125,23 +125,32 @@ public: void write(const Block & block) override { metadata_snapshot->check(block, true); - new_blocks.emplace_back(block); + + inserted_bytes += block.allocatedBytes(); + inserted_rows += block.rows(); + + Block sample = metadata_snapshot->getSampleBlock(); + + LazyColumns lazy_columns; + lazy_columns.reserve(sample.columns()); + + for (const auto & elem : sample) + { + const ColumnPtr & column = block.getByName(elem.name).column; + + if (storage.compress) + lazy_columns.emplace_back(column->compress()); + else + lazy_columns.emplace_back([=]{ return column; }); + } + + new_blocks.emplace_back(std::move(lazy_columns)); } void writeSuffix() override { - size_t inserted_bytes = 0; - size_t inserted_rows = 0; - - for (const auto & block : new_blocks) - { - inserted_bytes += block.allocatedBytes(); - inserted_rows += block.rows(); - } - std::lock_guard lock(storage.mutex); - - auto new_data = std::make_unique(*(storage.data.get())); + auto new_data = std::make_unique(*(storage.data.get())); new_data->insert(new_data->end(), new_blocks.begin(), new_blocks.end()); storage.data.set(std::move(new_data)); @@ -150,7 +159,9 @@ public: } private: - Blocks new_blocks; + LazyBlocks new_blocks; + size_t inserted_bytes = 0; + size_t inserted_rows = 0; StorageMemory & storage; StorageMetadataPtr metadata_snapshot; From 58f1d4d910a2b6d34f484ff742df85e421276391 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 7 Feb 2021 06:00:31 +0300 Subject: [PATCH 114/381] Add comment to config --- programs/server/config.xml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/programs/server/config.xml b/programs/server/config.xml index 849d3dc32ba..571a8c6cf75 100644 --- a/programs/server/config.xml +++ b/programs/server/config.xml @@ -284,6 +284,11 @@ In bytes. Cache is single for server. Memory is allocated only on demand. Cache is used when 'use_uncompressed_cache' user setting turned on (off by default). Uncompressed cache is advantageous only for very short queries and in rare cases. + + Note: uncompressed cache is pointless for lz4, because memory bandwidth is slower than multi-core decompression. + Enabling it will only make queries slower. + If number of CPU cores is in order of 100 and memory bandwidth is in range of 100-200 GB/sec, + there is a chance it is also being pointless for zstd. --> 8589934592 From 4d650a2a5621723f4466db263a8602cb04e6d40b Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 7 Feb 2021 06:03:13 +0300 Subject: [PATCH 115/381] Adjust config --- programs/server/users.xml | 3 --- src/Core/Settings.h | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/programs/server/users.xml b/programs/server/users.xml index 3223d855651..ef66891a6a0 100644 --- a/programs/server/users.xml +++ b/programs/server/users.xml @@ -7,9 +7,6 @@ 10000000000 - - 0 - From d9b85874c0139a3936cc15d85c3869ec22959a36 Mon Sep 17 00:00:00 2001 From: lehasm Date: Fri, 12 Feb 2021 22:52:02 +0300 Subject: [PATCH 126/381] welchttest, mannwhitneyutest markup fixed --- .../aggregate-functions/reference/mannwhitneyutest.md | 1 + .../en/sql-reference/aggregate-functions/reference/welchttest.md | 1 + 2 files changed, 2 insertions(+) diff --git a/docs/en/sql-reference/aggregate-functions/reference/mannwhitneyutest.md b/docs/en/sql-reference/aggregate-functions/reference/mannwhitneyutest.md index 012df7052aa..bc808ab0a9e 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/mannwhitneyutest.md +++ b/docs/en/sql-reference/aggregate-functions/reference/mannwhitneyutest.md @@ -30,6 +30,7 @@ The null hypothesis is that two populations are stochastically equal. Also one-s **Returned values** [Tuple](../../../sql-reference/data-types/tuple.md) with two elements: + - calculated U-statistic. [Float64](../../../sql-reference/data-types/float.md). - calculated p-value. [Float64](../../../sql-reference/data-types/float.md). diff --git a/docs/en/sql-reference/aggregate-functions/reference/welchttest.md b/docs/en/sql-reference/aggregate-functions/reference/welchttest.md index 3fe1c9d58b9..44c320c4565 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/welchttest.md +++ b/docs/en/sql-reference/aggregate-functions/reference/welchttest.md @@ -24,6 +24,7 @@ The null hypothesis is that means of populations are equal. Normal distribution **Returned values** [Tuple](../../../sql-reference/data-types/tuple.md) with two elements: + - calculated t-statistic. [Float64](../../../sql-reference/data-types/float.md). - calculated p-value. [Float64](../../../sql-reference/data-types/float.md). From 00ac1e691abbae0f656a4d913ac489d52ad9c3e4 Mon Sep 17 00:00:00 2001 From: lehasm Date: Fri, 12 Feb 2021 23:01:47 +0300 Subject: [PATCH 127/381] studentttest, welchttest, mannwhitneyutest markup fixed (ru) --- .../aggregate-functions/reference/mannwhitneyutest.md | 1 + .../sql-reference/aggregate-functions/reference/studentttest.md | 1 + .../ru/sql-reference/aggregate-functions/reference/welchttest.md | 1 + 3 files changed, 3 insertions(+) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/mannwhitneyutest.md b/docs/ru/sql-reference/aggregate-functions/reference/mannwhitneyutest.md index fb73fff5f00..a4647ecfb34 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/mannwhitneyutest.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/mannwhitneyutest.md @@ -31,6 +31,7 @@ mannWhitneyUTest[(alternative[, continuity_correction])](sample_data, sample_ind **Возвращаемые значения** [Кортеж](../../../sql-reference/data-types/tuple.md) с двумя элементами: + - вычисленное значение критерия Манна — Уитни. [Float64](../../../sql-reference/data-types/float.md). - вычисленное p-значение. [Float64](../../../sql-reference/data-types/float.md). diff --git a/docs/ru/sql-reference/aggregate-functions/reference/studentttest.md b/docs/ru/sql-reference/aggregate-functions/reference/studentttest.md index 5361e06c5e2..77378de95d1 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/studentttest.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/studentttest.md @@ -24,6 +24,7 @@ studentTTest(sample_data, sample_index) **Возвращаемые значения** [Кортеж](../../../sql-reference/data-types/tuple.md) с двумя элементами: + - вычисленное значение критерия Стьюдента. [Float64](../../../sql-reference/data-types/float.md). - вычисленное p-значение. [Float64](../../../sql-reference/data-types/float.md). diff --git a/docs/ru/sql-reference/aggregate-functions/reference/welchttest.md b/docs/ru/sql-reference/aggregate-functions/reference/welchttest.md index 1f36b2d04ee..16c122d1b49 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/welchttest.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/welchttest.md @@ -24,6 +24,7 @@ welchTTest(sample_data, sample_index) **Возвращаемые значения** [Кортеж](../../../sql-reference/data-types/tuple.md) с двумя элементами: + - вычисленное значение критерия Уэлча. [Float64](../../../sql-reference/data-types/float.md). - вычисленное p-значение. [Float64](../../../sql-reference/data-types/float.md). From 1c656830fc32606cbc52699beb775f80b7094243 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 13 Feb 2021 00:26:12 +0300 Subject: [PATCH 128/381] Fix clang-tidy --- src/Storages/StorageMemory.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Storages/StorageMemory.cpp b/src/Storages/StorageMemory.cpp index 01f70db5edd..d7b0ae055ab 100644 --- a/src/Storages/StorageMemory.cpp +++ b/src/Storages/StorageMemory.cpp @@ -124,7 +124,7 @@ public: if (storage.compress) { Block compressed_block; - for (auto & elem : block) + for (const auto & elem : block) compressed_block.insert({ elem.column->compress(), elem.type, elem.name }); new_blocks.emplace_back(compressed_block); @@ -351,6 +351,7 @@ void registerStorageMemory(StorageFactory & factory) return StorageMemory::create(args.table_id, args.columns, args.constraints, settings.compress); }, { + .supports_settings = true, .supports_parallel_insert = true, }); } From 453450985f9b5452779b6b4a7ec6c0a44105e3dc Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 13 Feb 2021 00:26:25 +0300 Subject: [PATCH 129/381] Performance improvement by Nikolai Kochetov --- src/Storages/StorageMemory.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Storages/StorageMemory.h b/src/Storages/StorageMemory.h index 91cf616c57d..db71c13ca99 100644 --- a/src/Storages/StorageMemory.h +++ b/src/Storages/StorageMemory.h @@ -45,6 +45,8 @@ public: /// Smaller blocks (e.g. 64K rows) are better for CPU cache. bool prefersLargeBlocks() const override { return false; } + bool hasEvenlyDistributedRead() const override { return true; } + BlockOutputStreamPtr write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, const Context & context) override; void drop() override; From b5826121db6379acb5eb54e800ba73bd8cf0cd06 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 13 Feb 2021 00:29:06 +0300 Subject: [PATCH 130/381] Fix Arcadia --- src/Columns/ya.make | 1 + src/Columns/ya.make.in | 1 + 2 files changed, 2 insertions(+) diff --git a/src/Columns/ya.make b/src/Columns/ya.make index def9dfd4cb7..061391b5214 100644 --- a/src/Columns/ya.make +++ b/src/Columns/ya.make @@ -13,6 +13,7 @@ PEERDIR( clickhouse/src/Common contrib/libs/icu contrib/libs/pdqsort + contrib/libs/lz4 ) SRCS( diff --git a/src/Columns/ya.make.in b/src/Columns/ya.make.in index 677a5bcbd70..4422d222ce1 100644 --- a/src/Columns/ya.make.in +++ b/src/Columns/ya.make.in @@ -12,6 +12,7 @@ PEERDIR( clickhouse/src/Common contrib/libs/icu contrib/libs/pdqsort + contrib/libs/lz4 ) SRCS( From 10d773d67154d67c2fa975f5c8d46c8f9ccfb5a6 Mon Sep 17 00:00:00 2001 From: lehasm Date: Sat, 13 Feb 2021 22:35:53 +0300 Subject: [PATCH 131/381] HTTP compression info updated xz compression method added. Text rearranged and edited. Examples improved. --- docs/en/interfaces/http.md | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/docs/en/interfaces/http.md b/docs/en/interfaces/http.md index 310286e3d44..84c1e268e07 100644 --- a/docs/en/interfaces/http.md +++ b/docs/en/interfaces/http.md @@ -148,25 +148,41 @@ $ echo 'DROP TABLE t' | curl 'http://localhost:8123/' --data-binary @- For successful requests that don’t return a data table, an empty response body is returned. -You can use the internal ClickHouse compression format when transmitting data. The compressed data has a non-standard format, and you will need to use the special `clickhouse-compressor` program to work with it (it is installed with the `clickhouse-client` package). To increase the efficiency of data insertion, you can disable server-side checksum verification by using the [http_native_compression_disable_checksumming_on_decompress](../operations/settings/settings.md#settings-http_native_compression_disable_checksumming_on_decompress) setting. -If you specified `compress=1` in the URL, the server compresses the data it sends you. -If you specified `decompress=1` in the URL, the server decompresses the same data that you pass in the `POST` method. +## Compression {#compression} -You can also choose to use [HTTP compression](https://en.wikipedia.org/wiki/HTTP_compression). To send a compressed `POST` request, append the request header `Content-Encoding: compression_method`. In order for ClickHouse to compress the response, you must append `Accept-Encoding: compression_method`. ClickHouse supports `gzip`, `br`, and `deflate` [compression methods](https://en.wikipedia.org/wiki/HTTP_compression#Content-Encoding_tokens). To enable HTTP compression, you must use the ClickHouse [enable_http_compression](../operations/settings/settings.md#settings-enable_http_compression) setting. You can configure the data compression level in the [http_zlib_compression_level](#settings-http_zlib_compression_level) setting for all the compression methods. +You can use compression to reduce network traffic when transmitting a large amount of data or for creating dumps that are immediately compressed. -You can use this to reduce network traffic when transmitting a large amount of data, or for creating dumps that are immediately compressed. +You can use the internal ClickHouse compression format when transmitting data. The compressed data has a non-standard format, and you need `clickhouse-compressor` program to work with it. It is installed with the `clickhouse-client` package. To increase the efficiency of data insertion, you can disable server-side checksum verification by using the [http_native_compression_disable_checksumming_on_decompress](../operations/settings/settings.md#settings-http_native_compression_disable_checksumming_on_decompress) setting. -Examples of sending data with compression: +If you specify `compress=1` in the URL, the server will compress the data it sends to you. +If you specify `decompress=1` in the URL, the server will decompress the data which you pass in the `POST` method. +You can also choose to use [HTTP compression](https://en.wikipedia.org/wiki/HTTP_compression). ClickHouse supports the following [compression methods](https://en.wikipedia.org/wiki/HTTP_compression#Content-Encoding_tokens): + +- `gzip` +- `br` +- `deflate` +- `xz` + +To send a compressed `POST` request, append the request header `Content-Encoding: compression_method`. Example: ``` bash -#Sending data to the server: -$ curl -vsS "http://localhost:8123/?enable_http_compression=1" -d 'SELECT number FROM system.numbers LIMIT 10' -H 'Accept-Encoding: gzip' - -#Sending data to the client: -$ echo "SELECT 1" | gzip -c | curl -sS --data-binary @- -H 'Content-Encoding: gzip' 'http://localhost:8123/' +$ echo "SELECT 1" | gzip -c | \ + curl -sS --data-binary @- -H 'Content-Encoding: gzip' 'http://localhost:8123/' ``` +In order for ClickHouse to compress the response, enable compression with [enable_http_compression](../operations/settings/settings.md#settings-enable_http_compression) setting and append `Accept-Encoding: compression_method` header to the request. You can configure the data compression level in the [http_zlib_compression_level](../operations/settings/settings.md#settings-http_zlib_compression_level) setting for all compression methods. +``` bash +$ curl -vsS "http://localhost:8123/?enable_http_compression=1" \ + -H 'Accept-Encoding: gzip' --output result.gz -d 'SELECT number FROM system.numbers LIMIT 3' +$ zcat result.gz +0 +1 +2 +``` + +## Default Database {#default-database} + !!! note "Note" Some HTTP clients might decompress data from the server by default (with `gzip` and `deflate`) and you might get decompressed data even if you use the compression settings correctly. From d4ba07c5c6737f2c978331969d6b7c4ce535613c Mon Sep 17 00:00:00 2001 From: lehasm Date: Sat, 13 Feb 2021 23:26:56 +0300 Subject: [PATCH 132/381] Fix missplaced header --- docs/en/interfaces/http.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/en/interfaces/http.md b/docs/en/interfaces/http.md index 84c1e268e07..d82d8baeb75 100644 --- a/docs/en/interfaces/http.md +++ b/docs/en/interfaces/http.md @@ -181,11 +181,12 @@ $ zcat result.gz 2 ``` -## Default Database {#default-database} - !!! note "Note" Some HTTP clients might decompress data from the server by default (with `gzip` and `deflate`) and you might get decompressed data even if you use the compression settings correctly. + +## Default Database {#default-database} + You can use the ‘database’ URL parameter or the ‘X-ClickHouse-Database’ header to specify the default database. ``` bash From b13d1f31422fe52f944ca95fe11276791434815d Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 14 Feb 2021 04:34:42 +0300 Subject: [PATCH 133/381] Fix integration test --- tests/integration/test_settings_profile/test.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/integration/test_settings_profile/test.py b/tests/integration/test_settings_profile/test.py index 3ceef9f25cf..1945875bf53 100644 --- a/tests/integration/test_settings_profile/test.py +++ b/tests/integration/test_settings_profile/test.py @@ -46,7 +46,7 @@ def reset_after_test(): def test_smoke(): - # Set settings and constraints via CREATE SETTINGS PROFILE ... TO user + # Set settings and constraints via CREATE SETTINGS PROFILE ... TO user instance.query( "CREATE SETTINGS PROFILE xyz SETTINGS max_memory_usage = 100000001 MIN 90000000 MAX 110000000 TO robin") assert instance.query( @@ -194,13 +194,13 @@ def test_show_profiles(): assert instance.query("SHOW CREATE PROFILE xyz") == "CREATE SETTINGS PROFILE xyz\n" assert instance.query( - "SHOW CREATE SETTINGS PROFILE default") == "CREATE SETTINGS PROFILE default SETTINGS max_memory_usage = 10000000000, use_uncompressed_cache = 0, load_balancing = \\'random\\'\n" + "SHOW CREATE SETTINGS PROFILE default") == "CREATE SETTINGS PROFILE default SETTINGS max_memory_usage = 10000000000, load_balancing = \\'random\\'\n" assert instance.query( - "SHOW CREATE PROFILES") == "CREATE SETTINGS PROFILE default SETTINGS max_memory_usage = 10000000000, use_uncompressed_cache = 0, load_balancing = \\'random\\'\n" \ + "SHOW CREATE PROFILES") == "CREATE SETTINGS PROFILE default SETTINGS max_memory_usage = 10000000000, load_balancing = \\'random\\'\n" \ "CREATE SETTINGS PROFILE readonly SETTINGS readonly = 1\n" \ "CREATE SETTINGS PROFILE xyz\n" - expected_access = "CREATE SETTINGS PROFILE default SETTINGS max_memory_usage = 10000000000, use_uncompressed_cache = 0, load_balancing = \\'random\\'\n" \ + expected_access = "CREATE SETTINGS PROFILE default SETTINGS max_memory_usage = 10000000000, load_balancing = \\'random\\'\n" \ "CREATE SETTINGS PROFILE readonly SETTINGS readonly = 1\n" \ "CREATE SETTINGS PROFILE xyz\n" assert expected_access in instance.query("SHOW ACCESS") @@ -210,7 +210,7 @@ def test_allow_ddl(): assert "it's necessary to have grant" in instance.query_and_get_error("CREATE TABLE tbl(a Int32) ENGINE=Log", user="robin") assert "it's necessary to have grant" in instance.query_and_get_error("GRANT CREATE ON tbl TO robin", user="robin") assert "DDL queries are prohibited" in instance.query_and_get_error("CREATE TABLE tbl(a Int32) ENGINE=Log", settings={"allow_ddl": 0}) - + instance.query("GRANT CREATE ON tbl TO robin") instance.query("CREATE TABLE tbl(a Int32) ENGINE=Log", user="robin") instance.query("DROP TABLE tbl") From 96dc69609c9def6dc5f457e67529e106f55ffccd Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Feb 2021 00:00:18 +0300 Subject: [PATCH 134/381] Fix Arcadia --- src/Columns/ya.make | 1 + src/Columns/ya.make.in | 1 + 2 files changed, 2 insertions(+) diff --git a/src/Columns/ya.make b/src/Columns/ya.make index 061391b5214..54dd02609ff 100644 --- a/src/Columns/ya.make +++ b/src/Columns/ya.make @@ -7,6 +7,7 @@ ADDINCL( contrib/libs/icu/common contrib/libs/icu/i18n contrib/libs/pdqsort + contrib/libs/lz4 ) PEERDIR( diff --git a/src/Columns/ya.make.in b/src/Columns/ya.make.in index 4422d222ce1..846e2c6c3bd 100644 --- a/src/Columns/ya.make.in +++ b/src/Columns/ya.make.in @@ -6,6 +6,7 @@ ADDINCL( contrib/libs/icu/common contrib/libs/icu/i18n contrib/libs/pdqsort + contrib/libs/lz4 ) PEERDIR( From c24221b04f1bc511cc0a9524e6e2388c03d08246 Mon Sep 17 00:00:00 2001 From: Roman Bug Date: Mon, 15 Feb 2021 01:53:44 +0300 Subject: [PATCH 135/381] Update docs/en/sql-reference/functions/type-conversion-functions.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/en/sql-reference/functions/type-conversion-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 268a7565b81..c80f8934f72 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -546,7 +546,7 @@ accurateCastOrNull(x, T) **Returned value** -- The value in specified data type `T`. +- The value, converted to the specified data type `T`. **Example** From cdac3cf9ce17391479681444b48e005dc24327d7 Mon Sep 17 00:00:00 2001 From: Roman Bug Date: Mon, 15 Feb 2021 01:53:51 +0300 Subject: [PATCH 136/381] Update docs/ru/sql-reference/functions/type-conversion-functions.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/ru/sql-reference/functions/type-conversion-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index e16fa438aed..985dd16c231 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -423,7 +423,7 @@ SELECT uuid = uuid2; ## CAST(x, T) {#type_conversion_function-cast} -Преобразует вхожное значение `x` в указананный тип данных `T`. +Преобразует входное значение `x` в указанный тип данных `T`. Поддерживается также синтаксис `CAST(x AS t)`. From cda9dc7600880ee35582cfe1d98d15bd4df43c28 Mon Sep 17 00:00:00 2001 From: Roman Bug Date: Mon, 15 Feb 2021 01:54:02 +0300 Subject: [PATCH 137/381] Update docs/ru/sql-reference/functions/type-conversion-functions.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/ru/sql-reference/functions/type-conversion-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index 985dd16c231..3c9d3993120 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -427,7 +427,7 @@ SELECT uuid = uuid2; Поддерживается также синтаксис `CAST(x AS t)`. -Обратите внимание, что если значение `x` не соответствует границам типа `T`, функция переполняется. Например, `CAST(-1, 'UInt8')` возвращает 255. +Обратите внимание, что если значение `x` не может быть преобразовано к типу `T`, возникает переполнение. Например, `CAST(-1, 'UInt8')` возвращает 255. **Пример** From b82bf79c5245092fea0a866f3cae2934262d66d6 Mon Sep 17 00:00:00 2001 From: Roman Bug Date: Mon, 15 Feb 2021 01:54:10 +0300 Subject: [PATCH 138/381] Update docs/ru/sql-reference/functions/type-conversion-functions.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/ru/sql-reference/functions/type-conversion-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index 3c9d3993120..16e52efceec 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -494,7 +494,7 @@ SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null; Преобразует входное значение `x` в указанный тип данных `T`. -Отличие от [cast(x, T)](#type_conversion_function-cast) в том, что `accurateCast` не допускает переполнения числовых типов, если значение типа `x` не соответствует границам типа `T`. Например, `accurateCast(-1, 'UInt8')` вернет ошибку. +В отличие от функции [cast(x, T)](#type_conversion_function-cast), `accurateCast` не допускает переполнения при преобразовании числовых типов. Например, `accurateCast(-1, 'UInt8')` вызовет исключение. **Примеры** From 82701ecbeccf88f38a73ccb0ea556267d2fa99a0 Mon Sep 17 00:00:00 2001 From: Roman Bug Date: Mon, 15 Feb 2021 01:54:15 +0300 Subject: [PATCH 139/381] Update docs/ru/sql-reference/functions/type-conversion-functions.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/ru/sql-reference/functions/type-conversion-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index 16e52efceec..0723ed2c752 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -527,7 +527,7 @@ Code: 70. DB::Exception: Received from localhost:9000. DB::Exception: Value in c Преобразует входное значение `x` в указанный тип данных `T`. -Всегда возвращает тип [Nullable](../../sql-reference/data-types/nullable.md), и возвращает [NULL](../../sql-reference/syntax.md#null-literal), если приведенное значение не может быть представлено в целевом типе. +Всегда возвращает тип [Nullable](../../sql-reference/data-types/nullable.md). Если исходное значение не может быть преобразовано к целевому типу, возвращает [NULL](../../sql-reference/syntax.md#null-literal). **Синтаксис** From 994b998df9863e772b438a858a2cdabdb2ce27ea Mon Sep 17 00:00:00 2001 From: Roman Bug Date: Mon, 15 Feb 2021 01:54:20 +0300 Subject: [PATCH 140/381] Update docs/ru/sql-reference/operators/in.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/ru/sql-reference/operators/in.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/ru/sql-reference/operators/in.md b/docs/ru/sql-reference/operators/in.md index c2d88a729be..e0412747898 100644 --- a/docs/ru/sql-reference/operators/in.md +++ b/docs/ru/sql-reference/operators/in.md @@ -17,8 +17,7 @@ SELECT (CounterID, UserID) IN ((34, 123), (101500, 456)) FROM ... В качестве правой части оператора может быть множество константных выражений, множество кортежей с константными выражениями (показано в примерах выше), а также имя таблицы или подзапрос SELECT в скобках. -ClickHouse допускает различные типы в левой и правой частях подзапроса `IN`. -В этом случае он преобразует левую сторону в тип правой стороны, применяя функцию [accurateCastOrNull](../functions/type-conversion-functions.md#type_conversion_function-accurate-cast_or_null). +Если типы данных в левой и правой частях подзапроса `IN` различаются, ClickHouse преобразует значение в левой части к типу данных из правой части. Преобразование выполняется по аналогии с функцией [accurateCastOrNull](../functions/type-conversion-functions.md#type_conversion_function-accurate-cast_or_null), т.е. тип данных становится [Nullable](../../sql-reference/data-types/nullable.md), а если преобразование не может быть выполнено, возвращается значение [NULL](../../sql-reference/syntax.md#null-literal). **Пример** From 2a71053c695ee6deb84d8583c51dec0cc74dcdb1 Mon Sep 17 00:00:00 2001 From: Roman Bug Date: Mon, 15 Feb 2021 01:54:25 +0300 Subject: [PATCH 141/381] Update docs/en/sql-reference/operators/in.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/en/sql-reference/operators/in.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/operators/in.md b/docs/en/sql-reference/operators/in.md index 4796c0f6bc0..34866f3d09a 100644 --- a/docs/en/sql-reference/operators/in.md +++ b/docs/en/sql-reference/operators/in.md @@ -17,7 +17,7 @@ Don’t list too many values explicitly (i.e. millions). If a data set is large The right side of the operator can be a set of constant expressions, a set of tuples with constant expressions (shown in the examples above), or the name of a database table or SELECT subquery in brackets. -ClickHouse allows different types in the left and right parts of `IN` subquery. In this case it converts the left hand side to the type of the right hand side as if the [accurateCastOrNull](../functions/type-conversion-functions.md#type_conversion_function-accurate-cast_or_null) function is applied. +ClickHouse allows types to differ in the left and the right parts of `IN` subquery. In this case it converts the left side value to the type of the right side, as if the [accurateCastOrNull](../functions/type-conversion-functions.md#type_conversion_function-accurate-cast_or_null) function is applied. That means, that the data type becomes [Nullable](../../sql-reference/data-types/nullable.md), and if the conversion cannot be performed, it returns [NULL](../../sql-reference/syntax.md#null-literal). **Example** From 320fd6b264db77de1ef335c0025c5487868e9ddb Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Mon, 15 Feb 2021 03:04:46 +0300 Subject: [PATCH 142/381] startup without zk --- src/Databases/DatabaseReplicated.cpp | 169 ++++++++++++------ src/Databases/DatabaseReplicated.h | 2 + src/Databases/DatabaseReplicatedWorker.cpp | 2 + src/Interpreters/DDLWorker.cpp | 2 +- .../test_replicated_database/test.py | 49 ++++- 5 files changed, 156 insertions(+), 68 deletions(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index d365ea24bbf..24a193d9134 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -82,37 +82,6 @@ DatabaseReplicated::DatabaseReplicated( /// If zookeeper chroot prefix is used, path should start with '/', because chroot concatenates without it. if (zookeeper_path.front() != '/') zookeeper_path = "/" + zookeeper_path; - - if (!context_.hasZooKeeper()) - { - throw Exception("Can't create replicated database without ZooKeeper", ErrorCodes::NO_ZOOKEEPER); - } - //FIXME it will fail on startup if zk is not available - - auto current_zookeeper = global_context.getZooKeeper(); - - if (!current_zookeeper->exists(zookeeper_path)) - { - /// Create new database, multiple nodes can execute it concurrently - createDatabaseNodesInZooKeeper(current_zookeeper); - } - - replica_path = zookeeper_path + "/replicas/" + getFullReplicaName(); - - String replica_host_id; - if (current_zookeeper->tryGet(replica_path, replica_host_id)) - { - String host_id = getHostID(global_context, db_uuid); - if (replica_host_id != host_id) - throw Exception(ErrorCodes::REPLICA_IS_ALREADY_EXIST, - "Replica {} of shard {} of replicated database at {} already exists. Replica host ID: '{}', current host ID: '{}'", - replica_name, shard_name, zookeeper_path, replica_host_id, host_id); - } - else - { - /// Throws if replica with the same name was created concurrently - createReplicaNodesInZooKeeper(current_zookeeper); - } } String DatabaseReplicated::getFullReplicaName() const @@ -203,6 +172,50 @@ ClusterPtr DatabaseReplicated::getCluster() const return std::make_shared(global_context.getSettingsRef(), shards, username, password, global_context.getTCPPort(), false); } +void DatabaseReplicated::tryConnectToZooKeeper(bool force_attach) +{ + try + { + if (!global_context.hasZooKeeper()) + { + throw Exception("Can't create replicated database without ZooKeeper", ErrorCodes::NO_ZOOKEEPER); + } + + auto current_zookeeper = global_context.getZooKeeper(); + + if (!current_zookeeper->exists(zookeeper_path)) + { + /// Create new database, multiple nodes can execute it concurrently + createDatabaseNodesInZooKeeper(current_zookeeper); + } + + replica_path = zookeeper_path + "/replicas/" + getFullReplicaName(); + + String replica_host_id; + if (current_zookeeper->tryGet(replica_path, replica_host_id)) + { + String host_id = getHostID(global_context, db_uuid); + if (replica_host_id != host_id) + throw Exception(ErrorCodes::REPLICA_IS_ALREADY_EXIST, + "Replica {} of shard {} of replicated database at {} already exists. Replica host ID: '{}', current host ID: '{}'", + replica_name, shard_name, zookeeper_path, replica_host_id, host_id); + } + else + { + /// Throws if replica with the same name already exists + createReplicaNodesInZooKeeper(current_zookeeper); + } + + is_readonly = false; + } + catch(...) + { + if (!force_attach) + throw; + tryLogCurrentException(log); + } +} + bool DatabaseReplicated::createDatabaseNodesInZooKeeper(const zkutil::ZooKeeperPtr & current_zookeeper) { current_zookeeper->createAncestors(zookeeper_path); @@ -256,6 +269,8 @@ void DatabaseReplicated::createReplicaNodesInZooKeeper(const zkutil::ZooKeeperPt void DatabaseReplicated::loadStoredObjects(Context & context, bool has_force_restore_data_flag, bool force_attach) { + tryConnectToZooKeeper(force_attach); + DatabaseAtomic::loadStoredObjects(context, has_force_restore_data_flag, force_attach); ddl_worker = std::make_unique(this, global_context); @@ -264,6 +279,9 @@ void DatabaseReplicated::loadStoredObjects(Context & context, bool has_force_res BlockIO DatabaseReplicated::propose(const ASTPtr & query, const Context & query_context) { + if (is_readonly) + throw Exception(ErrorCodes::NO_ZOOKEEPER, "Database is in readonly mode, because it cannot connect to ZooKeeper"); + if (query_context.getClientInfo().query_kind != ClientInfo::QueryKind::INITIAL_QUERY) throw Exception(ErrorCodes::INCORRECT_QUERY, "It's not initial query. ON CLUSTER is not allowed for Replicated database."); @@ -297,6 +315,24 @@ BlockIO DatabaseReplicated::propose(const ASTPtr & query, const Context & query_ return io; } +static UUID getTableUUIDIfReplicated(const String & metadata, const Context & context) +{ + bool looks_like_replicated = metadata.find("ReplicatedMergeTree") != std::string::npos; + if (!looks_like_replicated) + return UUIDHelpers::Nil; + + ParserCreateQuery parser; + auto size = context.getSettingsRef().max_query_size; + auto depth = context.getSettingsRef().max_parser_depth; + ASTPtr query = parseQuery(parser, metadata, size, depth); + const ASTCreateQuery & create = query->as(); + if (!create.storage || !create.storage->engine) + return UUIDHelpers::Nil; + if (!startsWith(create.storage->engine->name, "Replicated") || !endsWith(create.storage->engine->name, "MergeTree")) + return UUIDHelpers::Nil; + assert(create.uuid != UUIDHelpers::Nil); + return create.uuid; +} void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeeper, UInt32 our_log_ptr, UInt32 max_log_ptr) { @@ -311,42 +347,44 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep auto table_name_to_metadata = tryGetConsistentMetadataSnapshot(current_zookeeper, max_log_ptr); + /// For ReplicatedMergeTree tables we can compare only UUIDs to ensure that it's the same table. + /// Metadata can be different, it's handled on table replication level. + /// We need to handle only renamed tables. + /// TODO maybe we should also update MergeTree SETTINGS if required? + std::unordered_map zk_replicated_id_to_name; + for (const auto & zk_table : table_name_to_metadata) + { + UUID zk_replicated_id = getTableUUIDIfReplicated(zk_table.second, global_context); + if (zk_replicated_id != UUIDHelpers::Nil) + zk_replicated_id_to_name.emplace(zk_replicated_id, zk_table.first); + } + Strings tables_to_detach; + std::vector> replicated_tables_to_rename; size_t total_tables = 0; - auto existing_tables_it = getTablesIterator(global_context, {}); - while (existing_tables_it->isValid()) + std::vector replicated_ids; + for (auto existing_tables_it = getTablesIterator(global_context, {}); existing_tables_it->isValid(); existing_tables_it->next(), ++total_tables) { String name = existing_tables_it->name(); - auto in_zk = table_name_to_metadata.find(name); - String local_metadata = readMetadataFile(name); - if (in_zk == table_name_to_metadata.end() || in_zk->second != local_metadata) + UUID local_replicated_id = UUIDHelpers::Nil; + if (existing_tables_it->table()->supportsReplication()) { - bool should_detach = true; - bool looks_like_replicated = in_zk->second.find("ReplicatedMergeTree") != std::string::npos; - - if (looks_like_replicated) + local_replicated_id = existing_tables_it->table()->getStorageID().uuid; + auto it = zk_replicated_id_to_name.find(local_replicated_id); + if (it != zk_replicated_id_to_name.end()) { - ParserCreateQuery parser; - auto size = global_context.getSettingsRef().max_query_size; - auto depth = global_context.getSettingsRef().max_parser_depth; - ASTPtr local_create = parseQuery(parser, local_metadata, size, depth); - ASTPtr zk_create = parseQuery(parser, in_zk->second, size, depth); - if (local_create->as()->uuid == zk_create->as()->uuid) - { - /// For ReplicatedMergeTree tables we can compare only UUIDs to ensure that it's the same table. - /// Metadata can be different, it's handled on table replication level. - /// TODO maybe we should also compare MergeTree SETTINGS? - should_detach = false; - } + if (name != it->second) + replicated_tables_to_rename.emplace_back(name, it->second); + continue; } + } - if (should_detach) + auto in_zk = table_name_to_metadata.find(name); + if (in_zk == table_name_to_metadata.end() || in_zk->second != readMetadataFile(name)) + { tables_to_detach.emplace_back(std::move(name)); } - existing_tables_it->next(); - ++total_tables; } - existing_tables_it.reset(); String db_name = getDatabaseName(); String to_db_name = getDatabaseName() + BROKEN_TABLES_SUFFIX; @@ -375,17 +413,18 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep if (getDatabaseName() != db_name) throw Exception(ErrorCodes::UNKNOWN_DATABASE, "Database was renamed, will retry"); + auto table = tryGetTable(table_name, global_context); if (isDictionaryExist(table_name)) { LOG_DEBUG(log, "Will DROP DICTIONARY {}", backQuoteIfNeed(table_name)); DatabaseAtomic::removeDictionary(global_context, table_name); ++dropped_dicts; } - else if (!tryGetTable(table_name, global_context)->storesDataOnDisk()) + else if (!table->storesDataOnDisk()) { LOG_DEBUG(log, "Will DROP TABLE {}, because it does not store data on disk and can be safely dropped", backQuoteIfNeed(table_name)); dropped_tables.push_back(tryGetTableUUID(table_name)); - tryGetTable(table_name, global_context)->shutdown(); + table->shutdown(); DatabaseAtomic::dropTable(global_context, table_name, true); } else @@ -401,6 +440,20 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep LOG_WARNING(log, "Cleaned {} outdated objects: dropped {} dictionaries and {} tables, moved {} tables", tables_to_detach.size(), dropped_dicts, dropped_tables.size(), moved_tables); + /// Now database is cleared from outdated tables, let's rename ReplicatedMergeTree tables to actual names + for (const auto & old_to_new : replicated_tables_to_rename) + { + const String & from = old_to_new.first; + const String & to = old_to_new.second; + + LOG_DEBUG(log, "Will RENAME TABLE {} TO {}", backQuoteIfNeed(from), backQuoteIfNeed(to)); + /// TODO Maybe we should do it in two steps: rename all tables to temporary names and then rename them to actual names? + DDLGuardPtr table_guard = DatabaseCatalog::instance().getDDLGuard(db_name, std::min(from, to)); + DDLGuardPtr to_table_guard = DatabaseCatalog::instance().getDDLGuard(db_name, std::max(from, to)); + DatabaseAtomic::renameTable(global_context, from, *this, to, false, false); + } + + for (const auto & id : dropped_tables) DatabaseCatalog::instance().waitTableFinallyDropped(id); diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index 2c998a8bc97..43a6ce15376 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -84,6 +84,7 @@ public: friend struct DatabaseReplicatedTask; friend class DatabaseReplicatedDDLWorker; private: + void tryConnectToZooKeeper(bool force_attach); bool createDatabaseNodesInZooKeeper(const ZooKeeperPtr & current_zookeeper); void createReplicaNodesInZooKeeper(const ZooKeeperPtr & current_zookeeper); @@ -100,6 +101,7 @@ private: zkutil::ZooKeeperPtr getZooKeeper() const; + std::atomic_bool is_readonly = true; std::unique_ptr ddl_worker; }; diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp index 521ba5b7cb2..8751c125383 100644 --- a/src/Databases/DatabaseReplicatedWorker.cpp +++ b/src/Databases/DatabaseReplicatedWorker.cpp @@ -29,6 +29,8 @@ void DatabaseReplicatedDDLWorker::initializeMainThread() try { auto zookeeper = getAndSetZooKeeper(); + if (database->is_readonly) + database->tryConnectToZooKeeper(false); initializeReplication(); initialized = true; return; diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index 1f4c7932329..ac365dbb8d4 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -211,7 +211,7 @@ void DDLWorker::shutdown() DDLWorker::~DDLWorker() { - shutdown(); + DDLWorker::shutdown(); } diff --git a/tests/integration/test_replicated_database/test.py b/tests/integration/test_replicated_database/test.py index faeb436f279..0db6884fbb7 100644 --- a/tests/integration/test_replicated_database/test.py +++ b/tests/integration/test_replicated_database/test.py @@ -196,14 +196,16 @@ def test_recover_staled_replica(started_cluster): dummy_node.query("CREATE TABLE recover.mt2 (n int) ENGINE=MergeTree order by n", settings=settings) main_node.query("CREATE TABLE recover.rmt1 (n int) ENGINE=ReplicatedMergeTree order by n", settings=settings) dummy_node.query("CREATE TABLE recover.rmt2 (n int) ENGINE=ReplicatedMergeTree order by n", settings=settings) + main_node.query("CREATE TABLE recover.rmt3 (n int) ENGINE=ReplicatedMergeTree order by n", settings=settings) + dummy_node.query("CREATE TABLE recover.rmt5 (n int) ENGINE=ReplicatedMergeTree order by n", settings=settings) main_node.query("CREATE DICTIONARY recover.d1 (n int DEFAULT 0, m int DEFAULT 1) PRIMARY KEY n SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'default' TABLE 'rmt1' PASSWORD '' DB 'recover')) LIFETIME(MIN 1 MAX 10) LAYOUT(FLAT())") dummy_node.query("CREATE DICTIONARY recover.d2 (n int DEFAULT 0, m int DEFAULT 1) PRIMARY KEY n SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'default' TABLE 'rmt2' PASSWORD '' DB 'recover')) LIFETIME(MIN 1 MAX 10) LAYOUT(FLAT())") - for table in ['t1', 't2', 'mt1', 'mt2', 'rmt1', 'rmt2']: + for table in ['t1', 't2', 'mt1', 'mt2', 'rmt1', 'rmt2', 'rmt3', 'rmt5']: main_node.query("INSERT INTO recover.{} VALUES (42)".format(table)) for table in ['t1', 't2', 'mt1', 'mt2']: dummy_node.query("INSERT INTO recover.{} VALUES (42)".format(table)) - for table in ['rmt1', 'rmt2']: + for table in ['rmt1', 'rmt2', 'rmt3', 'rmt5']: main_node.query("SYSTEM SYNC REPLICA recover.{}".format(table)) with PartitionManager() as pm: @@ -212,6 +214,8 @@ def test_recover_staled_replica(started_cluster): main_node.query("RENAME TABLE recover.t1 TO recover.m1", settings=settings) main_node.query("ALTER TABLE recover.mt1 ADD COLUMN m int", settings=settings) main_node.query("ALTER TABLE recover.rmt1 ADD COLUMN m int", settings=settings) + main_node.query("RENAME TABLE recover.rmt3 TO recover.rmt4", settings=settings) + main_node.query("DROP TABLE recover.rmt5", settings=settings) main_node.query("DROP DICTIONARY recover.d2", settings=settings) main_node.query("CREATE DICTIONARY recover.d2 (n int DEFAULT 0, m int DEFAULT 1) PRIMARY KEY n SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'default' TABLE 'rmt1' PASSWORD '' DB 'recover')) LIFETIME(MIN 1 MAX 10) LAYOUT(FLAT());", settings=settings) @@ -223,25 +227,52 @@ def test_recover_staled_replica(started_cluster): main_node.query("DROP TABLE recover.tmp", settings=settings) main_node.query("CREATE TABLE recover.tmp AS recover.m1", settings=settings) - assert main_node.query("SELECT name FROM system.tables WHERE database='recover' ORDER BY name") == "d1\nd2\nm1\nmt1\nmt2\nrmt1\nrmt2\nt2\ntmp\n" + assert main_node.query("SELECT name FROM system.tables WHERE database='recover' ORDER BY name") == "d1\nd2\nm1\nmt1\nmt2\nrmt1\nrmt2\nrmt4\nt2\ntmp\n" query = "SELECT name, uuid, create_table_query FROM system.tables WHERE database='recover' ORDER BY name" expected = main_node.query(query) assert_eq_with_retry(dummy_node, query, expected) - for table in ['m1', 't2', 'mt1', 'mt2', 'rmt1', 'rmt2', 'd1', 'd2']: + for table in ['m1', 't2', 'mt1', 'mt2', 'rmt1', 'rmt2', 'rmt4', 'd1', 'd2']: assert main_node.query("SELECT (*,).1 FROM recover.{}".format(table)) == "42\n" - for table in ['t2', 'rmt1', 'rmt2', 'd1', 'd2', 'mt2']: + for table in ['t2', 'rmt1', 'rmt2', 'rmt4', 'd1', 'd2', 'mt2']: assert dummy_node.query("SELECT (*,).1 FROM recover.{}".format(table)) == "42\n" for table in ['m1', 'mt1']: assert dummy_node.query("SELECT count() FROM recover.{}".format(table)) == "0\n" - assert dummy_node.query("SELECT count() FROM system.tables WHERE database='recover_broken_tables'") == "1\n" - table = dummy_node.query("SHOW TABLES FROM recover_broken_tables").strip() - assert "mt1_22_" in table + assert dummy_node.query("SELECT count() FROM system.tables WHERE database='recover_broken_tables'") == "2\n" + table = dummy_node.query("SHOW TABLES FROM recover_broken_tables LIKE 'mt1_26_%'").strip() + assert dummy_node.query("SELECT (*,).1 FROM recover_broken_tables.{}".format(table)) == "42\n" + table = dummy_node.query("SHOW TABLES FROM recover_broken_tables LIKE 'rmt5_26_%'").strip() assert dummy_node.query("SELECT (*,).1 FROM recover_broken_tables.{}".format(table)) == "42\n" - expected = "Cleaned 3 outdated objects: dropped 1 dictionaries and 1 tables, moved 1 tables" + expected = "Cleaned 4 outdated objects: dropped 1 dictionaries and 1 tables, moved 2 tables" assert_logs_contain(dummy_node, expected) dummy_node.query("DROP TABLE recover.tmp") + assert_eq_with_retry(main_node, "SELECT count() FROM system.tables WHERE database='recover' AND name='tmp'", "0\n") +def test_startup_without_zk(started_cluster): + main_node.query("DROP DATABASE IF EXISTS testdb SYNC") + main_node.query("DROP DATABASE IF EXISTS recover SYNC") + with PartitionManager() as pm: + pm.drop_instance_zk_connections(main_node) + err = main_node.query_and_get_error("CREATE DATABASE startup ENGINE = Replicated('/clickhouse/databases/startup', 'shard1', 'replica1');") + assert "ZooKeeper" in err + main_node.query("CREATE DATABASE startup ENGINE = Replicated('/clickhouse/databases/startup', 'shard1', 'replica1');") + #main_node.query("CREATE TABLE startup.rmt (n int) ENGINE=ReplicatedMergeTree order by n") + main_node.query("CREATE TABLE startup.rmt (n int) ENGINE=MergeTree order by n") + main_node.query("INSERT INTO startup.rmt VALUES (42)") + with PartitionManager() as pm: + pm.drop_instance_zk_connections(main_node) + main_node.restart_clickhouse(stop_start_wait_sec=30) + assert main_node.query("SELECT (*,).1 FROM startup.rmt") == "42\n" + + for _ in range(10): + try: + main_node.query("CREATE TABLE startup.m (n int) ENGINE=Memory") + break + except: + time.sleep(1) + + main_node.query("EXCHANGE TABLES startup.rmt AND startup.m") + assert main_node.query("SELECT (*,).1 FROM startup.m") == "42\n" From 9c7cf9e92e8c75bc670abf070397c3aacbcf3193 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Mon, 15 Feb 2021 13:26:34 +0300 Subject: [PATCH 143/381] remove some debug code --- docker/test/stateful/run.sh | 4 +++ docker/test/stateless/run.sh | 4 +++ programs/server/Server.cpp | 4 ++- src/Core/Settings.h | 3 ++ src/Databases/DatabaseReplicated.cpp | 3 +- src/Databases/DatabaseReplicated.h | 1 - src/Databases/DatabaseReplicatedWorker.cpp | 4 +-- src/Interpreters/DDLWorker.cpp | 15 ++++----- src/Interpreters/DDLWorker.h | 5 +-- src/Interpreters/InterpreterCreateQuery.cpp | 21 ++++-------- src/Interpreters/executeDDLQueryOnCluster.cpp | 12 +------ tests/ci/ci_config.json | 24 ++++++++++++++ tests/clickhouse-test | 17 +++++++--- tests/config/install.sh | 3 ++ tests/config/users.d/database_replicated.xml | 10 ++++++ .../test_materialize_mysql_database/test.py | 2 +- .../configs/settings.xml | 12 +++++++ .../test_replicated_database/test.py | 10 +++--- tests/queries/skip_list.json | 33 ++++--------------- 19 files changed, 109 insertions(+), 78 deletions(-) create mode 100644 tests/config/users.d/database_replicated.xml create mode 100644 tests/integration/test_replicated_database/configs/settings.xml diff --git a/docker/test/stateful/run.sh b/docker/test/stateful/run.sh index f2fcefd604f..7779f0e9dc2 100755 --- a/docker/test/stateful/run.sh +++ b/docker/test/stateful/run.sh @@ -60,4 +60,8 @@ fi # more idiologically correct. read -ra ADDITIONAL_OPTIONS <<< "${ADDITIONAL_OPTIONS:-}" +if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then + ADDITIONAL_OPTIONS+=('--replicated-database') +fi + clickhouse-test --testname --shard --zookeeper --no-stateless --hung-check --print-time "$SKIP_LIST_OPT" "${ADDITIONAL_OPTIONS[@]}" "$SKIP_TESTS_OPTION" 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee test_output/test_result.txt diff --git a/docker/test/stateless/run.sh b/docker/test/stateless/run.sh index 575be721a54..d078f3739fd 100755 --- a/docker/test/stateless/run.sh +++ b/docker/test/stateless/run.sh @@ -57,6 +57,10 @@ function run_tests() ADDITIONAL_OPTIONS+=('4') fi + if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then + ADDITIONAL_OPTIONS+=('--replicated-database') + fi + clickhouse-test --testname --shard --zookeeper --hung-check --print-time \ --test-runs "$NUM_TRIES" \ "$SKIP_LIST_OPT" "${ADDITIONAL_OPTIONS[@]}" 2>&1 \ diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index 2bb5181d348..400796981d5 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -100,6 +100,7 @@ namespace CurrentMetrics extern const Metric Revision; extern const Metric VersionInteger; extern const Metric MemoryTracking; + extern const Metric MaxDDLEntryID; } @@ -997,7 +998,8 @@ int Server::main(const std::vector & /*args*/) int pool_size = config().getInt("distributed_ddl.pool_size", 1); if (pool_size < 1) throw Exception("distributed_ddl.pool_size should be greater then 0", ErrorCodes::ARGUMENT_OUT_OF_BOUND); - global_context->setDDLWorker(std::make_unique(pool_size, ddl_zookeeper_path, *global_context, &config(), "distributed_ddl")); + global_context->setDDLWorker(std::make_unique(pool_size, ddl_zookeeper_path, *global_context, &config(), + "distributed_ddl", "DDLWorker", &CurrentMetrics::MaxDDLEntryID)); } std::unique_ptr dns_cache_updater; diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 96571cedd3f..ba4fcdda48c 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -420,6 +420,9 @@ class IColumn; M(Bool, optimize_rewrite_sum_if_to_count_if, true, "Rewrite sumIf() and sum(if()) function countIf() function when logically equivalent", 0) \ M(UInt64, insert_shard_id, 0, "If non zero, when insert into a distributed table, the data will be inserted into the shard `insert_shard_id` synchronously. Possible values range from 1 to `shards_number` of corresponding distributed table", 0) \ M(Bool, allow_experimental_query_deduplication, false, "Allow sending parts' UUIDs for a query in order to deduplicate data parts if any", 0) \ + M(Bool, allow_experimental_database_replicated, false, "Allow to create databases with Replicated engine", 0) \ + M(UInt64, database_replicated_initial_query_timeout_sec, 300, "How long initial DDL query should wait for Replicated database to precess previous DDL queue entries", 0) \ + M(Bool, database_replicated_ddl_output, true, "Return table with query execution status as a result of DDL query", 0) \ \ /** Obsolete settings that do nothing but left for compatibility reasons. Remove each one after half a year of obsolescence. */ \ \ diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 24a193d9134..dc1203e8cc9 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -311,7 +311,8 @@ BlockIO DatabaseReplicated::propose(const ASTPtr & query, const Context & query_ Strings hosts_to_wait = getZooKeeper()->getChildren(zookeeper_path + "/replicas"); auto stream = std::make_shared(node_path, entry, query_context, hosts_to_wait); - io.in = std::move(stream); + if (query_context.getSettingsRef().database_replicated_ddl_output) + io.in = std::move(stream); return io; } diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index 43a6ce15376..2ae97b0d82a 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -80,7 +80,6 @@ public: ClusterPtr getCluster() const; - //FIXME friend struct DatabaseReplicatedTask; friend class DatabaseReplicatedDDLWorker; private: diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp index 8751c125383..ff15878b136 100644 --- a/src/Databases/DatabaseReplicatedWorker.cpp +++ b/src/Databases/DatabaseReplicatedWorker.cpp @@ -89,7 +89,7 @@ String DatabaseReplicatedDDLWorker::enqueueQuery(DDLLogEntry & entry) return node_path; } -String DatabaseReplicatedDDLWorker::tryEnqueueAndExecuteEntry(DDLLogEntry & entry, const Context & /*query_context*/) +String DatabaseReplicatedDDLWorker::tryEnqueueAndExecuteEntry(DDLLogEntry & entry, const Context & query_context) { /// NOTE Possibly it would be better to execute initial query on the most up-to-date node, /// but it requires more complex logic around /try node. @@ -114,7 +114,7 @@ String DatabaseReplicatedDDLWorker::tryEnqueueAndExecuteEntry(DDLLogEntry & entr task->is_initial_query = true; LOG_DEBUG(log, "Waiting for worker thread to process all entries before {}", entry_name); - UInt64 timeout = 600; + UInt64 timeout = query_context.getSettingsRef().database_replicated_initial_query_timeout_sec; { std::unique_lock lock{mutex}; bool processed = wait_current_task_change.wait_for(lock, std::chrono::seconds(timeout), [&]() diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index ac365dbb8d4..f08f47b1c0e 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -32,10 +32,6 @@ namespace fs = std::filesystem; -namespace CurrentMetrics -{ - extern const Metric MaxDDLEntryID; -} namespace DB { @@ -152,12 +148,14 @@ std::unique_ptr createSimpleZooKeeperLock( DDLWorker::DDLWorker(int pool_size_, const std::string & zk_root_dir, const Context & context_, const Poco::Util::AbstractConfiguration * config, const String & prefix, - const String & logger_name) + const String & logger_name, const CurrentMetrics::Metric * max_entry_metric_) : context(context_) , log(&Poco::Logger::get(logger_name)) , pool_size(pool_size_) + , max_entry_metric(max_entry_metric_) { - CurrentMetrics::set(CurrentMetrics::MaxDDLEntryID, 0); + if (max_entry_metric) + CurrentMetrics::set(*max_entry_metric, 0); if (1 < pool_size) { @@ -456,7 +454,8 @@ void DDLWorker::updateMaxDDLEntryID(const String & entry_name) { if (max_id.compare_exchange_weak(prev_id, id)) { - CurrentMetrics::set(CurrentMetrics::MaxDDLEntryID, id); + if (max_entry_metric) + CurrentMetrics::set(*max_entry_metric, id); break; } } @@ -596,7 +595,7 @@ void DDLWorker::processTask(DDLTaskBase & task) } -bool DDLWorker::taskShouldBeExecutedOnLeader(const ASTPtr ast_ddl, const StoragePtr storage) +bool DDLWorker::taskShouldBeExecutedOnLeader(const ASTPtr & ast_ddl, const StoragePtr storage) { /// Pure DROP queries have to be executed on each node separately if (auto * query = ast_ddl->as(); query && query->kind != ASTDropQuery::Kind::Truncate) diff --git a/src/Interpreters/DDLWorker.h b/src/Interpreters/DDLWorker.h index 03c80e3f669..0985884eef7 100644 --- a/src/Interpreters/DDLWorker.h +++ b/src/Interpreters/DDLWorker.h @@ -43,7 +43,7 @@ class DDLWorker { public: DDLWorker(int pool_size_, const std::string & zk_root_dir, const Context & context_, const Poco::Util::AbstractConfiguration * config, const String & prefix, - const String & logger_name = "DDLWorker"); + const String & logger_name = "DDLWorker", const CurrentMetrics::Metric * max_entry_metric_ = nullptr); virtual ~DDLWorker(); /// Pushes query into DDL queue, returns path to created node @@ -81,7 +81,7 @@ protected: void updateMaxDDLEntryID(const String & entry_name); /// Check that query should be executed on leader replica only - static bool taskShouldBeExecutedOnLeader(const ASTPtr ast_ddl, StoragePtr storage); + static bool taskShouldBeExecutedOnLeader(const ASTPtr & ast_ddl, StoragePtr storage); /// Executes query only on leader replica in case of replicated table. /// Queries like TRUNCATE/ALTER .../OPTIMIZE have to be executed only on one node of shard. @@ -144,6 +144,7 @@ protected: size_t max_tasks_in_queue = 1000; std::atomic max_id = 0; + const CurrentMetrics::Metric * max_entry_metric; }; diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index bbe8526ae5b..2021c1f1d60 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -138,20 +138,7 @@ BlockIO InterpreterCreateQuery::createDatabase(ASTCreateQuery & create) bool old_style_database = context.getSettingsRef().default_database_engine.value == DefaultDatabaseEngine::Ordinary; auto engine = std::make_shared(); auto storage = std::make_shared(); - - //FIXME revert it before merge - engine->name = "Atomic"; - if (old_style_database) - { - if (database_name == "test") - engine->name = "Ordinary"; // for stateful tests - else - engine = makeASTFunction("Replicated", - std::make_shared(fmt::format("/clickhouse/db/{}/", create.database)), - std::make_shared("s1"), - std::make_shared("r" + toString(getpid()))); - } - + engine->name = old_style_database ? "Ordinary" : "Atomic"; engine->no_empty_args = true; storage->set(storage->engine, engine); create.set(create.storage, storage); @@ -221,6 +208,12 @@ BlockIO InterpreterCreateQuery::createDatabase(ASTCreateQuery & create) "Enable allow_experimental_database_materialize_mysql to use it.", ErrorCodes::UNKNOWN_DATABASE_ENGINE); } + if (create.storage->engine->name == "Replicated" && !context.getSettingsRef().allow_experimental_database_replicated && !internal) + { + throw Exception("Replicated is an experimental database engine. " + "Enable allow_experimental_database_replicated to use it.", ErrorCodes::UNKNOWN_DATABASE_ENGINE); + } + DatabasePtr database = DatabaseFactory::get(create, metadata_path / "", context); if (create.uuid != UUIDHelpers::Nil) diff --git a/src/Interpreters/executeDDLQueryOnCluster.cpp b/src/Interpreters/executeDDLQueryOnCluster.cpp index 2774f78663e..1937fbaf905 100644 --- a/src/Interpreters/executeDDLQueryOnCluster.cpp +++ b/src/Interpreters/executeDDLQueryOnCluster.cpp @@ -205,10 +205,6 @@ DDLQueryStatusInputStream::DDLQueryStatusInputStream(const String & zk_node_path addTotalRowsApprox(waiting_hosts.size()); timeout_seconds = context.getSettingsRef().distributed_ddl_task_timeout; - - //FIXME revert it before merge - if (context.getSettingsRef().default_database_engine.value == DefaultDatabaseEngine::Ordinary) - timeout_seconds = 10; } Block DDLQueryStatusInputStream::readImpl() @@ -252,7 +248,6 @@ Block DDLQueryStatusInputStream::readImpl() sleepForMilliseconds(std::min(1000, 50 * (try_number + 1))); } - /// TODO: add shared lock if (!zookeeper->exists(node_path)) { throw Exception(ErrorCodes::UNFINISHED, @@ -301,12 +296,7 @@ Block DDLQueryStatusInputStream::readImpl() res = sample.cloneWithColumns(std::move(columns)); } - //FIXME revert it before merge - bool is_functional_tests = !by_hostname && context.getSettingsRef().default_database_engine.value == DefaultDatabaseEngine::Ordinary; - if (is_functional_tests) - return {}; - else - return res; + return res; } Strings DDLQueryStatusInputStream::getChildrenAllowNoNode(const std::shared_ptr & zookeeper, const String & node_path) diff --git a/tests/ci/ci_config.json b/tests/ci/ci_config.json index 44b35d61601..0e467319285 100644 --- a/tests/ci/ci_config.json +++ b/tests/ci/ci_config.json @@ -261,6 +261,18 @@ "with_coverage": false } }, + "Functional stateful tests (release, DatabaseReplicated)": { + "required_build_properties": { + "compiler": "clang-11", + "package_type": "deb", + "build_type": "relwithdebuginfo", + "sanitizer": "none", + "bundled": "bundled", + "splitted": "unsplitted", + "clang-tidy": "disable", + "with_coverage": false + } + }, "Functional stateless tests (address)": { "required_build_properties": { "compiler": "clang-11", @@ -381,6 +393,18 @@ "with_coverage": false } }, + "Functional stateless tests (release, DatabaseReplicated)": { + "required_build_properties": { + "compiler": "clang-11", + "package_type": "deb", + "build_type": "relwithdebuginfo", + "sanitizer": "none", + "bundled": "bundled", + "splitted": "unsplitted", + "clang-tidy": "disable", + "with_coverage": false + } + }, "Stress test (address)": { "required_build_properties": { "compiler": "clang-11", diff --git a/tests/clickhouse-test b/tests/clickhouse-test index b2f3f73b6c0..64a93416c41 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -105,7 +105,9 @@ def remove_control_characters(s): s = re.sub(r"[\x00-\x08\x0b\x0e-\x1f\x7f]", "", s) return s -def get_db_engine(args): +def get_db_engine(args, database_name): + if args.replicated_database: + return " ENGINE=Replicated('/test/clickhouse/db/{}', 's1', 'r1')".format(database_name) if args.db_engine: return " ENGINE=" + args.db_engine return "" # Will use default engine @@ -128,7 +130,7 @@ def run_single_test(args, ext, server_logs_level, client_options, case_file, std clickhouse_proc_create = Popen(shlex.split(args.client), stdin=PIPE, stdout=PIPE, stderr=PIPE, universal_newlines=True) try: - clickhouse_proc_create.communicate(("CREATE DATABASE " + database + get_db_engine(args)), timeout=args.timeout) + clickhouse_proc_create.communicate(("CREATE DATABASE " + database + get_db_engine(args, database)), timeout=args.timeout) except TimeoutExpired: total_time = (datetime.now() - start_time).total_seconds() return clickhouse_proc_create, "", "Timeout creating database {} before test".format(database), total_time @@ -532,6 +534,8 @@ class BuildFlags(): RELEASE = 'release-build' DATABASE_ORDINARY = 'database-ordinary' POLYMORPHIC_PARTS = 'polymorphic-parts' + ANTLR = 'antlr' + DATABASE_REPLICATED = 'database-replicated' def collect_build_flags(client): @@ -613,7 +617,9 @@ def main(args): build_flags = collect_build_flags(args.client) if args.antlr: - build_flags.append('antlr') + build_flags.append(BuildFlags.ANTLR) + if args.replicated_database: + build_flags.append(BuildFlags.DATABASE_REPLICATED) if args.use_skip_list: tests_to_skip_from_list = collect_tests_to_skip(args.skip_list_path, build_flags) @@ -666,10 +672,10 @@ def main(args): if args.database and args.database != "test": clickhouse_proc_create = Popen(shlex.split(args.client), stdin=PIPE, stdout=PIPE, stderr=PIPE, universal_newlines=True) - clickhouse_proc_create.communicate(("CREATE DATABASE IF NOT EXISTS " + args.database + get_db_engine(args))) + clickhouse_proc_create.communicate(("CREATE DATABASE IF NOT EXISTS " + args.database + get_db_engine(args, args.database))) clickhouse_proc_create = Popen(shlex.split(args.client), stdin=PIPE, stdout=PIPE, stderr=PIPE, universal_newlines=True) - clickhouse_proc_create.communicate(("CREATE DATABASE IF NOT EXISTS test" + get_db_engine(args))) + clickhouse_proc_create.communicate(("CREATE DATABASE IF NOT EXISTS test" + get_db_engine(args, 'test'))) def is_test_from_dir(suite_dir, case): case_file = os.path.join(suite_dir, case) @@ -923,6 +929,7 @@ if __name__ == '__main__': parser.add_argument('--skip-list-path', help="Path to skip-list file") parser.add_argument('--use-skip-list', action='store_true', default=False, help="Use skip list to skip tests if found") parser.add_argument('--db-engine', help='Database engine name') + parser.add_argument('--replicated-database', action='store_true', default=False, help='Run tests with Replicated database engine') parser.add_argument('--antlr', action='store_true', default=False, dest='antlr', help='Use new ANTLR parser in tests') parser.add_argument('--no-stateless', action='store_true', help='Disable all stateless tests') diff --git a/tests/config/install.sh b/tests/config/install.sh index 9965e1fb1ad..de6ba2a7a09 100755 --- a/tests/config/install.sh +++ b/tests/config/install.sh @@ -61,5 +61,8 @@ fi if [[ -n "$USE_DATABASE_ORDINARY" ]] && [[ "$USE_DATABASE_ORDINARY" -eq 1 ]]; then ln -sf $SRC_PATH/users.d/database_ordinary.xml $DEST_SERVER_PATH/users.d/ fi +if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then + ln -sf $SRC_PATH/users.d/database_replicated.xml $DEST_SERVER_PATH/users.d/ +fi ln -sf $SRC_PATH/client_config.xml $DEST_CLIENT_PATH/config.xml diff --git a/tests/config/users.d/database_replicated.xml b/tests/config/users.d/database_replicated.xml new file mode 100644 index 00000000000..23801d00154 --- /dev/null +++ b/tests/config/users.d/database_replicated.xml @@ -0,0 +1,10 @@ + + + + 1 + 0 + 30 + 30 + + + diff --git a/tests/integration/test_materialize_mysql_database/test.py b/tests/integration/test_materialize_mysql_database/test.py index 0175ec78587..e55772d9e1d 100644 --- a/tests/integration/test_materialize_mysql_database/test.py +++ b/tests/integration/test_materialize_mysql_database/test.py @@ -14,7 +14,7 @@ DOCKER_COMPOSE_PATH = get_docker_compose_path() cluster = ClickHouseCluster(__file__) -node_db_ordinary = cluster.add_instance('node1', user_configs=["configs/users.xml"], with_mysql=False, stay_alive=True, with_zookeeper=True) #FIXME +node_db_ordinary = cluster.add_instance('node1', user_configs=["configs/users.xml"], with_mysql=False, stay_alive=True) node_db_atomic = cluster.add_instance('node2', user_configs=["configs/users_db_atomic.xml"], with_mysql=False, stay_alive=True) diff --git a/tests/integration/test_replicated_database/configs/settings.xml b/tests/integration/test_replicated_database/configs/settings.xml new file mode 100644 index 00000000000..e0f7e8691e6 --- /dev/null +++ b/tests/integration/test_replicated_database/configs/settings.xml @@ -0,0 +1,12 @@ + + + + 1 + + + + + default + + + diff --git a/tests/integration/test_replicated_database/test.py b/tests/integration/test_replicated_database/test.py index 0db6884fbb7..99e7d6077f8 100644 --- a/tests/integration/test_replicated_database/test.py +++ b/tests/integration/test_replicated_database/test.py @@ -8,11 +8,11 @@ from helpers.network import PartitionManager cluster = ClickHouseCluster(__file__) -main_node = cluster.add_instance('main_node', main_configs=['configs/config.xml'], with_zookeeper=True, stay_alive=True, macros={"shard": 1, "replica": 1}) -dummy_node = cluster.add_instance('dummy_node', main_configs=['configs/config.xml'], with_zookeeper=True, stay_alive=True, macros={"shard": 1, "replica": 2}) -competing_node = cluster.add_instance('competing_node', main_configs=['configs/config.xml'], with_zookeeper=True, macros={"shard": 1, "replica": 3}) -snapshotting_node = cluster.add_instance('snapshotting_node', main_configs=['configs/config.xml'], with_zookeeper=True, macros={"shard": 2, "replica": 1}) -snapshot_recovering_node = cluster.add_instance('snapshot_recovering_node', main_configs=['configs/config.xml'], with_zookeeper=True, macros={"shard": 2, "replica": 2}) +main_node = cluster.add_instance('main_node', main_configs=['configs/config.xml'], user_configs=['configs/settings.xml'], with_zookeeper=True, stay_alive=True, macros={"shard": 1, "replica": 1}) +dummy_node = cluster.add_instance('dummy_node', main_configs=['configs/config.xml'], user_configs=['configs/settings.xml'], with_zookeeper=True, stay_alive=True, macros={"shard": 1, "replica": 2}) +competing_node = cluster.add_instance('competing_node', main_configs=['configs/config.xml'], user_configs=['configs/settings.xml'], with_zookeeper=True, macros={"shard": 1, "replica": 3}) +snapshotting_node = cluster.add_instance('snapshotting_node', main_configs=['configs/config.xml'], user_configs=['configs/settings.xml'], with_zookeeper=True, macros={"shard": 2, "replica": 1}) +snapshot_recovering_node = cluster.add_instance('snapshot_recovering_node', main_configs=['configs/config.xml'], user_configs=['configs/settings.xml'], with_zookeeper=True, macros={"shard": 2, "replica": 2}) all_nodes = [main_node, dummy_node, competing_node, snapshotting_node, snapshot_recovering_node] diff --git a/tests/queries/skip_list.json b/tests/queries/skip_list.json index 2317cdcecac..db7b0631b97 100644 --- a/tests/queries/skip_list.json +++ b/tests/queries/skip_list.json @@ -100,10 +100,15 @@ "00604_show_create_database", "00609_mv_index_in_in", "00510_materizlized_view_and_deduplication_zookeeper", - "memory_tracking", /// FIXME remove it before merge + "00738_lock_for_inner_table" + ], + "database-replicated": [ "memory_tracking", "memory_usage", + "live_view", "01188_attach_table_from_pat", + "01415_sticking_mutations", + "01130_in_memory_parts", "01110_dictionary_layout_without_arguments", "01018_ddl_dictionaries_create", "01018_ddl_dictionaries_select", @@ -167,7 +172,6 @@ "01493_alter_remove_properties_zookeeper", "01475_read_subcolumns_storages", "01475_read_subcolumns", - "01463_test_alter_live_view_refresh", "01451_replicated_detach_drop_part", "01451_detach_drop_part", "01440_big_int_exotic_casts", @@ -180,9 +184,6 @@ "01355_alter_column_with_order", "01291_geo_types", "01270_optimize_skip_unused_shards_low_cardinality", - "01237_live_view_over_distributed_with_subquery_select_table_alias", - "01236_distributed_over_live_view_over_distributed", - "01235_live_view_over_distributed", "01182_materialized_view_different_structure", "01150_ddl_guard_rwr", "01148_zookeeper_path_macros_unfolding", @@ -194,7 +195,6 @@ "01073_attach_if_not_exists", "01072_optimize_skip_unused_shards_const_expr_eval", "01071_prohibition_secondary_index_with_old_format_merge_tree", - "01071_live_view_detach_dependency", "01062_alter_on_mutataion_zookeeper", "01060_shutdown_table_after_detach", "01056_create_table_as", @@ -207,27 +207,6 @@ "00989_parallel_parts_loading", "00980_zookeeper_merge_tree_alter_settings", "00980_merge_alter_settings", - "00980_create_temporary_live_view", - "00978_live_view_watch", - "00977_live_view_watch_events", - "00976_live_view_select_version", - "00975_live_view_create", - "00974_live_view_select_with_aggregation", - "00973_live_view_with_subquery_select_with_aggregation_in_subquery", - "00973_live_view_with_subquery_select_with_aggregation", - "00973_live_view_with_subquery_select_table_alias", - "00973_live_view_with_subquery_select_nested_with_aggregation_table_alias", - "00973_live_view_with_subquery_select_nested_with_aggregation", - "00973_live_view_with_subquery_select_nested", - "00973_live_view_with_subquery_select_join_no_alias", - "00973_live_view_with_subquery_select_join", - "00973_live_view_with_subquery_select", - "00973_live_view_select_prewhere", - "00973_live_view_select", - "00972_live_view_select_1", - "00969_live_view_watch_format_jsoneachrowwithprogress", - "00968_live_view_select_format_jsoneachrowwithprogress", - "00961_temporary_live_view_watch", "00955_test_final_mark", "00933_reserved_word", "00926_zookeeper_adaptive_index_granularity_replicated_merge_tree", From 3ce33603795d0649ae4fca41ae11aa9918d8b143 Mon Sep 17 00:00:00 2001 From: alesapin Date: Mon, 15 Feb 2021 18:36:25 +0300 Subject: [PATCH 144/381] Some initial code --- src/Coordination/Changelog.cpp | 315 ++++++++++++++++++++++++++ src/Coordination/Changelog.h | 81 +++++++ src/Coordination/InMemoryLogStore.cpp | 8 +- src/Coordination/NuKeeperLogStore.h | 24 ++ 4 files changed, 424 insertions(+), 4 deletions(-) create mode 100644 src/Coordination/Changelog.cpp create mode 100644 src/Coordination/Changelog.h create mode 100644 src/Coordination/NuKeeperLogStore.h diff --git a/src/Coordination/Changelog.cpp b/src/Coordination/Changelog.cpp new file mode 100644 index 00000000000..a38f039fa40 --- /dev/null +++ b/src/Coordination/Changelog.cpp @@ -0,0 +1,315 @@ +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int CHECKSUM_DOESNT_MATCH; + extern const int CORRUPTED_DATA; + extern const int UNKNOWN_FORMAT_VERSION; + extern const int LOGICAL_ERROR; + extern const int UNIMPLEMENTED; +} + + +std::string toString(const ChangelogVersion & version) +{ + if (version == ChangelogVersion::V0) + return "V0"; + + throw Exception(ErrorCodes::UNKNOWN_FORMAT_VERSION, "Unknown chagelog version {}", static_cast(version)); +} + +ChangelogVersion fromString(const std::string & version_str) +{ + if (version == "V0") + return ChangelogVersion::V0; + + throw Exception(ErrorCodes::UNKNOWN_FORMAT_VERSION, "Unknown chagelog version {}", version_str); +} + +namespace +{ + +static constexpr auto DEFAULT_PREFIX = "changelog"; + +struct ChangelogName +{ + std::string prefix; + ChangelogVersion version; + size_t from_log_idx; + size_t to_log_idx; +}; + +std::string formatChangelogPath(const std::string & prefix, const ChangelogVersion & version, const ChangelogName & name) +{ + std::filesystem::path path(prefix); + path /= std::filesystem::path(name.prefix + "_" + toString(version) + "_" + std::to_string(name.from_log_idx) + "_" + std::to_string(name.to_log_idx) + ".log"); + return path.to_string(); +} + +ChangelogName getChangelogName(const std::string & path_str) +{ + std::filesystem::path path(path_str); + std:string filename = path.stem(); + Strings filename_parts; + boost::split(filename_parts, filename, boost::is_any_of("_")); + if (filename_parts.size() < 4) + throw Exception(ErrorCodes::CORRUPTED_DATA, "Invalid changelog {}", path_str); + + ChangelogName result; + result.prefix = filename_parts[0]; + result.version = fromString(filename_parts[1]); + result.form_log_idx = parse(filename_parts[2]); + result.to_log_idx = parse(filename_parts[3]); + return result; +} + +} + +class ChangelogWriter +{ +public: + ChangelogWriter(const std::string & filepath_, WriteMode mode, size_t start_index_) + : filepath(filepath_) + , plain_buf(filepath, DBMS_DEFAULT_BUFFER_SIZE, mode == WriteMode::Rewrite ? -1 : (O_APPEND | O_CREAT | O_WRONLY)) + , start_index(start_index_) + {} + + + off_t appendRecord(ChangelogRecord && record, bool sync) + { + off_t result = plain_buf.count(); + writeIntBinary(record.header.version, plain_buf); + writeIntBinary(record.header.index, plain_buf); + writeIntBinary(record.header.term, plain_buf); + writeIntBinary(record.header.value_type, plain_buf); + writeIntBinary(record.header.blob_size, plain_buf); + writeIntBinary(record.header.blob_checksum, plain_buf); + + if (record.blob_size != 0) + plain_buf.write(reinterpret_cast(record.blob->data_begin()), record.blob->size()); + + entries_written++; + + if (sync) + plain_buf.sync(); + reeturn result; + } + + void truncateToLength(off_t new_length) + { + flush(); + plain_buf.truncate(new_length); + } + + void flush() + { + plain_buf.sync(); + } + + size_t getEntriesWritten() const + { + return entries_written; + } + + size_t setEntriesWritten(size_t entries_written_) + { + entries_written = entries_written_; + } + + size_t getStartIndex() const + { + return start_index; + } + + void setStartIndex(size_t start_index_) + { + start_index = start_index_; + } + +private: + std::string filepath; + WriteBufferFromFile plain_buf; + size_t entries_written = 0; + size_t start_index; +}; + + +class ChangelogReader +{ +public: + explicit ChangelogReader(const std::string & filepath_) + : filepath(filepath_) + , read_buf(filepath) + {} + + size_t readChangelog(Changelog & changelog, IndexToOffset & index_to_offset) + { + size_t total_read = 0; + while (!read_buf.eof()) + { + total_read += 1; + off_t pos = read_buf.count(); + ChangelogRecord record; + readIntBinary(record.header.version, read_buf); + readIntBinary(record.header.index, read_buf); + readIntBinary(record.header.term, read_buf); + readIntBinary(record.header.value_type, read_buf); + readIntBinary(record.header.blob_size, read_buf); + readIntBinary(record.header.blob_checksum, read_buf); + auto buffer = nuraft::buffer::alloc(record.header.blob_size); + auto buffer_begin = reinterpret_cast(buffer->data_begin()); + read_buf.readStrict(buffer_begin, record.header.blob_size); + index_to_offset[record.header.index] = pos; + + Checksum checksum = CityHash_v1_0_2::CityHash128(buffer_begin, record.header.blob_size); + if (checksum != record.header.blob_checksum) + { + throw Exception(ErrorCodes::CHECKSUM_DOESNT_MATCH, + "Checksums doesn't match for log {} (version {}), index {}, blob_size {}", + filepath, record.header.version, record.header.index, record.header.blob_size); + } + + if (changlog.start_idx == 0) + changelog.start_idx = record.header.index; + + if (!changelog.try_emplace(record.header.index, buffer).second) + throw Exception(ErrorCodes::CORRUPTED_DATA, "Duplicated index id {} in log {}", record.header.index, filename); + } + return total_read; + } +private: + std::string filepath; + ReadBufferFromFile read_buf; +}; + +ChangelogOnDiskHelper::ChangelogOnDiskHelper(const std::string & changelogs_dir, size_t rotate_interval_) + : changelogs_dir(chagelogs_dir_) + , rotate_interval(rotate_interval_) +{ + namespace fs = std::filesystem; + for(const auto & p : fs::directory_iterator(changelogs_dir)) + existing_changelogs.push_back(p.path()); +} + +Changelog ChangelogOnDiskHelper::readChangelogAndInitWriter(size_t from_log_idx) +{ + Changelog result; + size_t read_from_last = 0; + for (const std::string & changelog_file : existing_changelogs) + { + ChangelogName parsed_name = getChangelogName(changelog_file); + if (parsed_name.to_log_idx >= from_log_idx) + { + ChangelogReader reader(changelog_file); + read_from_last = reader.readChangelog(result, index_to_start_pos); + } + } + if (existing_changelogs.size() > 0 && read_from_last < rotate_interval) + { + auto parsed_name = getChangelogName(existing_changelogs.back()); + current_writer = std::make_unique(existing_changelogs.back(), WriteMode::Append, parsed_name.from_log_idx); + current_writer->setEntriesWritten(read_from_last); + } + else + { + rotate(from_log_idx); + } + return result; +} + +void ChangelogOnDiskHelper::rotate(size_t new_start_log_idx) +{ + if (current_writer) + current_writer->flush(); + + ChangelogName new_name; + new_name.prefix = changelogs_dir; + new_name.version = CURRENT_CHANGELOG_VERSION; + new_name.from_log_idx = new_start_log_idx; + new_name.to_log_idx = new_start_log_idx; + + auto new_log_path = formatChagelogPath(changelogs_dir, CURRENT_CHANGELOG_VERSION, new_name); + existing_changelogs.push_back(new_log_path); + current_writer = std::make_unique(existing_changelogs.back(), WriteMode::Rewrite, new_start_log_idx); +} + +ChangelogRecord ChangelogOnDiskHelper::buildRecord(size_t index, nuraft::ptr log_entry) const +{ + ChangelogRecordHeader header; + header.index = index; + header.term = log_entry->get_term(); + header.value_type = log_entry->get_val_type(); + auto buffer = log_entry->get_buf_ptr(); + if (buffer) + { + header.blob_size = buffer->size(); + header.blob_checksum = CityHash_v1_0_2::CityHash128(reinterpret_cast(buffer->data_begin()), buffer->size()); + } + else + { + header.blob_size = 0; + header.blob_checksum = 0; + } + + return ChangelogRecord{header, buffer}; +} + +void ChangelogOnDiskHelper::appendRecord(size_t index, nuraft::ptr log_entry) +{ + if (!current_writer) + throw Exception(ErrorCodes::LOGICAL_ERROR, "ChangelogOnDiskHelper must be initialized before appending records"); + + if (current_writer->getEntriesWritten() == rotate_interval) + rotate(index); + + auto offset = current_writer->appendRecord(buildRecord(index, log_entry), true); + if (!index_to_start_pos.try_emplace(index, offset).second) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Record with index {} already exists", index); + +} + +void ChangelogOnDiskHelper::writeAt(size_t index, nuraft::ptr log_entry) +{ + if (index < current_writer->getStartIndex()) + throw Exception(ErrorCodes::UNIMPLEMENTED, "Currently cannot overwrite index from previous file"); + + auto entries_written = current_writer->getEntriesWritten(); + current_writer->truncateToLength(index_to_start_pos(index)); + for (auto itr = index_to_start_pos.begin(); itr != index_to_start_pos.end();) + { + if (itr->first >= index) + { + entries_written--; + itr = index_to_start_pos.erase(itr); + } + else + itr++; + } + + current_writer->setEntriesWritten(entries_written); + + appendRecord(index, log_entry); +} + +void ChangelogOnDiskHelper::compact(size_t up_to_log_idx) +{ + for (auto itr = existing_changelogs.begin(); itr != existing_changelogs.end();) + { + ChangelogName parsed_name = getChangelogName(*itr); + if (parsed_name.to_log_idx <= up_to_log_idx) + { + std::filesystem::remove(itr); + itr = existing_changelogs.erase(itr); + for (size_t idx = parsed_name.from_log_idx; idx <= parsed_name.to_log_idx; ++idx) + index_to_start_pos.erase(idx); + } + } +} + +} diff --git a/src/Coordination/Changelog.h b/src/Coordination/Changelog.h new file mode 100644 index 00000000000..ffcd2a353bb --- /dev/null +++ b/src/Coordination/Changelog.h @@ -0,0 +1,81 @@ +#pragma once + +#include // Y_IGNORE +#include +#include +#include +#include +#include + +namespace DB +{ + +using Checksum = CityHash_v1_0_2::uint128; + +enum class ChangelogVersion : uint8_t +{ + V0 = 0, +}; + +std::string toString(const ChangelogVersion & version); +ChangelogVersion fromString(const std::string & version_str); + +static constexpr auto CURRENT_CHANGELOG_VERSION = ChangeLogVersion::V0; + +struct ChangelogRecordHeader +{ + ChangelogVersion version = CURRENT_CHANGELOG_VERSION; + size_t index; + size_t term; + nuraft::log_val_type value_type; + size_t blob_size; + Checksum blob_checksum; +}; + +struct ChangelogRecord +{ + ChangelogRecordHeader header; + nuraft::ptr blob; +}; + +using IndexToOffset = std::unordered_map; +using IndexToLogEntry = std::map>; + +struct Changelog +{ +public: +private: + IndexToLogEntry logs; + size_t start_idx = 0; +}; + +class ChangelogWriter; + +class ChangelogOnDiskHelper +{ + +public: + ChangelogOnDiskHelper(const std::string & changelogs_dir_, size_t rotate_interval_); + + Changelog readChangelogAndInitWriter(size_t from_log_idx); + + void appendRecord(size_t index, nuraft::ptr log_entry); + + void writeAt(size_t index, nuraft::ptr log_entry); + + void compact(size_t up_to_log_idx); + +private: + void rotate(size_t new_start_log_idex); + + ChangelogRecord buildRecord(size_t index, nuraft::ptr log_entry) const; + +private: + std::string changelogs_dir; + std::deque existing_changelogs; + std::unique_ptr current_writer; + IndexToOffset index_to_start_pos; + const size_t rotate_interval; +}; + +} diff --git a/src/Coordination/InMemoryLogStore.cpp b/src/Coordination/InMemoryLogStore.cpp index 101458891e7..877c8a60a2a 100644 --- a/src/Coordination/InMemoryLogStore.cpp +++ b/src/Coordination/InMemoryLogStore.cpp @@ -72,12 +72,12 @@ nuraft::ptr>> InMemoryLogStore::log_e ret->resize(end - start); size_t cc = 0; - for (size_t ii = start; ii < end; ++ii) + for (size_t i = start; i < end; ++i) { nuraft::ptr src = nullptr; { std::lock_guard l(logs_lock); - auto entry = logs.find(ii); + auto entry = logs.find(i); if (entry == logs.end()) { entry = logs.find(0); @@ -152,9 +152,9 @@ void InMemoryLogStore::apply_pack(size_t index, nuraft::buffer & pack) pack.pos(0); Int32 num_logs = pack.get_int(); - for (Int32 ii = 0; ii < num_logs; ++ii) + for (Int32 i = 0; i < num_logs; ++i) { - size_t cur_idx = index + ii; + size_t cur_idx = index + i; Int32 buf_size = pack.get_int(); nuraft::ptr buf_local = nuraft::buffer::alloc(buf_size); diff --git a/src/Coordination/NuKeeperLogStore.h b/src/Coordination/NuKeeperLogStore.h new file mode 100644 index 00000000000..2d066ac3e3a --- /dev/null +++ b/src/Coordination/NuKeeperLogStore.h @@ -0,0 +1,24 @@ +#pragma once +#include // Y_IGNORE +#include +#include +#include +#include + +namespace DB +{ + +class NuKeeperLogStore : public nuraft::log_store +{ +public: + NuKeeperLogStore(const std::string & changelogs_path, size_t rotate_interval_); + + +private: + mutable std::mutex logs_lock; + std::atomic start_idx; + Changelog in_memory_changelog; + ChangelogOnDiskHelper on_disk_changelog_helper; +}; + +} From d38198dade3b79bcfecbee338d719e38d2c68501 Mon Sep 17 00:00:00 2001 From: lehasm Date: Mon, 15 Feb 2021 18:58:46 +0300 Subject: [PATCH 145/381] ru translation --- .../functions/string-functions.md | 43 +++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/docs/ru/sql-reference/functions/string-functions.md b/docs/ru/sql-reference/functions/string-functions.md index aeb0652cc18..b1c4012e9f9 100644 --- a/docs/ru/sql-reference/functions/string-functions.md +++ b/docs/ru/sql-reference/functions/string-functions.md @@ -597,4 +597,47 @@ Hello, "world"! 'foo' ``` + +## decodeXMLComponent {#decode-xml-component} + +Заменяет символами предопределенные мнемоники XML: `"` `&` `'` `>` `<` +Также эта функция заменяет числовые ссылки соответствующими символами юникод. +Поддерживаются десятичная (например, `✓`) и шестнадцатеричная (`✓`) формы. + +**Синтаксис** + +``` sql +decodeXMLComponent(x) +``` + +**Параметры** + +- `x` — последовательность символов. [String](../../sql-reference/data-types/string.md). + +**Возвращаемое значение** + +- Строка с произведенными заменами. + +Тип: [String](../../sql-reference/data-types/string.md). + +**Пример** + +Запрос: + +``` sql +SELECT decodeXMLComponent(''foo''); +SELECT decodeXMLComponent('< Σ >'); +``` + +Результат: + +``` text +'foo' +< Σ > +``` + +**Смотрите также** + +- [Мнемоники в HTML](https://ru.wikipedia.org/wiki/%D0%9C%D0%BD%D0%B5%D0%BC%D0%BE%D0%BD%D0%B8%D0%BA%D0%B8_%D0%B2_HTML) + [Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/string_functions/) From 5401116988b83cee6e4cf136d95843494c5523f0 Mon Sep 17 00:00:00 2001 From: alesapin Date: Mon, 15 Feb 2021 20:59:40 +0300 Subject: [PATCH 146/381] Compileable code --- src/Coordination/Changelog.cpp | 183 +++++++++++++++++---- src/Coordination/Changelog.h | 54 ++++-- src/Coordination/NuKeeperLogStore.h | 31 +++- src/Coordination/tests/gtest_for_build.cpp | 26 ++- 4 files changed, 238 insertions(+), 56 deletions(-) diff --git a/src/Coordination/Changelog.cpp b/src/Coordination/Changelog.cpp index a38f039fa40..f06185124da 100644 --- a/src/Coordination/Changelog.cpp +++ b/src/Coordination/Changelog.cpp @@ -1,7 +1,11 @@ #include #include #include +#include #include +#include +#include +#include namespace DB { @@ -26,7 +30,7 @@ std::string toString(const ChangelogVersion & version) ChangelogVersion fromString(const std::string & version_str) { - if (version == "V0") + if (version_str == "V0") return ChangelogVersion::V0; throw Exception(ErrorCodes::UNKNOWN_FORMAT_VERSION, "Unknown chagelog version {}", version_str); @@ -49,13 +53,13 @@ std::string formatChangelogPath(const std::string & prefix, const ChangelogVersi { std::filesystem::path path(prefix); path /= std::filesystem::path(name.prefix + "_" + toString(version) + "_" + std::to_string(name.from_log_idx) + "_" + std::to_string(name.to_log_idx) + ".log"); - return path.to_string(); + return path; } ChangelogName getChangelogName(const std::string & path_str) { std::filesystem::path path(path_str); - std:string filename = path.stem(); + std::string filename = path.stem(); Strings filename_parts; boost::split(filename_parts, filename, boost::is_any_of("_")); if (filename_parts.size() < 4) @@ -64,11 +68,16 @@ ChangelogName getChangelogName(const std::string & path_str) ChangelogName result; result.prefix = filename_parts[0]; result.version = fromString(filename_parts[1]); - result.form_log_idx = parse(filename_parts[2]); + result.from_log_idx = parse(filename_parts[2]); result.to_log_idx = parse(filename_parts[3]); return result; } +LogEntryPtr makeClone(const LogEntryPtr & entry) +{ + return cs_new(entry->get_term(), nuraft::buffer::clone(entry->get_buf()), entry->get_val_type()); +} + } class ChangelogWriter @@ -91,14 +100,14 @@ public: writeIntBinary(record.header.blob_size, plain_buf); writeIntBinary(record.header.blob_checksum, plain_buf); - if (record.blob_size != 0) + if (record.header.blob_size != 0) plain_buf.write(reinterpret_cast(record.blob->data_begin()), record.blob->size()); entries_written++; if (sync) plain_buf.sync(); - reeturn result; + return result; } void truncateToLength(off_t new_length) @@ -117,7 +126,7 @@ public: return entries_written; } - size_t setEntriesWritten(size_t entries_written_) + void setEntriesWritten(size_t entries_written_) { entries_written = entries_written_; } @@ -148,7 +157,7 @@ public: , read_buf(filepath) {} - size_t readChangelog(Changelog & changelog, IndexToOffset & index_to_offset) + size_t readChangelog(IndexToLogEntry & logs, size_t start_log_idx, IndexToOffset & index_to_offset) { size_t total_read = 0; while (!read_buf.eof()) @@ -174,12 +183,12 @@ public: "Checksums doesn't match for log {} (version {}), index {}, blob_size {}", filepath, record.header.version, record.header.index, record.header.blob_size); } + if (record.header.index < start_log_idx) + continue; - if (changlog.start_idx == 0) - changelog.start_idx = record.header.index; - - if (!changelog.try_emplace(record.header.index, buffer).second) - throw Exception(ErrorCodes::CORRUPTED_DATA, "Duplicated index id {} in log {}", record.header.index, filename); + auto log_entry = nuraft::cs_new(record.header.term, buffer, record.header.value_type); + if (!logs.try_emplace(record.header.index, log_entry).second) + throw Exception(ErrorCodes::CORRUPTED_DATA, "Duplicated index id {} in log {}", record.header.index, filepath); } return total_read; } @@ -188,8 +197,8 @@ private: ReadBufferFromFile read_buf; }; -ChangelogOnDiskHelper::ChangelogOnDiskHelper(const std::string & changelogs_dir, size_t rotate_interval_) - : changelogs_dir(chagelogs_dir_) +Changelog::Changelog(const std::string & changelogs_dir_, size_t rotate_interval_) + : changelogs_dir(changelogs_dir_) , rotate_interval(rotate_interval_) { namespace fs = std::filesystem; @@ -197,9 +206,8 @@ ChangelogOnDiskHelper::ChangelogOnDiskHelper(const std::string & changelogs_dir, existing_changelogs.push_back(p.path()); } -Changelog ChangelogOnDiskHelper::readChangelogAndInitWriter(size_t from_log_idx) +void Changelog::readChangelogAndInitWriter(size_t from_log_idx) { - Changelog result; size_t read_from_last = 0; for (const std::string & changelog_file : existing_changelogs) { @@ -207,9 +215,12 @@ Changelog ChangelogOnDiskHelper::readChangelogAndInitWriter(size_t from_log_idx) if (parsed_name.to_log_idx >= from_log_idx) { ChangelogReader reader(changelog_file); - read_from_last = reader.readChangelog(result, index_to_start_pos); + read_from_last = reader.readChangelog(logs, from_log_idx, index_to_start_pos); } } + + start_index = from_log_idx == 0 ? 1 : from_log_idx; + if (existing_changelogs.size() > 0 && read_from_last < rotate_interval) { auto parsed_name = getChangelogName(existing_changelogs.back()); @@ -220,26 +231,25 @@ Changelog ChangelogOnDiskHelper::readChangelogAndInitWriter(size_t from_log_idx) { rotate(from_log_idx); } - return result; } -void ChangelogOnDiskHelper::rotate(size_t new_start_log_idx) +void Changelog::rotate(size_t new_start_log_idx) { if (current_writer) current_writer->flush(); ChangelogName new_name; - new_name.prefix = changelogs_dir; + new_name.prefix = DEFAULT_PREFIX; new_name.version = CURRENT_CHANGELOG_VERSION; new_name.from_log_idx = new_start_log_idx; new_name.to_log_idx = new_start_log_idx; - auto new_log_path = formatChagelogPath(changelogs_dir, CURRENT_CHANGELOG_VERSION, new_name); + auto new_log_path = formatChangelogPath(changelogs_dir, CURRENT_CHANGELOG_VERSION, new_name); existing_changelogs.push_back(new_log_path); current_writer = std::make_unique(existing_changelogs.back(), WriteMode::Rewrite, new_start_log_idx); } -ChangelogRecord ChangelogOnDiskHelper::buildRecord(size_t index, nuraft::ptr log_entry) const +ChangelogRecord Changelog::buildRecord(size_t index, nuraft::ptr log_entry) const { ChangelogRecordHeader header; header.index = index; @@ -254,16 +264,16 @@ ChangelogRecord ChangelogOnDiskHelper::buildRecord(size_t index, nuraft::ptr log_entry) +void Changelog::appendEntry(size_t index, nuraft::ptr log_entry) { if (!current_writer) - throw Exception(ErrorCodes::LOGICAL_ERROR, "ChangelogOnDiskHelper must be initialized before appending records"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Changelog must be initialized before appending records"); if (current_writer->getEntriesWritten() == rotate_interval) rotate(index); @@ -271,16 +281,19 @@ void ChangelogOnDiskHelper::appendRecord(size_t index, nuraft::ptrappendRecord(buildRecord(index, log_entry), true); if (!index_to_start_pos.try_emplace(index, offset).second) throw Exception(ErrorCodes::LOGICAL_ERROR, "Record with index {} already exists", index); - + logs[index] = makeClone(log_entry); } -void ChangelogOnDiskHelper::writeAt(size_t index, nuraft::ptr log_entry) +void Changelog::writeAt(size_t index, nuraft::ptr log_entry) { if (index < current_writer->getStartIndex()) throw Exception(ErrorCodes::UNIMPLEMENTED, "Currently cannot overwrite index from previous file"); + if (index_to_start_pos.count(index) == 0) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot write at index {} because changelog doesn't contain it", index); + auto entries_written = current_writer->getEntriesWritten(); - current_writer->truncateToLength(index_to_start_pos(index)); + current_writer->truncateToLength(index_to_start_pos[index]); for (auto itr = index_to_start_pos.begin(); itr != index_to_start_pos.end();) { if (itr->first >= index) @@ -294,22 +307,128 @@ void ChangelogOnDiskHelper::writeAt(size_t index, nuraft::ptr current_writer->setEntriesWritten(entries_written); - appendRecord(index, log_entry); + auto itr = logs.lower_bound(index); + while (itr != logs.end()) + itr = logs.erase(itr); + + appendEntry(index, log_entry); } -void ChangelogOnDiskHelper::compact(size_t up_to_log_idx) +void Changelog::compact(size_t up_to_log_idx) { for (auto itr = existing_changelogs.begin(); itr != existing_changelogs.end();) { ChangelogName parsed_name = getChangelogName(*itr); if (parsed_name.to_log_idx <= up_to_log_idx) { - std::filesystem::remove(itr); + std::filesystem::remove(*itr); itr = existing_changelogs.erase(itr); for (size_t idx = parsed_name.from_log_idx; idx <= parsed_name.to_log_idx; ++idx) + { + auto logs_itr = logs.find(idx); + if (logs_itr != logs.end()) + logs.erase(idx); + else + break; index_to_start_pos.erase(idx); + } } } } +LogEntryPtr Changelog::getLastEntry() const +{ + + static LogEntryPtr fake_entry = nuraft::cs_new(0, nuraft::buffer::alloc(sizeof(size_t))); + + size_t next_idx = getNextEntryIndex() - 1; + auto entry = logs.find(next_idx); + if (entry == logs.end()) + return fake_entry; + + return makeClone(entry->second); +} + +LogEntriesPtr Changelog::getLogEntriesBetween(size_t start, size_t end) +{ + LogEntriesPtr ret = nuraft::cs_new>>(); + + ret->resize(end - start); + size_t result_pos = 0; + for (size_t i = start; i < end; ++i) + { + (*ret)[result_pos] = entryAt(i); + result_pos++; + } + return ret; +} + +LogEntryPtr Changelog::entryAt(size_t idx) +{ + nuraft::ptr src = nullptr; + auto entry = logs.find(idx); + if (entry == logs.end()) + return nullptr; + + src = entry->second; + return makeClone(src); +} + +nuraft::ptr Changelog::serializeEntriesToBuffer(size_t index, int32_t cnt) +{ + std::vector> returned_logs; + + size_t size_total = 0; + for (size_t i = index; i < index + cnt; ++i) + { + auto entry = logs.find(i); + if (entry == logs.end()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Don't have log entry {}", i); + + nuraft::ptr buf = entry->second->serialize(); + size_total += buf->size(); + returned_logs.push_back(buf); + } + + nuraft::ptr buf_out = nuraft::buffer::alloc(sizeof(int32_t) + cnt * sizeof(int32_t) + size_total); + buf_out->pos(0); + buf_out->put(static_cast(cnt)); + + for (auto & entry : returned_logs) + { + nuraft::ptr & bb = entry; + buf_out->put(static_cast(bb->size())); + buf_out->put(*bb); + } + return buf_out; +} + +void Changelog::applyEntriesFromBuffer(size_t index, nuraft::buffer & buffer) +{ + buffer.pos(0); + int num_logs = buffer.get_int(); + + for (int i = 0; i < num_logs; ++i) + { + size_t cur_idx = index + i; + int buf_size = buffer.get_int(); + + nuraft::ptr buf_local = nuraft::buffer::alloc(buf_size); + buffer.get(buf_local); + + LogEntryPtr log_entry = nuraft::log_entry::deserialize(*buf_local); + if (i == 0 && logs.count(cur_idx)) + writeAt(cur_idx, log_entry); + else + appendEntry(cur_idx, log_entry); + } +} + +void Changelog::flush() +{ + current_writer->flush(); +} + +Changelog::~Changelog() = default; + } diff --git a/src/Coordination/Changelog.h b/src/Coordination/Changelog.h index ffcd2a353bb..c58f35cb4a1 100644 --- a/src/Coordination/Changelog.h +++ b/src/Coordination/Changelog.h @@ -12,6 +12,13 @@ namespace DB using Checksum = CityHash_v1_0_2::uint128; +using LogEntryPtr = nuraft::ptr; +using LogEntries = std::vector; +using LogEntriesPtr = nuraft::ptr; + +using IndexToOffset = std::unordered_map; +using IndexToLogEntry = std::map; + enum class ChangelogVersion : uint8_t { V0 = 0, @@ -20,7 +27,7 @@ enum class ChangelogVersion : uint8_t std::string toString(const ChangelogVersion & version); ChangelogVersion fromString(const std::string & version_str); -static constexpr auto CURRENT_CHANGELOG_VERSION = ChangeLogVersion::V0; +static constexpr auto CURRENT_CHANGELOG_VERSION = ChangelogVersion::V0; struct ChangelogRecordHeader { @@ -38,33 +45,48 @@ struct ChangelogRecord nuraft::ptr blob; }; -using IndexToOffset = std::unordered_map; -using IndexToLogEntry = std::map>; -struct Changelog -{ -public: -private: - IndexToLogEntry logs; - size_t start_idx = 0; -}; class ChangelogWriter; -class ChangelogOnDiskHelper +class Changelog { public: - ChangelogOnDiskHelper(const std::string & changelogs_dir_, size_t rotate_interval_); + Changelog(const std::string & changelogs_dir_, size_t rotate_interval_); - Changelog readChangelogAndInitWriter(size_t from_log_idx); + void readChangelogAndInitWriter(size_t from_log_idx); - void appendRecord(size_t index, nuraft::ptr log_entry); + void appendEntry(size_t index, LogEntryPtr log_entry); - void writeAt(size_t index, nuraft::ptr log_entry); + void writeAt(size_t index, LogEntryPtr log_entry); void compact(size_t up_to_log_idx); + size_t getNextEntryIndex() const + { + return start_index + logs.size() - 1; + } + + size_t getStartIndex() const + { + return start_index; + } + + LogEntryPtr getLastEntry() const; + + LogEntriesPtr getLogEntriesBetween(size_t start_index, size_t end_idx); + + LogEntryPtr entryAt(size_t idx); + + nuraft::ptr serializeEntriesToBuffer(size_t index, Int32 cnt); + + void applyEntriesFromBuffer(size_t index, nuraft::buffer & buffer); + + void flush(); + + ~Changelog(); + private: void rotate(size_t new_start_log_idex); @@ -76,6 +98,8 @@ private: std::unique_ptr current_writer; IndexToOffset index_to_start_pos; const size_t rotate_interval; + IndexToLogEntry logs; + size_t start_index = 0; }; } diff --git a/src/Coordination/NuKeeperLogStore.h b/src/Coordination/NuKeeperLogStore.h index 2d066ac3e3a..981dc3f24e7 100644 --- a/src/Coordination/NuKeeperLogStore.h +++ b/src/Coordination/NuKeeperLogStore.h @@ -13,12 +13,35 @@ class NuKeeperLogStore : public nuraft::log_store public: NuKeeperLogStore(const std::string & changelogs_path, size_t rotate_interval_); + void init(size_t from_log_idx); + + size_t start_index() const override; + + size_t next_slot() const override; + + nuraft::ptr last_entry() const override; + + size_t append(nuraft::ptr & entry) override; + + void write_at(size_t index, nuraft::ptr & entry) override; + + nuraft::ptr>> log_entries(size_t start, size_t end) override; + + nuraft::ptr entry_at(size_t index) override; + + size_t term_at(size_t index) override; + + nuraft::ptr pack(size_t index, int32_t cnt) override; + + void apply_pack(size_t index, nuraft::buffer & pack) override; + + bool compact(size_t last_log_index) override; + + bool flush() override; private: - mutable std::mutex logs_lock; - std::atomic start_idx; - Changelog in_memory_changelog; - ChangelogOnDiskHelper on_disk_changelog_helper; + mutable std::mutex changelog_lock; + Changelog changelog; }; } diff --git a/src/Coordination/tests/gtest_for_build.cpp b/src/Coordination/tests/gtest_for_build.cpp index ed9777350c5..6142ee0b5c0 100644 --- a/src/Coordination/tests/gtest_for_build.cpp +++ b/src/Coordination/tests/gtest_for_build.cpp @@ -22,6 +22,8 @@ #include #include // Y_IGNORE #include +#include +#include TEST(CoordinationTest, BuildTest) @@ -134,7 +136,7 @@ struct SimpliestRaftServer using SummingRaftServer = SimpliestRaftServer; -nuraft::ptr getLogEntry(int64_t number) +nuraft::ptr getBuffer(int64_t number) { nuraft::ptr ret = nuraft::buffer::alloc(sizeof(number)); nuraft::buffer_serializer bs(ret); @@ -151,7 +153,7 @@ TEST(CoordinationTest, TestSummingRaft1) /// Single node is leader EXPECT_EQ(s1.raft_instance->get_leader(), 1); - auto entry1 = getLogEntry(143); + auto entry1 = getBuffer(143); auto ret = s1.raft_instance->append_entries({entry1}); EXPECT_TRUE(ret->get_accepted()) << "failed to replicate: entry 1" << ret->get_result_code(); EXPECT_EQ(ret->get_result_code(), nuraft::cmd_result_code::OK) << "failed to replicate: entry 1" << ret->get_result_code(); @@ -209,7 +211,7 @@ TEST(CoordinationTest, TestSummingRaft3) EXPECT_EQ(s3.raft_instance->get_leader(), 2); std::cerr << "Starting to add entries\n"; - auto entry = getLogEntry(1); + auto entry = getBuffer(1); auto ret = s2.raft_instance->append_entries({entry}); EXPECT_TRUE(ret->get_accepted()) << "failed to replicate: entry 1" << ret->get_result_code(); EXPECT_EQ(ret->get_result_code(), nuraft::cmd_result_code::OK) << "failed to replicate: entry 1" << ret->get_result_code(); @@ -236,7 +238,7 @@ TEST(CoordinationTest, TestSummingRaft3) EXPECT_EQ(s2.state_machine->getValue(), 1); EXPECT_EQ(s3.state_machine->getValue(), 1); - auto non_leader_entry = getLogEntry(3); + auto non_leader_entry = getBuffer(3); auto ret_non_leader1 = s1.raft_instance->append_entries({non_leader_entry}); EXPECT_FALSE(ret_non_leader1->get_accepted()); @@ -245,7 +247,7 @@ TEST(CoordinationTest, TestSummingRaft3) EXPECT_FALSE(ret_non_leader3->get_accepted()); - auto leader_entry = getLogEntry(77); + auto leader_entry = getBuffer(77); auto ret_leader = s2.raft_instance->append_entries({leader_entry}); EXPECT_TRUE(ret_leader->get_accepted()) << "failed to replicate: entry 78" << ret_leader->get_result_code(); EXPECT_EQ(ret_leader->get_result_code(), nuraft::cmd_result_code::OK) << "failed to replicate: entry 78" << ret_leader->get_result_code(); @@ -333,4 +335,18 @@ TEST(CoordinationTest, TestStorageSerialization) EXPECT_EQ(new_storage.ephemerals[1].size(), 1); } +DB::LogEntryPtr getLogEntry(const std::string & s) +{ + DB::WriteBufferFromNuraftBuffer bufwriter; + writeText(s, bufwriter); + return nuraft::cs_new(0, bufwriter.getBuffer()); +} + +TEST(CoordinationTest, ChangelogTestSimple) +{ + DB::Changelog changelog("./logs", 5); + auto entry = getLogEntry("hello world"); + changelog.appendEntry(1, entry); +} + #endif From ed9f2b5eb99335471c9f0b60bf9633e1d75a5204 Mon Sep 17 00:00:00 2001 From: alesapin Date: Mon, 15 Feb 2021 21:01:01 +0300 Subject: [PATCH 147/381] Linkable code --- src/Coordination/Changelog.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Coordination/Changelog.cpp b/src/Coordination/Changelog.cpp index f06185124da..d3ba176f209 100644 --- a/src/Coordination/Changelog.cpp +++ b/src/Coordination/Changelog.cpp @@ -16,7 +16,7 @@ namespace ErrorCodes extern const int CORRUPTED_DATA; extern const int UNKNOWN_FORMAT_VERSION; extern const int LOGICAL_ERROR; - extern const int UNIMPLEMENTED; + extern const int NOT_IMPLEMENTED; } @@ -287,7 +287,7 @@ void Changelog::appendEntry(size_t index, nuraft::ptr log_ent void Changelog::writeAt(size_t index, nuraft::ptr log_entry) { if (index < current_writer->getStartIndex()) - throw Exception(ErrorCodes::UNIMPLEMENTED, "Currently cannot overwrite index from previous file"); + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Currently cannot overwrite index from previous file"); if (index_to_start_pos.count(index) == 0) throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot write at index {} because changelog doesn't contain it", index); From 6734df2a014fd8b3b587592ecfe21244f06ef0c4 Mon Sep 17 00:00:00 2001 From: lehasm Date: Mon, 15 Feb 2021 21:25:32 +0300 Subject: [PATCH 148/381] Unnecessary new lines removed --- docs/en/sql-reference/functions/string-functions.md | 6 ++---- docs/ru/sql-reference/functions/string-functions.md | 3 +-- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/docs/en/sql-reference/functions/string-functions.md b/docs/en/sql-reference/functions/string-functions.md index fa9c84fa9af..03f6237bfe8 100644 --- a/docs/en/sql-reference/functions/string-functions.md +++ b/docs/en/sql-reference/functions/string-functions.md @@ -602,10 +602,8 @@ Hello, "world"! ## decodeXMLComponent {#decode-xml-component} -Replaces XML predefined entities with characters. -Predefined entities are `"` `&` `'` `>` `<` -This function also replaces numeric character references with Unicode characters. -Both decimal (like `✓`) and hexadecimal (`✓`) forms are supported. +Replaces XML predefined entities with characters. Predefined entities are `"` `&` `'` `>` `<` +This function also replaces numeric character references with Unicode characters. Both decimal (like `✓`) and hexadecimal (`✓`) forms are supported. **Syntax** diff --git a/docs/ru/sql-reference/functions/string-functions.md b/docs/ru/sql-reference/functions/string-functions.md index b1c4012e9f9..236583c211a 100644 --- a/docs/ru/sql-reference/functions/string-functions.md +++ b/docs/ru/sql-reference/functions/string-functions.md @@ -601,8 +601,7 @@ Hello, "world"! ## decodeXMLComponent {#decode-xml-component} Заменяет символами предопределенные мнемоники XML: `"` `&` `'` `>` `<` -Также эта функция заменяет числовые ссылки соответствующими символами юникод. -Поддерживаются десятичная (например, `✓`) и шестнадцатеричная (`✓`) формы. +Также эта функция заменяет числовые ссылки соответствующими символами юникод. Поддерживаются десятичная (например, `✓`) и шестнадцатеричная (`✓`) формы. **Синтаксис** From cf57c3b4a2b1741a8f12ee41ddb29659e06876de Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Mon, 15 Feb 2021 23:00:59 +0300 Subject: [PATCH 149/381] update comments --- src/Common/ZooKeeper/ZooKeeper.cpp | 8 ------ src/Databases/DatabaseFactory.cpp | 12 ++++++--- src/Databases/DatabaseReplicated.cpp | 35 +++++++++++++++++------- src/Databases/DatabaseReplicated.h | 40 ++++++++-------------------- tests/queries/skip_list.json | 1 + 5 files changed, 46 insertions(+), 50 deletions(-) diff --git a/src/Common/ZooKeeper/ZooKeeper.cpp b/src/Common/ZooKeeper/ZooKeeper.cpp index dc6abca6892..a1c6eb9b481 100644 --- a/src/Common/ZooKeeper/ZooKeeper.cpp +++ b/src/Common/ZooKeeper/ZooKeeper.cpp @@ -551,14 +551,6 @@ Coordination::Error ZooKeeper::trySet(const std::string & path, const std::strin Coordination::Error ZooKeeper::multiImpl(const Coordination::Requests & requests, Coordination::Responses & responses) { - String desc; - for (const auto & r : requests) - { - auto & r_ref = *r; - desc += String(typeid(r_ref).name()) + "\t" + r->getPath() + "\n"; - } - LOG_TRACE(&Poco::Logger::get("ZKTX"), "zk multi {}", desc); - if (requests.empty()) return Coordination::Error::ZOK; diff --git a/src/Databases/DatabaseFactory.cpp b/src/Databases/DatabaseFactory.cpp index cbe1b8bb02a..ca2b9bb083e 100644 --- a/src/Databases/DatabaseFactory.cpp +++ b/src/Databases/DatabaseFactory.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #if !defined(ARCADIA_BUILD) # include "config_core.h" @@ -196,10 +197,13 @@ DatabasePtr DatabaseFactory::getImpl(const ASTCreateQuery & create, const String const auto & arguments = engine->arguments->children; - //TODO allow macros in arguments - const auto & zookeeper_path = safeGetLiteralValue(arguments[0], "Replicated"); - const auto & shard_name = safeGetLiteralValue(arguments[1], "Replicated"); - const auto & replica_name = safeGetLiteralValue(arguments[2], "Replicated"); + String zookeeper_path = safeGetLiteralValue(arguments[0], "Replicated"); + String shard_name = safeGetLiteralValue(arguments[1], "Replicated"); + String replica_name = safeGetLiteralValue(arguments[2], "Replicated"); + + zookeeper_path = context.getMacros()->expand(zookeeper_path); + shard_name = context.getMacros()->expand(shard_name); + replica_name = context.getMacros()->expand(replica_name); return std::make_shared(database_name, metadata_path, uuid, zookeeper_path, shard_name, replica_name, context); } diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index dc1203e8cc9..441880ae616 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -208,10 +208,13 @@ void DatabaseReplicated::tryConnectToZooKeeper(bool force_attach) is_readonly = false; } - catch(...) + catch (...) { if (!force_attach) throw; + + /// It's server startup, ignore error. + /// Worker thread will try to setup ZooKeeper connection tryLogCurrentException(log); } } @@ -234,10 +237,11 @@ bool DatabaseReplicated::createDatabaseNodesInZooKeeper(const zkutil::ZooKeeperP Coordination::Responses responses; auto res = current_zookeeper->tryMulti(ops, responses); if (res == Coordination::Error::ZOK) - return true; + return true; /// Created new database (it's the first replica) if (res == Coordination::Error::ZNODEEXISTS) - return false; + return false; /// Database exists, we will add new replica + /// Other codes are unexpected, will throw zkutil::KeeperMultiException::check(res, ops, responses); assert(false); __builtin_unreachable(); @@ -285,6 +289,7 @@ BlockIO DatabaseReplicated::propose(const ASTPtr & query, const Context & query_ if (query_context.getClientInfo().query_kind != ClientInfo::QueryKind::INITIAL_QUERY) throw Exception(ErrorCodes::INCORRECT_QUERY, "It's not initial query. ON CLUSTER is not allowed for Replicated database."); + /// Replicas will set correct name of current database in query context (database name can be different on replicas) if (auto * ddl_query = query->as()) ddl_query->database.clear(); @@ -337,6 +342,11 @@ static UUID getTableUUIDIfReplicated(const String & metadata, const Context & co void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeeper, UInt32 our_log_ptr, UInt32 max_log_ptr) { + /// Let's compare local (possibly outdated) metadata with (most actual) metadata stored in ZooKeeper + /// and try to update the set of local tables. + /// We could drop all local tables and create the new ones just like it's new replica. + /// But it will cause all ReplicatedMergeTree tables to fetch all data parts again and data in other tables will be lost. + bool new_replica = our_log_ptr == 0; if (new_replica) LOG_INFO(log, "Will create new replica from log pointer {}", max_log_ptr); @@ -350,7 +360,7 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep /// For ReplicatedMergeTree tables we can compare only UUIDs to ensure that it's the same table. /// Metadata can be different, it's handled on table replication level. - /// We need to handle only renamed tables. + /// We need to handle renamed tables only. /// TODO maybe we should also update MergeTree SETTINGS if required? std::unordered_map zk_replicated_id_to_name; for (const auto & zk_table : table_name_to_metadata) @@ -360,6 +370,7 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep zk_replicated_id_to_name.emplace(zk_replicated_id, zk_table.first); } + /// We will drop or move tables which exist only in local metadata Strings tables_to_detach; std::vector> replicated_tables_to_rename; size_t total_tables = 0; @@ -370,12 +381,16 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep UUID local_replicated_id = UUIDHelpers::Nil; if (existing_tables_it->table()->supportsReplication()) { + /// Check if replicated tables have the same UUID local_replicated_id = existing_tables_it->table()->getStorageID().uuid; auto it = zk_replicated_id_to_name.find(local_replicated_id); if (it != zk_replicated_id_to_name.end()) { if (name != it->second) + { + /// Need just update table name replicated_tables_to_rename.emplace_back(name, it->second); + } continue; } } @@ -383,7 +398,8 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep auto in_zk = table_name_to_metadata.find(name); if (in_zk == table_name_to_metadata.end() || in_zk->second != readMetadataFile(name)) { - tables_to_detach.emplace_back(std::move(name)); + /// Local table does not exits in ZooKeeper or has different metadata + tables_to_detach.emplace_back(std::move(name)); } } @@ -407,16 +423,14 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep std::vector dropped_tables; for (const auto & table_name : tables_to_detach) { - String to_name = fmt::format("{}_{}_{}", table_name, max_log_ptr, thread_local_rng() % 1000); - assert(db_name < to_db_name); DDLGuardPtr table_guard = DatabaseCatalog::instance().getDDLGuard(db_name, table_name); - DDLGuardPtr to_table_guard = DatabaseCatalog::instance().getDDLGuard(to_db_name, to_name); if (getDatabaseName() != db_name) throw Exception(ErrorCodes::UNKNOWN_DATABASE, "Database was renamed, will retry"); auto table = tryGetTable(table_name, global_context); if (isDictionaryExist(table_name)) { + /// We can safely drop any dictionaries because they do not store data LOG_DEBUG(log, "Will DROP DICTIONARY {}", backQuoteIfNeed(table_name)); DatabaseAtomic::removeDictionary(global_context, table_name); ++dropped_dicts; @@ -430,7 +444,11 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep } else { + /// Table probably stores some data. Let's move it to another database. + String to_name = fmt::format("{}_{}_{}", table_name, max_log_ptr, thread_local_rng() % 1000); LOG_DEBUG(log, "Will RENAME TABLE {} TO {}.{}", backQuoteIfNeed(table_name), backQuoteIfNeed(to_db_name), backQuoteIfNeed(to_name)); + assert(db_name < to_db_name); + DDLGuardPtr to_table_guard = DatabaseCatalog::instance().getDDLGuard(to_db_name, to_name); auto to_db_ptr = DatabaseCatalog::instance().getDatabase(to_db_name); DatabaseAtomic::renameTable(global_context, table_name, *to_db_ptr, to_name, false, false); ++moved_tables; @@ -454,7 +472,6 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep DatabaseAtomic::renameTable(global_context, from, *this, to, false, false); } - for (const auto & id : dropped_tables) DatabaseCatalog::instance().waitTableFinallyDropped(id); diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index 2ae97b0d82a..83efb24a49d 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -18,28 +18,6 @@ using ZooKeeperPtr = std::shared_ptr; class Cluster; using ClusterPtr = std::shared_ptr; -/** DatabaseReplicated engine - * supports replication of metadata - * via DDL log being written to ZooKeeper - * and executed on all of the replicas - * for a given database. - * - * One Clickhouse server can have multiple - * replicated databases running and updating - * at the same time. - * - * The engine has two parameters ZooKeeper path and - * replica name. - * The same ZooKeeper path corresponds to the same - * database. Replica names MUST be different for all replicas - * of the same database. - * - * Using this engine, creation of Replicated tables - * requires no ZooKeeper path and replica name parameters. - * Table's replica name is the same as database replica name. - * Table's ZooKeeper path is a concatenation of database - * ZooKeeper path, /tables/, and UUID of the table. - */ class DatabaseReplicated : public DatabaseAtomic { public: @@ -49,6 +27,9 @@ public: ~DatabaseReplicated() override; + String getEngineName() const override { return "Replicated"; } + + /// If current query is initial, then the following methods add metadata updating ZooKeeper operations to current MetadataTransaction. void dropTable(const Context &, const String & table_name, bool no_delay) override; void renameTable(const Context & context, const String & table_name, IDatabase & to_database, const String & to_table_name, bool exchange, bool dictionary) override; @@ -64,22 +45,23 @@ public: void removeDictionary(const Context & context, const String & dictionary_name) override; void detachTablePermanently(const Context & context, const String & table_name) override; - void drop(const Context & /*context*/) override; - - String getEngineName() const override { return "Replicated"; } - + /// Try to execute DLL query on current host as initial query. If query is succeed, + /// then it will be executed on all replicas. BlockIO propose(const ASTPtr & query, const Context & query_context); void stopReplication(); - void shutdown() override; - - void loadStoredObjects(Context & context, bool has_force_restore_data_flag, bool force_attach) override; String getFullReplicaName() const; static std::pair parseFullReplicaName(const String & name); + /// Returns cluster consisting of database replicas ClusterPtr getCluster() const; + void drop(const Context & /*context*/) override; + + void loadStoredObjects(Context & context, bool has_force_restore_data_flag, bool force_attach) override; + void shutdown() override; + friend struct DatabaseReplicatedTask; friend class DatabaseReplicatedDDLWorker; private: diff --git a/tests/queries/skip_list.json b/tests/queries/skip_list.json index db7b0631b97..f28e2dd7226 100644 --- a/tests/queries/skip_list.json +++ b/tests/queries/skip_list.json @@ -296,6 +296,7 @@ "01015_attach_part", "01015_database_bad_tables", "01017_uniqCombined_memory_usage", + "01018_ddl_dictionaries_concurrent_requrests", /// Cannot parse ATTACH DICTIONARY IF NOT EXISTS "01019_alter_materialized_view_atomic", "01019_alter_materialized_view_consistent", "01019_alter_materialized_view_query", From e7bbb6cb23446791cabdd1ab315d29107e857324 Mon Sep 17 00:00:00 2001 From: sevirov <72220289+sevirov@users.noreply.github.com> Date: Mon, 15 Feb 2021 23:09:06 +0300 Subject: [PATCH 150/381] Update docs/en/sql-reference/functions/type-conversion-functions.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/en/sql-reference/functions/type-conversion-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index f752bb9f6cb..189cf74049c 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -701,7 +701,7 @@ parseDateTimeBestEffortUSOrNull(time_string [, time_zone]); **Parameters** -- `time_string` — String containing a date and time to convert. [String](../../sql-reference/data-types/string.md). +- `time_string` — String containing a date or date with time to convert. The date must be in the US date format (`MM/DD/YYYY`). [String](../../sql-reference/data-types/string.md). - `time_zone` — Time zone. The function parses `time_string` according to the time zone. [String](../../sql-reference/data-types/string.md). **Supported non-standard formats** From 5eda6169902306fb4e9f07e28327aff9531b3052 Mon Sep 17 00:00:00 2001 From: sevirov <72220289+sevirov@users.noreply.github.com> Date: Mon, 15 Feb 2021 23:14:01 +0300 Subject: [PATCH 151/381] Update docs/en/sql-reference/functions/type-conversion-functions.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/en/sql-reference/functions/type-conversion-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 189cf74049c..06ac64646ae 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -702,7 +702,7 @@ parseDateTimeBestEffortUSOrNull(time_string [, time_zone]); **Parameters** - `time_string` — String containing a date or date with time to convert. The date must be in the US date format (`MM/DD/YYYY`). [String](../../sql-reference/data-types/string.md). -- `time_zone` — Time zone. The function parses `time_string` according to the time zone. [String](../../sql-reference/data-types/string.md). +- `time_zone` — [Timezone](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone). The function parses `time_string` according to the timezone. Optional. [String](../../sql-reference/data-types/string.md). **Supported non-standard formats** From a09c9be48b6ba4d42029459486639b3c6b504429 Mon Sep 17 00:00:00 2001 From: sevirov <72220289+sevirov@users.noreply.github.com> Date: Mon, 15 Feb 2021 23:30:39 +0300 Subject: [PATCH 152/381] Update docs/en/sql-reference/functions/type-conversion-functions.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- .../en/sql-reference/functions/type-conversion-functions.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 06ac64646ae..24ac8d91d22 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -707,10 +707,10 @@ parseDateTimeBestEffortUSOrNull(time_string [, time_zone]); **Supported non-standard formats** - A string containing 9..10 digit [unix timestamp](https://en.wikipedia.org/wiki/Unix_time). -- A string with a date and a time component: `YYYYMMDDhhmmss`, `MM/DD/YYYY hh:mm:ss`, `MM-DD-YY hh:mm`, `YYYY-MM-DD hh:mm:ss`, etc. +- A string with a date and a time components: `YYYYMMDDhhmmss`, `MM/DD/YYYY hh:mm:ss`, `MM-DD-YY hh:mm`, `YYYY-MM-DD hh:mm:ss`, etc. - A string with a date, but no time component: `YYYY`, `YYYYMM`, `YYYY*MM`, `MM/DD/YYYY`, `MM-DD-YY` etc. -- A string with a day and time: `DD`, `DD hh`, `DD hh:mm`. In this case, `YYYY-MM` are substituted as `2000-01`. -- A string that includes the date and time along with time zone offset information: `YYYY-MM-DD hh:mm:ss ±h:mm`, etc. For example, `2020-12-12 17:36:00 -5:00`. +- A string with a day and time: `DD`, `DD hh`, `DD hh:mm`. In this case, `YYYY-MM` are substituted with `2000-01`. +- A string that includes date and time along with timezone offset information: `YYYY-MM-DD hh:mm:ss ±h:mm`, etc. For example, `2020-12-12 17:36:00 -5:00`. **Returned values** From f6cbad65e82267b6c6e9bc0fcc672f0802085384 Mon Sep 17 00:00:00 2001 From: sevirov <72220289+sevirov@users.noreply.github.com> Date: Mon, 15 Feb 2021 23:33:35 +0300 Subject: [PATCH 153/381] Update docs/en/sql-reference/functions/type-conversion-functions.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- .../en/sql-reference/functions/type-conversion-functions.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 24ac8d91d22..6cc0fe52442 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -714,8 +714,10 @@ parseDateTimeBestEffortUSOrNull(time_string [, time_zone]); **Returned values** -- `time_string` converted to the `DateTime` data type. -- `NULL`. +Possible values: + +- `time_string` converted to the [DateTime](../../sql-reference/data-types/datetime.md) data type. +- `NULL` if the input string cannot be converted to the `DateTime` data type. **Examples** From c9a6b21fc8c20f08c4abbe62398d635deb5de3d4 Mon Sep 17 00:00:00 2001 From: Dmitriy Date: Mon, 15 Feb 2021 23:47:12 +0300 Subject: [PATCH 154/381] Fix the English version MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Поправил английскую версию согласно комментариям в PR. --- .../functions/type-conversion-functions.md | 52 ++++++++----------- 1 file changed, 23 insertions(+), 29 deletions(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 6cc0fe52442..08e83771af7 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -691,12 +691,12 @@ Same as for [parseDateTimeBestEffort](#parsedatetimebesteffort) except that it r ## parseDateTimeBestEffortUSOrNull {#parsedatetimebesteffortusornull} -Same as for [parseDateTimeBestEffortUS](#parsedatetimebesteffortUS) except that it returns `NULL` when it encounters a date format that cannot be processed. +Same as [parseDateTimeBestEffortUS](#parsedatetimebesteffortUS) function except that it returns `NULL` when it encounters a date format that cannot be processed. **Syntax** ``` sql -parseDateTimeBestEffortUSOrNull(time_string [, time_zone]); +parseDateTimeBestEffortUSOrNull(time_string[, time_zone]) ``` **Parameters** @@ -716,16 +716,15 @@ parseDateTimeBestEffortUSOrNull(time_string [, time_zone]); Possible values: -- `time_string` converted to the [DateTime](../../sql-reference/data-types/datetime.md) data type. -- `NULL` if the input string cannot be converted to the `DateTime` data type. +- `time_string` converted to the [DateTime](../../sql-reference/data-types/datetime.md) data type. +- `NULL` if the input string cannot be converted to the `DateTime` data type. **Examples** Query: ``` sql -SELECT parseDateTimeBestEffortUSOrNull('02/10/2021 21:12:57') -AS parseDateTimeBestEffortUSOrNull; +SELECT parseDateTimeBestEffortUSOrNull('02/10/2021 21:12:57') AS parseDateTimeBestEffortUSOrNull; ``` Result: @@ -739,8 +738,7 @@ Result: Query: ``` sql -SELECT parseDateTimeBestEffortUSOrNull('02-10-2021 21:12:57') -AS parseDateTimeBestEffortUSOrNull; +SELECT parseDateTimeBestEffortUSOrNull('02-10-2021 21:12:57') AS parseDateTimeBestEffortUSOrNull; ``` Result: @@ -754,8 +752,7 @@ Result: Query: ``` sql -SELECT parseDateTimeBestEffortUSOrNull('02.10.2021 21:12:57') -AS parseDateTimeBestEffortUSOrNull; +SELECT parseDateTimeBestEffortUSOrNull('02.10.2021 21:12:57') AS parseDateTimeBestEffortUSOrNull; ``` Result: @@ -769,8 +766,7 @@ Result: Query: ``` sql -SELECT parseDateTimeBestEffortUSOrNull('02.2021 21:12:57') -AS parseDateTimeBestEffortUSOrNull; +SELECT parseDateTimeBestEffortUSOrNull('02.10.2021') AS parseDateTimeBestEffortUSOrNull; ``` Result: @@ -783,30 +779,32 @@ Result: ## parseDateTimeBestEffortUSOrZero {#parsedatetimebesteffortusorzero} -Same as for [parseDateTimeBestEffortUS](#parsedatetimebesteffortUS) except that it returns zero date or zero date time when it encounters a date format that cannot be processed. +Same as [parseDateTimeBestEffortUS](#parsedatetimebesteffortUS) function except that it returns zero date or zero date with time when it encounters a date format that cannot be processed. **Syntax** ``` sql -parseDateTimeBestEffortUSOrZero(time_string [, time_zone]); +parseDateTimeBestEffortUSOrZero(time_string[, time_zone]) ``` **Parameters** -- `time_string` — String containing a date and time to convert. [String](../../sql-reference/data-types/string.md). -- `time_zone` — Time zone. The function parses `time_string` according to the time zone. [String](../../sql-reference/data-types/string.md). +- `time_string` — String containing a date or date with time to convert. The date must be in the US date format (`MM/DD/YYYY`). [String](../../sql-reference/data-types/string.md). +- `time_zone` — [Timezone](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone). The function parses `time_string` according to the timezone. Optional. [String](../../sql-reference/data-types/string.md). **Supported non-standard formats** - A string containing 9..10 digit [unix timestamp](https://en.wikipedia.org/wiki/Unix_time). -- A string with a date and a time component: `YYYYMMDDhhmmss`, `MM/DD/YYYY hh:mm:ss`, `MM-DD-YY hh:mm`, `YYYY-MM-DD hh:mm:ss`, etc. +- A string with a date and a time components: `YYYYMMDDhhmmss`, `MM/DD/YYYY hh:mm:ss`, `MM-DD-YY hh:mm`, `YYYY-MM-DD hh:mm:ss`, etc. - A string with a date, but no time component: `YYYY`, `YYYYMM`, `YYYY*MM`, `MM/DD/YYYY`, `MM-DD-YY` etc. -- A string with a day and time: `DD`, `DD hh`, `DD hh:mm`. In this case, `YYYY-MM` are substituted as `2000-01`. -- A string that includes the date and time along with time zone offset information: `YYYY-MM-DD hh:mm:ss ±h:mm`, etc. For example, `2020-12-12 17:36:00 -5:00`. +- A string with a day and time: `DD`, `DD hh`, `DD hh:mm`. In this case, `YYYY-MM` are substituted with `2000-01`. +- A string that includes date and time along with timezone offset information: `YYYY-MM-DD hh:mm:ss ±h:mm`, etc. For example, `2020-12-12 17:36:00 -5:00`. -**Returned value** +**Returned values** -- `time_string` converted to the `DateTime` data type. +Possible values: + +- `time_string` converted to the [DateTime](../../sql-reference/data-types/datetime.md) data type. - `zero date time`. **Examples** @@ -814,8 +812,7 @@ parseDateTimeBestEffortUSOrZero(time_string [, time_zone]); Query: ``` sql -SELECT parseDateTimeBestEffortUSOrZero('02/10/2021 21:12:57') -AS parseDateTimeBestEffortUSOrZero; +SELECT parseDateTimeBestEffortUSOrZero('02/10/2021 21:12:57') AS parseDateTimeBestEffortUSOrZero; ``` Result: @@ -829,8 +826,7 @@ Result: Query: ``` sql -SELECT parseDateTimeBestEffortUSOrZero('02-10-2021 21:12:57') -AS parseDateTimeBestEffortUSOrZero; +SELECT parseDateTimeBestEffortUSOrZero('02-10-2021 21:12:57') AS parseDateTimeBestEffortUSOrZero; ``` Result: @@ -844,8 +840,7 @@ Result: Query: ``` sql -SELECT parseDateTimeBestEffortUSOrZero('02.10.2021 21:12:57') -AS parseDateTimeBestEffortUS; +SELECT parseDateTimeBestEffortUSOrZero('02.10.2021 21:12:57') AS parseDateTimeBestEffortUS; ``` Result: @@ -859,8 +854,7 @@ Result: Query: ``` sql -SELECT parseDateTimeBestEffortUSOrZero('02.2021 21:12:57') -AS parseDateTimeBestEffortUSOrZero; +SELECT parseDateTimeBestEffortUSOrZero('02.2021 21:12:57') AS parseDateTimeBestEffortUSOrZero; ``` Result: From 21f80a9367760528b12c0639d3c4faacf7c100e0 Mon Sep 17 00:00:00 2001 From: Dmitriy Date: Tue, 16 Feb 2021 00:42:16 +0300 Subject: [PATCH 155/381] Add examples MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Добавил примеры. --- .../functions/type-conversion-functions.md | 42 +++++++++---------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 08e83771af7..81b5649db32 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -738,28 +738,14 @@ Result: Query: ``` sql -SELECT parseDateTimeBestEffortUSOrNull('02-10-2021 21:12:57') AS parseDateTimeBestEffortUSOrNull; +SELECT parseDateTimeBestEffortUSOrNull('02-10-2021 21:12:57 GMT', 'Europe/Moscow') AS parseDateTimeBestEffortUSOrNull; ``` Result: ``` text ┌─parseDateTimeBestEffortUSOrNull─┐ -│ 2021-02-10 21:12:57 │ -└─────────────────────────────────┘ -``` - -Query: - -``` sql -SELECT parseDateTimeBestEffortUSOrNull('02.10.2021 21:12:57') AS parseDateTimeBestEffortUSOrNull; -``` - -Result: - -``` text -┌─parseDateTimeBestEffortUSOrNull─┐ -│ 2021-02-10 21:12:57 │ +│ 2021-02-11 00:12:57 │ └─────────────────────────────────┘ ``` @@ -771,6 +757,20 @@ SELECT parseDateTimeBestEffortUSOrNull('02.10.2021') AS parseDateTimeBestEffortU Result: +``` text +┌─parseDateTimeBestEffortUSOrNull─┐ +│ 2021-02-10 00:00:00 │ +└─────────────────────────────────┘ +``` + +Query: + +``` sql +SELECT parseDateTimeBestEffortUSOrNull('10.2021') AS parseDateTimeBestEffortUSOrNull; +``` + +Result: + ``` text ┌─parseDateTimeBestEffortUSOrNull─┐ │ ᴺᵁᴸᴸ │ @@ -826,35 +826,35 @@ Result: Query: ``` sql -SELECT parseDateTimeBestEffortUSOrZero('02-10-2021 21:12:57') AS parseDateTimeBestEffortUSOrZero; +SELECT parseDateTimeBestEffortUSOrZero('02-10-2021 21:12:57 GMT', 'Europe/Moscow') AS parseDateTimeBestEffortUSOrZero; ``` Result: ``` text ┌─parseDateTimeBestEffortUSOrZero─┐ -│ 2021-02-10 21:12:57 │ +│ 2021-02-11 00:12:57 │ └─────────────────────────────────┘ ``` Query: ``` sql -SELECT parseDateTimeBestEffortUSOrZero('02.10.2021 21:12:57') AS parseDateTimeBestEffortUS; +SELECT parseDateTimeBestEffortUSOrZero('02.10.2021') AS parseDateTimeBestEffortUSOrZero; ``` Result: ``` text ┌─parseDateTimeBestEffortUSOrZero─┐ -│ 2021-02-10 21:12:57 │ +│ 2021-02-10 00:00:00 │ └─────────────────────────────────┘ ``` Query: ``` sql -SELECT parseDateTimeBestEffortUSOrZero('02.2021 21:12:57') AS parseDateTimeBestEffortUSOrZero; +SELECT parseDateTimeBestEffortUSOrZero('02.2021') AS parseDateTimeBestEffortUSOrZero; ``` Result: From a6322800118f9f9c27b3c239d78707af1025e97d Mon Sep 17 00:00:00 2001 From: George Date: Tue, 16 Feb 2021 13:53:44 +0300 Subject: [PATCH 156/381] added alias for nulls --- docs/en/sql-reference/functions/functions-for-nulls.md | 2 ++ docs/ru/sql-reference/functions/functions-for-nulls.md | 2 ++ 2 files changed, 4 insertions(+) diff --git a/docs/en/sql-reference/functions/functions-for-nulls.md b/docs/en/sql-reference/functions/functions-for-nulls.md index c32af7194fb..fbbda2c0ecc 100644 --- a/docs/en/sql-reference/functions/functions-for-nulls.md +++ b/docs/en/sql-reference/functions/functions-for-nulls.md @@ -13,6 +13,8 @@ Checks whether the argument is [NULL](../../sql-reference/syntax.md#null-literal isNull(x) ``` +Alias: `ISNULL`. + **Parameters** - `x` — A value with a non-compound data type. diff --git a/docs/ru/sql-reference/functions/functions-for-nulls.md b/docs/ru/sql-reference/functions/functions-for-nulls.md index 17da1ea9194..0db55847631 100644 --- a/docs/ru/sql-reference/functions/functions-for-nulls.md +++ b/docs/ru/sql-reference/functions/functions-for-nulls.md @@ -13,6 +13,8 @@ toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438\u0020\u0434\u043b\u044f\u isNull(x) ``` +Синоним: `ISNULL`. + **Параметры** - `x` — значение с не составным типом данных. From bc6fdc7d4b09f290a57f7da39ba4abae2532d7c6 Mon Sep 17 00:00:00 2001 From: George Date: Tue, 16 Feb 2021 14:12:12 +0300 Subject: [PATCH 157/381] added aliases for date-time functions --- .../functions/date-time-functions.md | 18 ++++++++++++++++++ .../functions/date-time-functions.md | 18 ++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md index 4a73bdb2546..a0c89ecb035 100644 --- a/docs/en/sql-reference/functions/date-time-functions.md +++ b/docs/en/sql-reference/functions/date-time-functions.md @@ -61,40 +61,58 @@ int32samoa: 1546300800 Converts a date or date with time to a UInt16 number containing the year number (AD). +Alias: `Year`. + ## toQuarter {#toquarter} Converts a date or date with time to a UInt8 number containing the quarter number. +Alias: `QUARTER`. + ## toMonth {#tomonth} Converts a date or date with time to a UInt8 number containing the month number (1-12). +Alias: `MONTH`. + ## toDayOfYear {#todayofyear} Converts a date or date with time to a UInt16 number containing the number of the day of the year (1-366). +Alias: `DAYOFYEAR`. + ## toDayOfMonth {#todayofmonth} Converts a date or date with time to a UInt8 number containing the number of the day of the month (1-31). +Aliases: `DAYOFMONTH`, `DAY`. + ## toDayOfWeek {#todayofweek} Converts a date or date with time to a UInt8 number containing the number of the day of the week (Monday is 1, and Sunday is 7). +Alias: `DAYOFWEEK`. + ## toHour {#tohour} Converts a date with time to a UInt8 number containing the number of the hour in 24-hour time (0-23). This function assumes that if clocks are moved ahead, it is by one hour and occurs at 2 a.m., and if clocks are moved back, it is by one hour and occurs at 3 a.m. (which is not always true – even in Moscow the clocks were twice changed at a different time). +Alias: `HOUR`. + ## toMinute {#tominute} Converts a date with time to a UInt8 number containing the number of the minute of the hour (0-59). +Alias: `MINUTE`. + ## toSecond {#tosecond} Converts a date with time to a UInt8 number containing the number of the second in the minute (0-59). Leap seconds are not accounted for. +Alias: `SECOND`. + ## toUnixTimestamp {#to-unix-timestamp} For DateTime argument: converts value to the number with type UInt32 -- Unix Timestamp (https://en.wikipedia.org/wiki/Unix_time). diff --git a/docs/ru/sql-reference/functions/date-time-functions.md b/docs/ru/sql-reference/functions/date-time-functions.md index 31482cde77f..add47e9dad1 100644 --- a/docs/ru/sql-reference/functions/date-time-functions.md +++ b/docs/ru/sql-reference/functions/date-time-functions.md @@ -63,40 +63,58 @@ int32samoa: 1546300800 Переводит дату или дату-с-временем в число типа UInt16, содержащее номер года (AD). +Синоним: `Year`. + ## toQuarter {#toquarter} Переводит дату или дату-с-временем в число типа UInt8, содержащее номер квартала. +Синоним: `QUARTER`. + ## toMonth {#tomonth} Переводит дату или дату-с-временем в число типа UInt8, содержащее номер месяца (1-12). +Синоним: `MONTH`. + ## toDayOfYear {#todayofyear} Переводит дату или дату-с-временем в число типа UInt16, содержащее номер дня года (1-366). +Синоним: `DAYOFYEAR`. + ## toDayOfMonth {#todayofmonth} Переводит дату или дату-с-временем в число типа UInt8, содержащее номер дня в месяце (1-31). +Синонимы: `DAYOFMONTH`, `DAY`. + ## toDayOfWeek {#todayofweek} Переводит дату или дату-с-временем в число типа UInt8, содержащее номер дня в неделе (понедельник - 1, воскресенье - 7). +Синоним: `DAYOFWEEK`. + ## toHour {#tohour} Переводит дату-с-временем в число типа UInt8, содержащее номер часа в сутках (0-23). Функция исходит из допущения, что перевод стрелок вперёд, если осуществляется, то на час, в два часа ночи, а перевод стрелок назад, если осуществляется, то на час, в три часа ночи (что, в общем, не верно - даже в Москве два раза перевод стрелок был осуществлён в другое время). +Синоним: `HOUR`. + ## toMinute {#tominute} Переводит дату-с-временем в число типа UInt8, содержащее номер минуты в часе (0-59). +Синоним: `MINUTE`. + ## toSecond {#tosecond} Переводит дату-с-временем в число типа UInt8, содержащее номер секунды в минуте (0-59). Секунды координации не учитываются. +Синоним: `SECOND`. + ## toUnixTimestamp {#to-unix-timestamp} Переводит дату-с-временем в число типа UInt32 -- Unix Timestamp (https://en.wikipedia.org/wiki/Unix_time). From 33e12f7b4a628fdd63f3a30e070cedbb0449473a Mon Sep 17 00:00:00 2001 From: George Date: Tue, 16 Feb 2021 14:12:44 +0300 Subject: [PATCH 158/381] added aliases for encoding functions --- docs/en/sql-reference/functions/encoding-functions.md | 2 ++ docs/ru/sql-reference/functions/encoding-functions.md | 2 ++ 2 files changed, 4 insertions(+) diff --git a/docs/en/sql-reference/functions/encoding-functions.md b/docs/en/sql-reference/functions/encoding-functions.md index bc3f5ca4345..3ec6c8ec3dd 100644 --- a/docs/en/sql-reference/functions/encoding-functions.md +++ b/docs/en/sql-reference/functions/encoding-functions.md @@ -75,6 +75,8 @@ Result: Returns a string containing the argument’s hexadecimal representation. +Alias: `HEX`. + **Syntax** ``` sql diff --git a/docs/ru/sql-reference/functions/encoding-functions.md b/docs/ru/sql-reference/functions/encoding-functions.md index 6f1c2aad6cb..8c3065e5a77 100644 --- a/docs/ru/sql-reference/functions/encoding-functions.md +++ b/docs/ru/sql-reference/functions/encoding-functions.md @@ -75,6 +75,8 @@ SELECT char(0xE4, 0xBD, 0xA0, 0xE5, 0xA5, 0xBD) AS hello; Returns a string containing the argument’s hexadecimal representation. +Синоним: `HEX`. + **Syntax** ``` sql From 1bd1a97716264f668659a972861c3f172e3b1cef Mon Sep 17 00:00:00 2001 From: George Date: Tue, 16 Feb 2021 14:13:01 +0300 Subject: [PATCH 159/381] added aliases for string functions --- docs/en/sql-reference/functions/string-functions.md | 4 ++++ docs/ru/sql-reference/functions/string-functions.md | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/docs/en/sql-reference/functions/string-functions.md b/docs/en/sql-reference/functions/string-functions.md index 2b93dd924a3..c1f3625c14d 100644 --- a/docs/en/sql-reference/functions/string-functions.md +++ b/docs/en/sql-reference/functions/string-functions.md @@ -276,10 +276,14 @@ Returns the string ‘s’ that was converted from the encoding in ‘from’ to Encodes ‘s’ string into base64 +Alias: `TO_BASE64`. + ## base64Decode(s) {#base64decode} Decode base64-encoded string ‘s’ into original string. In case of failure raises an exception. +Alias: `FROM_BASE64`. + ## tryBase64Decode(s) {#trybase64decode} Similar to base64Decode, but in case of error an empty string would be returned. diff --git a/docs/ru/sql-reference/functions/string-functions.md b/docs/ru/sql-reference/functions/string-functions.md index aeb0652cc18..24edc3618fb 100644 --- a/docs/ru/sql-reference/functions/string-functions.md +++ b/docs/ru/sql-reference/functions/string-functions.md @@ -273,10 +273,14 @@ SELECT concat(key1, key2), sum(value) FROM key_val GROUP BY (key1, key2) Производит кодирование строки s в base64-представление. +Синоним: `TO_BASE64`. + ## base64Decode(s) {#base64decode} Декодирует base64-представление s в исходную строку. При невозможности декодирования выбрасывает исключение +Синоним: `FROM_BASE64`. + ## tryBase64Decode(s) {#trybase64decode} Функционал аналогичен base64Decode, но при невозможности декодирования возвращает пустую строку. From 3603fbd46a30e5a8f77877de5cac871ebec17564 Mon Sep 17 00:00:00 2001 From: George Date: Tue, 16 Feb 2021 14:13:17 +0300 Subject: [PATCH 160/381] added aliases for ip-address functions --- .../sql-reference/functions/ip-address-functions.md | 12 +++++++++++- .../sql-reference/functions/ip-address-functions.md | 12 +++++++++++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/docs/en/sql-reference/functions/ip-address-functions.md b/docs/en/sql-reference/functions/ip-address-functions.md index 0c1f675304b..8e2939e9272 100644 --- a/docs/en/sql-reference/functions/ip-address-functions.md +++ b/docs/en/sql-reference/functions/ip-address-functions.md @@ -9,10 +9,14 @@ toc_title: IP Addresses Takes a UInt32 number. Interprets it as an IPv4 address in big endian. Returns a string containing the corresponding IPv4 address in the format A.B.C.d (dot-separated numbers in decimal form). +Alias: `INET_NTOA`. + ## IPv4StringToNum(s) {#ipv4stringtonums} The reverse function of IPv4NumToString. If the IPv4 address has an invalid format, it returns 0. +Alias: `INET_ATON`. + ## IPv4NumToStringClassC(num) {#ipv4numtostringclasscnum} Similar to IPv4NumToString, but using xxx instead of the last octet. @@ -49,7 +53,11 @@ Since using ‘xxx’ is highly unusual, this may be changed in the future. We r ### IPv6NumToString(x) {#ipv6numtostringx} Accepts a FixedString(16) value containing the IPv6 address in binary format. Returns a string containing this address in text format. -IPv6-mapped IPv4 addresses are output in the format ::ffff:111.222.33.44. Examples: +IPv6-mapped IPv4 addresses are output in the format ::ffff:111.222.33.44. + +Alias: `INET6_NTOA`. + +Examples: ``` sql SELECT IPv6NumToString(toFixedString(unhex('2A0206B8000000000000000000000011'), 16)) AS addr @@ -119,6 +127,8 @@ The reverse function of IPv6NumToString. If the IPv6 address has an invalid form If the IP address is a valid IPv4 address then the IPv6 equivalent of the IPv4 address is returned. HEX can be uppercase or lowercase. +Alias: `INET6_ATON`. + ``` sql SELECT cutIPv6(IPv6StringToNum('127.0.0.1'), 0, 0); ``` diff --git a/docs/ru/sql-reference/functions/ip-address-functions.md b/docs/ru/sql-reference/functions/ip-address-functions.md index 52f0a92bc9f..3b7379e9a65 100644 --- a/docs/ru/sql-reference/functions/ip-address-functions.md +++ b/docs/ru/sql-reference/functions/ip-address-functions.md @@ -9,10 +9,14 @@ toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438\u0020\u0434\u043b\u044f\u Принимает число типа UInt32. Интерпретирует его, как IPv4-адрес в big endian. Возвращает строку, содержащую соответствующий IPv4-адрес в формате A.B.C.D (числа в десятичной форме через точки). +Синоним: `INET_NTOA`. + ## IPv4StringToNum(s) {#ipv4stringtonums} Функция, обратная к IPv4NumToString. Если IPv4 адрес в неправильном формате, то возвращает 0. +Синоним: `INET_ATON`. + ## IPv4NumToStringClassC(num) {#ipv4numtostringclasscnum} Похоже на IPv4NumToString, но вместо последнего октета используется xxx. @@ -49,7 +53,11 @@ LIMIT 10 ### IPv6NumToString(x) {#ipv6numtostringx} Принимает значение типа FixedString(16), содержащее IPv6-адрес в бинарном виде. Возвращает строку, содержащую этот адрес в текстовом виде. -IPv6-mapped IPv4 адреса выводится в формате ::ffff:111.222.33.44. Примеры: +IPv6-mapped IPv4 адреса выводится в формате ::ffff:111.222.33.44. + +Примеры: `INET6_NTOA`. + +Примеры: ``` sql SELECT IPv6NumToString(toFixedString(unhex('2A0206B8000000000000000000000011'), 16)) AS addr @@ -118,6 +126,8 @@ LIMIT 10 Функция, обратная к IPv6NumToString. Если IPv6 адрес в неправильном формате, то возвращает строку из нулевых байт. HEX может быть в любом регистре. +Alias: `INET6_ATON`. + ## IPv4ToIPv6(x) {#ipv4toipv6x} Принимает число типа `UInt32`. Интерпретирует его, как IPv4-адрес в [big endian](https://en.wikipedia.org/wiki/Endianness). Возвращает значение `FixedString(16)`, содержащее адрес IPv6 в двоичном формате. Примеры: From c661760113164e74d7cb5ee5c394de3c57892d6c Mon Sep 17 00:00:00 2001 From: George Date: Tue, 16 Feb 2021 14:27:52 +0300 Subject: [PATCH 161/381] fixed a typo --- docs/ru/sql-reference/functions/date-time-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/functions/date-time-functions.md b/docs/ru/sql-reference/functions/date-time-functions.md index add47e9dad1..85d7c275f27 100644 --- a/docs/ru/sql-reference/functions/date-time-functions.md +++ b/docs/ru/sql-reference/functions/date-time-functions.md @@ -63,7 +63,7 @@ int32samoa: 1546300800 Переводит дату или дату-с-временем в число типа UInt16, содержащее номер года (AD). -Синоним: `Year`. +Синоним: `YEAR`. ## toQuarter {#toquarter} From 8a7d59f0fef99281a935cad8e51f40ff8a7341bc Mon Sep 17 00:00:00 2001 From: George Date: Tue, 16 Feb 2021 14:31:24 +0300 Subject: [PATCH 162/381] Added aliases for string function --- docs/en/sql-reference/functions/string-functions.md | 2 ++ docs/ru/sql-reference/functions/string-functions.md | 2 ++ 2 files changed, 4 insertions(+) diff --git a/docs/en/sql-reference/functions/string-functions.md b/docs/en/sql-reference/functions/string-functions.md index c1f3625c14d..a4c127507b7 100644 --- a/docs/en/sql-reference/functions/string-functions.md +++ b/docs/en/sql-reference/functions/string-functions.md @@ -98,6 +98,8 @@ SELECT toValidUTF8('\x61\xF0\x80\x80\x80b') Repeats a string as many times as specified and concatenates the replicated values as a single string. +Alias: `REPEAT`. + **Syntax** ``` sql diff --git a/docs/ru/sql-reference/functions/string-functions.md b/docs/ru/sql-reference/functions/string-functions.md index 24edc3618fb..d01d12ac8d5 100644 --- a/docs/ru/sql-reference/functions/string-functions.md +++ b/docs/ru/sql-reference/functions/string-functions.md @@ -95,6 +95,8 @@ SELECT toValidUTF8('\x61\xF0\x80\x80\x80b') Повторяет строку определенное количество раз и объединяет повторяемые значения в одну строку. +Синоним: `REPEAT`. + **Синтаксис** ``` sql From 4315cd8d26cb838553dc38a38ba35380e0eed767 Mon Sep 17 00:00:00 2001 From: George Date: Tue, 16 Feb 2021 14:34:24 +0300 Subject: [PATCH 163/381] fixed a typo --- docs/en/sql-reference/functions/date-time-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md index a0c89ecb035..880942a02f9 100644 --- a/docs/en/sql-reference/functions/date-time-functions.md +++ b/docs/en/sql-reference/functions/date-time-functions.md @@ -61,7 +61,7 @@ int32samoa: 1546300800 Converts a date or date with time to a UInt16 number containing the year number (AD). -Alias: `Year`. +Alias: `YEAR`. ## toQuarter {#toquarter} From 243ca5fe58d7b12fee746784c2f8a2f36790ff1e Mon Sep 17 00:00:00 2001 From: George Date: Tue, 16 Feb 2021 14:48:28 +0300 Subject: [PATCH 164/381] Added aliases for type conversion functions --- docs/en/sql-reference/functions/type-conversion-functions.md | 2 ++ docs/ru/sql-reference/functions/type-conversion-functions.md | 2 ++ 2 files changed, 4 insertions(+) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 3ca36f41c78..6e21ee9774d 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -124,6 +124,8 @@ SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8) ## toDate {#todate} +Alias: `DATE`. + ## toDateOrZero {#todateorzero} ## toDateOrNull {#todateornull} diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index 4a314bd22d8..022b4c3ebc7 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -124,6 +124,8 @@ SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8) ## toDate {#todate} +Cиноним: `DATE`. + ## toDateOrZero {#todateorzero} ## toDateOrNull {#todateornull} From 7b54b892b5eed13edfb0963dd02287fbe0d8881f Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Tue, 16 Feb 2021 17:05:58 +0300 Subject: [PATCH 165/381] fix --- src/Databases/DatabaseOnDisk.cpp | 2 +- src/Interpreters/Context.cpp | 4 ++-- src/Interpreters/Context.h | 2 +- src/Interpreters/DDLWorker.cpp | 9 +++++++-- src/Interpreters/DDLWorker.h | 2 +- src/Storages/StorageMaterializedView.cpp | 19 +++++++++++++++---- tests/queries/skip_list.json | 7 +++++++ 7 files changed, 34 insertions(+), 11 deletions(-) diff --git a/src/Databases/DatabaseOnDisk.cpp b/src/Databases/DatabaseOnDisk.cpp index 24bab42cad2..e5d2b23ace0 100644 --- a/src/Databases/DatabaseOnDisk.cpp +++ b/src/Databases/DatabaseOnDisk.cpp @@ -460,7 +460,7 @@ void DatabaseOnDisk::renameTable( if (from_atomic_to_ordinary) { - auto & atomic_db = assert_cast(*this); + auto & atomic_db = dynamic_cast(*this); /// Special case: usually no actions with symlinks are required when detaching/attaching table, /// but not when moving from Atomic database to Ordinary if (table->storesDataOnDisk()) diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index d0a1e4d37bf..766b14dea42 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -2553,10 +2553,10 @@ StorageID Context::resolveStorageIDImpl(StorageID storage_id, StorageNamespace w return StorageID::createEmpty(); } -void Context::initMetadataTransaction(MetadataTransactionPtr txn) +void Context::initMetadataTransaction(MetadataTransactionPtr txn, [[maybe_unused]] bool attach_existing) { assert(!metadata_transaction); - assert(query_context == this); + assert(attach_existing || query_context == this); metadata_transaction = std::move(txn); } diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index f6ee28aca22..8b59b225480 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -746,7 +746,7 @@ public: IHostContextPtr & getHostContext(); const IHostContextPtr & getHostContext() const; - void initMetadataTransaction(MetadataTransactionPtr txn); + void initMetadataTransaction(MetadataTransactionPtr txn, bool attach_to_context = false); MetadataTransactionPtr getMetadataTransaction() const; struct MySQLWireContext diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index f08f47b1c0e..c342a994395 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -328,6 +328,8 @@ void DDLWorker::scheduleTasks() LOG_TRACE(log, "No tasks to schedule"); return; } + else if (max_tasks_in_queue < queue_nodes.size()) + cleanup_event->set(); bool server_startup = current_tasks.empty(); auto begin_node = queue_nodes.begin(); @@ -489,9 +491,8 @@ void DDLWorker::processTask(DDLTaskBase & task) if (create_active_res == Coordination::Error::ZNODEEXISTS) { - /// Connection has been lost and now we are retrying to write query status, + /// Connection has been lost and now we are retrying, /// but our previous ephemeral node still exists. - assert(task.was_executed); zkutil::EventPtr eph_node_disappeared = std::make_shared(); String dummy; if (zookeeper->tryGet(active_node_path, dummy, nullptr, eph_node_disappeared)) @@ -826,6 +827,7 @@ void DDLWorker::cleanupQueue(Int64, const ZooKeeperPtr & zookeeper) ops.emplace_back(zkutil::makeRemoveRequest(fs::path(node_path) / "finished", -1)); ops.emplace_back(zkutil::makeRemoveRequest(node_path, -1)); auto rm_entry_res = zookeeper->tryMulti(ops, res); + if (rm_entry_res == Coordination::Error::ZNONODE) { /// Most likely both node_path/finished and node_path were removed concurrently. @@ -888,8 +890,11 @@ void DDLWorker::createStatusDirs(const std::string & node_path, const ZooKeeperP return; if (is_currently_deleting) + { + cleanup_event->set(); throw Exception(ErrorCodes::UNFINISHED, "Cannot create status dirs for {}, " "most likely because someone is deleting it concurrently", node_path); + } /// Connection lost or entry was removed assert(Coordination::isHardwareError(code) || code == Coordination::Error::ZNONODE); diff --git a/src/Interpreters/DDLWorker.h b/src/Interpreters/DDLWorker.h index 0985884eef7..c39a832c098 100644 --- a/src/Interpreters/DDLWorker.h +++ b/src/Interpreters/DDLWorker.h @@ -102,7 +102,7 @@ protected: virtual bool canRemoveQueueEntry(const String & entry_name, const Coordination::Stat & stat); /// Init task node - static void createStatusDirs(const std::string & node_path, const ZooKeeperPtr & zookeeper); + void createStatusDirs(const std::string & node_path, const ZooKeeperPtr & zookeeper); virtual void initializeMainThread(); diff --git a/src/Storages/StorageMaterializedView.cpp b/src/Storages/StorageMaterializedView.cpp index fb75a933910..32317968fe5 100644 --- a/src/Storages/StorageMaterializedView.cpp +++ b/src/Storages/StorageMaterializedView.cpp @@ -194,9 +194,9 @@ BlockOutputStreamPtr StorageMaterializedView::write(const ASTPtr & query, const } -static void executeDropQuery(ASTDropQuery::Kind kind, const Context & global_context, const StorageID & target_table_id, bool no_delay) +static void executeDropQuery(ASTDropQuery::Kind kind, const Context & global_context, const Context & current_context, const StorageID & target_table_id, bool no_delay) { - if (DatabaseCatalog::instance().tryGetTable(target_table_id, global_context)) + if (DatabaseCatalog::instance().tryGetTable(target_table_id, current_context)) { /// We create and execute `drop` query for internal table. auto drop_query = std::make_shared(); @@ -206,7 +206,18 @@ static void executeDropQuery(ASTDropQuery::Kind kind, const Context & global_con drop_query->no_delay = no_delay; drop_query->if_exists = true; ASTPtr ast_drop_query = drop_query; + /// FIXME We have to use global context to execute DROP query for inner table + /// to avoid "Not enough privileges" error if current user has only DROP VIEW ON mat_view_name privilege + /// and not allowed to drop inner table explicitly. Allowing to drop inner table without explicit grant + /// looks like expected behaviour and we have tests for it. auto drop_context = Context(global_context); + drop_context.getClientInfo().query_kind = ClientInfo::QueryKind::SECONDARY_QUERY; + if (auto txn = current_context.getMetadataTransaction()) + { + /// For Replicated database + drop_context.setQueryContext(const_cast(current_context)); + drop_context.initMetadataTransaction(txn, true); + } InterpreterDropQuery drop_interpreter(ast_drop_query, drop_context); drop_interpreter.execute(); } @@ -226,13 +237,13 @@ void StorageMaterializedView::drop() void StorageMaterializedView::dropInnerTable(bool no_delay, const Context & context) { if (has_inner_table && tryGetTargetTable()) - executeDropQuery(ASTDropQuery::Kind::Drop, context, target_table_id, no_delay); + executeDropQuery(ASTDropQuery::Kind::Drop, global_context, context, target_table_id, no_delay); } void StorageMaterializedView::truncate(const ASTPtr &, const StorageMetadataPtr &, const Context & context, TableExclusiveLockHolder &) { if (has_inner_table) - executeDropQuery(ASTDropQuery::Kind::Truncate, context, target_table_id, true); + executeDropQuery(ASTDropQuery::Kind::Truncate, global_context, context, target_table_id, true); } void StorageMaterializedView::checkStatementCanBeForwarded() const diff --git a/tests/queries/skip_list.json b/tests/queries/skip_list.json index 5c75fc0300b..52cef210748 100644 --- a/tests/queries/skip_list.json +++ b/tests/queries/skip_list.json @@ -103,9 +103,16 @@ "00738_lock_for_inner_table" ], "database-replicated": [ + /// Tests with DETACH TABLE (it's not allowed) + /// and tests with SET (session and query settings are not supported) "memory_tracking", "memory_usage", "live_view", + "00152_insert_different_granularity", + "01715_background_checker_blather_zookeeper", + "01714_alter_drop_version", + "01114_materialize_clear_index_compact_parts", + "00814_replicated_minimalistic_part_header_zookeeper", "01188_attach_table_from_pat", "01415_sticking_mutations", "01130_in_memory_parts", From 75117389eccf862b1a08b93a32d4f839846715f6 Mon Sep 17 00:00:00 2001 From: M0r64n Date: Tue, 16 Feb 2021 18:50:11 +0400 Subject: [PATCH 166/381] Add a couple of QOL file engine settings --- docs/en/operations/settings/settings.md | 20 +++++++++++++++++++ src/Core/Settings.h | 2 ++ src/Storages/StorageFile.cpp | 12 ++++++++++- ..._engine_file_empty_if_not_exists.reference | 0 .../01720_engine_file_empty_if_not_exists.sql | 15 ++++++++++++++ ...1_engine_file_truncate_on_insert.reference | 13 ++++++++++++ .../01721_engine_file_truncate_on_insert.sql | 20 +++++++++++++++++++ 7 files changed, 81 insertions(+), 1 deletion(-) create mode 100644 tests/queries/0_stateless/01720_engine_file_empty_if_not_exists.reference create mode 100644 tests/queries/0_stateless/01720_engine_file_empty_if_not_exists.sql create mode 100644 tests/queries/0_stateless/01721_engine_file_truncate_on_insert.reference create mode 100644 tests/queries/0_stateless/01721_engine_file_truncate_on_insert.sql diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 43519bfc8dc..6440f09bb40 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -2659,3 +2659,23 @@ Result: Note that this setting influences [Materialized view](../../sql-reference/statements/create/view.md#materialized) and [MaterializeMySQL](../../engines/database-engines/materialize-mysql.md) behaviour. [Original article](https://clickhouse.tech/docs/en/operations/settings/settings/) + +## engine_file_empty_if_not_exists {#engine-file-empty_if-not-exists} + +Allows to select data from a file engine table without file. + +Possible values: +- 0 — `SELECT` throws exception. +- 1 — `SELECT` returns empty result. + +Default value: `0`. + +## engine_file_truncate_on_insert {#engine-file-truncate-on-insert} + +Enables or disables truncate before insert in file engine tables. + +Possible values: +- 0 — Disabled. +- 1 — Enabled. + +Default value: `0`. diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 9bb9ad30f15..98c3b9d1f85 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -421,6 +421,8 @@ class IColumn; M(Bool, optimize_rewrite_sum_if_to_count_if, true, "Rewrite sumIf() and sum(if()) function countIf() function when logically equivalent", 0) \ M(UInt64, insert_shard_id, 0, "If non zero, when insert into a distributed table, the data will be inserted into the shard `insert_shard_id` synchronously. Possible values range from 1 to `shards_number` of corresponding distributed table", 0) \ M(Bool, allow_experimental_query_deduplication, false, "Allow sending parts' UUIDs for a query in order to deduplicate data parts if any", 0) \ + M(Bool, engine_file_empty_if_not_exists, false, "Allows to select data from a file engine table without file", 0) \ + M(Bool, engine_file_truncate_on_insert, false, "Enables or disables truncate before insert in file engine tables", 0) \ \ /** Obsolete settings that do nothing but left for compatibility reasons. Remove each one after half a year of obsolescence. */ \ \ diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index a5935ba3bf4..856d03ea2ce 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -34,6 +34,7 @@ #include #include #include +#include #include namespace fs = std::filesystem; @@ -427,7 +428,12 @@ Pipe StorageFile::read( paths = {""}; /// when use fd, paths are empty else if (paths.size() == 1 && !Poco::File(paths[0]).exists()) - throw Exception("File " + paths[0] + " doesn't exist", ErrorCodes::FILE_DOESNT_EXIST); + { + if (context.getSettingsRef().engine_file_empty_if_not_exists) + return Pipe(std::make_shared(metadata_snapshot->getSampleBlockForColumns(column_names, getVirtuals(), getStorageID()))); + else + throw Exception("File " + paths[0] + " doesn't exist", ErrorCodes::FILE_DOESNT_EXIST); + } auto files_info = std::make_shared(); @@ -547,6 +553,10 @@ BlockOutputStreamPtr StorageFile::write( throw Exception("Method write is not implemented for Distributed format", ErrorCodes::NOT_IMPLEMENTED); std::string path; + if (context.getSettingsRef().engine_file_truncate_on_insert) + if (0 != ::truncate(paths[0].c_str(), 0)) + throwFromErrnoWithPath("Cannot truncate file " + paths[0], paths[0], ErrorCodes::CANNOT_TRUNCATE_FILE); + if (!paths.empty()) { path = paths[0]; diff --git a/tests/queries/0_stateless/01720_engine_file_empty_if_not_exists.reference b/tests/queries/0_stateless/01720_engine_file_empty_if_not_exists.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/01720_engine_file_empty_if_not_exists.sql b/tests/queries/0_stateless/01720_engine_file_empty_if_not_exists.sql new file mode 100644 index 00000000000..c04e01ccc88 --- /dev/null +++ b/tests/queries/0_stateless/01720_engine_file_empty_if_not_exists.sql @@ -0,0 +1,15 @@ +DROP TABLE IF EXISTS file_engine_table; + +CREATE TABLE file_engine_table (id UInt32) ENGINE=File(TSV); + +SELECT * FROM file_engine_table; --{ serverError 107 } + +SET engine_file_empty_if_not_exists=0; + +SELECT * FROM file_engine_table; --{ serverError 107 } + +SET engine_file_empty_if_not_exists=1; + +SELECT * FROM file_engine_table; + +SET engine_file_empty_if_not_exists=0; diff --git a/tests/queries/0_stateless/01721_engine_file_truncate_on_insert.reference b/tests/queries/0_stateless/01721_engine_file_truncate_on_insert.reference new file mode 100644 index 00000000000..a25fb4f0e7e --- /dev/null +++ b/tests/queries/0_stateless/01721_engine_file_truncate_on_insert.reference @@ -0,0 +1,13 @@ +1 +2 +3 +4 +1 +2 +3 +4 +5 +6 +0 +1 +2 \ No newline at end of file diff --git a/tests/queries/0_stateless/01721_engine_file_truncate_on_insert.sql b/tests/queries/0_stateless/01721_engine_file_truncate_on_insert.sql new file mode 100644 index 00000000000..65246db7963 --- /dev/null +++ b/tests/queries/0_stateless/01721_engine_file_truncate_on_insert.sql @@ -0,0 +1,20 @@ +INSERT INTO TABLE FUNCTION file('01718_file/test/data.TSV', 'TSV', 'id UInt32') VALUES ('file', 42); +ATTACH TABLE test FROM '01718_file/test' (id UInt8) ENGINE=File(TSV); + +CREATE TABLE file_engine_table (id UInt32) ENGINE=File(TabSeparated); + +INSERT INTO file_engine_table VALUES (1), (2), (3); +INSERT INTO file_engine_table VALUES (4); +SELECT * FROM file_engine_table; + +SET engine_file_truncate_on_insert=0; + +INSERT INTO file_engine_table VALUES (5), (6); +SELECT * FROM file_engine_table; + +SET engine_file_truncate_on_insert=1; + +INSERT INTO file_engine_table VALUES (0), (1), (2); +SELECT * FROM file_engine_table; + +SET engine_file_truncate_on_insert=0; From 16bcd9d247877c55d27936e64a0d3c76dbe9cf7a Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 16 Feb 2021 20:28:54 +0300 Subject: [PATCH 167/381] Add changelog tests --- src/Coordination/Changelog.cpp | 102 ++++--- src/Coordination/Changelog.h | 12 +- src/Coordination/tests/gtest_for_build.cpp | 325 ++++++++++++++++++++- 3 files changed, 396 insertions(+), 43 deletions(-) diff --git a/src/Coordination/Changelog.cpp b/src/Coordination/Changelog.cpp index d3ba176f209..6fa3e0e9e03 100644 --- a/src/Coordination/Changelog.cpp +++ b/src/Coordination/Changelog.cpp @@ -44,15 +44,14 @@ static constexpr auto DEFAULT_PREFIX = "changelog"; struct ChangelogName { std::string prefix; - ChangelogVersion version; size_t from_log_idx; size_t to_log_idx; }; -std::string formatChangelogPath(const std::string & prefix, const ChangelogVersion & version, const ChangelogName & name) +std::string formatChangelogPath(const std::string & prefix, const ChangelogName & name) { std::filesystem::path path(prefix); - path /= std::filesystem::path(name.prefix + "_" + toString(version) + "_" + std::to_string(name.from_log_idx) + "_" + std::to_string(name.to_log_idx) + ".log"); + path /= std::filesystem::path(name.prefix + "_" + std::to_string(name.from_log_idx) + "_" + std::to_string(name.to_log_idx) + ".bin"); return path; } @@ -62,14 +61,13 @@ ChangelogName getChangelogName(const std::string & path_str) std::string filename = path.stem(); Strings filename_parts; boost::split(filename_parts, filename, boost::is_any_of("_")); - if (filename_parts.size() < 4) + if (filename_parts.size() < 3) throw Exception(ErrorCodes::CORRUPTED_DATA, "Invalid changelog {}", path_str); ChangelogName result; result.prefix = filename_parts[0]; - result.version = fromString(filename_parts[1]); - result.from_log_idx = parse(filename_parts[2]); - result.to_log_idx = parse(filename_parts[3]); + result.from_log_idx = parse(filename_parts[1]); + result.to_log_idx = parse(filename_parts[2]); return result; } @@ -114,6 +112,7 @@ public: { flush(); plain_buf.truncate(new_length); + plain_buf.seek(new_length, SEEK_SET); } void flush() @@ -190,6 +189,7 @@ public: if (!logs.try_emplace(record.header.index, log_entry).second) throw Exception(ErrorCodes::CORRUPTED_DATA, "Duplicated index id {} in log {}", record.header.index, filepath); } + return total_read; } private: @@ -203,13 +203,16 @@ Changelog::Changelog(const std::string & changelogs_dir_, size_t rotate_interval { namespace fs = std::filesystem; for(const auto & p : fs::directory_iterator(changelogs_dir)) - existing_changelogs.push_back(p.path()); + { + auto name = getChangelogName(p.path()); + existing_changelogs[name.from_log_idx] = p.path(); + } } void Changelog::readChangelogAndInitWriter(size_t from_log_idx) { size_t read_from_last = 0; - for (const std::string & changelog_file : existing_changelogs) + for (const auto & [start_id, changelog_file] : existing_changelogs) { ChangelogName parsed_name = getChangelogName(changelog_file); if (parsed_name.to_log_idx >= from_log_idx) @@ -223,8 +226,9 @@ void Changelog::readChangelogAndInitWriter(size_t from_log_idx) if (existing_changelogs.size() > 0 && read_from_last < rotate_interval) { - auto parsed_name = getChangelogName(existing_changelogs.back()); - current_writer = std::make_unique(existing_changelogs.back(), WriteMode::Append, parsed_name.from_log_idx); + auto str_name = existing_changelogs.rbegin()->second; + auto parsed_name = getChangelogName(str_name); + current_writer = std::make_unique(str_name, WriteMode::Append, parsed_name.from_log_idx); current_writer->setEntriesWritten(read_from_last); } else @@ -240,13 +244,12 @@ void Changelog::rotate(size_t new_start_log_idx) ChangelogName new_name; new_name.prefix = DEFAULT_PREFIX; - new_name.version = CURRENT_CHANGELOG_VERSION; new_name.from_log_idx = new_start_log_idx; - new_name.to_log_idx = new_start_log_idx; + new_name.to_log_idx = new_start_log_idx + rotate_interval - 1; - auto new_log_path = formatChangelogPath(changelogs_dir, CURRENT_CHANGELOG_VERSION, new_name); - existing_changelogs.push_back(new_log_path); - current_writer = std::make_unique(existing_changelogs.back(), WriteMode::Rewrite, new_start_log_idx); + auto new_log_path = formatChangelogPath(changelogs_dir, new_name); + existing_changelogs[new_start_log_idx] = new_log_path; + current_writer = std::make_unique(new_log_path, WriteMode::Rewrite, new_start_log_idx); } ChangelogRecord Changelog::buildRecord(size_t index, nuraft::ptr log_entry) const @@ -275,42 +278,62 @@ void Changelog::appendEntry(size_t index, nuraft::ptr log_ent if (!current_writer) throw Exception(ErrorCodes::LOGICAL_ERROR, "Changelog must be initialized before appending records"); + if (logs.empty()) + start_index = index; + if (current_writer->getEntriesWritten() == rotate_interval) rotate(index); auto offset = current_writer->appendRecord(buildRecord(index, log_entry), true); if (!index_to_start_pos.try_emplace(index, offset).second) throw Exception(ErrorCodes::LOGICAL_ERROR, "Record with index {} already exists", index); + logs[index] = makeClone(log_entry); } void Changelog::writeAt(size_t index, nuraft::ptr log_entry) { - if (index < current_writer->getStartIndex()) - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Currently cannot overwrite index from previous file"); - if (index_to_start_pos.count(index) == 0) throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot write at index {} because changelog doesn't contain it", index); + bool need_rollback = index < current_writer->getStartIndex(); + if (need_rollback) + { + auto index_changelog = existing_changelogs.lower_bound(index); + std::string fname; + if (index_changelog->first == index) + fname = index_changelog->second; + else + fname = std::prev(index_changelog)->second; + + current_writer = std::make_unique(fname, WriteMode::Append, index_changelog->first); + auto formated_name = getChangelogName(fname); + current_writer->setEntriesWritten(formated_name.to_log_idx - formated_name.from_log_idx + 1); + } + auto entries_written = current_writer->getEntriesWritten(); current_writer->truncateToLength(index_to_start_pos[index]); - for (auto itr = index_to_start_pos.begin(); itr != index_to_start_pos.end();) + + if (need_rollback) { - if (itr->first >= index) + auto to_remove_itr = existing_changelogs.upper_bound(index); + for (auto itr = to_remove_itr; itr != existing_changelogs.end();) { - entries_written--; - itr = index_to_start_pos.erase(itr); + std::filesystem::remove(itr->second); + itr = existing_changelogs.erase(itr); } - else - itr++; + } + + /// Rollback in memory state + for (auto itr = logs.lower_bound(index); itr != logs.end();) + { + index_to_start_pos.erase(itr->first); + itr = logs.erase(itr); + entries_written--; } current_writer->setEntriesWritten(entries_written); - auto itr = logs.lower_bound(index); - while (itr != logs.end()) - itr = logs.erase(itr); - appendEntry(index, log_entry); } @@ -318,22 +341,27 @@ void Changelog::compact(size_t up_to_log_idx) { for (auto itr = existing_changelogs.begin(); itr != existing_changelogs.end();) { - ChangelogName parsed_name = getChangelogName(*itr); + ChangelogName parsed_name = getChangelogName(itr->second); if (parsed_name.to_log_idx <= up_to_log_idx) { - std::filesystem::remove(*itr); - itr = existing_changelogs.erase(itr); + for (size_t idx = parsed_name.from_log_idx; idx <= parsed_name.to_log_idx; ++idx) { - auto logs_itr = logs.find(idx); - if (logs_itr != logs.end()) - logs.erase(idx); - else + auto index_pos = index_to_start_pos.find(idx); + if (index_pos == index_to_start_pos.end()) break; - index_to_start_pos.erase(idx); + index_to_start_pos.erase(index_pos); } + std::filesystem::remove(itr->second); + itr = existing_changelogs.erase(itr); } + else + break; } + auto start = logs.begin(); + auto end = logs.upper_bound(up_to_log_idx); + logs.erase(start, end); + start_index = up_to_log_idx + 1; } LogEntryPtr Changelog::getLastEntry() const diff --git a/src/Coordination/Changelog.h b/src/Coordination/Changelog.h index c58f35cb4a1..97669d1aa19 100644 --- a/src/Coordination/Changelog.h +++ b/src/Coordination/Changelog.h @@ -65,7 +65,7 @@ public: size_t getNextEntryIndex() const { - return start_index + logs.size() - 1; + return start_index + logs.size(); } size_t getStartIndex() const @@ -79,22 +79,28 @@ public: LogEntryPtr entryAt(size_t idx); - nuraft::ptr serializeEntriesToBuffer(size_t index, Int32 cnt); + nuraft::ptr serializeEntriesToBuffer(size_t index, int32_t cnt); void applyEntriesFromBuffer(size_t index, nuraft::buffer & buffer); void flush(); + size_t size() const + { + return logs.size(); + } + ~Changelog(); private: + void rotate(size_t new_start_log_idex); ChangelogRecord buildRecord(size_t index, nuraft::ptr log_entry) const; private: std::string changelogs_dir; - std::deque existing_changelogs; + std::map existing_changelogs; std::unique_ptr current_writer; IndexToOffset index_to_start_pos; const size_t rotate_interval; diff --git a/src/Coordination/tests/gtest_for_build.cpp b/src/Coordination/tests/gtest_for_build.cpp index 6142ee0b5c0..6335df4b940 100644 --- a/src/Coordination/tests/gtest_for_build.cpp +++ b/src/Coordination/tests/gtest_for_build.cpp @@ -24,6 +24,7 @@ #include #include #include +#include TEST(CoordinationTest, BuildTest) @@ -335,18 +336,336 @@ TEST(CoordinationTest, TestStorageSerialization) EXPECT_EQ(new_storage.ephemerals[1].size(), 1); } -DB::LogEntryPtr getLogEntry(const std::string & s) +DB::LogEntryPtr getLogEntry(const std::string & s, size_t term) { DB::WriteBufferFromNuraftBuffer bufwriter; writeText(s, bufwriter); - return nuraft::cs_new(0, bufwriter.getBuffer()); + return nuraft::cs_new(term, bufwriter.getBuffer()); } +namespace fs = std::filesystem; +struct ChangelogDirTest +{ + std::string path; + bool drop; + ChangelogDirTest(std::string path_, bool drop_ = true) + : path(path_) + , drop(drop_) + { + if (fs::exists(path)) + EXPECT_TRUE(false) << "Path " << path << " already exists, remove it to run test"; + fs::create_directory(path); + } + + ~ChangelogDirTest() + { + if (fs::exists(path) && drop) + fs::remove_all(path); + } +}; + TEST(CoordinationTest, ChangelogTestSimple) { + ChangelogDirTest test("./logs"); DB::Changelog changelog("./logs", 5); - auto entry = getLogEntry("hello world"); + changelog.readChangelogAndInitWriter(1); + auto entry = getLogEntry("hello world", 77); changelog.appendEntry(1, entry); + EXPECT_EQ(changelog.getNextEntryIndex(), 2); + EXPECT_EQ(changelog.getStartIndex(), 1); + EXPECT_EQ(changelog.getLastEntry()->get_term(), 77); + EXPECT_EQ(changelog.entryAt(1)->get_term(), 77); + EXPECT_EQ(changelog.getLogEntriesBetween(1, 2)->size(), 1); +} + +TEST(CoordinationTest, ChangelogTestFile) +{ + ChangelogDirTest test("./logs"); + DB::Changelog changelog("./logs", 5); + changelog.readChangelogAndInitWriter(1); + auto entry = getLogEntry("hello world", 77); + changelog.appendEntry(1, entry); + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + for(const auto & p : fs::directory_iterator("./logs")) + EXPECT_EQ(p.path(), "./logs/changelog_1_5.bin"); + + changelog.appendEntry(2, entry); + changelog.appendEntry(3, entry); + changelog.appendEntry(4, entry); + changelog.appendEntry(5, entry); + changelog.appendEntry(6, entry); + + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); +} + +TEST(CoordinationTest, ChangelogReadWrite) +{ + ChangelogDirTest test("./logs"); + DB::Changelog changelog("./logs", 1000); + changelog.readChangelogAndInitWriter(1); + for (size_t i = 0; i < 10; ++i) + { + auto entry = getLogEntry("hello world", i * 10); + changelog.appendEntry(changelog.getNextEntryIndex(), entry); + } + EXPECT_EQ(changelog.size(), 10); + DB::Changelog changelog_reader("./logs", 1000); + changelog_reader.readChangelogAndInitWriter(1); + EXPECT_EQ(changelog_reader.size(), 10); + EXPECT_EQ(changelog_reader.getLastEntry()->get_term(), changelog.getLastEntry()->get_term()); + EXPECT_EQ(changelog_reader.getStartIndex(), changelog.getStartIndex()); + EXPECT_EQ(changelog_reader.getNextEntryIndex(), changelog.getNextEntryIndex()); + + for (size_t i = 0; i < 10; ++i) + EXPECT_EQ(changelog_reader.entryAt(i + 1)->get_term(), changelog.entryAt(i + 1)->get_term()); + + auto entries_from_range_read = changelog_reader.getLogEntriesBetween(1, 11); + auto entries_from_range = changelog.getLogEntriesBetween(1, 11); + EXPECT_EQ(entries_from_range_read->size(), entries_from_range->size()); + EXPECT_EQ(10, entries_from_range->size()); +} + +TEST(CoordinationTest, ChangelogWriteAt) +{ + ChangelogDirTest test("./logs"); + DB::Changelog changelog("./logs", 1000); + changelog.readChangelogAndInitWriter(1); + for (size_t i = 0; i < 10; ++i) + { + auto entry = getLogEntry("hello world", i * 10); + changelog.appendEntry(changelog.getNextEntryIndex(), entry); + } + EXPECT_EQ(changelog.size(), 10); + + auto entry = getLogEntry("writer", 77); + changelog.writeAt(7, entry); + EXPECT_EQ(changelog.size(), 7); + EXPECT_EQ(changelog.getLastEntry()->get_term(), 77); + EXPECT_EQ(changelog.entryAt(7)->get_term(), 77); + EXPECT_EQ(changelog.getNextEntryIndex(), 8); + + DB::Changelog changelog_reader("./logs", 1000); + changelog_reader.readChangelogAndInitWriter(1); + + EXPECT_EQ(changelog_reader.size(), changelog.size()); + EXPECT_EQ(changelog_reader.getLastEntry()->get_term(), changelog.getLastEntry()->get_term()); + EXPECT_EQ(changelog_reader.getStartIndex(), changelog.getStartIndex()); + EXPECT_EQ(changelog_reader.getNextEntryIndex(), changelog.getNextEntryIndex()); +} + + +TEST(CoordinationTest, ChangelogTestAppendAfterRead) +{ + ChangelogDirTest test("./logs"); + DB::Changelog changelog("./logs", 5); + changelog.readChangelogAndInitWriter(1); + for (size_t i = 0; i < 7; ++i) + { + auto entry = getLogEntry("hello world", i * 10); + changelog.appendEntry(changelog.getNextEntryIndex(), entry); + } + + EXPECT_EQ(changelog.size(), 7); + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); + + DB::Changelog changelog_reader("./logs", 5); + changelog_reader.readChangelogAndInitWriter(1); + + EXPECT_EQ(changelog_reader.size(), 7); + for (size_t i = 7; i < 10; ++i) + { + auto entry = getLogEntry("hello world", i * 10); + changelog_reader.appendEntry(changelog_reader.getNextEntryIndex(), entry); + } + EXPECT_EQ(changelog_reader.size(), 10); + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); + + size_t logs_count = 0; + for(const auto & _ [[maybe_unused]]: fs::directory_iterator("./logs")) + logs_count++; + + EXPECT_EQ(logs_count, 2); + + auto entry = getLogEntry("someentry", 77); + changelog_reader.appendEntry(changelog_reader.getNextEntryIndex(), entry); + EXPECT_EQ(changelog_reader.size(), 11); + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_11_15.bin")); + + logs_count = 0; + for(const auto & _ [[maybe_unused]]: fs::directory_iterator("./logs")) + logs_count++; + + EXPECT_EQ(logs_count, 3); +} + +TEST(CoordinationTest, ChangelogTestCompaction) +{ + ChangelogDirTest test("./logs"); + DB::Changelog changelog("./logs", 5); + changelog.readChangelogAndInitWriter(1); + + for (size_t i = 0; i < 3; ++i) + { + auto entry = getLogEntry("hello world", i * 10); + changelog.appendEntry(changelog.getNextEntryIndex(), entry); + } + + EXPECT_EQ(changelog.size(), 3); + + changelog.compact(2); + + EXPECT_EQ(changelog.size(), 1); + EXPECT_EQ(changelog.getStartIndex(), 3); + EXPECT_EQ(changelog.getNextEntryIndex(), 4); + EXPECT_EQ(changelog.getLastEntry()->get_term(), 20); + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + + changelog.appendEntry(changelog.getNextEntryIndex(), getLogEntry("hello world", 30)); + changelog.appendEntry(changelog.getNextEntryIndex(), getLogEntry("hello world", 40)); + changelog.appendEntry(changelog.getNextEntryIndex(), getLogEntry("hello world", 50)); + changelog.appendEntry(changelog.getNextEntryIndex(), getLogEntry("hello world", 60)); + + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); + + changelog.compact(6); + + EXPECT_FALSE(fs::exists("./logs/changelog_1_5.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); + + EXPECT_EQ(changelog.size(), 1); + EXPECT_EQ(changelog.getStartIndex(), 7); + EXPECT_EQ(changelog.getNextEntryIndex(), 8); + EXPECT_EQ(changelog.getLastEntry()->get_term(), 60); + /// And we able to read it + DB::Changelog changelog_reader("./logs", 5); + changelog_reader.readChangelogAndInitWriter(7); + EXPECT_EQ(changelog_reader.size(), 1); + EXPECT_EQ(changelog_reader.getStartIndex(), 7); + EXPECT_EQ(changelog_reader.getNextEntryIndex(), 8); + EXPECT_EQ(changelog_reader.getLastEntry()->get_term(), 60); +} + +TEST(CoordinationTest, ChangelogTestBatchOperations) +{ + ChangelogDirTest test("./logs"); + DB::Changelog changelog("./logs", 100); + changelog.readChangelogAndInitWriter(1); + for (size_t i = 0; i < 10; ++i) + { + auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10); + changelog.appendEntry(changelog.getNextEntryIndex(), entry); + } + + EXPECT_EQ(changelog.size(), 10); + + auto entries = changelog.serializeEntriesToBuffer(1, 5); + + DB::Changelog apply_changelog("./logs", 100); + apply_changelog.readChangelogAndInitWriter(1); + + for (size_t i = 0; i < 10; ++i) + { + EXPECT_EQ(apply_changelog.entryAt(i + 1)->get_term(), i * 10); + } + EXPECT_EQ(apply_changelog.size(), 10); + + apply_changelog.applyEntriesFromBuffer(8, *entries); + + EXPECT_EQ(apply_changelog.size(), 12); + EXPECT_EQ(apply_changelog.getStartIndex(), 1); + EXPECT_EQ(apply_changelog.getNextEntryIndex(), 13); + + for (size_t i = 0; i < 7; ++i) + { + EXPECT_EQ(apply_changelog.entryAt(i + 1)->get_term(), i * 10); + } + + EXPECT_EQ(apply_changelog.entryAt(8)->get_term(), 0); + EXPECT_EQ(apply_changelog.entryAt(9)->get_term(), 10); + EXPECT_EQ(apply_changelog.entryAt(10)->get_term(), 20); + EXPECT_EQ(apply_changelog.entryAt(11)->get_term(), 30); + EXPECT_EQ(apply_changelog.entryAt(12)->get_term(), 40); +} + +TEST(CoordinationTest, ChangelogTestBatchOperationsEmpty) +{ + ChangelogDirTest test("./logs"); + DB::Changelog changelog("./logs", 100); + changelog.readChangelogAndInitWriter(1); + for (size_t i = 0; i < 10; ++i) + { + auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10); + changelog.appendEntry(changelog.getNextEntryIndex(), entry); + } + + EXPECT_EQ(changelog.size(), 10); + + auto entries = changelog.serializeEntriesToBuffer(5, 5); + + ChangelogDirTest test1("./logs1"); + DB::Changelog changelog_new("./logs1", 100); + changelog_new.readChangelogAndInitWriter(1); + EXPECT_EQ(changelog_new.size(), 0); + + changelog_new.applyEntriesFromBuffer(5, *entries); + + EXPECT_EQ(changelog_new.size(), 5); + EXPECT_EQ(changelog_new.getStartIndex(), 5); + EXPECT_EQ(changelog_new.getNextEntryIndex(), 10); + + for (size_t i = 4; i < 9; ++i) + EXPECT_EQ(changelog_new.entryAt(i + 1)->get_term(), i * 10); + + changelog_new.appendEntry(changelog_new.getNextEntryIndex(), getLogEntry("hello_world", 110)); + EXPECT_EQ(changelog_new.size(), 6); + EXPECT_EQ(changelog_new.getStartIndex(), 5); + EXPECT_EQ(changelog_new.getNextEntryIndex(), 11); + + DB::Changelog changelog_reader("./logs1", 100); + changelog_reader.readChangelogAndInitWriter(5); +} + + +TEST(CoordinationTest, ChangelogTestWriteAtPreviousFile) +{ + ChangelogDirTest test("./logs"); + DB::Changelog changelog("./logs", 5); + changelog.readChangelogAndInitWriter(1); + + for (size_t i = 0; i < 33; ++i) + { + auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10); + changelog.appendEntry(changelog.getNextEntryIndex(), entry); + } + EXPECT_EQ(changelog.size(), 33); + + changelog.writeAt(7, getLogEntry("helloworld", 5555)); + EXPECT_EQ(changelog.size(), 7); + EXPECT_EQ(changelog.getStartIndex(), 1); + EXPECT_EQ(changelog.getNextEntryIndex(), 8); + EXPECT_EQ(changelog.getLastEntry()->get_term(), 5555); + + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); + + EXPECT_FALSE(fs::exists("./logs/changelog_11_15.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_16_20.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_11_25.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_26_30.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_31_35.bin")); + + DB::Changelog changelog_read("./logs", 5); + changelog_read.readChangelogAndInitWriter(1); + EXPECT_EQ(changelog_read.size(), 7); + EXPECT_EQ(changelog_read.getStartIndex(), 1); + EXPECT_EQ(changelog_read.getNextEntryIndex(), 8); + EXPECT_EQ(changelog_read.getLastEntry()->get_term(), 5555); } #endif From b029f3e5cf4b03df444ee2da007040756cb46570 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 16 Feb 2021 20:32:35 +0300 Subject: [PATCH 168/381] Border test --- src/Coordination/tests/gtest_for_build.cpp | 36 ++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/src/Coordination/tests/gtest_for_build.cpp b/src/Coordination/tests/gtest_for_build.cpp index 6335df4b940..f6139ea5de3 100644 --- a/src/Coordination/tests/gtest_for_build.cpp +++ b/src/Coordination/tests/gtest_for_build.cpp @@ -668,4 +668,40 @@ TEST(CoordinationTest, ChangelogTestWriteAtPreviousFile) EXPECT_EQ(changelog_read.getLastEntry()->get_term(), 5555); } +TEST(CoordinationTest, ChangelogTestWriteAtFileBorder) +{ + ChangelogDirTest test("./logs"); + DB::Changelog changelog("./logs", 5); + changelog.readChangelogAndInitWriter(1); + + for (size_t i = 0; i < 33; ++i) + { + auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10); + changelog.appendEntry(changelog.getNextEntryIndex(), entry); + } + EXPECT_EQ(changelog.size(), 33); + + changelog.writeAt(11, getLogEntry("helloworld", 5555)); + EXPECT_EQ(changelog.size(), 11); + EXPECT_EQ(changelog.getStartIndex(), 1); + EXPECT_EQ(changelog.getNextEntryIndex(), 12); + EXPECT_EQ(changelog.getLastEntry()->get_term(), 5555); + + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_11_15.bin")); + + EXPECT_FALSE(fs::exists("./logs/changelog_16_20.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_11_25.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_26_30.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_31_35.bin")); + + DB::Changelog changelog_read("./logs", 5); + changelog_read.readChangelogAndInitWriter(1); + EXPECT_EQ(changelog_read.size(), 11); + EXPECT_EQ(changelog_read.getStartIndex(), 1); + EXPECT_EQ(changelog_read.getNextEntryIndex(), 12); + EXPECT_EQ(changelog_read.getLastEntry()->get_term(), 5555); +} + #endif From b76b8013ba88b081362ab9f31c103a3b6c77bc27 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 16 Feb 2021 20:47:12 +0300 Subject: [PATCH 169/381] Fix tests --- src/Coordination/Changelog.cpp | 1 - src/Coordination/tests/gtest_for_build.cpp | 22 ++++++++++++++++++++-- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/src/Coordination/Changelog.cpp b/src/Coordination/Changelog.cpp index 6fa3e0e9e03..5198382e731 100644 --- a/src/Coordination/Changelog.cpp +++ b/src/Coordination/Changelog.cpp @@ -344,7 +344,6 @@ void Changelog::compact(size_t up_to_log_idx) ChangelogName parsed_name = getChangelogName(itr->second); if (parsed_name.to_log_idx <= up_to_log_idx) { - for (size_t idx = parsed_name.from_log_idx; idx <= parsed_name.to_log_idx; ++idx) { auto index_pos = index_to_start_pos.find(idx); diff --git a/src/Coordination/tests/gtest_for_build.cpp b/src/Coordination/tests/gtest_for_build.cpp index f6139ea5de3..fa8ae8f8b82 100644 --- a/src/Coordination/tests/gtest_for_build.cpp +++ b/src/Coordination/tests/gtest_for_build.cpp @@ -643,6 +643,15 @@ TEST(CoordinationTest, ChangelogTestWriteAtPreviousFile) auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10); changelog.appendEntry(changelog.getNextEntryIndex(), entry); } + + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_11_15.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_16_20.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_21_25.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_26_30.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_31_35.bin")); + EXPECT_EQ(changelog.size(), 33); changelog.writeAt(7, getLogEntry("helloworld", 5555)); @@ -656,7 +665,7 @@ TEST(CoordinationTest, ChangelogTestWriteAtPreviousFile) EXPECT_FALSE(fs::exists("./logs/changelog_11_15.bin")); EXPECT_FALSE(fs::exists("./logs/changelog_16_20.bin")); - EXPECT_FALSE(fs::exists("./logs/changelog_11_25.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_21_25.bin")); EXPECT_FALSE(fs::exists("./logs/changelog_26_30.bin")); EXPECT_FALSE(fs::exists("./logs/changelog_31_35.bin")); @@ -679,6 +688,15 @@ TEST(CoordinationTest, ChangelogTestWriteAtFileBorder) auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10); changelog.appendEntry(changelog.getNextEntryIndex(), entry); } + + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_11_15.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_16_20.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_21_25.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_26_30.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_31_35.bin")); + EXPECT_EQ(changelog.size(), 33); changelog.writeAt(11, getLogEntry("helloworld", 5555)); @@ -692,7 +710,7 @@ TEST(CoordinationTest, ChangelogTestWriteAtFileBorder) EXPECT_TRUE(fs::exists("./logs/changelog_11_15.bin")); EXPECT_FALSE(fs::exists("./logs/changelog_16_20.bin")); - EXPECT_FALSE(fs::exists("./logs/changelog_11_25.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_21_25.bin")); EXPECT_FALSE(fs::exists("./logs/changelog_26_30.bin")); EXPECT_FALSE(fs::exists("./logs/changelog_31_35.bin")); From e93e1911ee0b11278e13a2deb8022bbb456ef15d Mon Sep 17 00:00:00 2001 From: Dmitriy Date: Tue, 16 Feb 2021 21:01:36 +0300 Subject: [PATCH 170/381] Translate to Russian MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Выполнил перевод на русский язык. --- .../functions/type-conversion-functions.md | 14 +- .../functions/type-conversion-functions.md | 172 ++++++++++++++++++ 2 files changed, 177 insertions(+), 9 deletions(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 81b5649db32..6795b31bd33 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -701,21 +701,19 @@ parseDateTimeBestEffortUSOrNull(time_string[, time_zone]) **Parameters** -- `time_string` — String containing a date or date with time to convert. The date must be in the US date format (`MM/DD/YYYY`). [String](../../sql-reference/data-types/string.md). +- `time_string` — String containing a date or date with time to convert. The date must be in the US date format (`MM/DD/YYYY`, etc). [String](../../sql-reference/data-types/string.md). - `time_zone` — [Timezone](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone). The function parses `time_string` according to the timezone. Optional. [String](../../sql-reference/data-types/string.md). **Supported non-standard formats** - A string containing 9..10 digit [unix timestamp](https://en.wikipedia.org/wiki/Unix_time). - A string with a date and a time components: `YYYYMMDDhhmmss`, `MM/DD/YYYY hh:mm:ss`, `MM-DD-YY hh:mm`, `YYYY-MM-DD hh:mm:ss`, etc. -- A string with a date, but no time component: `YYYY`, `YYYYMM`, `YYYY*MM`, `MM/DD/YYYY`, `MM-DD-YY` etc. +- A string with a date, but no time component: `YYYY`, `YYYYMM`, `YYYY*MM`, `MM/DD/YYYY`, `MM-DD-YY`, etc. - A string with a day and time: `DD`, `DD hh`, `DD hh:mm`. In this case, `YYYY-MM` are substituted with `2000-01`. - A string that includes date and time along with timezone offset information: `YYYY-MM-DD hh:mm:ss ±h:mm`, etc. For example, `2020-12-12 17:36:00 -5:00`. **Returned values** -Possible values: - - `time_string` converted to the [DateTime](../../sql-reference/data-types/datetime.md) data type. - `NULL` if the input string cannot be converted to the `DateTime` data type. @@ -789,23 +787,21 @@ parseDateTimeBestEffortUSOrZero(time_string[, time_zone]) **Parameters** -- `time_string` — String containing a date or date with time to convert. The date must be in the US date format (`MM/DD/YYYY`). [String](../../sql-reference/data-types/string.md). +- `time_string` — String containing a date or date with time to convert. The date must be in the US date format (`MM/DD/YYYY`, etc). [String](../../sql-reference/data-types/string.md). - `time_zone` — [Timezone](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone). The function parses `time_string` according to the timezone. Optional. [String](../../sql-reference/data-types/string.md). **Supported non-standard formats** - A string containing 9..10 digit [unix timestamp](https://en.wikipedia.org/wiki/Unix_time). - A string with a date and a time components: `YYYYMMDDhhmmss`, `MM/DD/YYYY hh:mm:ss`, `MM-DD-YY hh:mm`, `YYYY-MM-DD hh:mm:ss`, etc. -- A string with a date, but no time component: `YYYY`, `YYYYMM`, `YYYY*MM`, `MM/DD/YYYY`, `MM-DD-YY` etc. +- A string with a date, but no time component: `YYYY`, `YYYYMM`, `YYYY*MM`, `MM/DD/YYYY`, `MM-DD-YY`, etc. - A string with a day and time: `DD`, `DD hh`, `DD hh:mm`. In this case, `YYYY-MM` are substituted with `2000-01`. - A string that includes date and time along with timezone offset information: `YYYY-MM-DD hh:mm:ss ±h:mm`, etc. For example, `2020-12-12 17:36:00 -5:00`. **Returned values** -Possible values: - - `time_string` converted to the [DateTime](../../sql-reference/data-types/datetime.md) data type. -- `zero date time`. +- Zero date or zero date with time if the input string cannot be converted to the `DateTime` data type. **Examples** diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index 4a314bd22d8..92e674242df 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -658,6 +658,178 @@ AS parseDateTimeBestEffortUS; └─────────────────────────——┘ ``` +## parseDateTimeBestEffortUSOrNull {#parsedatetimebesteffortusornull} + +Похожа на функцию [parseDateTimeBestEffortUS](#parsedatetimebesteffortUS), но разница состоит в том, что возвращает `NULL`, если входная строка не может быть преобразована в тип данных [DateTime](../../sql-reference/data-types/datetime.md). + +**Синтаксис** + +``` sql +parseDateTimeBestEffortUSOrNull(time_string[, time_zone]) +``` + +**Параметры** + +- `time_string` — строка, содержащая дату или дату со временем для преобразования. Дата должна быть в американском формате (`MM/DD/YYYY` и т.д.). [String](../../sql-reference/data-types/string.md). +- `time_zone` — [часовой пояс](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone). Функция анализирует `time_string` в соответствии с заданным часовым поясом. Опциональный параметр. [String](../../sql-reference/data-types/string.md). + +**Поддерживаемые нестандартные форматы** + +- Строка в формате [unix timestamp](https://en.wikipedia.org/wiki/Unix_time), содержащая 9-10 символов. +- Строка, содержащая дату и время: `YYYYMMDDhhmmss`, `MM/DD/YYYY hh:mm:ss`, `MM-DD-YY hh:mm`, `YYYY-MM-DD hh:mm:ss` и т.д. +- Строка, содержащая дату без времени: `YYYY`, `YYYYMM`, `YYYY*MM`, `MM/DD/YYYY`, `MM-DD-YY` и т.д. +- Строка, содержащая день и время: `DD`, `DD hh`, `DD hh:mm`. В этом случае `YYYY-MM` заменяется на `2000-01`. +- Строка, содержащая дату и время, а также информацию о часовом поясе: `YYYY-MM-DD hh:mm:ss ±h:mm` и т.д. Например, `2020-12-12 17:36:00 -5:00`. + +**Возвращаемые значения** + +- `time_string`, преобразованная в тип данных `DateTime`. +- `NULL`, если входная строка не может быть преобразована в тип данных `DateTime`. + +**Примеры** + +Запрос: + +``` sql +SELECT parseDateTimeBestEffortUSOrNull('02/10/2021 21:12:57') AS parseDateTimeBestEffortUSOrNull; +``` + +Результат: + +``` text +┌─parseDateTimeBestEffortUSOrNull─┐ +│ 2021-02-10 21:12:57 │ +└─────────────────────────────────┘ +``` + +Запрос: + +``` sql +SELECT parseDateTimeBestEffortUSOrNull('02-10-2021 21:12:57 GMT', 'Europe/Moscow') AS parseDateTimeBestEffortUSOrNull; +``` + +Результат: + +``` text +┌─parseDateTimeBestEffortUSOrNull─┐ +│ 2021-02-11 00:12:57 │ +└─────────────────────────────────┘ +``` + +Запрос: + +``` sql +SELECT parseDateTimeBestEffortUSOrNull('02.10.2021') AS parseDateTimeBestEffortUSOrNull; +``` + +Результат: + +``` text +┌─parseDateTimeBestEffortUSOrNull─┐ +│ 2021-02-10 00:00:00 │ +└─────────────────────────────────┘ +``` + +Запрос: + +``` sql +SELECT parseDateTimeBestEffortUSOrNull('10.2021') AS parseDateTimeBestEffortUSOrNull; +``` + +Результат: + +``` text +┌─parseDateTimeBestEffortUSOrNull─┐ +│ ᴺᵁᴸᴸ │ +└─────────────────────────────────┘ +``` + +## parseDateTimeBestEffortUSOrZero {#parsedatetimebesteffortusorzero} + +Похожа на функцию [parseDateTimeBestEffortUS](#parsedatetimebesteffortUS), но разница в том, что возвращает нулевую дату или нулевую дату со временем, если входная строка не может быть преобразована в тип данных [DateTime](../../sql-reference/data-types/datetime.md). + +**Синтаксис** + +``` sql +parseDateTimeBestEffortUSOrZero(time_string[, time_zone]) +``` + +**Параметры** + +- `time_string` — строка, содержащая дату или дату со временем для преобразования. Дата должна быть в американском формате (`MM/DD/YYYY` и т.д.). [String](../../sql-reference/data-types/string.md). +- `time_zone` — [часовой пояс](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone). Функция анализирует `time_string` в соответствии с заданным часовым поясом. Опциональный параметр. [String](../../sql-reference/data-types/string.md). + +**Поддерживаемые нестандартные форматы** + +- Строка в формате [unix timestamp](https://en.wikipedia.org/wiki/Unix_time), содержащая 9-10 символов. +- Строка, содержащая дату и время: `YYYYMMDDhhmmss`, `MM/DD/YYYY hh:mm:ss`, `MM-DD-YY hh:mm`, `YYYY-MM-DD hh:mm:ss` и т.д. +- Строка, содержащая дату без времени: `YYYY`, `YYYYMM`, `YYYY*MM`, `MM/DD/YYYY`, `MM-DD-YY` и т.д. +- Строка, содержащая день и время: `DD`, `DD hh`, `DD hh:mm`. В этом случае `YYYY-MM` заменяется на `2000-01`. +- Строка, содержащая дату и время, а также информацию о часовом поясе: `YYYY-MM-DD hh:mm:ss ±h:mm` и т.д. Например, `2020-12-12 17:36:00 -5:00`. + +**Возвращаемые значения** + +- `time_string`, преобразованная в тип данных `DateTime`. +- Нулевая дата или нулевая дата со временем, если входная строка не может быть преобразована в тип данных `DateTime`. + +**Примеры** + +Запрос: + +``` sql +SELECT parseDateTimeBestEffortUSOrZero('02/10/2021 21:12:57') AS parseDateTimeBestEffortUSOrZero; +``` + +Результат: + +``` text +┌─parseDateTimeBestEffortUSOrZero─┐ +│ 2021-02-10 21:12:57 │ +└─────────────────────────────────┘ +``` + +Запрос: + +``` sql +SELECT parseDateTimeBestEffortUSOrZero('02-10-2021 21:12:57 GMT', 'Europe/Moscow') AS parseDateTimeBestEffortUSOrZero; +``` + +Результат: + +``` text +┌─parseDateTimeBestEffortUSOrZero─┐ +│ 2021-02-11 00:12:57 │ +└─────────────────────────────────┘ +``` + +Запрос: + +``` sql +SELECT parseDateTimeBestEffortUSOrZero('02.10.2021') AS parseDateTimeBestEffortUSOrZero; +``` + +Результат: + +``` text +┌─parseDateTimeBestEffortUSOrZero─┐ +│ 2021-02-10 00:00:00 │ +└─────────────────────────────────┘ +``` + +Запрос: + +``` sql +SELECT parseDateTimeBestEffortUSOrZero('02.2021') AS parseDateTimeBestEffortUSOrZero; +``` + +Результат: + +``` text +┌─parseDateTimeBestEffortUSOrZero─┐ +│ 1970-01-01 00:00:00 │ +└─────────────────────────────────┘ +``` + ## toUnixTimestamp64Milli ## toUnixTimestamp64Micro ## toUnixTimestamp64Nano From d3e87701d478c2f779eae5b892c040b1132d8b6c Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 16 Feb 2021 22:02:18 +0300 Subject: [PATCH 171/381] Persistent storage --- src/Coordination/Changelog.cpp | 10 ++-- src/Coordination/Changelog.h | 2 - src/Coordination/CoordinationSettings.h | 3 +- src/Coordination/InMemoryStateManager.cpp | 21 ++++--- src/Coordination/InMemoryStateManager.h | 13 +++-- src/Coordination/NuKeeperServer.cpp | 12 +++- src/Coordination/tests/gtest_for_build.cpp | 67 +++++++++++----------- tests/config/config.d/test_keeper_port.xml | 1 + 8 files changed, 77 insertions(+), 52 deletions(-) diff --git a/src/Coordination/Changelog.cpp b/src/Coordination/Changelog.cpp index 5198382e731..e4d8b13ec37 100644 --- a/src/Coordination/Changelog.cpp +++ b/src/Coordination/Changelog.cpp @@ -16,10 +16,8 @@ namespace ErrorCodes extern const int CORRUPTED_DATA; extern const int UNKNOWN_FORMAT_VERSION; extern const int LOGICAL_ERROR; - extern const int NOT_IMPLEMENTED; } - std::string toString(const ChangelogVersion & version) { if (version == ChangelogVersion::V0) @@ -147,7 +145,6 @@ private: size_t start_index; }; - class ChangelogReader { public: @@ -202,7 +199,10 @@ Changelog::Changelog(const std::string & changelogs_dir_, size_t rotate_interval , rotate_interval(rotate_interval_) { namespace fs = std::filesystem; - for(const auto & p : fs::directory_iterator(changelogs_dir)) + if (!fs::exists(changelogs_dir)) + fs::create_directories(changelogs_dir); + + for (const auto & p : fs::directory_iterator(changelogs_dir)) { auto name = getChangelogName(p.path()); existing_changelogs[name.from_log_idx] = p.path(); @@ -233,7 +233,7 @@ void Changelog::readChangelogAndInitWriter(size_t from_log_idx) } else { - rotate(from_log_idx); + rotate(start_index); } } diff --git a/src/Coordination/Changelog.h b/src/Coordination/Changelog.h index 97669d1aa19..7c352e7a91b 100644 --- a/src/Coordination/Changelog.h +++ b/src/Coordination/Changelog.h @@ -45,8 +45,6 @@ struct ChangelogRecord nuraft::ptr blob; }; - - class ChangelogWriter; class Changelog diff --git a/src/Coordination/CoordinationSettings.h b/src/Coordination/CoordinationSettings.h index 441e1a5936f..0f1afb3fffe 100644 --- a/src/Coordination/CoordinationSettings.h +++ b/src/Coordination/CoordinationSettings.h @@ -28,7 +28,8 @@ struct Settings; M(Bool, auto_forwarding, true, "Allow to forward write requests from followers to leader", 0) \ M(Milliseconds, shutdown_timeout, 5000, "How many time we will until RAFT shutdown", 0) \ M(Milliseconds, startup_timeout, 30000, "How many time we will until RAFT to start", 0) \ - M(LogsLevel, raft_logs_level, LogsLevel::information, "Log internal RAFT logs into main server log level. Valid values: 'trace', 'debug', 'information', 'warning', 'error', 'fatal', 'none'", 0) + M(LogsLevel, raft_logs_level, LogsLevel::information, "Log internal RAFT logs into main server log level. Valid values: 'trace', 'debug', 'information', 'warning', 'error', 'fatal', 'none'", 0) \ + M(UInt64, rotate_log_storage_interval, 500000, "How many records will be stored in one log storage file", 0) DECLARE_SETTINGS_TRAITS(CoordinationSettingsTraits, LIST_OF_COORDINATION_SETTINGS) diff --git a/src/Coordination/InMemoryStateManager.cpp b/src/Coordination/InMemoryStateManager.cpp index 69e93578cc1..6c4e95b993a 100644 --- a/src/Coordination/InMemoryStateManager.cpp +++ b/src/Coordination/InMemoryStateManager.cpp @@ -9,10 +9,10 @@ namespace ErrorCodes extern const int RAFT_ERROR; } -InMemoryStateManager::InMemoryStateManager(int server_id_, const std::string & host, int port) +InMemoryStateManager::InMemoryStateManager(int server_id_, const std::string & host, int port, const std::string & logs_path) : my_server_id(server_id_) , my_port(port) - , log_store(nuraft::cs_new()) + , log_store(nuraft::cs_new(logs_path, 5000)) , cluster_config(nuraft::cs_new()) { auto peer_config = nuraft::cs_new(my_server_id, host + ":" + std::to_string(port)); @@ -22,17 +22,19 @@ InMemoryStateManager::InMemoryStateManager(int server_id_, const std::string & h InMemoryStateManager::InMemoryStateManager( int my_server_id_, const std::string & config_prefix, - const Poco::Util::AbstractConfiguration & config) + const Poco::Util::AbstractConfiguration & config, + const CoordinationSettingsPtr & coordination_settings) : my_server_id(my_server_id_) - , log_store(nuraft::cs_new()) + , log_store(nuraft::cs_new(config.getString(config_prefix + ".log_storage_path"), coordination_settings->rotate_log_storage_interval)) , cluster_config(nuraft::cs_new()) { + Poco::Util::AbstractConfiguration::Keys keys; - config.keys(config_prefix, keys); + config.keys(config_prefix + ".raft_configuration", keys); for (const auto & server_key : keys) { - std::string full_prefix = config_prefix + "." + server_key; + std::string full_prefix = config_prefix + ".raft_configuration." + server_key; int server_id = config.getInt(full_prefix + ".id"); std::string hostname = config.getString(full_prefix + ".hostname"); int port = config.getInt(full_prefix + ".port"); @@ -53,12 +55,17 @@ InMemoryStateManager::InMemoryStateManager( cluster_config->get_servers().push_back(peer_config); } if (!my_server_config) - throw Exception(ErrorCodes::RAFT_ERROR, "Our server id {} not found in raft_configuration section"); + throw Exception(ErrorCodes::RAFT_ERROR, "Our server id {} not found in raft_configuration section", my_server_id); if (start_as_follower_servers.size() == cluster_config->get_servers().size()) throw Exception(ErrorCodes::RAFT_ERROR, "At least one of servers should be able to start as leader (without )"); } +void InMemoryStateManager::loadLogStore(size_t start_log_index) +{ + log_store->init(start_log_index); +} + void InMemoryStateManager::save_config(const nuraft::cluster_config & config) { // Just keep in memory in this example. diff --git a/src/Coordination/InMemoryStateManager.h b/src/Coordination/InMemoryStateManager.h index 2a5c2f00dba..8a7be7d0129 100644 --- a/src/Coordination/InMemoryStateManager.h +++ b/src/Coordination/InMemoryStateManager.h @@ -2,7 +2,8 @@ #include #include -#include +#include +#include #include // Y_IGNORE #include @@ -15,12 +16,16 @@ public: InMemoryStateManager( int server_id_, const std::string & config_prefix, - const Poco::Util::AbstractConfiguration & config); + const Poco::Util::AbstractConfiguration & config, + const CoordinationSettingsPtr & coordination_settings); InMemoryStateManager( int server_id_, const std::string & host, - int port); + int port, + const std::string & logs_path); + + void loadLogStore(size_t start_log_index); nuraft::ptr load_config() override { return cluster_config; } @@ -49,7 +54,7 @@ private: int my_server_id; int my_port; std::unordered_set start_as_follower_servers; - nuraft::ptr log_store; + nuraft::ptr log_store; nuraft::ptr my_server_config; nuraft::ptr cluster_config; nuraft::ptr server_state; diff --git a/src/Coordination/NuKeeperServer.cpp b/src/Coordination/NuKeeperServer.cpp index 7464a06e86f..a4582a5fbb8 100644 --- a/src/Coordination/NuKeeperServer.cpp +++ b/src/Coordination/NuKeeperServer.cpp @@ -26,13 +26,16 @@ NuKeeperServer::NuKeeperServer( : server_id(server_id_) , coordination_settings(coordination_settings_) , state_machine(nuraft::cs_new(responses_queue_, coordination_settings)) - , state_manager(nuraft::cs_new(server_id, "test_keeper_server.raft_configuration", config)) + , state_manager(nuraft::cs_new(server_id, "test_keeper_server", config, coordination_settings)) , responses_queue(responses_queue_) { } void NuKeeperServer::startup() { + + state_manager->loadLogStore(state_machine->last_commit_index()); + nuraft::raft_params params; params.heart_beat_interval_ = coordination_settings->heart_beat_interval_ms.totalMilliseconds(); params.election_timeout_lower_bound_ = coordination_settings->election_timeout_lower_bound_ms.totalMilliseconds(); @@ -172,6 +175,13 @@ void NuKeeperServer::waitInit() int64_t timeout = coordination_settings->startup_timeout.totalMilliseconds(); if (!initialized_cv.wait_for(lock, std::chrono::milliseconds(timeout), [&] { return initialized_flag; })) throw Exception(ErrorCodes::RAFT_ERROR, "Failed to wait RAFT initialization"); + + /// TODO FIXME somehow + while (isLeader() && raft_instance->get_committed_log_idx() != raft_instance->get_last_log_idx()) + { + LOG_WARNING(&Poco::Logger::get("NuKeeperServer"), "Loading from log store {}/{}", raft_instance->get_committed_log_idx(), raft_instance->get_last_log_idx()); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } } std::unordered_set NuKeeperServer::getDeadSessions() diff --git a/src/Coordination/tests/gtest_for_build.cpp b/src/Coordination/tests/gtest_for_build.cpp index fa8ae8f8b82..6d91ba95111 100644 --- a/src/Coordination/tests/gtest_for_build.cpp +++ b/src/Coordination/tests/gtest_for_build.cpp @@ -26,6 +26,26 @@ #include #include +namespace fs = std::filesystem; +struct ChangelogDirTest +{ + std::string path; + bool drop; + ChangelogDirTest(std::string path_, bool drop_ = true) + : path(path_) + , drop(drop_) + { + if (fs::exists(path)) + EXPECT_TRUE(false) << "Path " << path << " already exists, remove it to run test"; + fs::create_directory(path); + } + + ~ChangelogDirTest() + { + if (fs::exists(path) && drop) + fs::remove_all(path); + } +}; TEST(CoordinationTest, BuildTest) { @@ -70,14 +90,15 @@ TEST(CoordinationTest, BufferSerde) template struct SimpliestRaftServer { - SimpliestRaftServer(int server_id_, const std::string & hostname_, int port_) + SimpliestRaftServer(int server_id_, const std::string & hostname_, int port_, const std::string & logs_path) : server_id(server_id_) , hostname(hostname_) , port(port_) , endpoint(hostname + ":" + std::to_string(port)) , state_machine(nuraft::cs_new()) - , state_manager(nuraft::cs_new(server_id, hostname, port)) + , state_manager(nuraft::cs_new(server_id, hostname, port, logs_path)) { + state_manager->loadLogStore(1); nuraft::raft_params params; params.heart_beat_interval_ = 100; params.election_timeout_lower_bound_ = 200; @@ -126,7 +147,7 @@ struct SimpliestRaftServer nuraft::ptr state_machine; // State manager. - nuraft::ptr state_manager; + nuraft::ptr state_manager; // Raft launcher. nuraft::raft_launcher launcher; @@ -141,7 +162,6 @@ nuraft::ptr getBuffer(int64_t number) { nuraft::ptr ret = nuraft::buffer::alloc(sizeof(number)); nuraft::buffer_serializer bs(ret); - // WARNING: We don't consider endian-safety in this example. bs.put_raw(&number, sizeof(number)); return ret; } @@ -149,7 +169,8 @@ nuraft::ptr getBuffer(int64_t number) TEST(CoordinationTest, TestSummingRaft1) { - SummingRaftServer s1(1, "localhost", 44444); + ChangelogDirTest test("./logs"); + SummingRaftServer s1(1, "localhost", 44444, "./logs"); /// Single node is leader EXPECT_EQ(s1.raft_instance->get_leader(), 1); @@ -172,9 +193,12 @@ TEST(CoordinationTest, TestSummingRaft1) TEST(CoordinationTest, TestSummingRaft3) { - SummingRaftServer s1(1, "localhost", 44444); - SummingRaftServer s2(2, "localhost", 44445); - SummingRaftServer s3(3, "localhost", 44446); + ChangelogDirTest test1("./logs1"); + SummingRaftServer s1(1, "localhost", 44444, "./logs1"); + ChangelogDirTest test2("./logs2"); + SummingRaftServer s2(2, "localhost", 44445, "./logs2"); + ChangelogDirTest test3("./logs3"); + SummingRaftServer s3(3, "localhost", 44446, "./logs3"); nuraft::srv_config first_config(1, "localhost:44444"); auto ret1 = s2.raft_instance->add_srv(first_config); @@ -343,27 +367,6 @@ DB::LogEntryPtr getLogEntry(const std::string & s, size_t term) return nuraft::cs_new(term, bufwriter.getBuffer()); } -namespace fs = std::filesystem; -struct ChangelogDirTest -{ - std::string path; - bool drop; - ChangelogDirTest(std::string path_, bool drop_ = true) - : path(path_) - , drop(drop_) - { - if (fs::exists(path)) - EXPECT_TRUE(false) << "Path " << path << " already exists, remove it to run test"; - fs::create_directory(path); - } - - ~ChangelogDirTest() - { - if (fs::exists(path) && drop) - fs::remove_all(path); - } -}; - TEST(CoordinationTest, ChangelogTestSimple) { ChangelogDirTest test("./logs"); @@ -386,7 +389,7 @@ TEST(CoordinationTest, ChangelogTestFile) auto entry = getLogEntry("hello world", 77); changelog.appendEntry(1, entry); EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); - for(const auto & p : fs::directory_iterator("./logs")) + for (const auto & p : fs::directory_iterator("./logs")) EXPECT_EQ(p.path(), "./logs/changelog_1_5.bin"); changelog.appendEntry(2, entry); @@ -484,7 +487,7 @@ TEST(CoordinationTest, ChangelogTestAppendAfterRead) EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); size_t logs_count = 0; - for(const auto & _ [[maybe_unused]]: fs::directory_iterator("./logs")) + for (const auto & _ [[maybe_unused]]: fs::directory_iterator("./logs")) logs_count++; EXPECT_EQ(logs_count, 2); @@ -497,7 +500,7 @@ TEST(CoordinationTest, ChangelogTestAppendAfterRead) EXPECT_TRUE(fs::exists("./logs/changelog_11_15.bin")); logs_count = 0; - for(const auto & _ [[maybe_unused]]: fs::directory_iterator("./logs")) + for (const auto & _ [[maybe_unused]]: fs::directory_iterator("./logs")) logs_count++; EXPECT_EQ(logs_count, 3); diff --git a/tests/config/config.d/test_keeper_port.xml b/tests/config/config.d/test_keeper_port.xml index 97c6d7c2e33..44123ffe9c1 100644 --- a/tests/config/config.d/test_keeper_port.xml +++ b/tests/config/config.d/test_keeper_port.xml @@ -2,6 +2,7 @@ 9181 1 + /var/lib/clickhouse/coordination/log 10000 From 8717dbd0e222536e6daf709820c3bee1ef395c05 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 16 Feb 2021 22:29:09 +0300 Subject: [PATCH 172/381] Missed configs --- .../test_testkeeper_back_to_back/configs/enable_test_keeper.xml | 1 + .../configs/enable_test_keeper1.xml | 1 + .../configs/enable_test_keeper2.xml | 1 + .../configs/enable_test_keeper3.xml | 1 + .../configs/enable_test_keeper1.xml | 1 + .../configs/enable_test_keeper2.xml | 1 + .../configs/enable_test_keeper3.xml | 1 + 7 files changed, 7 insertions(+) diff --git a/tests/integration/test_testkeeper_back_to_back/configs/enable_test_keeper.xml b/tests/integration/test_testkeeper_back_to_back/configs/enable_test_keeper.xml index 1a441909998..a8b8991f959 100644 --- a/tests/integration/test_testkeeper_back_to_back/configs/enable_test_keeper.xml +++ b/tests/integration/test_testkeeper_back_to_back/configs/enable_test_keeper.xml @@ -2,6 +2,7 @@ 9181 1 + /var/lib/clickhouse/coordination/log 5000 diff --git a/tests/integration/test_testkeeper_multinode_blocade_leader/configs/enable_test_keeper1.xml b/tests/integration/test_testkeeper_multinode_blocade_leader/configs/enable_test_keeper1.xml index 4ad76889d1e..a47e5eae09a 100644 --- a/tests/integration/test_testkeeper_multinode_blocade_leader/configs/enable_test_keeper1.xml +++ b/tests/integration/test_testkeeper_multinode_blocade_leader/configs/enable_test_keeper1.xml @@ -2,6 +2,7 @@ 9181 1 + /var/lib/clickhouse/coordination/log 5000 diff --git a/tests/integration/test_testkeeper_multinode_blocade_leader/configs/enable_test_keeper2.xml b/tests/integration/test_testkeeper_multinode_blocade_leader/configs/enable_test_keeper2.xml index a1954a1e639..18681f0dc95 100644 --- a/tests/integration/test_testkeeper_multinode_blocade_leader/configs/enable_test_keeper2.xml +++ b/tests/integration/test_testkeeper_multinode_blocade_leader/configs/enable_test_keeper2.xml @@ -2,6 +2,7 @@ 9181 2 + /var/lib/clickhouse/coordination/log 5000 diff --git a/tests/integration/test_testkeeper_multinode_blocade_leader/configs/enable_test_keeper3.xml b/tests/integration/test_testkeeper_multinode_blocade_leader/configs/enable_test_keeper3.xml index 88d2358138f..184d3724219 100644 --- a/tests/integration/test_testkeeper_multinode_blocade_leader/configs/enable_test_keeper3.xml +++ b/tests/integration/test_testkeeper_multinode_blocade_leader/configs/enable_test_keeper3.xml @@ -2,6 +2,7 @@ 9181 3 + /var/lib/clickhouse/coordination/log 5000 diff --git a/tests/integration/test_testkeeper_multinode_simple/configs/enable_test_keeper1.xml b/tests/integration/test_testkeeper_multinode_simple/configs/enable_test_keeper1.xml index 4ad76889d1e..a47e5eae09a 100644 --- a/tests/integration/test_testkeeper_multinode_simple/configs/enable_test_keeper1.xml +++ b/tests/integration/test_testkeeper_multinode_simple/configs/enable_test_keeper1.xml @@ -2,6 +2,7 @@ 9181 1 + /var/lib/clickhouse/coordination/log 5000 diff --git a/tests/integration/test_testkeeper_multinode_simple/configs/enable_test_keeper2.xml b/tests/integration/test_testkeeper_multinode_simple/configs/enable_test_keeper2.xml index a1954a1e639..18681f0dc95 100644 --- a/tests/integration/test_testkeeper_multinode_simple/configs/enable_test_keeper2.xml +++ b/tests/integration/test_testkeeper_multinode_simple/configs/enable_test_keeper2.xml @@ -2,6 +2,7 @@ 9181 2 + /var/lib/clickhouse/coordination/log 5000 diff --git a/tests/integration/test_testkeeper_multinode_simple/configs/enable_test_keeper3.xml b/tests/integration/test_testkeeper_multinode_simple/configs/enable_test_keeper3.xml index 88d2358138f..184d3724219 100644 --- a/tests/integration/test_testkeeper_multinode_simple/configs/enable_test_keeper3.xml +++ b/tests/integration/test_testkeeper_multinode_simple/configs/enable_test_keeper3.xml @@ -2,6 +2,7 @@ 9181 3 + /var/lib/clickhouse/coordination/log 5000 From fa200160915ee9c187e5e64a4a1e395d70430b7f Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 17 Feb 2021 09:53:18 +0300 Subject: [PATCH 173/381] Enable distributed_aggregation_memory_efficient by default --- src/Core/Settings.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 9bb9ad30f15..6c05d247037 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -100,7 +100,7 @@ class IColumn; M(UInt64, min_count_to_compile_expression, 3, "The number of identical expressions before they are JIT-compiled", 0) \ M(UInt64, group_by_two_level_threshold, 100000, "From what number of keys, a two-level aggregation starts. 0 - the threshold is not set.", 0) \ M(UInt64, group_by_two_level_threshold_bytes, 100000000, "From what size of the aggregation state in bytes, a two-level aggregation begins to be used. 0 - the threshold is not set. Two-level aggregation is used when at least one of the thresholds is triggered.", 0) \ - M(Bool, distributed_aggregation_memory_efficient, false, "Is the memory-saving mode of distributed aggregation enabled.", 0) \ + M(Bool, distributed_aggregation_memory_efficient, true, "Is the memory-saving mode of distributed aggregation enabled.", 0) \ M(UInt64, aggregation_memory_efficient_merge_threads, 0, "Number of threads to use for merge intermediate aggregation results in memory efficient mode. When bigger, then more memory is consumed. 0 means - same as 'max_threads'.", 0) \ \ M(UInt64, max_parallel_replicas, 1, "The maximum number of replicas of each shard used when the query is executed. For consistency (to get different parts of the same partition), this option only works for the specified sampling key. The lag of the replicas is not controlled.", 0) \ From dfaa79b88ed8bd5e67df1e510d1a91cb1644a6a5 Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 17 Feb 2021 10:10:46 +0300 Subject: [PATCH 174/381] Add missed file --- src/Coordination/NuKeeperLogStore.cpp | 97 +++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 src/Coordination/NuKeeperLogStore.cpp diff --git a/src/Coordination/NuKeeperLogStore.cpp b/src/Coordination/NuKeeperLogStore.cpp new file mode 100644 index 00000000000..fa0631e14ad --- /dev/null +++ b/src/Coordination/NuKeeperLogStore.cpp @@ -0,0 +1,97 @@ +#include + +namespace DB +{ + +NuKeeperLogStore::NuKeeperLogStore(const std::string & changelogs_path, size_t rotate_interval_) + : changelog(changelogs_path, rotate_interval_) +{ +} + +size_t NuKeeperLogStore::start_index() const +{ + std::lock_guard lock(changelog_lock); + return changelog.getStartIndex(); +} + +void NuKeeperLogStore::init(size_t from_log_idx) +{ + std::lock_guard lock(changelog_lock); + changelog.readChangelogAndInitWriter(from_log_idx); +} + +size_t NuKeeperLogStore::next_slot() const +{ + std::lock_guard lock(changelog_lock); + return changelog.getNextEntryIndex(); +} + +nuraft::ptr NuKeeperLogStore::last_entry() const +{ + std::lock_guard lock(changelog_lock); + return changelog.getLastEntry(); +} + +size_t NuKeeperLogStore::append(nuraft::ptr & entry) +{ + std::lock_guard lock(changelog_lock); + size_t idx = changelog.getNextEntryIndex(); + changelog.appendEntry(idx, entry); + return idx; +} + + +void NuKeeperLogStore::write_at(size_t index, nuraft::ptr & entry) +{ + std::lock_guard lock(changelog_lock); + changelog.writeAt(index, entry); +} + +nuraft::ptr>> NuKeeperLogStore::log_entries(size_t start, size_t end) +{ + std::lock_guard lock(changelog_lock); + return changelog.getLogEntriesBetween(start, end); +} + +nuraft::ptr NuKeeperLogStore::entry_at(size_t index) +{ + std::lock_guard lock(changelog_lock); + return changelog.entryAt(index); +} + +size_t NuKeeperLogStore::term_at(size_t index) +{ + std::lock_guard lock(changelog_lock); + auto entry = changelog.entryAt(index); + if (entry) + return entry->get_term(); + return 0; +} + +nuraft::ptr NuKeeperLogStore::pack(size_t index, int32_t cnt) +{ + std::lock_guard lock(changelog_lock); + return changelog.serializeEntriesToBuffer(index, cnt); +} + +bool NuKeeperLogStore::compact(size_t last_log_index) +{ + std::lock_guard lock(changelog_lock); + changelog.compact(last_log_index); + return true; +} + +bool NuKeeperLogStore::flush() +{ + std::lock_guard lock(changelog_lock); + changelog.flush(); + return true; +} + +void NuKeeperLogStore::apply_pack(size_t index, nuraft::buffer & pack) +{ + std::lock_guard lock(changelog_lock); + changelog.applyEntriesFromBuffer(index, pack); +} + +} From af95db2fcf8ac6c974e9a3d546392419b1ba6a5f Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 17 Feb 2021 11:00:17 +0300 Subject: [PATCH 175/381] Test log storage instead of changelog --- src/Coordination/Changelog.cpp | 7 +- src/Coordination/NuKeeperLogStore.cpp | 6 + src/Coordination/NuKeeperLogStore.h | 2 + src/Coordination/tests/gtest_for_build.cpp | 327 +++++++++++++-------- 4 files changed, 218 insertions(+), 124 deletions(-) diff --git a/src/Coordination/Changelog.cpp b/src/Coordination/Changelog.cpp index e4d8b13ec37..4f095974836 100644 --- a/src/Coordination/Changelog.cpp +++ b/src/Coordination/Changelog.cpp @@ -212,6 +212,8 @@ Changelog::Changelog(const std::string & changelogs_dir_, size_t rotate_interval void Changelog::readChangelogAndInitWriter(size_t from_log_idx) { size_t read_from_last = 0; + start_index = from_log_idx == 0 ? 1 : from_log_idx; + size_t total_read = 0; for (const auto & [start_id, changelog_file] : existing_changelogs) { ChangelogName parsed_name = getChangelogName(changelog_file); @@ -219,11 +221,10 @@ void Changelog::readChangelogAndInitWriter(size_t from_log_idx) { ChangelogReader reader(changelog_file); read_from_last = reader.readChangelog(logs, from_log_idx, index_to_start_pos); + total_read += read_from_last; } } - start_index = from_log_idx == 0 ? 1 : from_log_idx; - if (existing_changelogs.size() > 0 && read_from_last < rotate_interval) { auto str_name = existing_changelogs.rbegin()->second; @@ -233,7 +234,7 @@ void Changelog::readChangelogAndInitWriter(size_t from_log_idx) } else { - rotate(start_index); + rotate(start_index + total_read); } } diff --git a/src/Coordination/NuKeeperLogStore.cpp b/src/Coordination/NuKeeperLogStore.cpp index fa0631e14ad..fa8d6d6c299 100644 --- a/src/Coordination/NuKeeperLogStore.cpp +++ b/src/Coordination/NuKeeperLogStore.cpp @@ -94,4 +94,10 @@ void NuKeeperLogStore::apply_pack(size_t index, nuraft::buffer & pack) changelog.applyEntriesFromBuffer(index, pack); } +size_t NuKeeperLogStore::size() const +{ + std::lock_guard lock(changelog_lock); + return changelog.size(); +} + } diff --git a/src/Coordination/NuKeeperLogStore.h b/src/Coordination/NuKeeperLogStore.h index 981dc3f24e7..49d5dbfdf7c 100644 --- a/src/Coordination/NuKeeperLogStore.h +++ b/src/Coordination/NuKeeperLogStore.h @@ -39,6 +39,8 @@ public: bool flush() override; + size_t size() const; + private: mutable std::mutex changelog_lock; Changelog changelog; diff --git a/src/Coordination/tests/gtest_for_build.cpp b/src/Coordination/tests/gtest_for_build.cpp index 6d91ba95111..8328d93d9cf 100644 --- a/src/Coordination/tests/gtest_for_build.cpp +++ b/src/Coordination/tests/gtest_for_build.cpp @@ -114,10 +114,10 @@ struct SimpliestRaftServer if (!raft_instance) { - std::cerr << "Failed to initialize launcher (see the message " - "in the log file)." << std::endl; + std::cerr << "Failed to initialize launcher" << std::endl; exit(-1); } + std::cout << "init Raft instance " << server_id; for (size_t ii = 0; ii < 20; ++ii) { @@ -370,33 +370,33 @@ DB::LogEntryPtr getLogEntry(const std::string & s, size_t term) TEST(CoordinationTest, ChangelogTestSimple) { ChangelogDirTest test("./logs"); - DB::Changelog changelog("./logs", 5); - changelog.readChangelogAndInitWriter(1); + DB::NuKeeperLogStore changelog("./logs", 5); + changelog.init(1); auto entry = getLogEntry("hello world", 77); - changelog.appendEntry(1, entry); - EXPECT_EQ(changelog.getNextEntryIndex(), 2); - EXPECT_EQ(changelog.getStartIndex(), 1); - EXPECT_EQ(changelog.getLastEntry()->get_term(), 77); - EXPECT_EQ(changelog.entryAt(1)->get_term(), 77); - EXPECT_EQ(changelog.getLogEntriesBetween(1, 2)->size(), 1); + changelog.append(entry); + EXPECT_EQ(changelog.next_slot(), 2); + EXPECT_EQ(changelog.start_index(), 1); + EXPECT_EQ(changelog.last_entry()->get_term(), 77); + EXPECT_EQ(changelog.entry_at(1)->get_term(), 77); + EXPECT_EQ(changelog.log_entries(1, 2)->size(), 1); } TEST(CoordinationTest, ChangelogTestFile) { ChangelogDirTest test("./logs"); - DB::Changelog changelog("./logs", 5); - changelog.readChangelogAndInitWriter(1); + DB::NuKeeperLogStore changelog("./logs", 5); + changelog.init(1); auto entry = getLogEntry("hello world", 77); - changelog.appendEntry(1, entry); + changelog.append(entry); EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); for (const auto & p : fs::directory_iterator("./logs")) EXPECT_EQ(p.path(), "./logs/changelog_1_5.bin"); - changelog.appendEntry(2, entry); - changelog.appendEntry(3, entry); - changelog.appendEntry(4, entry); - changelog.appendEntry(5, entry); - changelog.appendEntry(6, entry); + changelog.append(entry); + changelog.append(entry); + changelog.append(entry); + changelog.append(entry); + changelog.append(entry); EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); @@ -405,26 +405,26 @@ TEST(CoordinationTest, ChangelogTestFile) TEST(CoordinationTest, ChangelogReadWrite) { ChangelogDirTest test("./logs"); - DB::Changelog changelog("./logs", 1000); - changelog.readChangelogAndInitWriter(1); + DB::NuKeeperLogStore changelog("./logs", 1000); + changelog.init(1); for (size_t i = 0; i < 10; ++i) { auto entry = getLogEntry("hello world", i * 10); - changelog.appendEntry(changelog.getNextEntryIndex(), entry); + changelog.append(entry); } EXPECT_EQ(changelog.size(), 10); - DB::Changelog changelog_reader("./logs", 1000); - changelog_reader.readChangelogAndInitWriter(1); + DB::NuKeeperLogStore changelog_reader("./logs", 1000); + changelog_reader.init(1); EXPECT_EQ(changelog_reader.size(), 10); - EXPECT_EQ(changelog_reader.getLastEntry()->get_term(), changelog.getLastEntry()->get_term()); - EXPECT_EQ(changelog_reader.getStartIndex(), changelog.getStartIndex()); - EXPECT_EQ(changelog_reader.getNextEntryIndex(), changelog.getNextEntryIndex()); + EXPECT_EQ(changelog_reader.last_entry()->get_term(), changelog.last_entry()->get_term()); + EXPECT_EQ(changelog_reader.start_index(), changelog.start_index()); + EXPECT_EQ(changelog_reader.next_slot(), changelog.next_slot()); for (size_t i = 0; i < 10; ++i) - EXPECT_EQ(changelog_reader.entryAt(i + 1)->get_term(), changelog.entryAt(i + 1)->get_term()); + EXPECT_EQ(changelog_reader.entry_at(i + 1)->get_term(), changelog.entry_at(i + 1)->get_term()); - auto entries_from_range_read = changelog_reader.getLogEntriesBetween(1, 11); - auto entries_from_range = changelog.getLogEntriesBetween(1, 11); + auto entries_from_range_read = changelog_reader.log_entries(1, 11); + auto entries_from_range = changelog.log_entries(1, 11); EXPECT_EQ(entries_from_range_read->size(), entries_from_range->size()); EXPECT_EQ(10, entries_from_range->size()); } @@ -432,55 +432,55 @@ TEST(CoordinationTest, ChangelogReadWrite) TEST(CoordinationTest, ChangelogWriteAt) { ChangelogDirTest test("./logs"); - DB::Changelog changelog("./logs", 1000); - changelog.readChangelogAndInitWriter(1); + DB::NuKeeperLogStore changelog("./logs", 1000); + changelog.init(1); for (size_t i = 0; i < 10; ++i) { auto entry = getLogEntry("hello world", i * 10); - changelog.appendEntry(changelog.getNextEntryIndex(), entry); + changelog.append(entry); } EXPECT_EQ(changelog.size(), 10); auto entry = getLogEntry("writer", 77); - changelog.writeAt(7, entry); + changelog.write_at(7, entry); EXPECT_EQ(changelog.size(), 7); - EXPECT_EQ(changelog.getLastEntry()->get_term(), 77); - EXPECT_EQ(changelog.entryAt(7)->get_term(), 77); - EXPECT_EQ(changelog.getNextEntryIndex(), 8); + EXPECT_EQ(changelog.last_entry()->get_term(), 77); + EXPECT_EQ(changelog.entry_at(7)->get_term(), 77); + EXPECT_EQ(changelog.next_slot(), 8); - DB::Changelog changelog_reader("./logs", 1000); - changelog_reader.readChangelogAndInitWriter(1); + DB::NuKeeperLogStore changelog_reader("./logs", 1000); + changelog_reader.init(1); EXPECT_EQ(changelog_reader.size(), changelog.size()); - EXPECT_EQ(changelog_reader.getLastEntry()->get_term(), changelog.getLastEntry()->get_term()); - EXPECT_EQ(changelog_reader.getStartIndex(), changelog.getStartIndex()); - EXPECT_EQ(changelog_reader.getNextEntryIndex(), changelog.getNextEntryIndex()); + EXPECT_EQ(changelog_reader.last_entry()->get_term(), changelog.last_entry()->get_term()); + EXPECT_EQ(changelog_reader.start_index(), changelog.start_index()); + EXPECT_EQ(changelog_reader.next_slot(), changelog.next_slot()); } TEST(CoordinationTest, ChangelogTestAppendAfterRead) { ChangelogDirTest test("./logs"); - DB::Changelog changelog("./logs", 5); - changelog.readChangelogAndInitWriter(1); + DB::NuKeeperLogStore changelog("./logs", 5); + changelog.init(1); for (size_t i = 0; i < 7; ++i) { auto entry = getLogEntry("hello world", i * 10); - changelog.appendEntry(changelog.getNextEntryIndex(), entry); + changelog.append(entry); } EXPECT_EQ(changelog.size(), 7); EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); - DB::Changelog changelog_reader("./logs", 5); - changelog_reader.readChangelogAndInitWriter(1); + DB::NuKeeperLogStore changelog_reader("./logs", 5); + changelog_reader.init(1); EXPECT_EQ(changelog_reader.size(), 7); for (size_t i = 7; i < 10; ++i) { auto entry = getLogEntry("hello world", i * 10); - changelog_reader.appendEntry(changelog_reader.getNextEntryIndex(), entry); + changelog_reader.append(entry); } EXPECT_EQ(changelog_reader.size(), 10); EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); @@ -493,7 +493,7 @@ TEST(CoordinationTest, ChangelogTestAppendAfterRead) EXPECT_EQ(logs_count, 2); auto entry = getLogEntry("someentry", 77); - changelog_reader.appendEntry(changelog_reader.getNextEntryIndex(), entry); + changelog_reader.append(entry); EXPECT_EQ(changelog_reader.size(), 11); EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); @@ -509,13 +509,13 @@ TEST(CoordinationTest, ChangelogTestAppendAfterRead) TEST(CoordinationTest, ChangelogTestCompaction) { ChangelogDirTest test("./logs"); - DB::Changelog changelog("./logs", 5); - changelog.readChangelogAndInitWriter(1); + DB::NuKeeperLogStore changelog("./logs", 5); + changelog.init(1); for (size_t i = 0; i < 3; ++i) { auto entry = getLogEntry("hello world", i * 10); - changelog.appendEntry(changelog.getNextEntryIndex(), entry); + changelog.append(entry); } EXPECT_EQ(changelog.size(), 3); @@ -523,15 +523,19 @@ TEST(CoordinationTest, ChangelogTestCompaction) changelog.compact(2); EXPECT_EQ(changelog.size(), 1); - EXPECT_EQ(changelog.getStartIndex(), 3); - EXPECT_EQ(changelog.getNextEntryIndex(), 4); - EXPECT_EQ(changelog.getLastEntry()->get_term(), 20); + EXPECT_EQ(changelog.start_index(), 3); + EXPECT_EQ(changelog.next_slot(), 4); + EXPECT_EQ(changelog.last_entry()->get_term(), 20); EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); - changelog.appendEntry(changelog.getNextEntryIndex(), getLogEntry("hello world", 30)); - changelog.appendEntry(changelog.getNextEntryIndex(), getLogEntry("hello world", 40)); - changelog.appendEntry(changelog.getNextEntryIndex(), getLogEntry("hello world", 50)); - changelog.appendEntry(changelog.getNextEntryIndex(), getLogEntry("hello world", 60)); + auto e1 = getLogEntry("hello world", 30); + changelog.append(e1); + auto e2 = getLogEntry("hello world", 40); + changelog.append(e2); + auto e3 = getLogEntry("hello world", 50); + changelog.append(e3); + auto e4 = getLogEntry("hello world", 60); + changelog.append(e4); EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); @@ -542,109 +546,110 @@ TEST(CoordinationTest, ChangelogTestCompaction) EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); EXPECT_EQ(changelog.size(), 1); - EXPECT_EQ(changelog.getStartIndex(), 7); - EXPECT_EQ(changelog.getNextEntryIndex(), 8); - EXPECT_EQ(changelog.getLastEntry()->get_term(), 60); + EXPECT_EQ(changelog.start_index(), 7); + EXPECT_EQ(changelog.next_slot(), 8); + EXPECT_EQ(changelog.last_entry()->get_term(), 60); /// And we able to read it - DB::Changelog changelog_reader("./logs", 5); - changelog_reader.readChangelogAndInitWriter(7); + DB::NuKeeperLogStore changelog_reader("./logs", 5); + changelog_reader.init(7); EXPECT_EQ(changelog_reader.size(), 1); - EXPECT_EQ(changelog_reader.getStartIndex(), 7); - EXPECT_EQ(changelog_reader.getNextEntryIndex(), 8); - EXPECT_EQ(changelog_reader.getLastEntry()->get_term(), 60); + EXPECT_EQ(changelog_reader.start_index(), 7); + EXPECT_EQ(changelog_reader.next_slot(), 8); + EXPECT_EQ(changelog_reader.last_entry()->get_term(), 60); } TEST(CoordinationTest, ChangelogTestBatchOperations) { ChangelogDirTest test("./logs"); - DB::Changelog changelog("./logs", 100); - changelog.readChangelogAndInitWriter(1); + DB::NuKeeperLogStore changelog("./logs", 100); + changelog.init(1); for (size_t i = 0; i < 10; ++i) { auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10); - changelog.appendEntry(changelog.getNextEntryIndex(), entry); + changelog.append(entry); } EXPECT_EQ(changelog.size(), 10); - auto entries = changelog.serializeEntriesToBuffer(1, 5); + auto entries = changelog.pack(1, 5); - DB::Changelog apply_changelog("./logs", 100); - apply_changelog.readChangelogAndInitWriter(1); + DB::NuKeeperLogStore apply_changelog("./logs", 100); + apply_changelog.init(1); for (size_t i = 0; i < 10; ++i) { - EXPECT_EQ(apply_changelog.entryAt(i + 1)->get_term(), i * 10); + EXPECT_EQ(apply_changelog.entry_at(i + 1)->get_term(), i * 10); } EXPECT_EQ(apply_changelog.size(), 10); - apply_changelog.applyEntriesFromBuffer(8, *entries); + apply_changelog.apply_pack(8, *entries); EXPECT_EQ(apply_changelog.size(), 12); - EXPECT_EQ(apply_changelog.getStartIndex(), 1); - EXPECT_EQ(apply_changelog.getNextEntryIndex(), 13); + EXPECT_EQ(apply_changelog.start_index(), 1); + EXPECT_EQ(apply_changelog.next_slot(), 13); for (size_t i = 0; i < 7; ++i) { - EXPECT_EQ(apply_changelog.entryAt(i + 1)->get_term(), i * 10); + EXPECT_EQ(apply_changelog.entry_at(i + 1)->get_term(), i * 10); } - EXPECT_EQ(apply_changelog.entryAt(8)->get_term(), 0); - EXPECT_EQ(apply_changelog.entryAt(9)->get_term(), 10); - EXPECT_EQ(apply_changelog.entryAt(10)->get_term(), 20); - EXPECT_EQ(apply_changelog.entryAt(11)->get_term(), 30); - EXPECT_EQ(apply_changelog.entryAt(12)->get_term(), 40); + EXPECT_EQ(apply_changelog.entry_at(8)->get_term(), 0); + EXPECT_EQ(apply_changelog.entry_at(9)->get_term(), 10); + EXPECT_EQ(apply_changelog.entry_at(10)->get_term(), 20); + EXPECT_EQ(apply_changelog.entry_at(11)->get_term(), 30); + EXPECT_EQ(apply_changelog.entry_at(12)->get_term(), 40); } TEST(CoordinationTest, ChangelogTestBatchOperationsEmpty) { ChangelogDirTest test("./logs"); - DB::Changelog changelog("./logs", 100); - changelog.readChangelogAndInitWriter(1); + DB::NuKeeperLogStore changelog("./logs", 100); + changelog.init(1); for (size_t i = 0; i < 10; ++i) { auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10); - changelog.appendEntry(changelog.getNextEntryIndex(), entry); + changelog.append(entry); } EXPECT_EQ(changelog.size(), 10); - auto entries = changelog.serializeEntriesToBuffer(5, 5); + auto entries = changelog.pack(5, 5); ChangelogDirTest test1("./logs1"); - DB::Changelog changelog_new("./logs1", 100); - changelog_new.readChangelogAndInitWriter(1); + DB::NuKeeperLogStore changelog_new("./logs1", 100); + changelog_new.init(1); EXPECT_EQ(changelog_new.size(), 0); - changelog_new.applyEntriesFromBuffer(5, *entries); + changelog_new.apply_pack(5, *entries); EXPECT_EQ(changelog_new.size(), 5); - EXPECT_EQ(changelog_new.getStartIndex(), 5); - EXPECT_EQ(changelog_new.getNextEntryIndex(), 10); + EXPECT_EQ(changelog_new.start_index(), 5); + EXPECT_EQ(changelog_new.next_slot(), 10); for (size_t i = 4; i < 9; ++i) - EXPECT_EQ(changelog_new.entryAt(i + 1)->get_term(), i * 10); + EXPECT_EQ(changelog_new.entry_at(i + 1)->get_term(), i * 10); - changelog_new.appendEntry(changelog_new.getNextEntryIndex(), getLogEntry("hello_world", 110)); + auto e = getLogEntry("hello_world", 110); + changelog_new.append(e); EXPECT_EQ(changelog_new.size(), 6); - EXPECT_EQ(changelog_new.getStartIndex(), 5); - EXPECT_EQ(changelog_new.getNextEntryIndex(), 11); + EXPECT_EQ(changelog_new.start_index(), 5); + EXPECT_EQ(changelog_new.next_slot(), 11); - DB::Changelog changelog_reader("./logs1", 100); - changelog_reader.readChangelogAndInitWriter(5); + DB::NuKeeperLogStore changelog_reader("./logs1", 100); + changelog_reader.init(5); } TEST(CoordinationTest, ChangelogTestWriteAtPreviousFile) { ChangelogDirTest test("./logs"); - DB::Changelog changelog("./logs", 5); - changelog.readChangelogAndInitWriter(1); + DB::NuKeeperLogStore changelog("./logs", 5); + changelog.init(1); for (size_t i = 0; i < 33; ++i) { auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10); - changelog.appendEntry(changelog.getNextEntryIndex(), entry); + changelog.append(entry); } EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); @@ -657,11 +662,12 @@ TEST(CoordinationTest, ChangelogTestWriteAtPreviousFile) EXPECT_EQ(changelog.size(), 33); - changelog.writeAt(7, getLogEntry("helloworld", 5555)); + auto e1 = getLogEntry("helloworld", 5555); + changelog.write_at(7, e1); EXPECT_EQ(changelog.size(), 7); - EXPECT_EQ(changelog.getStartIndex(), 1); - EXPECT_EQ(changelog.getNextEntryIndex(), 8); - EXPECT_EQ(changelog.getLastEntry()->get_term(), 5555); + EXPECT_EQ(changelog.start_index(), 1); + EXPECT_EQ(changelog.next_slot(), 8); + EXPECT_EQ(changelog.last_entry()->get_term(), 5555); EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); @@ -672,24 +678,24 @@ TEST(CoordinationTest, ChangelogTestWriteAtPreviousFile) EXPECT_FALSE(fs::exists("./logs/changelog_26_30.bin")); EXPECT_FALSE(fs::exists("./logs/changelog_31_35.bin")); - DB::Changelog changelog_read("./logs", 5); - changelog_read.readChangelogAndInitWriter(1); + DB::NuKeeperLogStore changelog_read("./logs", 5); + changelog_read.init(1); EXPECT_EQ(changelog_read.size(), 7); - EXPECT_EQ(changelog_read.getStartIndex(), 1); - EXPECT_EQ(changelog_read.getNextEntryIndex(), 8); - EXPECT_EQ(changelog_read.getLastEntry()->get_term(), 5555); + EXPECT_EQ(changelog_read.start_index(), 1); + EXPECT_EQ(changelog_read.next_slot(), 8); + EXPECT_EQ(changelog_read.last_entry()->get_term(), 5555); } TEST(CoordinationTest, ChangelogTestWriteAtFileBorder) { ChangelogDirTest test("./logs"); - DB::Changelog changelog("./logs", 5); - changelog.readChangelogAndInitWriter(1); + DB::NuKeeperLogStore changelog("./logs", 5); + changelog.init(1); for (size_t i = 0; i < 33; ++i) { auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10); - changelog.appendEntry(changelog.getNextEntryIndex(), entry); + changelog.append(entry); } EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); @@ -702,11 +708,12 @@ TEST(CoordinationTest, ChangelogTestWriteAtFileBorder) EXPECT_EQ(changelog.size(), 33); - changelog.writeAt(11, getLogEntry("helloworld", 5555)); + auto e1 = getLogEntry("helloworld", 5555); + changelog.write_at(11, e1); EXPECT_EQ(changelog.size(), 11); - EXPECT_EQ(changelog.getStartIndex(), 1); - EXPECT_EQ(changelog.getNextEntryIndex(), 12); - EXPECT_EQ(changelog.getLastEntry()->get_term(), 5555); + EXPECT_EQ(changelog.start_index(), 1); + EXPECT_EQ(changelog.next_slot(), 12); + EXPECT_EQ(changelog.last_entry()->get_term(), 5555); EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); @@ -717,12 +724,90 @@ TEST(CoordinationTest, ChangelogTestWriteAtFileBorder) EXPECT_FALSE(fs::exists("./logs/changelog_26_30.bin")); EXPECT_FALSE(fs::exists("./logs/changelog_31_35.bin")); - DB::Changelog changelog_read("./logs", 5); - changelog_read.readChangelogAndInitWriter(1); + DB::NuKeeperLogStore changelog_read("./logs", 5); + changelog_read.init(1); EXPECT_EQ(changelog_read.size(), 11); - EXPECT_EQ(changelog_read.getStartIndex(), 1); - EXPECT_EQ(changelog_read.getNextEntryIndex(), 12); - EXPECT_EQ(changelog_read.getLastEntry()->get_term(), 5555); + EXPECT_EQ(changelog_read.start_index(), 1); + EXPECT_EQ(changelog_read.next_slot(), 12); + EXPECT_EQ(changelog_read.last_entry()->get_term(), 5555); +} + +TEST(CoordinationTest, ChangelogTestWriteAtAllFiles) +{ + ChangelogDirTest test("./logs"); + DB::NuKeeperLogStore changelog("./logs", 5); + changelog.init(1); + + for (size_t i = 0; i < 33; ++i) + { + auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10); + changelog.append(entry); + } + + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_11_15.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_16_20.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_21_25.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_26_30.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_31_35.bin")); + + EXPECT_EQ(changelog.size(), 33); + + auto e1 = getLogEntry("helloworld", 5555); + changelog.write_at(1, e1); + EXPECT_EQ(changelog.size(), 1); + EXPECT_EQ(changelog.start_index(), 1); + EXPECT_EQ(changelog.next_slot(), 2); + EXPECT_EQ(changelog.last_entry()->get_term(), 5555); + + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + + EXPECT_FALSE(fs::exists("./logs/changelog_6_10.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_11_15.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_16_20.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_21_25.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_26_30.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_31_35.bin")); +} + +TEST(CoordinationTest, ChangelogTestStartNewLogAfterRead) +{ + ChangelogDirTest test("./logs"); + DB::NuKeeperLogStore changelog("./logs", 5); + changelog.init(1); + + for (size_t i = 0; i < 35; ++i) + { + auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10); + changelog.append(entry); + } + EXPECT_EQ(changelog.size(), 35); + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_11_15.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_16_20.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_21_25.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_26_30.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_31_35.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_36_40.bin")); + + + DB::NuKeeperLogStore changelog_reader("./logs", 5); + changelog_reader.init(1); + + auto entry = getLogEntry("36_hello_world", 360); + changelog_reader.append(entry); + + EXPECT_EQ(changelog_reader.size(), 36); + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_11_15.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_16_20.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_21_25.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_26_30.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_31_35.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_36_40.bin")); } #endif From acf843a01a9ff7677188dfabbebd4a861a2a7d5a Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 17 Feb 2021 12:00:12 +0300 Subject: [PATCH 176/381] Slightly more optimal --- src/Coordination/Changelog.cpp | 88 ++++++++++++---------- src/Coordination/Changelog.h | 11 ++- src/Coordination/tests/gtest_for_build.cpp | 57 ++++++++++++++ 3 files changed, 116 insertions(+), 40 deletions(-) diff --git a/src/Coordination/Changelog.cpp b/src/Coordination/Changelog.cpp index 4f095974836..9e1ed557430 100644 --- a/src/Coordination/Changelog.cpp +++ b/src/Coordination/Changelog.cpp @@ -39,21 +39,15 @@ namespace static constexpr auto DEFAULT_PREFIX = "changelog"; -struct ChangelogName -{ - std::string prefix; - size_t from_log_idx; - size_t to_log_idx; -}; - -std::string formatChangelogPath(const std::string & prefix, const ChangelogName & name) +std::string formatChangelogPath(const std::string & prefix, const ChangelogFileDescription & name) { std::filesystem::path path(prefix); path /= std::filesystem::path(name.prefix + "_" + std::to_string(name.from_log_idx) + "_" + std::to_string(name.to_log_idx) + ".bin"); return path; } -ChangelogName getChangelogName(const std::string & path_str) + +ChangelogFileDescription getChangelogFileDescription(const std::string & path_str) { std::filesystem::path path(path_str); std::string filename = path.stem(); @@ -62,10 +56,11 @@ ChangelogName getChangelogName(const std::string & path_str) if (filename_parts.size() < 3) throw Exception(ErrorCodes::CORRUPTED_DATA, "Invalid changelog {}", path_str); - ChangelogName result; + ChangelogFileDescription result; result.prefix = filename_parts[0]; result.from_log_idx = parse(filename_parts[1]); result.to_log_idx = parse(filename_parts[2]); + result.path = path_str; return result; } @@ -204,8 +199,8 @@ Changelog::Changelog(const std::string & changelogs_dir_, size_t rotate_interval for (const auto & p : fs::directory_iterator(changelogs_dir)) { - auto name = getChangelogName(p.path()); - existing_changelogs[name.from_log_idx] = p.path(); + auto file_description = getChangelogFileDescription(p.path()); + existing_changelogs[file_description.from_log_idx] = file_description; } } @@ -214,22 +209,40 @@ void Changelog::readChangelogAndInitWriter(size_t from_log_idx) size_t read_from_last = 0; start_index = from_log_idx == 0 ? 1 : from_log_idx; size_t total_read = 0; - for (const auto & [start_id, changelog_file] : existing_changelogs) + size_t entries_in_last = 0; + size_t incomplete_log_idx = 0; + for (const auto & [start_idx, changelog_description] : existing_changelogs) { - ChangelogName parsed_name = getChangelogName(changelog_file); - if (parsed_name.to_log_idx >= from_log_idx) + entries_in_last = changelog_description.to_log_idx - changelog_description.from_log_idx + 1; + + if (changelog_description.to_log_idx >= from_log_idx) { - ChangelogReader reader(changelog_file); + ChangelogReader reader(changelog_description.path); read_from_last = reader.readChangelog(logs, from_log_idx, index_to_start_pos); total_read += read_from_last; + + /// May happen after truncate and crash + if (read_from_last < entries_in_last) + { + incomplete_log_idx = start_idx; + break; + } } } - if (existing_changelogs.size() > 0 && read_from_last < rotate_interval) + if (incomplete_log_idx != 0) { - auto str_name = existing_changelogs.rbegin()->second; - auto parsed_name = getChangelogName(str_name); - current_writer = std::make_unique(str_name, WriteMode::Append, parsed_name.from_log_idx); + for (auto itr = existing_changelogs.upper_bound(incomplete_log_idx); itr != existing_changelogs.end();) + { + std::filesystem::remove(itr->second.path); + itr = existing_changelogs.erase(itr); + } + } + + if (existing_changelogs.size() > 0 && read_from_last < entries_in_last) + { + auto description = existing_changelogs.rbegin()->second; + current_writer = std::make_unique(description.path, WriteMode::Append, description.from_log_idx); current_writer->setEntriesWritten(read_from_last); } else @@ -243,14 +256,14 @@ void Changelog::rotate(size_t new_start_log_idx) if (current_writer) current_writer->flush(); - ChangelogName new_name; - new_name.prefix = DEFAULT_PREFIX; - new_name.from_log_idx = new_start_log_idx; - new_name.to_log_idx = new_start_log_idx + rotate_interval - 1; + ChangelogFileDescription new_description; + new_description.prefix = DEFAULT_PREFIX; + new_description.from_log_idx = new_start_log_idx; + new_description.to_log_idx = new_start_log_idx + rotate_interval - 1; - auto new_log_path = formatChangelogPath(changelogs_dir, new_name); - existing_changelogs[new_start_log_idx] = new_log_path; - current_writer = std::make_unique(new_log_path, WriteMode::Rewrite, new_start_log_idx); + new_description.path = formatChangelogPath(changelogs_dir, new_description); + existing_changelogs[new_start_log_idx] = new_description; + current_writer = std::make_unique(new_description.path, WriteMode::Rewrite, new_start_log_idx); } ChangelogRecord Changelog::buildRecord(size_t index, nuraft::ptr log_entry) const @@ -301,15 +314,14 @@ void Changelog::writeAt(size_t index, nuraft::ptr log_entry) if (need_rollback) { auto index_changelog = existing_changelogs.lower_bound(index); - std::string fname; + ChangelogFileDescription description; if (index_changelog->first == index) - fname = index_changelog->second; + description = index_changelog->second; else - fname = std::prev(index_changelog)->second; + description = std::prev(index_changelog)->second; - current_writer = std::make_unique(fname, WriteMode::Append, index_changelog->first); - auto formated_name = getChangelogName(fname); - current_writer->setEntriesWritten(formated_name.to_log_idx - formated_name.from_log_idx + 1); + current_writer = std::make_unique(description.path, WriteMode::Append, index_changelog->first); + current_writer->setEntriesWritten(description.to_log_idx - description.from_log_idx + 1); } auto entries_written = current_writer->getEntriesWritten(); @@ -320,7 +332,7 @@ void Changelog::writeAt(size_t index, nuraft::ptr log_entry) auto to_remove_itr = existing_changelogs.upper_bound(index); for (auto itr = to_remove_itr; itr != existing_changelogs.end();) { - std::filesystem::remove(itr->second); + std::filesystem::remove(itr->second.path); itr = existing_changelogs.erase(itr); } } @@ -342,17 +354,16 @@ void Changelog::compact(size_t up_to_log_idx) { for (auto itr = existing_changelogs.begin(); itr != existing_changelogs.end();) { - ChangelogName parsed_name = getChangelogName(itr->second); - if (parsed_name.to_log_idx <= up_to_log_idx) + if (itr->second.to_log_idx <= up_to_log_idx) { - for (size_t idx = parsed_name.from_log_idx; idx <= parsed_name.to_log_idx; ++idx) + for (size_t idx = itr->second.from_log_idx; idx <= itr->second.to_log_idx; ++idx) { auto index_pos = index_to_start_pos.find(idx); if (index_pos == index_to_start_pos.end()) break; index_to_start_pos.erase(index_pos); } - std::filesystem::remove(itr->second); + std::filesystem::remove(itr->second.path); itr = existing_changelogs.erase(itr); } else @@ -366,7 +377,6 @@ void Changelog::compact(size_t up_to_log_idx) LogEntryPtr Changelog::getLastEntry() const { - static LogEntryPtr fake_entry = nuraft::cs_new(0, nuraft::buffer::alloc(sizeof(size_t))); size_t next_idx = getNextEntryIndex() - 1; diff --git a/src/Coordination/Changelog.h b/src/Coordination/Changelog.h index 7c352e7a91b..e154c1c70c6 100644 --- a/src/Coordination/Changelog.h +++ b/src/Coordination/Changelog.h @@ -45,6 +45,15 @@ struct ChangelogRecord nuraft::ptr blob; }; +struct ChangelogFileDescription +{ + std::string prefix; + size_t from_log_idx; + size_t to_log_idx; + + std::string path; +}; + class ChangelogWriter; class Changelog @@ -98,7 +107,7 @@ private: private: std::string changelogs_dir; - std::map existing_changelogs; + std::map existing_changelogs; std::unique_ptr current_writer; IndexToOffset index_to_start_pos; const size_t rotate_interval; diff --git a/src/Coordination/tests/gtest_for_build.cpp b/src/Coordination/tests/gtest_for_build.cpp index 8328d93d9cf..76dd08a6d33 100644 --- a/src/Coordination/tests/gtest_for_build.cpp +++ b/src/Coordination/tests/gtest_for_build.cpp @@ -36,7 +36,9 @@ struct ChangelogDirTest , drop(drop_) { if (fs::exists(path)) + { EXPECT_TRUE(false) << "Path " << path << " already exists, remove it to run test"; + } fs::create_directory(path); } @@ -810,4 +812,59 @@ TEST(CoordinationTest, ChangelogTestStartNewLogAfterRead) EXPECT_TRUE(fs::exists("./logs/changelog_36_40.bin")); } + +TEST(CoordinationTest, ChangelogTestReadAfterBrokenTruncate) +{ + ChangelogDirTest test("./logs"); + + DB::NuKeeperLogStore changelog("./logs", 5); + changelog.init(1); + + for (size_t i = 0; i < 35; ++i) + { + auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10); + changelog.append(entry); + } + EXPECT_EQ(changelog.size(), 35); + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_11_15.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_16_20.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_21_25.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_26_30.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_31_35.bin")); + + DB::WriteBufferFromFile plain_buf("./logs/changelog_11_15.bin", DBMS_DEFAULT_BUFFER_SIZE, O_APPEND | O_CREAT | O_WRONLY); + plain_buf.truncate(0); + + DB::NuKeeperLogStore changelog_reader("./logs", 5); + changelog_reader.init(1); + + EXPECT_EQ(changelog_reader.size(), 10); + EXPECT_EQ(changelog_reader.last_entry()->get_term(), 90); + + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_11_15.bin")); + + EXPECT_FALSE(fs::exists("./logs/changelog_16_20.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_21_25.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_26_30.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_31_35.bin")); + + auto entry = getLogEntry("h", 7777); + changelog_reader.append(entry); + EXPECT_EQ(changelog_reader.size(), 11); + EXPECT_EQ(changelog_reader.last_entry()->get_term(), 7777); + + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_11_15.bin")); + + EXPECT_FALSE(fs::exists("./logs/changelog_16_20.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_21_25.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_26_30.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_31_35.bin")); +} + #endif From 9396bae2e2051e2d50faa0d8c1005465171db481 Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 17 Feb 2021 14:53:47 +0300 Subject: [PATCH 177/381] More reliable test keeper tests --- src/Coordination/tests/gtest_for_build.cpp | 2 +- .../test_testkeeper_back_to_back/test.py | 536 +++++++++--------- .../__init__.py | 1 + .../configs/enable_test_keeper.xml | 21 + .../configs/logs_conf.xml | 12 + .../configs/use_test_keeper.xml | 8 + .../test_testkeeper_persistent_log/test.py | 124 ++++ 7 files changed, 444 insertions(+), 260 deletions(-) create mode 100644 tests/integration/test_testkeeper_persistent_log/__init__.py create mode 100644 tests/integration/test_testkeeper_persistent_log/configs/enable_test_keeper.xml create mode 100644 tests/integration/test_testkeeper_persistent_log/configs/logs_conf.xml create mode 100644 tests/integration/test_testkeeper_persistent_log/configs/use_test_keeper.xml create mode 100644 tests/integration/test_testkeeper_persistent_log/test.py diff --git a/src/Coordination/tests/gtest_for_build.cpp b/src/Coordination/tests/gtest_for_build.cpp index 76dd08a6d33..81e1751c08c 100644 --- a/src/Coordination/tests/gtest_for_build.cpp +++ b/src/Coordination/tests/gtest_for_build.cpp @@ -31,7 +31,7 @@ struct ChangelogDirTest { std::string path; bool drop; - ChangelogDirTest(std::string path_, bool drop_ = true) + explicit ChangelogDirTest(std::string path_, bool drop_ = true) : path(path_) , drop(drop_) { diff --git a/tests/integration/test_testkeeper_back_to_back/test.py b/tests/integration/test_testkeeper_back_to_back/test.py index 8ec54f1a883..dd4e1f98cfd 100644 --- a/tests/integration/test_testkeeper_back_to_back/test.py +++ b/tests/integration/test_testkeeper_back_to_back/test.py @@ -8,32 +8,23 @@ from multiprocessing.dummy import Pool cluster = ClickHouseCluster(__file__) node = cluster.add_instance('node', main_configs=['configs/enable_test_keeper.xml', 'configs/logs_conf.xml'], with_zookeeper=True) -from kazoo.client import KazooClient, KazooState - -_genuine_zk_instance = None -_fake_zk_instance = None +from kazoo.client import KazooClient, KazooState, KeeperState def get_genuine_zk(): - global _genuine_zk_instance - if not _genuine_zk_instance: - print("Zoo1", cluster.get_instance_ip("zoo1")) - _genuine_zk_instance = cluster.get_kazoo_client('zoo1') - return _genuine_zk_instance - + print("Zoo1", cluster.get_instance_ip("zoo1")) + return cluster.get_kazoo_client('zoo1') def get_fake_zk(): - global _fake_zk_instance - if not _fake_zk_instance: - print("node", cluster.get_instance_ip("node")) - _fake_zk_instance = KazooClient(hosts=cluster.get_instance_ip("node") + ":9181", timeout=30.0) - def reset_last_zxid_listener(state): - print("Fake zk callback called for state", state) - global _fake_zk_instance - if state != KazooState.CONNECTED: - _fake_zk_instance._reset() + print("node", cluster.get_instance_ip("node")) + _fake_zk_instance = KazooClient(hosts=cluster.get_instance_ip("node") + ":9181", timeout=30.0) + def reset_last_zxid_listener(state): + print("Fake zk callback called for state", state) + nonlocal _fake_zk_instance + if state != KazooState.CONNECTED: + _fake_zk_instance._reset() - _fake_zk_instance.add_listener(reset_last_zxid_listener) - _fake_zk_instance.start() + _fake_zk_instance.add_listener(reset_last_zxid_listener) + _fake_zk_instance.start() return _fake_zk_instance def random_string(length): @@ -44,6 +35,15 @@ def create_random_path(prefix="", depth=1): return prefix return create_random_path(os.path.join(prefix, random_string(3)), depth - 1) +def stop_zk(zk): + try: + if zk: + zk.stop() + zk.close() + except: + pass + + @pytest.fixture(scope="module") def started_cluster(): try: @@ -53,44 +53,46 @@ def started_cluster(): finally: cluster.shutdown() - if _genuine_zk_instance: - _genuine_zk_instance.stop() - _genuine_zk_instance.close() - if _fake_zk_instance: - _fake_zk_instance.stop() - _fake_zk_instance.close() def test_simple_commands(started_cluster): - genuine_zk = get_genuine_zk() - fake_zk = get_fake_zk() + try: + genuine_zk = get_genuine_zk() + fake_zk = get_fake_zk() - for zk in [genuine_zk, fake_zk]: - zk.create("/test_simple_commands", b"") - zk.create("/test_simple_commands/somenode1", b"hello") - zk.set("/test_simple_commands/somenode1", b"world") + for zk in [genuine_zk, fake_zk]: + zk.create("/test_simple_commands", b"") + zk.create("/test_simple_commands/somenode1", b"hello") + zk.set("/test_simple_commands/somenode1", b"world") - for zk in [genuine_zk, fake_zk]: - assert zk.exists("/test_simple_commands") - assert zk.exists("/test_simple_commands/somenode1") - print(zk.get("/test_simple_commands/somenode1")) - assert zk.get("/test_simple_commands/somenode1")[0] == b"world" + for zk in [genuine_zk, fake_zk]: + assert zk.exists("/test_simple_commands") + assert zk.exists("/test_simple_commands/somenode1") + print(zk.get("/test_simple_commands/somenode1")) + assert zk.get("/test_simple_commands/somenode1")[0] == b"world" + finally: + for zk in [genuine_zk, fake_zk]: + stop_zk(zk) def test_sequential_nodes(started_cluster): - genuine_zk = get_genuine_zk() - fake_zk = get_fake_zk() - genuine_zk.create("/test_sequential_nodes") - fake_zk.create("/test_sequential_nodes") - for i in range(1, 11): - genuine_zk.create("/test_sequential_nodes/" + ("a" * i) + "-", sequence=True) - genuine_zk.create("/test_sequential_nodes/" + ("b" * i)) - fake_zk.create("/test_sequential_nodes/" + ("a" * i) + "-", sequence=True) - fake_zk.create("/test_sequential_nodes/" + ("b" * i)) + try: + genuine_zk = get_genuine_zk() + fake_zk = get_fake_zk() + genuine_zk.create("/test_sequential_nodes") + fake_zk.create("/test_sequential_nodes") + for i in range(1, 11): + genuine_zk.create("/test_sequential_nodes/" + ("a" * i) + "-", sequence=True) + genuine_zk.create("/test_sequential_nodes/" + ("b" * i)) + fake_zk.create("/test_sequential_nodes/" + ("a" * i) + "-", sequence=True) + fake_zk.create("/test_sequential_nodes/" + ("b" * i)) - genuine_childs = list(sorted(genuine_zk.get_children("/test_sequential_nodes"))) - fake_childs = list(sorted(fake_zk.get_children("/test_sequential_nodes"))) - assert genuine_childs == fake_childs + genuine_childs = list(sorted(genuine_zk.get_children("/test_sequential_nodes"))) + fake_childs = list(sorted(fake_zk.get_children("/test_sequential_nodes"))) + assert genuine_childs == fake_childs + finally: + for zk in [genuine_zk, fake_zk]: + stop_zk(zk) def assert_eq_stats(stat1, stat2): @@ -102,130 +104,141 @@ def assert_eq_stats(stat1, stat2): assert stat1.numChildren == stat2.numChildren def test_stats(started_cluster): - genuine_zk = get_genuine_zk() - fake_zk = get_fake_zk() - genuine_zk.create("/test_stats_nodes") - fake_zk.create("/test_stats_nodes") - genuine_stats = genuine_zk.exists("/test_stats_nodes") - fake_stats = fake_zk.exists("/test_stats_nodes") - assert_eq_stats(genuine_stats, fake_stats) - for i in range(1, 11): - genuine_zk.create("/test_stats_nodes/" + ("a" * i) + "-", sequence=True) - genuine_zk.create("/test_stats_nodes/" + ("b" * i)) - fake_zk.create("/test_stats_nodes/" + ("a" * i) + "-", sequence=True) - fake_zk.create("/test_stats_nodes/" + ("b" * i)) + try: + genuine_zk = get_genuine_zk() + fake_zk = get_fake_zk() + genuine_zk.create("/test_stats_nodes") + fake_zk.create("/test_stats_nodes") + genuine_stats = genuine_zk.exists("/test_stats_nodes") + fake_stats = fake_zk.exists("/test_stats_nodes") + assert_eq_stats(genuine_stats, fake_stats) + for i in range(1, 11): + genuine_zk.create("/test_stats_nodes/" + ("a" * i) + "-", sequence=True) + genuine_zk.create("/test_stats_nodes/" + ("b" * i)) + fake_zk.create("/test_stats_nodes/" + ("a" * i) + "-", sequence=True) + fake_zk.create("/test_stats_nodes/" + ("b" * i)) - genuine_stats = genuine_zk.exists("/test_stats_nodes") - fake_stats = fake_zk.exists("/test_stats_nodes") - assert_eq_stats(genuine_stats, fake_stats) - for i in range(1, 11): - print("/test_stats_nodes/" + ("a" * i) + "-" + "{:010d}".format((i - 1) * 2)) - genuine_zk.delete("/test_stats_nodes/" + ("a" * i) + "-" + "{:010d}".format((i - 1) * 2)) - genuine_zk.delete("/test_stats_nodes/" + ("b" * i)) - fake_zk.delete("/test_stats_nodes/" + ("a" * i) + "-" + "{:010d}".format((i - 1) * 2)) - fake_zk.delete("/test_stats_nodes/" + ("b" * i)) + genuine_stats = genuine_zk.exists("/test_stats_nodes") + fake_stats = fake_zk.exists("/test_stats_nodes") + assert_eq_stats(genuine_stats, fake_stats) + for i in range(1, 11): + print("/test_stats_nodes/" + ("a" * i) + "-" + "{:010d}".format((i - 1) * 2)) + genuine_zk.delete("/test_stats_nodes/" + ("a" * i) + "-" + "{:010d}".format((i - 1) * 2)) + genuine_zk.delete("/test_stats_nodes/" + ("b" * i)) + fake_zk.delete("/test_stats_nodes/" + ("a" * i) + "-" + "{:010d}".format((i - 1) * 2)) + fake_zk.delete("/test_stats_nodes/" + ("b" * i)) - genuine_stats = genuine_zk.exists("/test_stats_nodes") - fake_stats = fake_zk.exists("/test_stats_nodes") - print(genuine_stats) - print(fake_stats) - assert_eq_stats(genuine_stats, fake_stats) - for i in range(100): - genuine_zk.set("/test_stats_nodes", ("q" * i).encode()) - fake_zk.set("/test_stats_nodes", ("q" * i).encode()) + genuine_stats = genuine_zk.exists("/test_stats_nodes") + fake_stats = fake_zk.exists("/test_stats_nodes") + print(genuine_stats) + print(fake_stats) + assert_eq_stats(genuine_stats, fake_stats) + for i in range(100): + genuine_zk.set("/test_stats_nodes", ("q" * i).encode()) + fake_zk.set("/test_stats_nodes", ("q" * i).encode()) - genuine_stats = genuine_zk.exists("/test_stats_nodes") - fake_stats = fake_zk.exists("/test_stats_nodes") - print(genuine_stats) - print(fake_stats) - assert_eq_stats(genuine_stats, fake_stats) + genuine_stats = genuine_zk.exists("/test_stats_nodes") + fake_stats = fake_zk.exists("/test_stats_nodes") + print(genuine_stats) + print(fake_stats) + assert_eq_stats(genuine_stats, fake_stats) + finally: + for zk in [genuine_zk, fake_zk]: + stop_zk(zk) def test_watchers(started_cluster): - genuine_zk = get_genuine_zk() - fake_zk = get_fake_zk() - genuine_zk.create("/test_data_watches") - fake_zk.create("/test_data_watches") - genuine_data_watch_data = None + try: + genuine_zk = get_genuine_zk() + fake_zk = get_fake_zk() + genuine_zk.create("/test_data_watches") + fake_zk.create("/test_data_watches") + genuine_data_watch_data = None - def genuine_callback(event): - print("Genuine data watch called") - nonlocal genuine_data_watch_data - genuine_data_watch_data = event + def genuine_callback(event): + print("Genuine data watch called") + nonlocal genuine_data_watch_data + genuine_data_watch_data = event - fake_data_watch_data = None - def fake_callback(event): - print("Fake data watch called") - nonlocal fake_data_watch_data - fake_data_watch_data = event + fake_data_watch_data = None + def fake_callback(event): + print("Fake data watch called") + nonlocal fake_data_watch_data + fake_data_watch_data = event - genuine_zk.get("/test_data_watches", watch=genuine_callback) - fake_zk.get("/test_data_watches", watch=fake_callback) + genuine_zk.get("/test_data_watches", watch=genuine_callback) + fake_zk.get("/test_data_watches", watch=fake_callback) - print("Calling set genuine") - genuine_zk.set("/test_data_watches", b"a") - print("Calling set fake") - fake_zk.set("/test_data_watches", b"a") - time.sleep(3) + print("Calling set genuine") + genuine_zk.set("/test_data_watches", b"a") + print("Calling set fake") + fake_zk.set("/test_data_watches", b"a") + time.sleep(3) - print("Genuine data", genuine_data_watch_data) - print("Fake data", fake_data_watch_data) - assert genuine_data_watch_data == fake_data_watch_data + print("Genuine data", genuine_data_watch_data) + print("Fake data", fake_data_watch_data) + assert genuine_data_watch_data == fake_data_watch_data - genuine_children = None - def genuine_child_callback(event): - print("Genuine child watch called") - nonlocal genuine_children - genuine_children = event + genuine_children = None + def genuine_child_callback(event): + print("Genuine child watch called") + nonlocal genuine_children + genuine_children = event - fake_children = None - def fake_child_callback(event): - print("Fake child watch called") - nonlocal fake_children - fake_children = event + fake_children = None + def fake_child_callback(event): + print("Fake child watch called") + nonlocal fake_children + fake_children = event - genuine_zk.get_children("/test_data_watches", watch=genuine_child_callback) - fake_zk.get_children("/test_data_watches", watch=fake_child_callback) + genuine_zk.get_children("/test_data_watches", watch=genuine_child_callback) + fake_zk.get_children("/test_data_watches", watch=fake_child_callback) - print("Calling genuine child") - genuine_zk.create("/test_data_watches/child", b"b") - print("Calling fake child") - fake_zk.create("/test_data_watches/child", b"b") + print("Calling genuine child") + genuine_zk.create("/test_data_watches/child", b"b") + print("Calling fake child") + fake_zk.create("/test_data_watches/child", b"b") - time.sleep(3) + time.sleep(3) - print("Genuine children", genuine_children) - print("Fake children", fake_children) - assert genuine_children == fake_children + print("Genuine children", genuine_children) + print("Fake children", fake_children) + assert genuine_children == fake_children + finally: + for zk in [genuine_zk, fake_zk]: + stop_zk(zk) def test_multitransactions(started_cluster): - genuine_zk = get_genuine_zk() - fake_zk = get_fake_zk() - for zk in [genuine_zk, fake_zk]: - zk.create('/test_multitransactions') - t = zk.transaction() - t.create('/test_multitransactions/freddy') - t.create('/test_multitransactions/fred', ephemeral=True) - t.create('/test_multitransactions/smith', sequence=True) - results = t.commit() - assert len(results) == 3 - assert results[0] == '/test_multitransactions/freddy' - assert results[2].startswith('/test_multitransactions/smith0') is True - - from kazoo.exceptions import RolledBackError, NoNodeError - for i, zk in enumerate([genuine_zk, fake_zk]): - print("Processing ZK", i) - t = zk.transaction() - t.create('/test_multitransactions/q') - t.delete('/test_multitransactions/a') - t.create('/test_multitransactions/x') - results = t.commit() - print("Results", results) - assert results[0].__class__ == RolledBackError - assert results[1].__class__ == NoNodeError - assert zk.exists('/test_multitransactions/q') is None - assert zk.exists('/test_multitransactions/a') is None - assert zk.exists('/test_multitransactions/x') is None + try: + genuine_zk = get_genuine_zk() + fake_zk = get_fake_zk() + for zk in [genuine_zk, fake_zk]: + zk.create('/test_multitransactions') + t = zk.transaction() + t.create('/test_multitransactions/freddy') + t.create('/test_multitransactions/fred', ephemeral=True) + t.create('/test_multitransactions/smith', sequence=True) + results = t.commit() + assert len(results) == 3 + assert results[0] == '/test_multitransactions/freddy' + assert results[2].startswith('/test_multitransactions/smith0') is True + from kazoo.exceptions import RolledBackError, NoNodeError + for i, zk in enumerate([genuine_zk, fake_zk]): + print("Processing ZK", i) + t = zk.transaction() + t.create('/test_multitransactions/q') + t.delete('/test_multitransactions/a') + t.create('/test_multitransactions/x') + results = t.commit() + print("Results", results) + assert results[0].__class__ == RolledBackError + assert results[1].__class__ == NoNodeError + assert zk.exists('/test_multitransactions/q') is None + assert zk.exists('/test_multitransactions/a') is None + assert zk.exists('/test_multitransactions/x') is None + finally: + for zk in [genuine_zk, fake_zk]: + stop_zk(zk) def exists(zk, path): result = zk.exists(path) @@ -278,13 +291,13 @@ class Request(object): arg_str = ', '.join([str(k) + "=" + str(v) for k, v in self.arguments.items()]) return "ZKRequest name {} with arguments {}".format(self.name, arg_str) -def generate_requests(iters=1): +def generate_requests(prefix="/", iters=1): requests = [] existing_paths = [] for i in range(iters): for _ in range(100): rand_length = random.randint(0, 10) - path = "/" + path = prefix for j in range(1, rand_length): path = create_random_path(path, 1) existing_paths.append(path) @@ -322,31 +335,43 @@ def generate_requests(iters=1): def test_random_requests(started_cluster): - requests = generate_requests(10) - genuine_zk = get_genuine_zk() - fake_zk = get_fake_zk() - for i, request in enumerate(requests): - genuine_throw = False - fake_throw = False - fake_result = None - genuine_result = None - try: - genuine_result = request.callback(genuine_zk) - except Exception as ex: - genuine_throw = True + try: + requests = generate_requests("/test_random_requests", 10) + print("Generated", len(requests), "requests") + genuine_zk = get_genuine_zk() + fake_zk = get_fake_zk() + genuine_zk.create("/test_random_requests") + fake_zk.create("/test_random_requests") + for i, request in enumerate(requests): + genuine_throw = False + fake_throw = False + fake_result = None + genuine_result = None + try: + genuine_result = request.callback(genuine_zk) + except Exception as ex: + print("i", i, "request", request) + print("Genuine exception", str(ex)) + genuine_throw = True - try: - fake_result = request.callback(fake_zk) - except Exception as ex: - fake_throw = True + try: + fake_result = request.callback(fake_zk) + except Exception as ex: + print("i", i, "request", request) + print("Fake exception", str(ex)) + fake_throw = True - assert fake_throw == genuine_throw, "Fake throw genuine not or vise versa" - assert fake_result == genuine_result, "Zookeeper results differ" - root_children_genuine = [elem for elem in list(sorted(genuine_zk.get_children("/"))) if elem not in ('clickhouse', 'zookeeper')] - root_children_fake = [elem for elem in list(sorted(fake_zk.get_children("/"))) if elem not in ('clickhouse', 'zookeeper')] - assert root_children_fake == root_children_genuine + assert fake_throw == genuine_throw, "Fake throw genuine not or vise versa request {}" + assert fake_result == genuine_result, "Zookeeper results differ" + root_children_genuine = [elem for elem in list(sorted(genuine_zk.get_children("/test_random_requests"))) if elem not in ('clickhouse', 'zookeeper')] + root_children_fake = [elem for elem in list(sorted(fake_zk.get_children("/test_random_requests"))) if elem not in ('clickhouse', 'zookeeper')] + assert root_children_fake == root_children_genuine + finally: + for zk in [genuine_zk, fake_zk]: + stop_zk(zk) def test_end_of_session(started_cluster): + fake_zk1 = None fake_zk2 = None genuine_zk1 = None @@ -401,13 +426,8 @@ def test_end_of_session(started_cluster): assert fake_ephemeral_event == genuine_ephemeral_event finally: - try: - for zk in [fake_zk1, fake_zk2, genuine_zk1, genuine_zk2]: - if zk: - zk.stop() - zk.close() - except: - pass + for zk in [fake_zk1, fake_zk2, genuine_zk1, genuine_zk2]: + stop_zk(zk) def test_end_of_watches_session(started_cluster): fake_zk1 = None @@ -442,91 +462,89 @@ def test_end_of_watches_session(started_cluster): assert dummy_set == 2 finally: - try: - for zk in [fake_zk1, fake_zk2]: - if zk: - zk.stop() - zk.close() - except: - pass + for zk in [fake_zk1, fake_zk2]: + stop_zk(zk) def test_concurrent_watches(started_cluster): - fake_zk = get_fake_zk() - fake_zk.restart() - global_path = "/test_concurrent_watches_0" - fake_zk.create(global_path) + try: + fake_zk = get_fake_zk() + fake_zk.restart() + global_path = "/test_concurrent_watches_0" + fake_zk.create(global_path) - dumb_watch_triggered_counter = 0 - all_paths_triggered = [] + dumb_watch_triggered_counter = 0 + all_paths_triggered = [] - existing_path = [] - all_paths_created = [] - watches_created = 0 - def create_path_and_watch(i): - nonlocal watches_created - nonlocal all_paths_created - fake_zk.ensure_path(global_path + "/" + str(i)) - # new function each time - def dumb_watch(event): - nonlocal dumb_watch_triggered_counter - dumb_watch_triggered_counter += 1 - nonlocal all_paths_triggered - all_paths_triggered.append(event.path) + existing_path = [] + all_paths_created = [] + watches_created = 0 + def create_path_and_watch(i): + nonlocal watches_created + nonlocal all_paths_created + fake_zk.ensure_path(global_path + "/" + str(i)) + # new function each time + def dumb_watch(event): + nonlocal dumb_watch_triggered_counter + dumb_watch_triggered_counter += 1 + nonlocal all_paths_triggered + all_paths_triggered.append(event.path) - fake_zk.get(global_path + "/" + str(i), watch=dumb_watch) - all_paths_created.append(global_path + "/" + str(i)) - watches_created += 1 - existing_path.append(i) + fake_zk.get(global_path + "/" + str(i), watch=dumb_watch) + all_paths_created.append(global_path + "/" + str(i)) + watches_created += 1 + existing_path.append(i) - trigger_called = 0 - def trigger_watch(i): - nonlocal trigger_called - trigger_called += 1 - fake_zk.set(global_path + "/" + str(i), b"somevalue") - try: - existing_path.remove(i) - except: - pass - - def call(total): - for i in range(total): - create_path_and_watch(random.randint(0, 1000)) - time.sleep(random.random() % 0.5) + trigger_called = 0 + def trigger_watch(i): + nonlocal trigger_called + trigger_called += 1 + fake_zk.set(global_path + "/" + str(i), b"somevalue") try: - rand_num = random.choice(existing_path) - trigger_watch(rand_num) - except: - pass - while existing_path: - try: - rand_num = random.choice(existing_path) - trigger_watch(rand_num) + existing_path.remove(i) except: pass - p = Pool(10) - arguments = [100] * 10 - watches_must_be_created = sum(arguments) - watches_trigger_must_be_called = sum(arguments) - watches_must_be_triggered = sum(arguments) - p.map(call, arguments) - p.close() + def call(total): + for i in range(total): + create_path_and_watch(random.randint(0, 1000)) + time.sleep(random.random() % 0.5) + try: + rand_num = random.choice(existing_path) + trigger_watch(rand_num) + except: + pass + while existing_path: + try: + rand_num = random.choice(existing_path) + trigger_watch(rand_num) + except: + pass - # waiting for late watches - for i in range(50): - if dumb_watch_triggered_counter == watches_must_be_triggered: - break + p = Pool(10) + arguments = [100] * 10 + watches_must_be_created = sum(arguments) + watches_trigger_must_be_called = sum(arguments) + watches_must_be_triggered = sum(arguments) + p.map(call, arguments) + p.close() - time.sleep(0.1) + # waiting for late watches + for i in range(50): + if dumb_watch_triggered_counter == watches_must_be_triggered: + break - assert watches_created == watches_must_be_created - assert trigger_called >= watches_trigger_must_be_called - assert len(existing_path) == 0 - if dumb_watch_triggered_counter != watches_must_be_triggered: - print("All created paths", all_paths_created) - print("All triggerred paths", all_paths_triggered) - print("All paths len", len(all_paths_created)) - print("All triggered len", len(all_paths_triggered)) - print("Diff", list(set(all_paths_created) - set(all_paths_triggered))) + time.sleep(0.1) - assert dumb_watch_triggered_counter == watches_must_be_triggered + assert watches_created == watches_must_be_created + assert trigger_called >= watches_trigger_must_be_called + assert len(existing_path) == 0 + if dumb_watch_triggered_counter != watches_must_be_triggered: + print("All created paths", all_paths_created) + print("All triggerred paths", all_paths_triggered) + print("All paths len", len(all_paths_created)) + print("All triggered len", len(all_paths_triggered)) + print("Diff", list(set(all_paths_created) - set(all_paths_triggered))) + + assert dumb_watch_triggered_counter == watches_must_be_triggered + finally: + stop_zk(fake_zk) diff --git a/tests/integration/test_testkeeper_persistent_log/__init__.py b/tests/integration/test_testkeeper_persistent_log/__init__.py new file mode 100644 index 00000000000..e5a0d9b4834 --- /dev/null +++ b/tests/integration/test_testkeeper_persistent_log/__init__.py @@ -0,0 +1 @@ +#!/usr/bin/env python3 diff --git a/tests/integration/test_testkeeper_persistent_log/configs/enable_test_keeper.xml b/tests/integration/test_testkeeper_persistent_log/configs/enable_test_keeper.xml new file mode 100644 index 00000000000..a8b8991f959 --- /dev/null +++ b/tests/integration/test_testkeeper_persistent_log/configs/enable_test_keeper.xml @@ -0,0 +1,21 @@ + + + 9181 + 1 + /var/lib/clickhouse/coordination/log + + + 5000 + 10000 + trace + + + + + 1 + localhost + 44444 + + + + diff --git a/tests/integration/test_testkeeper_persistent_log/configs/logs_conf.xml b/tests/integration/test_testkeeper_persistent_log/configs/logs_conf.xml new file mode 100644 index 00000000000..318a6bca95d --- /dev/null +++ b/tests/integration/test_testkeeper_persistent_log/configs/logs_conf.xml @@ -0,0 +1,12 @@ + + 3 + + trace + /var/log/clickhouse-server/log.log + /var/log/clickhouse-server/log.err.log + 1000M + 10 + /var/log/clickhouse-server/stderr.log + /var/log/clickhouse-server/stdout.log + + diff --git a/tests/integration/test_testkeeper_persistent_log/configs/use_test_keeper.xml b/tests/integration/test_testkeeper_persistent_log/configs/use_test_keeper.xml new file mode 100644 index 00000000000..12dc7fd9447 --- /dev/null +++ b/tests/integration/test_testkeeper_persistent_log/configs/use_test_keeper.xml @@ -0,0 +1,8 @@ + + + + node1 + 9181 + + + diff --git a/tests/integration/test_testkeeper_persistent_log/test.py b/tests/integration/test_testkeeper_persistent_log/test.py new file mode 100644 index 00000000000..71fee94088f --- /dev/null +++ b/tests/integration/test_testkeeper_persistent_log/test.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python3 +import pytest +from helpers.cluster import ClickHouseCluster +import random +import string +import os +import time +from kazoo.client import KazooClient, KazooState + + +cluster = ClickHouseCluster(__file__) + +node = cluster.add_instance('node', main_configs=['configs/enable_test_keeper.xml', 'configs/logs_conf.xml', 'configs/use_test_keeper.xml'], stay_alive=True) + + +def random_string(length): + return ''.join(random.choices(string.ascii_lowercase + string.digits, k=length)) + +def create_random_path(prefix="", depth=1): + if depth == 0: + return prefix + return create_random_path(os.path.join(prefix, random_string(3)), depth - 1) + +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster.start() + + yield cluster + + finally: + cluster.shutdown() + +def get_connection_zk(nodename, timeout=30.0): + _fake_zk_instance = KazooClient(hosts=cluster.get_instance_ip(nodename) + ":9181", timeout=timeout) + def reset_listener(state): + nonlocal _fake_zk_instance + print("Fake zk callback called for state", state) + if state != KazooState.CONNECTED: + _fake_zk_instance._reset() + + _fake_zk_instance.add_listener(reset_listener) + _fake_zk_instance.start() + return _fake_zk_instance + +def test_state_after_restart(started_cluster): + try: + node_zk = None + node_zk2 = None + node_zk = get_connection_zk("node") + + node_zk.create("/test_state_after_restart", b"somevalue") + strs = [] + for i in range(100): + strs.append(random_string(123).encode()) + node_zk.create("/test_state_after_restart/node" + str(i), strs[i]) + + for i in range(100): + if i % 7 == 0: + node_zk.delete("/test_state_after_restart/node" + str(i)) + + node.restart_clickhouse(kill=True) + + node_zk2 = get_connection_zk("node") + + assert node_zk2.get("/test_state_after_restart")[0] == b"somevalue" + for i in range(100): + if i % 7 == 0: + assert node_zk2.exists("/test_state_after_restart/node" + str(i)) is None + else: + assert len(node_zk2.get("/test_state_after_restart/node" + str(i))[0]) == 123 + assert node_zk2.get("/test_state_after_restart/node" + str(i))[0] == strs[i] + finally: + try: + if node_zk is not None: + node_zk.stop() + node_zk.close() + + if node_zk2 is not None: + node_zk2.stop() + node_zk2.close() + except: + pass + + +# http://zookeeper-user.578899.n2.nabble.com/Why-are-ephemeral-nodes-written-to-disk-tp7583403p7583418.html +def test_ephemeral_after_restart(started_cluster): + try: + node_zk = None + node_zk2 = None + node_zk = get_connection_zk("node") + + node_zk.create("/test_ephemeral_after_restart", b"somevalue") + strs = [] + for i in range(100): + strs.append(random_string(123).encode()) + node_zk.create("/test_ephemeral_after_restart/node" + str(i), strs[i], ephemeral=True) + + for i in range(100): + if i % 7 == 0: + node_zk.delete("/test_ephemeral_after_restart/node" + str(i)) + + node.restart_clickhouse(kill=True) + + node_zk2 = get_connection_zk("node") + + assert node_zk2.get("/test_ephemeral_after_restart")[0] == b"somevalue" + for i in range(100): + if i % 7 == 0: + assert node_zk2.exists("/test_ephemeral_after_restart/node" + str(i)) is None + else: + assert len(node_zk2.get("/test_ephemeral_after_restart/node" + str(i))[0]) == 123 + assert node_zk2.get("/test_ephemeral_after_restart/node" + str(i))[0] == strs[i] + finally: + try: + if node_zk is not None: + node_zk.stop() + node_zk.close() + + if node_zk2 is not None: + node_zk2.stop() + node_zk2.close() + except: + pass From e82bd824d7818279db000f2019f5d2c82fefbb38 Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 17 Feb 2021 16:07:59 +0300 Subject: [PATCH 178/381] Fix restart replica in test --- .../test.py | 36 ++++++++++++++----- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/tests/integration/test_testkeeper_multinode_blocade_leader/test.py b/tests/integration/test_testkeeper_multinode_blocade_leader/test.py index 3b2867ef3c7..a1fd066ab83 100644 --- a/tests/integration/test_testkeeper_multinode_blocade_leader/test.py +++ b/tests/integration/test_testkeeper_multinode_blocade_leader/test.py @@ -87,7 +87,7 @@ def test_blocade_leader(started_cluster): for i in range(100): try: - node2.query("SYSTEM RESTART REPLICA ordinary.t1") + restart_replica_for_sure(node2, "ordinary.t1", "/clickhouse/t1/replicas/2") node2.query("INSERT INTO ordinary.t1 SELECT rand() FROM numbers(100)") break except Exception as ex: @@ -104,7 +104,7 @@ def test_blocade_leader(started_cluster): for i in range(100): try: - node3.query("SYSTEM RESTART REPLICA ordinary.t1") + restart_replica_for_sure(node3, "ordinary.t1", "/clickhouse/t1/replicas/3") node3.query("INSERT INTO ordinary.t1 SELECT rand() FROM numbers(100)") break except Exception as ex: @@ -122,7 +122,7 @@ def test_blocade_leader(started_cluster): for n, node in enumerate([node1, node2, node3]): for i in range(100): try: - node.query("SYSTEM RESTART REPLICA ordinary.t1") + restart_replica_for_sure(node, "ordinary.t1", "/clickhouse/t1/replicas/{}".format(n + 1)) break except Exception as ex: try: @@ -150,7 +150,7 @@ def test_blocade_leader(started_cluster): for n, node in enumerate([node1, node2, node3]): for i in range(100): try: - node.query("SYSTEM RESTART REPLICA ordinary.t1") + restart_replica_for_sure(node, "ordinary.t1", "/clickhouse/t1/replicas/{}".format(n + 1)) node.query("SYSTEM SYNC REPLICA ordinary.t1", timeout=10) break except Exception as ex: @@ -188,6 +188,25 @@ def dump_zk(node, zk_path, replica_path): print("Parts") print(node.query("SELECT name FROM system.zookeeper WHERE path = '{}/parts' FORMAT Vertical".format(replica_path))) +def restart_replica_for_sure(node, table_name, zk_replica_path): + fake_zk = None + try: + node.query("DETACH TABLE {}".format(table_name)) + fake_zk = get_fake_zk(node.name) + if fake_zk.exists(zk_replica_path + "/is_active") is not None: + fake_zk.delete(zk_replica_path + "/is_active") + + node.query("ATTACH TABLE {}".format(table_name)) + except Exception as ex: + print("Exception", ex) + raise ex + finally: + if fake_zk: + fake_zk.stop() + fake_zk.close() + + + # in extremely rare case it can take more than 5 minutes in debug build with sanitizer @pytest.mark.timeout(600) def test_blocade_leader_twice(started_cluster): @@ -211,7 +230,7 @@ def test_blocade_leader_twice(started_cluster): for i in range(100): try: - node2.query("SYSTEM RESTART REPLICA ordinary.t2") + restart_replica_for_sure(node2, "ordinary.t2", "/clickhouse/t2/replicas/2") node2.query("INSERT INTO ordinary.t2 SELECT rand() FROM numbers(100)") break except Exception as ex: @@ -228,7 +247,8 @@ def test_blocade_leader_twice(started_cluster): for i in range(100): try: - node3.query("SYSTEM RESTART REPLICA ordinary.t2") + + restart_replica_for_sure(node3, "ordinary.t2", "/clickhouse/t2/replicas/3") node3.query("INSERT INTO ordinary.t2 SELECT rand() FROM numbers(100)") break except Exception as ex: @@ -265,7 +285,7 @@ def test_blocade_leader_twice(started_cluster): for n, node in enumerate([node1, node2, node3]): for i in range(100): try: - node.query("SYSTEM RESTART REPLICA ordinary.t2") + restart_replica_for_sure(node, "ordinary.t2", "/clickhouse/t2/replicas/{}".format(n + 1)) break except Exception as ex: try: @@ -296,7 +316,7 @@ def test_blocade_leader_twice(started_cluster): for n, node in enumerate([node1, node2, node3]): for i in range(100): try: - node.query("SYSTEM RESTART REPLICA ordinary.t2") + restart_replica_for_sure(node, "ordinary.t2", "/clickhouse/t2/replicas/{}".format(n + 1)) node.query("SYSTEM SYNC REPLICA ordinary.t2", timeout=10) break except Exception as ex: From ee4d3f7aa485f851831b9ce96c8d1b4b78f90589 Mon Sep 17 00:00:00 2001 From: Evgeniia Sudarikova Date: Wed, 17 Feb 2021 16:23:10 +0300 Subject: [PATCH 179/381] edited ; in queries, edited after review --- docs/en/sql-reference/functions/array-functions.md | 12 ++++++------ .../example-datasets/brown-benchmark.md | 6 +++--- docs/ru/sql-reference/functions/array-functions.md | 12 ++++++------ 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/docs/en/sql-reference/functions/array-functions.md b/docs/en/sql-reference/functions/array-functions.md index 48c5176f0e1..528d81b0a0b 100644 --- a/docs/en/sql-reference/functions/array-functions.md +++ b/docs/en/sql-reference/functions/array-functions.md @@ -1315,7 +1315,7 @@ Type: [Int](../../sql-reference/data-types/int-uint.md) or [Float](../../sql-ref Query: ``` sql -SELECT arrayMin([1, 2, 4]) AS res +SELECT arrayMin([1, 2, 4]) AS res; ``` Result: @@ -1329,7 +1329,7 @@ Result: Query: ``` sql -SELECT arrayMin(x -> (-x), [1, 2, 4]) AS res +SELECT arrayMin(x -> (-x), [1, 2, 4]) AS res; ``` Result: @@ -1367,7 +1367,7 @@ Type: [Int](../../sql-reference/data-types/int-uint.md) or [Float](../../sql-ref Query: ```sql -SELECT arrayMax([1, 2, 4]) AS res +SELECT arrayMax([1, 2, 4]) AS res; ``` Result: @@ -1381,7 +1381,7 @@ Result: Query: ``` sql -SELECT arrayMax(x -> (-x), [1, 2, 4]) AS res +SELECT arrayMax(x -> (-x), [1, 2, 4]) AS res; ``` Result: @@ -1419,7 +1419,7 @@ Type: [Int](../../sql-reference/data-types/int-uint.md) or [Float](../../sql-ref Query: ```sql -SELECT arraySum([2,3]) AS res +SELECT arraySum([2,3]) AS res; ``` Result: @@ -1433,7 +1433,7 @@ Result: Query: ``` sql -SELECT arraySum(x -> x*x, [2, 3]) AS res +SELECT arraySum(x -> x*x, [2, 3]) AS res; ``` Result: diff --git a/docs/ru/getting-started/example-datasets/brown-benchmark.md b/docs/ru/getting-started/example-datasets/brown-benchmark.md index e4fe00ace93..23702e07fcd 100644 --- a/docs/ru/getting-started/example-datasets/brown-benchmark.md +++ b/docs/ru/getting-started/example-datasets/brown-benchmark.md @@ -5,7 +5,7 @@ toc_title: Brown University Benchmark # Brown University Benchmark -`MgBench` — это новый аналитический бенчмарк для данных журнала событий, сгенерированных машиной. Бенчмарк разработан [Andrew Crotty](http://cs.brown.edu/people/acrotty/). +`MgBench` — это аналитический тест производительности для данных журнала событий, сгенерированных машиной. Бенчмарк разработан [Andrew Crotty](http://cs.brown.edu/people/acrotty/). Скачать данные: ``` @@ -74,7 +74,7 @@ ENGINE = MergeTree() ORDER BY (event_type, log_time); ``` -Insert data: +Вставка данных: ``` clickhouse-client --query "INSERT INTO mgbench.logs1 FORMAT CSVWithNames" < mgbench1.csv @@ -82,7 +82,7 @@ clickhouse-client --query "INSERT INTO mgbench.logs2 FORMAT CSVWithNames" < mgbe clickhouse-client --query "INSERT INTO mgbench.logs3 FORMAT CSVWithNames" < mgbench3.csv ``` -Run benchmark queries: +Запуск тестов производительности: ``` -- Q1.1: What is the CPU/network utilization for each web server since midnight? diff --git a/docs/ru/sql-reference/functions/array-functions.md b/docs/ru/sql-reference/functions/array-functions.md index 7afd9da471e..9702ab13d5e 100644 --- a/docs/ru/sql-reference/functions/array-functions.md +++ b/docs/ru/sql-reference/functions/array-functions.md @@ -1162,7 +1162,7 @@ arrayMin(arr) Запрос: ``` sql -SELECT arrayMin([1, 2, 4]) AS res +SELECT arrayMin([1, 2, 4]) AS res; ``` Результат: @@ -1176,7 +1176,7 @@ SELECT arrayMin([1, 2, 4]) AS res Запрос: ``` sql -SELECT arrayMin(x -> (-x), [1, 2, 4]) AS res +SELECT arrayMin(x -> (-x), [1, 2, 4]) AS res; ``` Результат: @@ -1214,7 +1214,7 @@ arrayMax(arr) Запрос: ```sql -SELECT arrayMax([1, 2, 4]) AS res +SELECT arrayMax([1, 2, 4]) AS res; ``` Результат: @@ -1228,7 +1228,7 @@ SELECT arrayMax([1, 2, 4]) AS res Запрос: ``` sql -SELECT arrayMax(x -> (-x), [1, 2, 4]) AS res +SELECT arrayMax(x -> (-x), [1, 2, 4]) AS res; ``` Результат: @@ -1266,7 +1266,7 @@ arraySum(arr) Запрос: ```sql -SELECT arraySum([2,3]) AS res +SELECT arraySum([2,3]) AS res; ``` Результат: @@ -1280,7 +1280,7 @@ SELECT arraySum([2,3]) AS res Запрос: ``` sql -SELECT arraySum(x -> x*x, [2, 3]) AS res +SELECT arraySum(x -> x*x, [2, 3]) AS res; ``` Результат: From 499c100b12233e3a6fbd31066a4bac3914a650e1 Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 17 Feb 2021 16:41:43 +0300 Subject: [PATCH 180/381] Better test --- .../test.py | 37 +++++++++++-------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/tests/integration/test_testkeeper_multinode_blocade_leader/test.py b/tests/integration/test_testkeeper_multinode_blocade_leader/test.py index a1fd066ab83..49d86ab9fe8 100644 --- a/tests/integration/test_testkeeper_multinode_blocade_leader/test.py +++ b/tests/integration/test_testkeeper_multinode_blocade_leader/test.py @@ -55,7 +55,6 @@ def get_fake_zk(nodename, timeout=30.0): _fake_zk_instance = KazooClient(hosts=cluster.get_instance_ip(nodename) + ":9181", timeout=timeout) def reset_listener(state): nonlocal _fake_zk_instance - print("Fake zk callback called for state", state) if state != KazooState.CONNECTED: _fake_zk_instance._reset() @@ -247,8 +246,8 @@ def test_blocade_leader_twice(started_cluster): for i in range(100): try: - restart_replica_for_sure(node3, "ordinary.t2", "/clickhouse/t2/replicas/3") + node3.query("SYSTEM SYNC REPLICA ordinary.t2", timeout=10) node3.query("INSERT INTO ordinary.t2 SELECT rand() FROM numbers(100)") break except Exception as ex: @@ -263,6 +262,10 @@ def test_blocade_leader_twice(started_cluster): dump_zk(node, '/clickhouse/t2', '/clickhouse/t2/replicas/{}'.format(num + 1)) assert False, "Cannot reconnect for node3" + node2.query("SYSTEM SYNC REPLICA ordinary.t2", timeout=10) + + assert node2.query("SELECT COUNT() FROM ordinary.t2") == "210\n" + assert node3.query("SELECT COUNT() FROM ordinary.t2") == "210\n" # Total network partition pm.partition_instances(node3, node2) @@ -281,7 +284,6 @@ def test_blocade_leader_twice(started_cluster): except Exception as ex: time.sleep(0.5) - for n, node in enumerate([node1, node2, node3]): for i in range(100): try: @@ -313,24 +315,29 @@ def test_blocade_leader_twice(started_cluster): dump_zk(node, '/clickhouse/t2', '/clickhouse/t2/replicas/{}'.format(num + 1)) assert False, "Cannot reconnect for node{}".format(n + 1) - for n, node in enumerate([node1, node2, node3]): for i in range(100): - try: - restart_replica_for_sure(node, "ordinary.t2", "/clickhouse/t2/replicas/{}".format(n + 1)) - node.query("SYSTEM SYNC REPLICA ordinary.t2", timeout=10) - break - except Exception as ex: + all_done = True + for n, node in enumerate([node1, node2, node3]): try: - node.query("ATTACH TABLE ordinary.t2") - except Exception as attach_ex: - print("Got exception node{}".format(n + 1), smaller_exception(attach_ex)) + restart_replica_for_sure(node, "ordinary.t2", "/clickhouse/t2/replicas/{}".format(n + 1)) + node.query("SYSTEM SYNC REPLICA ordinary.t2", timeout=10) + break + except Exception as ex: + all_done = False + try: + node.query("ATTACH TABLE ordinary.t2") + except Exception as attach_ex: + print("Got exception node{}".format(n + 1), smaller_exception(attach_ex)) - print("Got exception node{}".format(n + 1), smaller_exception(ex)) - time.sleep(0.5) + print("Got exception node{}".format(n + 1), smaller_exception(ex)) + time.sleep(0.5) + + if all_done: + break else: for num, node in enumerate([node1, node2, node3]): dump_zk(node, '/clickhouse/t2', '/clickhouse/t2/replicas/{}'.format(num + 1)) - assert False, "Cannot reconnect for node{}".format(n + 1) + assert False, "Cannot reconnect in i {} retries".format(i) assert node1.query("SELECT COUNT() FROM ordinary.t2") == "510\n" if node2.query("SELECT COUNT() FROM ordinary.t2") != "510\n": From 8cecb533ca53038fe70a55fc4aa46e7ab2b0bef9 Mon Sep 17 00:00:00 2001 From: Marvin Taschenberger <45663148+Taschenbergerm@users.noreply.github.com> Date: Wed, 17 Feb 2021 15:03:09 +0100 Subject: [PATCH 181/381] Update argmax.md --- .../aggregate-functions/reference/argmax.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/en/sql-reference/aggregate-functions/reference/argmax.md b/docs/en/sql-reference/aggregate-functions/reference/argmax.md index 7639117042f..1af188ad026 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/argmax.md +++ b/docs/en/sql-reference/aggregate-functions/reference/argmax.md @@ -17,12 +17,12 @@ argMax(arg, val) or ``` sql -argMax(tuple(arg, val)) +argMax(tuple(arg1, arg2), val) ``` **Arguments** -- `arg` — Argument. +- `arg{i}` — Argument. - `val` — Value. **Returned value** @@ -33,7 +33,7 @@ Type: matches `arg` type. For tuple in the input: -- Tuple `(arg, val)`, where `val` is the maximum value and `arg` is a corresponding value. +- Tuple `(arg1, arg2)`, where `arg1` and `arg2` are the corresponding values. Type: [Tuple](../../../sql-reference/data-types/tuple.md). @@ -52,13 +52,13 @@ Input table: Query: ``` sql -SELECT argMax(user, salary), argMax(tuple(user, salary)) FROM salary; +SELECT argMax(user, salary), argMax(tuple(user, salary), salary) FROM salary; ``` Result: ``` text -┌─argMax(user, salary)─┬─argMax(tuple(user, salary))─┐ +┌─argMax(user, salary)─┬─argMax(tuple(user, salary), salary)─┐ │ director │ ('director',5000) │ └──────────────────────┴─────────────────────────────┘ ``` From bb4ced05f9da997c987c7f520f423fd3892bb7d0 Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 17 Feb 2021 17:52:32 +0300 Subject: [PATCH 182/381] Fix fast test --- docker/test/fasttest/run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/test/fasttest/run.sh b/docker/test/fasttest/run.sh index 90663102f17..202e2f12a1a 100755 --- a/docker/test/fasttest/run.sh +++ b/docker/test/fasttest/run.sh @@ -361,7 +361,7 @@ function run_tests stop_server ||: # Clean the data so that there is no interference from the previous test run. - rm -rf "$FASTTEST_DATA"/{{meta,}data,user_files} ||: + rm -rf "$FASTTEST_DATA"/{{meta,}data,user_files,coordination} ||: start_server From acb5fb8179c2845890635582332790c94995df83 Mon Sep 17 00:00:00 2001 From: Alexander Kazakov Date: Wed, 17 Feb 2021 20:58:04 +0300 Subject: [PATCH 183/381] Randomly shuffle replicas withing the same priority --- base/mysqlxx/PoolWithFailover.cpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/base/mysqlxx/PoolWithFailover.cpp b/base/mysqlxx/PoolWithFailover.cpp index 5bee75aab1b..e2d612d6bc4 100644 --- a/base/mysqlxx/PoolWithFailover.cpp +++ b/base/mysqlxx/PoolWithFailover.cpp @@ -1,3 +1,6 @@ +#include +#include + #include @@ -7,6 +10,8 @@ static bool startsWith(const std::string & s, const char * prefix) return s.size() >= strlen(prefix) && 0 == memcmp(s.data(), prefix, strlen(prefix)); } +/// This is thread-safe +std::random_device rd; using namespace mysqlxx; @@ -33,6 +38,13 @@ PoolWithFailover::PoolWithFailover(const Poco::Util::AbstractConfiguration & con std::make_shared(config_, replica_name, default_connections_, max_connections_, config_name_.c_str())); } } + + static thread_local std::mt19937 rnd_generator(rd()); + for (auto & [_, replicas] : replicas_by_priority) + { + if (replicas.size() > 1) + std::shuffle(replicas.begin(), replicas.end(), rnd_generator); + } } else { From 0296d7d026ab3fb1a335d1a97a5154add718ad89 Mon Sep 17 00:00:00 2001 From: Alexander Kazakov Date: Wed, 17 Feb 2021 21:51:05 +0300 Subject: [PATCH 184/381] Added some explanations on randomization --- base/mysqlxx/PoolWithFailover.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/base/mysqlxx/PoolWithFailover.cpp b/base/mysqlxx/PoolWithFailover.cpp index e2d612d6bc4..9132773f727 100644 --- a/base/mysqlxx/PoolWithFailover.cpp +++ b/base/mysqlxx/PoolWithFailover.cpp @@ -10,7 +10,7 @@ static bool startsWith(const std::string & s, const char * prefix) return s.size() >= strlen(prefix) && 0 == memcmp(s.data(), prefix, strlen(prefix)); } -/// This is thread-safe +/// This reads from "/dev/urandom" and thus is thread-safe std::random_device rd; using namespace mysqlxx; @@ -39,6 +39,11 @@ PoolWithFailover::PoolWithFailover(const Poco::Util::AbstractConfiguration & con } } + /// PoolWithFailover objects are stored in a cache inside PoolFactory. + /// This cache is reset by ExternalDictionariesLoader after every SYSTEM RELOAD DICTIONAR{Y|IES} + /// which triggers massive re-constructing of connection pools. + /// The state of PRNDGs like std::mt19937 is considered to be quite heavy + /// thus here we attempt to optimize its construction. static thread_local std::mt19937 rnd_generator(rd()); for (auto & [_, replicas] : replicas_by_priority) { From 6e244e7bb1722e23a9e616c7e8048ac2c8306885 Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 17 Feb 2021 22:32:38 +0300 Subject: [PATCH 185/381] Trying without fsync --- src/Coordination/Changelog.cpp | 2 +- src/Coordination/Changelog.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Coordination/Changelog.cpp b/src/Coordination/Changelog.cpp index 9e1ed557430..a9693b2a47b 100644 --- a/src/Coordination/Changelog.cpp +++ b/src/Coordination/Changelog.cpp @@ -298,7 +298,7 @@ void Changelog::appendEntry(size_t index, nuraft::ptr log_ent if (current_writer->getEntriesWritten() == rotate_interval) rotate(index); - auto offset = current_writer->appendRecord(buildRecord(index, log_entry), true); + auto offset = current_writer->appendRecord(buildRecord(index, log_entry), false); if (!index_to_start_pos.try_emplace(index, offset).second) throw Exception(ErrorCodes::LOGICAL_ERROR, "Record with index {} already exists", index); diff --git a/src/Coordination/Changelog.h b/src/Coordination/Changelog.h index e154c1c70c6..5f38f68750e 100644 --- a/src/Coordination/Changelog.h +++ b/src/Coordination/Changelog.h @@ -101,7 +101,7 @@ public: private: - void rotate(size_t new_start_log_idex); + void rotate(size_t new_start_log_idx); ChangelogRecord buildRecord(size_t index, nuraft::ptr log_entry) const; From ff663dc511a5daf955e559cdff0d47fa6a07f104 Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 17 Feb 2021 23:36:25 +0300 Subject: [PATCH 186/381] Fsync at server shutdown --- src/Coordination/Changelog.cpp | 13 ++++++++++++- src/Coordination/InMemoryStateManager.cpp | 5 +++++ src/Coordination/InMemoryStateManager.h | 2 ++ src/Coordination/NuKeeperServer.cpp | 1 + 4 files changed, 20 insertions(+), 1 deletion(-) diff --git a/src/Coordination/Changelog.cpp b/src/Coordination/Changelog.cpp index a9693b2a47b..2d1bbfb4440 100644 --- a/src/Coordination/Changelog.cpp +++ b/src/Coordination/Changelog.cpp @@ -467,6 +467,17 @@ void Changelog::flush() current_writer->flush(); } -Changelog::~Changelog() = default; +Changelog::~Changelog() +{ + try + { + if (current_writer) + current_writer->flush(); + } + catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + } +} } diff --git a/src/Coordination/InMemoryStateManager.cpp b/src/Coordination/InMemoryStateManager.cpp index 6c4e95b993a..0423d2466f2 100644 --- a/src/Coordination/InMemoryStateManager.cpp +++ b/src/Coordination/InMemoryStateManager.cpp @@ -66,6 +66,11 @@ void InMemoryStateManager::loadLogStore(size_t start_log_index) log_store->init(start_log_index); } +void InMemoryStateManager::flushLogStore() +{ + log_store->flush(); +} + void InMemoryStateManager::save_config(const nuraft::cluster_config & config) { // Just keep in memory in this example. diff --git a/src/Coordination/InMemoryStateManager.h b/src/Coordination/InMemoryStateManager.h index 8a7be7d0129..c53f00702d4 100644 --- a/src/Coordination/InMemoryStateManager.h +++ b/src/Coordination/InMemoryStateManager.h @@ -27,6 +27,8 @@ public: void loadLogStore(size_t start_log_index); + void flushLogStore(); + nuraft::ptr load_config() override { return cluster_config; } void save_config(const nuraft::cluster_config & config) override; diff --git a/src/Coordination/NuKeeperServer.cpp b/src/Coordination/NuKeeperServer.cpp index a4582a5fbb8..8556fa85231 100644 --- a/src/Coordination/NuKeeperServer.cpp +++ b/src/Coordination/NuKeeperServer.cpp @@ -67,6 +67,7 @@ void NuKeeperServer::startup() void NuKeeperServer::shutdown() { state_machine->shutdownStorage(); + state_manager->flushLogStore(); if (!launcher.shutdown(coordination_settings->shutdown_timeout.totalSeconds())) LOG_WARNING(&Poco::Logger::get("NuKeeperServer"), "Failed to shutdown RAFT server in {} seconds", 5); } From 65f2b6a0449f19e0488c5c66e013e9002b4949d3 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Mon, 15 Feb 2021 10:18:37 +0300 Subject: [PATCH 187/381] test/fasttest: add gdb into docker image --- docker/test/fasttest/Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/test/fasttest/Dockerfile b/docker/test/fasttest/Dockerfile index 03b7b2fc53a..64be52d8e30 100644 --- a/docker/test/fasttest/Dockerfile +++ b/docker/test/fasttest/Dockerfile @@ -47,6 +47,7 @@ RUN apt-get update \ expect \ fakeroot \ git \ + gdb \ gperf \ lld-${LLVM_VERSION} \ llvm-${LLVM_VERSION} \ From ee18f6a7ec23304c7ebc5128882d163d510525e0 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Sun, 14 Feb 2021 23:34:14 +0300 Subject: [PATCH 188/381] test/fasttest: collect diagnosis by attaching with gdb in background Otherwise sometimes stacktraces may be lost [1]: [1]: https://clickhouse-test-reports.s3.yandex.net/20477/8ad20fcee5aaa642c2a2dd873d02103692d554f4/fast_test.html#fail1 --- docker/test/fasttest/run.sh | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docker/test/fasttest/run.sh b/docker/test/fasttest/run.sh index e6294b5d74d..fbdad93a553 100755 --- a/docker/test/fasttest/run.sh +++ b/docker/test/fasttest/run.sh @@ -107,6 +107,18 @@ function start_server fi echo "ClickHouse server pid '$server_pid' started and responded" + + echo " +handle all noprint +handle SIGSEGV stop print +handle SIGBUS stop print +handle SIGABRT stop print +continue +thread apply all backtrace +continue +" > script.gdb + + gdb -batch -command script.gdb -p "$server_pid" & } function clone_root From 9b72255ca4fd4d1ec7fd090dd9b39ab16ec6965e Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 18 Feb 2021 02:09:46 +0300 Subject: [PATCH 189/381] Implement compression for all columns except LowCardinality --- src/Columns/ColumnArray.cpp | 25 +++++++++++++- src/Columns/ColumnArray.h | 5 ++- src/Columns/ColumnDecimal.cpp | 25 ++++++++++++++ src/Columns/ColumnDecimal.h | 2 ++ src/Columns/ColumnFixedString.cpp | 30 ++++++++++++++++- src/Columns/ColumnFixedString.h | 2 ++ src/Columns/ColumnMap.h | 2 ++ src/Columns/ColumnNullable.cpp | 15 +++++++++ src/Columns/ColumnNullable.h | 2 ++ src/Columns/ColumnString.cpp | 54 +++++++++++++++++++++++++++++++ src/Columns/ColumnString.h | 2 ++ src/Columns/ColumnTuple.cpp | 24 +++++++++++++- src/Columns/ColumnTuple.h | 1 + src/Columns/ColumnUnique.h | 5 +++ 14 files changed, 188 insertions(+), 6 deletions(-) diff --git a/src/Columns/ColumnArray.cpp b/src/Columns/ColumnArray.cpp index 8c0e06424e7..e8a48672435 100644 --- a/src/Columns/ColumnArray.cpp +++ b/src/Columns/ColumnArray.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -369,8 +370,12 @@ void ColumnArray::compareColumn(const IColumn & rhs, size_t rhs_row_num, compare_results, direction, nan_direction_hint); } + +namespace +{ + template -struct ColumnArray::Cmp +struct Cmp { const ColumnArray & parent; int nan_direction_hint; @@ -390,6 +395,9 @@ struct ColumnArray::Cmp } }; +} + + void ColumnArray::reserve(size_t n) { getOffsets().reserve(n); @@ -912,6 +920,21 @@ void ColumnArray::updatePermutationWithCollation(const Collator & collator, bool updatePermutationImpl(limit, res, equal_range, Cmp(*this, nan_direction_hint, &collator)); } +ColumnPtr ColumnArray::compress() const +{ + ColumnPtr data_compressed = data->compress(); + ColumnPtr offsets_compressed = offsets->compress(); + + size_t byte_size = data_compressed->byteSize() + offsets_compressed->byteSize(); + + return ColumnCompressed::create(size(), byte_size, + [data_compressed = std::move(data_compressed), offsets_compressed = std::move(offsets_compressed)] + { + return ColumnArray::create(data_compressed->decompress(), offsets_compressed->decompress()); + }); +} + + ColumnPtr ColumnArray::replicate(const Offsets & replicate_offsets) const { if (replicate_offsets.empty()) diff --git a/src/Columns/ColumnArray.h b/src/Columns/ColumnArray.h index e81ecbc1ca0..1caaf672d49 100644 --- a/src/Columns/ColumnArray.h +++ b/src/Columns/ColumnArray.h @@ -123,6 +123,8 @@ public: void gather(ColumnGathererStream & gatherer_stream) override; + ColumnPtr compress() const override; + void forEachSubcolumn(ColumnCallback callback) override { callback(offsets); @@ -183,9 +185,6 @@ private: template void updatePermutationImpl(size_t limit, Permutation & res, EqualRanges & equal_range, Comparator cmp) const; - - template - struct Cmp; }; diff --git a/src/Columns/ColumnDecimal.cpp b/src/Columns/ColumnDecimal.cpp index ddc971032b6..bb61f60706e 100644 --- a/src/Columns/ColumnDecimal.cpp +++ b/src/Columns/ColumnDecimal.cpp @@ -14,6 +14,7 @@ #include #include +#include #include @@ -346,6 +347,30 @@ void ColumnDecimal::gather(ColumnGathererStream & gatherer) gatherer.gather(*this); } +template +ColumnPtr ColumnDecimal::compress() const +{ + size_t source_size = data.size() * sizeof(T); + + /// Don't compress small blocks. + if (source_size < 4096) /// A wild guess. + return ColumnCompressed::wrap(this->getPtr()); + + auto compressed = ColumnCompressed::compressBuffer(data.data(), source_size); + + if (!compressed) + return ColumnCompressed::wrap(this->getPtr()); + + return ColumnCompressed::create(data.size(), compressed->size(), + [compressed = std::move(compressed), column_size = data.size(), scale = this->scale] + { + auto res = ColumnDecimal::create(column_size, scale); + ColumnCompressed::decompressBuffer( + compressed->data(), res->getData().data(), compressed->size(), column_size * sizeof(T)); + return res; + }); +} + template void ColumnDecimal::getExtremes(Field & min, Field & max) const { diff --git a/src/Columns/ColumnDecimal.h b/src/Columns/ColumnDecimal.h index ef841292a7d..5016ddca791 100644 --- a/src/Columns/ColumnDecimal.h +++ b/src/Columns/ColumnDecimal.h @@ -172,6 +172,8 @@ public: return false; } + ColumnPtr compress() const override; + void insertValue(const T value) { data.push_back(value); } Container & getData() { return data; } diff --git a/src/Columns/ColumnFixedString.cpp b/src/Columns/ColumnFixedString.cpp index 55e387ff2ee..278c2fef5f8 100644 --- a/src/Columns/ColumnFixedString.cpp +++ b/src/Columns/ColumnFixedString.cpp @@ -1,6 +1,7 @@ #include - #include +#include + #include #include #include @@ -446,4 +447,31 @@ void ColumnFixedString::getExtremes(Field & min, Field & max) const get(max_idx, max); } +ColumnPtr ColumnFixedString::compress() const +{ + size_t source_size = chars.size() * n; + + /// Don't compress small blocks. + if (source_size < 4096) /// A wild guess. + return ColumnCompressed::wrap(this->getPtr()); + + auto compressed = ColumnCompressed::compressBuffer(chars.data(), source_size); + + if (!compressed) + return ColumnCompressed::wrap(this->getPtr()); + + size_t column_size = size(); + + return ColumnCompressed::create(column_size, compressed->size(), + [compressed = std::move(compressed), column_size, n = n] + { + size_t chars_size = n * column_size; + auto res = ColumnFixedString::create(n); + res->getChars().resize(chars_size); + ColumnCompressed::decompressBuffer( + compressed->data(), res->getChars().data(), compressed->size(), chars_size); + return res; + }); +} + } diff --git a/src/Columns/ColumnFixedString.h b/src/Columns/ColumnFixedString.h index 286b3a752dc..1bb7f922f3e 100644 --- a/src/Columns/ColumnFixedString.h +++ b/src/Columns/ColumnFixedString.h @@ -156,6 +156,8 @@ public: void gather(ColumnGathererStream & gatherer_stream) override; + ColumnPtr compress() const override; + void reserve(size_t size) override { chars.reserve(n * size); diff --git a/src/Columns/ColumnMap.h b/src/Columns/ColumnMap.h index c1948491db5..a970f67bd46 100644 --- a/src/Columns/ColumnMap.h +++ b/src/Columns/ColumnMap.h @@ -91,6 +91,8 @@ public: const ColumnTuple & getNestedData() const { return assert_cast(getNestedColumn().getData()); } ColumnTuple & getNestedData() { return assert_cast(getNestedColumn().getData()); } + + ColumnPtr compress() const override { return nested->compress(); } }; } diff --git a/src/Columns/ColumnNullable.cpp b/src/Columns/ColumnNullable.cpp index 35ce005073a..4e5cc2b4cf7 100644 --- a/src/Columns/ColumnNullable.cpp +++ b/src/Columns/ColumnNullable.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include @@ -511,6 +512,20 @@ void ColumnNullable::protect() getNullMapColumn().protect(); } +ColumnPtr ColumnNullable::compress() const +{ + ColumnPtr nested_compressed = nested_column->compress(); + ColumnPtr null_map_compressed = null_map->compress(); + + size_t byte_size = nested_column->byteSize() + null_map->byteSize(); + + return ColumnCompressed::create(size(), byte_size, + [nested_column = std::move(nested_column), null_map = std::move(null_map)] + { + return ColumnNullable::create(nested_column->decompress(), null_map->decompress()); + }); +} + namespace { diff --git a/src/Columns/ColumnNullable.h b/src/Columns/ColumnNullable.h index ade2c106627..8d267de8644 100644 --- a/src/Columns/ColumnNullable.h +++ b/src/Columns/ColumnNullable.h @@ -117,6 +117,8 @@ public: void gather(ColumnGathererStream & gatherer_stream) override; + ColumnPtr compress() const override; + void forEachSubcolumn(ColumnCallback callback) override { callback(nested_column); diff --git a/src/Columns/ColumnString.cpp b/src/Columns/ColumnString.cpp index 00d6349408f..190517bfeb9 100644 --- a/src/Columns/ColumnString.cpp +++ b/src/Columns/ColumnString.cpp @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -525,6 +526,59 @@ void ColumnString::getExtremes(Field & min, Field & max) const } +ColumnPtr ColumnString::compress() const +{ + size_t source_chars_size = chars.size(); + size_t source_offsets_size = offsets.size() * sizeof(Offset); + + /// Don't compress small blocks. + if (source_chars_size < 4096) /// A wild guess. + return ColumnCompressed::wrap(this->getPtr()); + + auto chars_compressed = ColumnCompressed::compressBuffer(chars.data(), source_chars_size); + auto offsets_compressed = ColumnCompressed::compressBuffer(offsets.data(), source_offsets_size); + + /// Return original column if not compressable. + if (!chars_compressed && !offsets_compressed) + return ColumnCompressed::wrap(this->getPtr()); + + if (!chars_compressed) + { + chars_compressed = std::make_shared>(source_chars_size); + memcpy(chars_compressed->data(), chars.data(), source_chars_size); + } + + if (!offsets_compressed) + { + offsets_compressed = std::make_shared>(source_offsets_size); + memcpy(offsets_compressed->data(), offsets.data(), source_offsets_size); + } + + return ColumnCompressed::create(offsets.size(), chars_compressed->size() + offsets_compressed->size(), + [ + chars_compressed = std::move(chars_compressed), + offsets_compressed = std::move(offsets_compressed), + source_chars_size, + source_offsets_elements = offsets.size() + ] + { + auto res = ColumnString::create(); + + res->getChars().resize(source_chars_size); + res->getOffsets().resize(source_offsets_elements); + + ColumnCompressed::decompressBuffer( + chars_compressed->data(), res->getChars().data(), chars_compressed->size(), source_chars_size); + + ColumnCompressed::decompressBuffer( + offsets_compressed->data(), res->getOffsets().data(), offsets_compressed->size(), source_offsets_elements * sizeof(Offset)); + + return res; + }); + +} + + int ColumnString::compareAtWithCollation(size_t n, size_t m, const IColumn & rhs_, int, const Collator & collator) const { const ColumnString & rhs = assert_cast(rhs_); diff --git a/src/Columns/ColumnString.h b/src/Columns/ColumnString.h index c1e76c5e28e..843e445d1a0 100644 --- a/src/Columns/ColumnString.h +++ b/src/Columns/ColumnString.h @@ -261,6 +261,8 @@ public: void gather(ColumnGathererStream & gatherer_stream) override; + ColumnPtr compress() const override; + void reserve(size_t n) override; void getExtremes(Field & min, Field & max) const override; diff --git a/src/Columns/ColumnTuple.cpp b/src/Columns/ColumnTuple.cpp index fa5a15d0351..1d85c67e7c6 100644 --- a/src/Columns/ColumnTuple.cpp +++ b/src/Columns/ColumnTuple.cpp @@ -1,6 +1,7 @@ #include #include +#include #include #include #include @@ -486,7 +487,7 @@ bool ColumnTuple::structureEquals(const IColumn & rhs) const bool ColumnTuple::isCollationSupported() const { - for (const auto& column : columns) + for (const auto & column : columns) { if (column->isCollationSupported()) return true; @@ -495,4 +496,25 @@ bool ColumnTuple::isCollationSupported() const } +ColumnPtr ColumnTuple::compress() const +{ + size_t byte_size = 0; + Columns compressed; + compressed.reserve(columns.size()); + for (const auto & column : columns) + { + auto compressed_column = column->compress(); + byte_size += compressed_column->byteSize(); + compressed.emplace_back(std::move(compressed_column)); + } + + return ColumnCompressed::create(size(), byte_size, + [compressed = std::move(compressed)] + { + for (auto & column : compressed) + column = column->decompress(); + return ColumnTuple::create(compressed); + }); +} + } diff --git a/src/Columns/ColumnTuple.h b/src/Columns/ColumnTuple.h index f763ca3fcba..818b29937bd 100644 --- a/src/Columns/ColumnTuple.h +++ b/src/Columns/ColumnTuple.h @@ -89,6 +89,7 @@ public: void forEachSubcolumn(ColumnCallback callback) override; bool structureEquals(const IColumn & rhs) const override; bool isCollationSupported() const override; + ColumnPtr compress() const override; size_t tupleSize() const { return columns.size(); } diff --git a/src/Columns/ColumnUnique.h b/src/Columns/ColumnUnique.h index 5d58b2484e0..d1c4a4e1183 100644 --- a/src/Columns/ColumnUnique.h +++ b/src/Columns/ColumnUnique.h @@ -28,6 +28,11 @@ namespace ErrorCodes extern const int ILLEGAL_COLUMN; } +/** Stores another column with unique values + * and also an index that allows to find position by value. + * + * This column is not used on it's own but only as implementation detail of ColumnLowCardinality. + */ template class ColumnUnique final : public COWHelper> { From 1781a64370c86c93be915db8673644cffe0e58df Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 18 Feb 2021 02:11:41 +0300 Subject: [PATCH 190/381] Whitespaces --- src/Columns/ColumnUnique.h | 2 +- src/Columns/ReverseIndex.h | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/Columns/ColumnUnique.h b/src/Columns/ColumnUnique.h index d1c4a4e1183..fbd3c3641b5 100644 --- a/src/Columns/ColumnUnique.h +++ b/src/Columns/ColumnUnique.h @@ -39,7 +39,7 @@ class ColumnUnique final : public COWHelper>; private: - explicit ColumnUnique(MutableColumnPtr && holder, bool is_nullable); + ColumnUnique(MutableColumnPtr && holder, bool is_nullable); explicit ColumnUnique(const IDataType & type); ColumnUnique(const ColumnUnique & other); diff --git a/src/Columns/ReverseIndex.h b/src/Columns/ReverseIndex.h index 154293acf99..35b0029fc7b 100644 --- a/src/Columns/ReverseIndex.h +++ b/src/Columns/ReverseIndex.h @@ -316,8 +316,8 @@ template class ReverseIndex { public: - explicit ReverseIndex(UInt64 num_prefix_rows_to_skip_, UInt64 base_index_) - : num_prefix_rows_to_skip(num_prefix_rows_to_skip_), base_index(base_index_), saved_hash_ptr(nullptr) {} + ReverseIndex(UInt64 num_prefix_rows_to_skip_, UInt64 base_index_) + : num_prefix_rows_to_skip(num_prefix_rows_to_skip_), base_index(base_index_), saved_hash_ptr(nullptr) {} void setColumn(ColumnType * column_); @@ -329,14 +329,16 @@ public: /// Returns the found data's index in the dictionary. If index is not built, builds it. UInt64 getInsertionPoint(StringRef data) { - if (!index) buildIndex(); + if (!index) + buildIndex(); return getIndexImpl(data); } /// Returns the found data's index in the dictionary if the #index is built, otherwise, returns a std::nullopt. std::optional getIndex(StringRef data) const { - if (!index) return {}; + if (!index) + return {}; return getIndexImpl(data); } From b7011f4f9c2a6df4144e9dec4a45c12e7fa62ec8 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 18 Feb 2021 02:52:07 +0300 Subject: [PATCH 191/381] Fix build --- src/Columns/ColumnTuple.cpp | 2 +- src/DataTypes/DataTypeLowCardinality.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/Columns/ColumnTuple.cpp b/src/Columns/ColumnTuple.cpp index 1d85c67e7c6..c7c5f7b97c6 100644 --- a/src/Columns/ColumnTuple.cpp +++ b/src/Columns/ColumnTuple.cpp @@ -509,7 +509,7 @@ ColumnPtr ColumnTuple::compress() const } return ColumnCompressed::create(size(), byte_size, - [compressed = std::move(compressed)] + [compressed = std::move(compressed)]() mutable { for (auto & column : compressed) column = column->decompress(); diff --git a/src/DataTypes/DataTypeLowCardinality.h b/src/DataTypes/DataTypeLowCardinality.h index 6ed2b792ce3..fc28ce0a59d 100644 --- a/src/DataTypes/DataTypeLowCardinality.h +++ b/src/DataTypes/DataTypeLowCardinality.h @@ -1,7 +1,9 @@ #pragma once + #include #include + namespace DB { From 634be2b933d87926fe79ce54bc037b4740dcf7de Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 18 Feb 2021 03:52:09 +0300 Subject: [PATCH 192/381] Fix error --- src/Columns/ColumnCompressed.cpp | 4 ++-- src/Columns/ColumnCompressed.h | 5 +++-- src/Columns/ColumnDecimal.cpp | 2 +- src/Columns/ColumnFixedString.cpp | 4 ++-- src/Columns/ColumnString.cpp | 18 +++--------------- src/Columns/ColumnVector.cpp | 2 +- 6 files changed, 12 insertions(+), 23 deletions(-) diff --git a/src/Columns/ColumnCompressed.cpp b/src/Columns/ColumnCompressed.cpp index d7d30745868..292c6968b86 100644 --- a/src/Columns/ColumnCompressed.cpp +++ b/src/Columns/ColumnCompressed.cpp @@ -15,7 +15,7 @@ namespace ErrorCodes } -std::shared_ptr> ColumnCompressed::compressBuffer(const void * data, size_t data_size) +std::shared_ptr> ColumnCompressed::compressBuffer(const void * data, size_t data_size, bool always_compress) { size_t max_dest_size = LZ4_COMPRESSBOUND(data_size); @@ -34,7 +34,7 @@ std::shared_ptr> ColumnCompressed::compressBuffer(const void * data, si throw Exception(ErrorCodes::CANNOT_COMPRESS, "Cannot compress column"); /// If compression is inefficient. - if (static_cast(compressed_size) * 2 > data_size) + if (!always_compress && static_cast(compressed_size) * 2 > data_size) return {}; /// Shrink to fit. diff --git a/src/Columns/ColumnCompressed.h b/src/Columns/ColumnCompressed.h index bd70005ac5d..f6b6bf22177 100644 --- a/src/Columns/ColumnCompressed.h +++ b/src/Columns/ColumnCompressed.h @@ -65,8 +65,9 @@ public: /// Helper methods for compression. - /// If data is not worth to be compressed - returns nullptr. Note: shared_ptr is to allow to be captured by std::function. - static std::shared_ptr> compressBuffer(const void * data, size_t data_size); + /// If data is not worth to be compressed and not 'always_compress' - returns nullptr. + /// Note: shared_ptr is to allow to be captured by std::function. + static std::shared_ptr> compressBuffer(const void * data, size_t data_size, bool always_compress); static void decompressBuffer( const void * compressed_data, void * decompressed_data, size_t compressed_size, size_t decompressed_size); diff --git a/src/Columns/ColumnDecimal.cpp b/src/Columns/ColumnDecimal.cpp index bb61f60706e..bad3a4c3402 100644 --- a/src/Columns/ColumnDecimal.cpp +++ b/src/Columns/ColumnDecimal.cpp @@ -356,7 +356,7 @@ ColumnPtr ColumnDecimal::compress() const if (source_size < 4096) /// A wild guess. return ColumnCompressed::wrap(this->getPtr()); - auto compressed = ColumnCompressed::compressBuffer(data.data(), source_size); + auto compressed = ColumnCompressed::compressBuffer(data.data(), source_size, false); if (!compressed) return ColumnCompressed::wrap(this->getPtr()); diff --git a/src/Columns/ColumnFixedString.cpp b/src/Columns/ColumnFixedString.cpp index 278c2fef5f8..84bd0561f01 100644 --- a/src/Columns/ColumnFixedString.cpp +++ b/src/Columns/ColumnFixedString.cpp @@ -449,13 +449,13 @@ void ColumnFixedString::getExtremes(Field & min, Field & max) const ColumnPtr ColumnFixedString::compress() const { - size_t source_size = chars.size() * n; + size_t source_size = chars.size(); /// Don't compress small blocks. if (source_size < 4096) /// A wild guess. return ColumnCompressed::wrap(this->getPtr()); - auto compressed = ColumnCompressed::compressBuffer(chars.data(), source_size); + auto compressed = ColumnCompressed::compressBuffer(chars.data(), source_size, false); if (!compressed) return ColumnCompressed::wrap(this->getPtr()); diff --git a/src/Columns/ColumnString.cpp b/src/Columns/ColumnString.cpp index 190517bfeb9..f46c96caf8c 100644 --- a/src/Columns/ColumnString.cpp +++ b/src/Columns/ColumnString.cpp @@ -535,24 +535,13 @@ ColumnPtr ColumnString::compress() const if (source_chars_size < 4096) /// A wild guess. return ColumnCompressed::wrap(this->getPtr()); - auto chars_compressed = ColumnCompressed::compressBuffer(chars.data(), source_chars_size); - auto offsets_compressed = ColumnCompressed::compressBuffer(offsets.data(), source_offsets_size); + auto chars_compressed = ColumnCompressed::compressBuffer(chars.data(), source_chars_size, false); /// Return original column if not compressable. - if (!chars_compressed && !offsets_compressed) + if (!chars_compressed) return ColumnCompressed::wrap(this->getPtr()); - if (!chars_compressed) - { - chars_compressed = std::make_shared>(source_chars_size); - memcpy(chars_compressed->data(), chars.data(), source_chars_size); - } - - if (!offsets_compressed) - { - offsets_compressed = std::make_shared>(source_offsets_size); - memcpy(offsets_compressed->data(), offsets.data(), source_offsets_size); - } + auto offsets_compressed = ColumnCompressed::compressBuffer(offsets.data(), source_offsets_size, true); return ColumnCompressed::create(offsets.size(), chars_compressed->size() + offsets_compressed->size(), [ @@ -575,7 +564,6 @@ ColumnPtr ColumnString::compress() const return res; }); - } diff --git a/src/Columns/ColumnVector.cpp b/src/Columns/ColumnVector.cpp index b8bfef7258e..19ba86c5120 100644 --- a/src/Columns/ColumnVector.cpp +++ b/src/Columns/ColumnVector.cpp @@ -533,7 +533,7 @@ ColumnPtr ColumnVector::compress() const if (source_size < 4096) /// A wild guess. return ColumnCompressed::wrap(this->getPtr()); - auto compressed = ColumnCompressed::compressBuffer(data.data(), source_size); + auto compressed = ColumnCompressed::compressBuffer(data.data(), source_size, false); if (!compressed) return ColumnCompressed::wrap(this->getPtr()); From 5007f7f0183f3cc6ce2b3580b99748ff7a3649ae Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 18 Feb 2021 05:57:15 +0300 Subject: [PATCH 193/381] Fix typo --- src/Columns/ColumnString.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Columns/ColumnString.cpp b/src/Columns/ColumnString.cpp index f46c96caf8c..8fd22e85e10 100644 --- a/src/Columns/ColumnString.cpp +++ b/src/Columns/ColumnString.cpp @@ -537,7 +537,7 @@ ColumnPtr ColumnString::compress() const auto chars_compressed = ColumnCompressed::compressBuffer(chars.data(), source_chars_size, false); - /// Return original column if not compressable. + /// Return original column if not compressible. if (!chars_compressed) return ColumnCompressed::wrap(this->getPtr()); From 04cb91a0fd1e3dc0f3a1b00d752d93b19a116e97 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 18 Feb 2021 06:02:41 +0300 Subject: [PATCH 194/381] Fix error --- src/Columns/ColumnMap.cpp | 10 ++++++++++ src/Columns/ColumnMap.h | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/src/Columns/ColumnMap.cpp b/src/Columns/ColumnMap.cpp index 1cfd7e6c4ef..cc2640a9cf6 100644 --- a/src/Columns/ColumnMap.cpp +++ b/src/Columns/ColumnMap.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -243,4 +244,13 @@ bool ColumnMap::structureEquals(const IColumn & rhs) const return false; } +ColumnPtr ColumnMap::compress() const +{ + auto compressed = nested->compress(); + return ColumnCompressed::create(size(), compressed->byteSize(), [compressed = std::move(compressed)] + { + return ColumnMap::create(compressed->decompress()); + }); +} + } diff --git a/src/Columns/ColumnMap.h b/src/Columns/ColumnMap.h index a970f67bd46..acae1574f4c 100644 --- a/src/Columns/ColumnMap.h +++ b/src/Columns/ColumnMap.h @@ -92,7 +92,7 @@ public: const ColumnTuple & getNestedData() const { return assert_cast(getNestedColumn().getData()); } ColumnTuple & getNestedData() { return assert_cast(getNestedColumn().getData()); } - ColumnPtr compress() const override { return nested->compress(); } + ColumnPtr compress() const override; }; } From adf5d24177b6d23d4788e531fa2267378c07aae6 Mon Sep 17 00:00:00 2001 From: M0r64n Date: Thu, 18 Feb 2021 11:36:17 +0400 Subject: [PATCH 195/381] Correct file engine settings tests --- .../01720_engine_file_empty_if_not_exists.sql | 1 + .../01721_engine_file_truncate_on_insert.sql | 21 ++++++++++--------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/tests/queries/0_stateless/01720_engine_file_empty_if_not_exists.sql b/tests/queries/0_stateless/01720_engine_file_empty_if_not_exists.sql index c04e01ccc88..d665dbc722f 100644 --- a/tests/queries/0_stateless/01720_engine_file_empty_if_not_exists.sql +++ b/tests/queries/0_stateless/01720_engine_file_empty_if_not_exists.sql @@ -13,3 +13,4 @@ SET engine_file_empty_if_not_exists=1; SELECT * FROM file_engine_table; SET engine_file_empty_if_not_exists=0; +DROP TABLE file_engine_table; diff --git a/tests/queries/0_stateless/01721_engine_file_truncate_on_insert.sql b/tests/queries/0_stateless/01721_engine_file_truncate_on_insert.sql index 65246db7963..42d935cc0dd 100644 --- a/tests/queries/0_stateless/01721_engine_file_truncate_on_insert.sql +++ b/tests/queries/0_stateless/01721_engine_file_truncate_on_insert.sql @@ -1,20 +1,21 @@ -INSERT INTO TABLE FUNCTION file('01718_file/test/data.TSV', 'TSV', 'id UInt32') VALUES ('file', 42); +DROP TABLE IF EXISTS test; + +INSERT INTO TABLE FUNCTION file('01718_file/test/data.TSV', 'TSV', 'id UInt32') VALUES (1); ATTACH TABLE test FROM '01718_file/test' (id UInt8) ENGINE=File(TSV); -CREATE TABLE file_engine_table (id UInt32) ENGINE=File(TabSeparated); - -INSERT INTO file_engine_table VALUES (1), (2), (3); -INSERT INTO file_engine_table VALUES (4); -SELECT * FROM file_engine_table; +INSERT INTO test VALUES (2), (3); +INSERT INTO test VALUES (4); +SELECT * FROM test; SET engine_file_truncate_on_insert=0; -INSERT INTO file_engine_table VALUES (5), (6); -SELECT * FROM file_engine_table; +INSERT INTO test VALUES (5), (6); +SELECT * FROM test; SET engine_file_truncate_on_insert=1; -INSERT INTO file_engine_table VALUES (0), (1), (2); -SELECT * FROM file_engine_table; +INSERT INTO test VALUES (0), (1), (2); +SELECT * FROM test; SET engine_file_truncate_on_insert=0; +DROP TABLE test; From 1ce9570fcb4919880c19b05986dd9f7691fefb6f Mon Sep 17 00:00:00 2001 From: M0r64n Date: Thu, 18 Feb 2021 07:50:15 +0000 Subject: [PATCH 196/381] Fix 01721_engine_file_truncate_on_insert.reference --- .../0_stateless/01721_engine_file_truncate_on_insert.reference | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/01721_engine_file_truncate_on_insert.reference b/tests/queries/0_stateless/01721_engine_file_truncate_on_insert.reference index a25fb4f0e7e..578661c9194 100644 --- a/tests/queries/0_stateless/01721_engine_file_truncate_on_insert.reference +++ b/tests/queries/0_stateless/01721_engine_file_truncate_on_insert.reference @@ -10,4 +10,4 @@ 6 0 1 -2 \ No newline at end of file +2 From 5b597fdf446bb2039ae45d722ad423445a063a96 Mon Sep 17 00:00:00 2001 From: alesapin Date: Thu, 18 Feb 2021 13:23:48 +0300 Subject: [PATCH 197/381] Force sync setting and ability to start with broken log --- src/Coordination/Changelog.cpp | 90 +++++++++++-------- src/Coordination/Changelog.h | 6 +- src/Coordination/CoordinationSettings.h | 3 +- src/Coordination/InMemoryStateManager.cpp | 6 +- src/Coordination/NuKeeperLogStore.cpp | 9 +- src/Coordination/NuKeeperLogStore.h | 3 +- src/Coordination/tests/gtest_for_build.cpp | 89 ++++++++++++------ tests/config/config.d/test_keeper_port.xml | 1 + .../configs/enable_test_keeper.xml | 1 + 9 files changed, 137 insertions(+), 71 deletions(-) diff --git a/src/Coordination/Changelog.cpp b/src/Coordination/Changelog.cpp index 2d1bbfb4440..4358fa062e8 100644 --- a/src/Coordination/Changelog.cpp +++ b/src/Coordination/Changelog.cpp @@ -6,6 +6,8 @@ #include #include #include +#include +#include namespace DB { @@ -37,7 +39,7 @@ ChangelogVersion fromString(const std::string & version_str) namespace { -static constexpr auto DEFAULT_PREFIX = "changelog"; +constexpr auto DEFAULT_PREFIX = "changelog"; std::string formatChangelogPath(const std::string & prefix, const ChangelogFileDescription & name) { @@ -151,39 +153,56 @@ public: size_t readChangelog(IndexToLogEntry & logs, size_t start_log_idx, IndexToOffset & index_to_offset) { size_t total_read = 0; - while (!read_buf.eof()) + try { - total_read += 1; - off_t pos = read_buf.count(); - ChangelogRecord record; - readIntBinary(record.header.version, read_buf); - readIntBinary(record.header.index, read_buf); - readIntBinary(record.header.term, read_buf); - readIntBinary(record.header.value_type, read_buf); - readIntBinary(record.header.blob_size, read_buf); - readIntBinary(record.header.blob_checksum, read_buf); - auto buffer = nuraft::buffer::alloc(record.header.blob_size); - auto buffer_begin = reinterpret_cast(buffer->data_begin()); - read_buf.readStrict(buffer_begin, record.header.blob_size); - index_to_offset[record.header.index] = pos; - - Checksum checksum = CityHash_v1_0_2::CityHash128(buffer_begin, record.header.blob_size); - if (checksum != record.header.blob_checksum) + while (!read_buf.eof()) { - throw Exception(ErrorCodes::CHECKSUM_DOESNT_MATCH, - "Checksums doesn't match for log {} (version {}), index {}, blob_size {}", - filepath, record.header.version, record.header.index, record.header.blob_size); - } - if (record.header.index < start_log_idx) - continue; + off_t pos = read_buf.count(); + ChangelogRecord record; + readIntBinary(record.header.version, read_buf); + readIntBinary(record.header.index, read_buf); + readIntBinary(record.header.term, read_buf); + readIntBinary(record.header.value_type, read_buf); + readIntBinary(record.header.blob_size, read_buf); + readIntBinary(record.header.blob_checksum, read_buf); + auto buffer = nuraft::buffer::alloc(record.header.blob_size); + auto buffer_begin = reinterpret_cast(buffer->data_begin()); + read_buf.readStrict(buffer_begin, record.header.blob_size); + index_to_offset[record.header.index] = pos; - auto log_entry = nuraft::cs_new(record.header.term, buffer, record.header.value_type); - if (!logs.try_emplace(record.header.index, log_entry).second) - throw Exception(ErrorCodes::CORRUPTED_DATA, "Duplicated index id {} in log {}", record.header.index, filepath); + Checksum checksum = CityHash_v1_0_2::CityHash128(buffer_begin, record.header.blob_size); + if (checksum != record.header.blob_checksum) + { + throw Exception(ErrorCodes::CHECKSUM_DOESNT_MATCH, + "Checksums doesn't match for log {} (version {}), index {}, blob_size {}", + filepath, record.header.version, record.header.index, record.header.blob_size); + } + + if (logs.count(record.header.index) != 0) + throw Exception(ErrorCodes::CORRUPTED_DATA, "Duplicated index id {} in log {}", record.header.index, filepath); + + total_read += 1; + + if (record.header.index < start_log_idx) + continue; + + auto log_entry = nuraft::cs_new(record.header.term, buffer, record.header.value_type); + + logs.emplace(record.header.index, log_entry); + } + } + catch (const Exception & ex) + { + LOG_WARNING(&Poco::Logger::get("RaftChangelog"), "Cannot completely read changelog on path {}, error: {}", filepath, ex.message()); + } + catch (...) + { + tryLogCurrentException(&Poco::Logger::get("RaftChangelog")); } return total_read; } + private: std::string filepath; ReadBufferFromFile read_buf; @@ -239,11 +258,12 @@ void Changelog::readChangelogAndInitWriter(size_t from_log_idx) } } - if (existing_changelogs.size() > 0 && read_from_last < entries_in_last) + if (!existing_changelogs.empty() && read_from_last < entries_in_last) { auto description = existing_changelogs.rbegin()->second; current_writer = std::make_unique(description.path, WriteMode::Append, description.from_log_idx); current_writer->setEntriesWritten(read_from_last); + current_writer->truncateToLength(index_to_start_pos[read_from_last]); } else { @@ -287,7 +307,7 @@ ChangelogRecord Changelog::buildRecord(size_t index, nuraft::ptr log_entry) +void Changelog::appendEntry(size_t index, nuraft::ptr log_entry, bool force_sync) { if (!current_writer) throw Exception(ErrorCodes::LOGICAL_ERROR, "Changelog must be initialized before appending records"); @@ -298,14 +318,14 @@ void Changelog::appendEntry(size_t index, nuraft::ptr log_ent if (current_writer->getEntriesWritten() == rotate_interval) rotate(index); - auto offset = current_writer->appendRecord(buildRecord(index, log_entry), false); + auto offset = current_writer->appendRecord(buildRecord(index, log_entry), force_sync); if (!index_to_start_pos.try_emplace(index, offset).second) throw Exception(ErrorCodes::LOGICAL_ERROR, "Record with index {} already exists", index); logs[index] = makeClone(log_entry); } -void Changelog::writeAt(size_t index, nuraft::ptr log_entry) +void Changelog::writeAt(size_t index, nuraft::ptr log_entry, bool force_sync) { if (index_to_start_pos.count(index) == 0) throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot write at index {} because changelog doesn't contain it", index); @@ -347,7 +367,7 @@ void Changelog::writeAt(size_t index, nuraft::ptr log_entry) current_writer->setEntriesWritten(entries_written); - appendEntry(index, log_entry); + appendEntry(index, log_entry, force_sync); } void Changelog::compact(size_t up_to_log_idx) @@ -441,7 +461,7 @@ nuraft::ptr Changelog::serializeEntriesToBuffer(size_t index, in return buf_out; } -void Changelog::applyEntriesFromBuffer(size_t index, nuraft::buffer & buffer) +void Changelog::applyEntriesFromBuffer(size_t index, nuraft::buffer & buffer, bool force_sync) { buffer.pos(0); int num_logs = buffer.get_int(); @@ -456,9 +476,9 @@ void Changelog::applyEntriesFromBuffer(size_t index, nuraft::buffer & buffer) LogEntryPtr log_entry = nuraft::log_entry::deserialize(*buf_local); if (i == 0 && logs.count(cur_idx)) - writeAt(cur_idx, log_entry); + writeAt(cur_idx, log_entry, force_sync); else - appendEntry(cur_idx, log_entry); + appendEntry(cur_idx, log_entry, force_sync); } } diff --git a/src/Coordination/Changelog.h b/src/Coordination/Changelog.h index 5f38f68750e..38d83819da2 100644 --- a/src/Coordination/Changelog.h +++ b/src/Coordination/Changelog.h @@ -64,9 +64,9 @@ public: void readChangelogAndInitWriter(size_t from_log_idx); - void appendEntry(size_t index, LogEntryPtr log_entry); + void appendEntry(size_t index, LogEntryPtr log_entry, bool force_sync); - void writeAt(size_t index, LogEntryPtr log_entry); + void writeAt(size_t index, LogEntryPtr log_entry, bool force_sync); void compact(size_t up_to_log_idx); @@ -88,7 +88,7 @@ public: nuraft::ptr serializeEntriesToBuffer(size_t index, int32_t cnt); - void applyEntriesFromBuffer(size_t index, nuraft::buffer & buffer); + void applyEntriesFromBuffer(size_t index, nuraft::buffer & buffer, bool force_sync); void flush(); diff --git a/src/Coordination/CoordinationSettings.h b/src/Coordination/CoordinationSettings.h index 0f1afb3fffe..ba3d3a7141a 100644 --- a/src/Coordination/CoordinationSettings.h +++ b/src/Coordination/CoordinationSettings.h @@ -29,7 +29,8 @@ struct Settings; M(Milliseconds, shutdown_timeout, 5000, "How many time we will until RAFT shutdown", 0) \ M(Milliseconds, startup_timeout, 30000, "How many time we will until RAFT to start", 0) \ M(LogsLevel, raft_logs_level, LogsLevel::information, "Log internal RAFT logs into main server log level. Valid values: 'trace', 'debug', 'information', 'warning', 'error', 'fatal', 'none'", 0) \ - M(UInt64, rotate_log_storage_interval, 500000, "How many records will be stored in one log storage file", 0) + M(UInt64, rotate_log_storage_interval, 500000, "How many records will be stored in one log storage file", 0) \ + M(Bool, force_sync, true, " Call fsync on each change in RAFT changelog", 0) DECLARE_SETTINGS_TRAITS(CoordinationSettingsTraits, LIST_OF_COORDINATION_SETTINGS) diff --git a/src/Coordination/InMemoryStateManager.cpp b/src/Coordination/InMemoryStateManager.cpp index 0423d2466f2..084ab043d12 100644 --- a/src/Coordination/InMemoryStateManager.cpp +++ b/src/Coordination/InMemoryStateManager.cpp @@ -12,7 +12,7 @@ namespace ErrorCodes InMemoryStateManager::InMemoryStateManager(int server_id_, const std::string & host, int port, const std::string & logs_path) : my_server_id(server_id_) , my_port(port) - , log_store(nuraft::cs_new(logs_path, 5000)) + , log_store(nuraft::cs_new(logs_path, 5000, true)) , cluster_config(nuraft::cs_new()) { auto peer_config = nuraft::cs_new(my_server_id, host + ":" + std::to_string(port)); @@ -25,7 +25,9 @@ InMemoryStateManager::InMemoryStateManager( const Poco::Util::AbstractConfiguration & config, const CoordinationSettingsPtr & coordination_settings) : my_server_id(my_server_id_) - , log_store(nuraft::cs_new(config.getString(config_prefix + ".log_storage_path"), coordination_settings->rotate_log_storage_interval)) + , log_store(nuraft::cs_new( + config.getString(config_prefix + ".log_storage_path"), + coordination_settings->rotate_log_storage_interval, coordination_settings->force_sync)) , cluster_config(nuraft::cs_new()) { diff --git a/src/Coordination/NuKeeperLogStore.cpp b/src/Coordination/NuKeeperLogStore.cpp index fa8d6d6c299..8834bdc4d69 100644 --- a/src/Coordination/NuKeeperLogStore.cpp +++ b/src/Coordination/NuKeeperLogStore.cpp @@ -3,8 +3,9 @@ namespace DB { -NuKeeperLogStore::NuKeeperLogStore(const std::string & changelogs_path, size_t rotate_interval_) +NuKeeperLogStore::NuKeeperLogStore(const std::string & changelogs_path, size_t rotate_interval_, bool force_sync_) : changelog(changelogs_path, rotate_interval_) + , force_sync(force_sync_) { } @@ -36,7 +37,7 @@ size_t NuKeeperLogStore::append(nuraft::ptr & entry) { std::lock_guard lock(changelog_lock); size_t idx = changelog.getNextEntryIndex(); - changelog.appendEntry(idx, entry); + changelog.appendEntry(idx, entry, force_sync); return idx; } @@ -44,7 +45,7 @@ size_t NuKeeperLogStore::append(nuraft::ptr & entry) void NuKeeperLogStore::write_at(size_t index, nuraft::ptr & entry) { std::lock_guard lock(changelog_lock); - changelog.writeAt(index, entry); + changelog.writeAt(index, entry, force_sync); } nuraft::ptr>> NuKeeperLogStore::log_entries(size_t start, size_t end) @@ -91,7 +92,7 @@ bool NuKeeperLogStore::flush() void NuKeeperLogStore::apply_pack(size_t index, nuraft::buffer & pack) { std::lock_guard lock(changelog_lock); - changelog.applyEntriesFromBuffer(index, pack); + changelog.applyEntriesFromBuffer(index, pack, force_sync); } size_t NuKeeperLogStore::size() const diff --git a/src/Coordination/NuKeeperLogStore.h b/src/Coordination/NuKeeperLogStore.h index 49d5dbfdf7c..0ff92220316 100644 --- a/src/Coordination/NuKeeperLogStore.h +++ b/src/Coordination/NuKeeperLogStore.h @@ -11,7 +11,7 @@ namespace DB class NuKeeperLogStore : public nuraft::log_store { public: - NuKeeperLogStore(const std::string & changelogs_path, size_t rotate_interval_); + NuKeeperLogStore(const std::string & changelogs_path, size_t rotate_interval_, bool force_sync_); void init(size_t from_log_idx); @@ -44,6 +44,7 @@ public: private: mutable std::mutex changelog_lock; Changelog changelog; + bool force_sync; }; } diff --git a/src/Coordination/tests/gtest_for_build.cpp b/src/Coordination/tests/gtest_for_build.cpp index 81e1751c08c..3fd2db84e3e 100644 --- a/src/Coordination/tests/gtest_for_build.cpp +++ b/src/Coordination/tests/gtest_for_build.cpp @@ -6,7 +6,8 @@ #endif #if USE_NURAFT - +#include +#include #include #include #include @@ -20,6 +21,7 @@ #include #include #include +#include #include // Y_IGNORE #include #include @@ -372,7 +374,7 @@ DB::LogEntryPtr getLogEntry(const std::string & s, size_t term) TEST(CoordinationTest, ChangelogTestSimple) { ChangelogDirTest test("./logs"); - DB::NuKeeperLogStore changelog("./logs", 5); + DB::NuKeeperLogStore changelog("./logs", 5, true); changelog.init(1); auto entry = getLogEntry("hello world", 77); changelog.append(entry); @@ -386,7 +388,7 @@ TEST(CoordinationTest, ChangelogTestSimple) TEST(CoordinationTest, ChangelogTestFile) { ChangelogDirTest test("./logs"); - DB::NuKeeperLogStore changelog("./logs", 5); + DB::NuKeeperLogStore changelog("./logs", 5, true); changelog.init(1); auto entry = getLogEntry("hello world", 77); changelog.append(entry); @@ -407,7 +409,7 @@ TEST(CoordinationTest, ChangelogTestFile) TEST(CoordinationTest, ChangelogReadWrite) { ChangelogDirTest test("./logs"); - DB::NuKeeperLogStore changelog("./logs", 1000); + DB::NuKeeperLogStore changelog("./logs", 1000, true); changelog.init(1); for (size_t i = 0; i < 10; ++i) { @@ -415,7 +417,7 @@ TEST(CoordinationTest, ChangelogReadWrite) changelog.append(entry); } EXPECT_EQ(changelog.size(), 10); - DB::NuKeeperLogStore changelog_reader("./logs", 1000); + DB::NuKeeperLogStore changelog_reader("./logs", 1000, true); changelog_reader.init(1); EXPECT_EQ(changelog_reader.size(), 10); EXPECT_EQ(changelog_reader.last_entry()->get_term(), changelog.last_entry()->get_term()); @@ -434,7 +436,7 @@ TEST(CoordinationTest, ChangelogReadWrite) TEST(CoordinationTest, ChangelogWriteAt) { ChangelogDirTest test("./logs"); - DB::NuKeeperLogStore changelog("./logs", 1000); + DB::NuKeeperLogStore changelog("./logs", 1000, true); changelog.init(1); for (size_t i = 0; i < 10; ++i) { @@ -450,7 +452,7 @@ TEST(CoordinationTest, ChangelogWriteAt) EXPECT_EQ(changelog.entry_at(7)->get_term(), 77); EXPECT_EQ(changelog.next_slot(), 8); - DB::NuKeeperLogStore changelog_reader("./logs", 1000); + DB::NuKeeperLogStore changelog_reader("./logs", 1000, true); changelog_reader.init(1); EXPECT_EQ(changelog_reader.size(), changelog.size()); @@ -463,7 +465,7 @@ TEST(CoordinationTest, ChangelogWriteAt) TEST(CoordinationTest, ChangelogTestAppendAfterRead) { ChangelogDirTest test("./logs"); - DB::NuKeeperLogStore changelog("./logs", 5); + DB::NuKeeperLogStore changelog("./logs", 5, true); changelog.init(1); for (size_t i = 0; i < 7; ++i) { @@ -475,7 +477,7 @@ TEST(CoordinationTest, ChangelogTestAppendAfterRead) EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); - DB::NuKeeperLogStore changelog_reader("./logs", 5); + DB::NuKeeperLogStore changelog_reader("./logs", 5, true); changelog_reader.init(1); EXPECT_EQ(changelog_reader.size(), 7); @@ -511,7 +513,7 @@ TEST(CoordinationTest, ChangelogTestAppendAfterRead) TEST(CoordinationTest, ChangelogTestCompaction) { ChangelogDirTest test("./logs"); - DB::NuKeeperLogStore changelog("./logs", 5); + DB::NuKeeperLogStore changelog("./logs", 5, true); changelog.init(1); for (size_t i = 0; i < 3; ++i) @@ -552,7 +554,7 @@ TEST(CoordinationTest, ChangelogTestCompaction) EXPECT_EQ(changelog.next_slot(), 8); EXPECT_EQ(changelog.last_entry()->get_term(), 60); /// And we able to read it - DB::NuKeeperLogStore changelog_reader("./logs", 5); + DB::NuKeeperLogStore changelog_reader("./logs", 5, true); changelog_reader.init(7); EXPECT_EQ(changelog_reader.size(), 1); EXPECT_EQ(changelog_reader.start_index(), 7); @@ -563,7 +565,7 @@ TEST(CoordinationTest, ChangelogTestCompaction) TEST(CoordinationTest, ChangelogTestBatchOperations) { ChangelogDirTest test("./logs"); - DB::NuKeeperLogStore changelog("./logs", 100); + DB::NuKeeperLogStore changelog("./logs", 100, true); changelog.init(1); for (size_t i = 0; i < 10; ++i) { @@ -575,7 +577,7 @@ TEST(CoordinationTest, ChangelogTestBatchOperations) auto entries = changelog.pack(1, 5); - DB::NuKeeperLogStore apply_changelog("./logs", 100); + DB::NuKeeperLogStore apply_changelog("./logs", 100, true); apply_changelog.init(1); for (size_t i = 0; i < 10; ++i) @@ -605,7 +607,7 @@ TEST(CoordinationTest, ChangelogTestBatchOperations) TEST(CoordinationTest, ChangelogTestBatchOperationsEmpty) { ChangelogDirTest test("./logs"); - DB::NuKeeperLogStore changelog("./logs", 100); + DB::NuKeeperLogStore changelog("./logs", 100, true); changelog.init(1); for (size_t i = 0; i < 10; ++i) { @@ -618,7 +620,7 @@ TEST(CoordinationTest, ChangelogTestBatchOperationsEmpty) auto entries = changelog.pack(5, 5); ChangelogDirTest test1("./logs1"); - DB::NuKeeperLogStore changelog_new("./logs1", 100); + DB::NuKeeperLogStore changelog_new("./logs1", 100, true); changelog_new.init(1); EXPECT_EQ(changelog_new.size(), 0); @@ -637,7 +639,7 @@ TEST(CoordinationTest, ChangelogTestBatchOperationsEmpty) EXPECT_EQ(changelog_new.start_index(), 5); EXPECT_EQ(changelog_new.next_slot(), 11); - DB::NuKeeperLogStore changelog_reader("./logs1", 100); + DB::NuKeeperLogStore changelog_reader("./logs1", 100, true); changelog_reader.init(5); } @@ -645,7 +647,7 @@ TEST(CoordinationTest, ChangelogTestBatchOperationsEmpty) TEST(CoordinationTest, ChangelogTestWriteAtPreviousFile) { ChangelogDirTest test("./logs"); - DB::NuKeeperLogStore changelog("./logs", 5); + DB::NuKeeperLogStore changelog("./logs", 5, true); changelog.init(1); for (size_t i = 0; i < 33; ++i) @@ -680,7 +682,7 @@ TEST(CoordinationTest, ChangelogTestWriteAtPreviousFile) EXPECT_FALSE(fs::exists("./logs/changelog_26_30.bin")); EXPECT_FALSE(fs::exists("./logs/changelog_31_35.bin")); - DB::NuKeeperLogStore changelog_read("./logs", 5); + DB::NuKeeperLogStore changelog_read("./logs", 5, true); changelog_read.init(1); EXPECT_EQ(changelog_read.size(), 7); EXPECT_EQ(changelog_read.start_index(), 1); @@ -691,7 +693,7 @@ TEST(CoordinationTest, ChangelogTestWriteAtPreviousFile) TEST(CoordinationTest, ChangelogTestWriteAtFileBorder) { ChangelogDirTest test("./logs"); - DB::NuKeeperLogStore changelog("./logs", 5); + DB::NuKeeperLogStore changelog("./logs", 5, true); changelog.init(1); for (size_t i = 0; i < 33; ++i) @@ -726,7 +728,7 @@ TEST(CoordinationTest, ChangelogTestWriteAtFileBorder) EXPECT_FALSE(fs::exists("./logs/changelog_26_30.bin")); EXPECT_FALSE(fs::exists("./logs/changelog_31_35.bin")); - DB::NuKeeperLogStore changelog_read("./logs", 5); + DB::NuKeeperLogStore changelog_read("./logs", 5, true); changelog_read.init(1); EXPECT_EQ(changelog_read.size(), 11); EXPECT_EQ(changelog_read.start_index(), 1); @@ -737,7 +739,7 @@ TEST(CoordinationTest, ChangelogTestWriteAtFileBorder) TEST(CoordinationTest, ChangelogTestWriteAtAllFiles) { ChangelogDirTest test("./logs"); - DB::NuKeeperLogStore changelog("./logs", 5); + DB::NuKeeperLogStore changelog("./logs", 5, true); changelog.init(1); for (size_t i = 0; i < 33; ++i) @@ -776,7 +778,7 @@ TEST(CoordinationTest, ChangelogTestWriteAtAllFiles) TEST(CoordinationTest, ChangelogTestStartNewLogAfterRead) { ChangelogDirTest test("./logs"); - DB::NuKeeperLogStore changelog("./logs", 5); + DB::NuKeeperLogStore changelog("./logs", 5, true); changelog.init(1); for (size_t i = 0; i < 35; ++i) @@ -795,7 +797,7 @@ TEST(CoordinationTest, ChangelogTestStartNewLogAfterRead) EXPECT_FALSE(fs::exists("./logs/changelog_36_40.bin")); - DB::NuKeeperLogStore changelog_reader("./logs", 5); + DB::NuKeeperLogStore changelog_reader("./logs", 5, true); changelog_reader.init(1); auto entry = getLogEntry("36_hello_world", 360); @@ -817,7 +819,7 @@ TEST(CoordinationTest, ChangelogTestReadAfterBrokenTruncate) { ChangelogDirTest test("./logs"); - DB::NuKeeperLogStore changelog("./logs", 5); + DB::NuKeeperLogStore changelog("./logs", 5, true); changelog.init(1); for (size_t i = 0; i < 35; ++i) @@ -837,7 +839,7 @@ TEST(CoordinationTest, ChangelogTestReadAfterBrokenTruncate) DB::WriteBufferFromFile plain_buf("./logs/changelog_11_15.bin", DBMS_DEFAULT_BUFFER_SIZE, O_APPEND | O_CREAT | O_WRONLY); plain_buf.truncate(0); - DB::NuKeeperLogStore changelog_reader("./logs", 5); + DB::NuKeeperLogStore changelog_reader("./logs", 5, true); changelog_reader.init(1); EXPECT_EQ(changelog_reader.size(), 10); @@ -867,4 +869,41 @@ TEST(CoordinationTest, ChangelogTestReadAfterBrokenTruncate) EXPECT_FALSE(fs::exists("./logs/changelog_31_35.bin")); } +TEST(CoordinationTest, ChangelogTestReadAfterBrokenTruncate2) +{ + ChangelogDirTest test("./logs"); + + DB::NuKeeperLogStore changelog("./logs", 20, true); + changelog.init(1); + + for (size_t i = 0; i < 35; ++i) + { + auto entry = getLogEntry(std::to_string(i) + "_hello_world", (i + 44) * 10); + changelog.append(entry); + } + + EXPECT_TRUE(fs::exists("./logs/changelog_1_20.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_21_40.bin")); + + DB::WriteBufferFromFile plain_buf("./logs/changelog_1_20.bin", DBMS_DEFAULT_BUFFER_SIZE, O_APPEND | O_CREAT | O_WRONLY); + plain_buf.truncate(140); + + DB::NuKeeperLogStore changelog_reader("./logs", 20, true); + changelog_reader.init(1); + + EXPECT_EQ(changelog_reader.size(), 2); + EXPECT_EQ(changelog_reader.last_entry()->get_term(), 450); + EXPECT_TRUE(fs::exists("./logs/changelog_1_20.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_21_40.bin")); +} + +int main(int argc, char ** argv) +{ + Poco::AutoPtr channel(new Poco::ConsoleChannel(std::cerr)); + Poco::Logger::root().setChannel(channel); + Poco::Logger::root().setLevel("trace"); + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + #endif diff --git a/tests/config/config.d/test_keeper_port.xml b/tests/config/config.d/test_keeper_port.xml index 44123ffe9c1..88fbf027ce7 100644 --- a/tests/config/config.d/test_keeper_port.xml +++ b/tests/config/config.d/test_keeper_port.xml @@ -9,6 +9,7 @@ 30000 0 0 + false diff --git a/tests/integration/test_testkeeper_back_to_back/configs/enable_test_keeper.xml b/tests/integration/test_testkeeper_back_to_back/configs/enable_test_keeper.xml index a8b8991f959..2cf9f8022d1 100644 --- a/tests/integration/test_testkeeper_back_to_back/configs/enable_test_keeper.xml +++ b/tests/integration/test_testkeeper_back_to_back/configs/enable_test_keeper.xml @@ -8,6 +8,7 @@ 5000 10000 trace + false From 7f815325ba92e487712488e6a368ab12133421b7 Mon Sep 17 00:00:00 2001 From: alesapin Date: Thu, 18 Feb 2021 14:42:09 +0300 Subject: [PATCH 198/381] More tests for broken changelog read --- src/Coordination/Changelog.cpp | 42 +++++++++++++------ src/Coordination/tests/gtest_for_build.cpp | 15 +++++++ .../configs/use_test_keeper.xml | 2 +- 3 files changed, 45 insertions(+), 14 deletions(-) diff --git a/src/Coordination/Changelog.cpp b/src/Coordination/Changelog.cpp index 4358fa062e8..12943bd9272 100644 --- a/src/Coordination/Changelog.cpp +++ b/src/Coordination/Changelog.cpp @@ -142,6 +142,13 @@ private: size_t start_index; }; +struct ChangelogReadResult +{ + size_t entries_read; + off_t last_position; + bool error; +}; + class ChangelogReader { public: @@ -150,14 +157,15 @@ public: , read_buf(filepath) {} - size_t readChangelog(IndexToLogEntry & logs, size_t start_log_idx, IndexToOffset & index_to_offset) + ChangelogReadResult readChangelog(IndexToLogEntry & logs, size_t start_log_idx, IndexToOffset & index_to_offset) { - size_t total_read = 0; + size_t previous_index = 0; + ChangelogReadResult result{}; try { while (!read_buf.eof()) { - off_t pos = read_buf.count(); + result.last_position = read_buf.count(); ChangelogRecord record; readIntBinary(record.header.version, read_buf); readIntBinary(record.header.index, read_buf); @@ -168,7 +176,11 @@ public: auto buffer = nuraft::buffer::alloc(record.header.blob_size); auto buffer_begin = reinterpret_cast(buffer->data_begin()); read_buf.readStrict(buffer_begin, record.header.blob_size); - index_to_offset[record.header.index] = pos; + + if (previous_index != 0 && previous_index + 1 != record.header.index) + throw Exception(ErrorCodes::CORRUPTED_DATA, "Previous log entry {}, next log entry {}, seems like some entries skipped", previous_index, record.header.index); + + previous_index = record.header.index; Checksum checksum = CityHash_v1_0_2::CityHash128(buffer_begin, record.header.blob_size); if (checksum != record.header.blob_checksum) @@ -181,7 +193,7 @@ public: if (logs.count(record.header.index) != 0) throw Exception(ErrorCodes::CORRUPTED_DATA, "Duplicated index id {} in log {}", record.header.index, filepath); - total_read += 1; + result.entries_read += 1; if (record.header.index < start_log_idx) continue; @@ -189,18 +201,21 @@ public: auto log_entry = nuraft::cs_new(record.header.term, buffer, record.header.value_type); logs.emplace(record.header.index, log_entry); + index_to_offset[record.header.index] = result.last_position; } } catch (const Exception & ex) { + result.error = true; LOG_WARNING(&Poco::Logger::get("RaftChangelog"), "Cannot completely read changelog on path {}, error: {}", filepath, ex.message()); } catch (...) { + result.error = true; tryLogCurrentException(&Poco::Logger::get("RaftChangelog")); } - return total_read; + return result; } private: @@ -225,11 +240,11 @@ Changelog::Changelog(const std::string & changelogs_dir_, size_t rotate_interval void Changelog::readChangelogAndInitWriter(size_t from_log_idx) { - size_t read_from_last = 0; start_index = from_log_idx == 0 ? 1 : from_log_idx; size_t total_read = 0; size_t entries_in_last = 0; size_t incomplete_log_idx = 0; + ChangelogReadResult result{}; for (const auto & [start_idx, changelog_description] : existing_changelogs) { entries_in_last = changelog_description.to_log_idx - changelog_description.from_log_idx + 1; @@ -237,11 +252,11 @@ void Changelog::readChangelogAndInitWriter(size_t from_log_idx) if (changelog_description.to_log_idx >= from_log_idx) { ChangelogReader reader(changelog_description.path); - read_from_last = reader.readChangelog(logs, from_log_idx, index_to_start_pos); - total_read += read_from_last; + result = reader.readChangelog(logs, from_log_idx, index_to_start_pos); + total_read += result.entries_read; /// May happen after truncate and crash - if (read_from_last < entries_in_last) + if (result.entries_read < entries_in_last) { incomplete_log_idx = start_idx; break; @@ -258,12 +273,13 @@ void Changelog::readChangelogAndInitWriter(size_t from_log_idx) } } - if (!existing_changelogs.empty() && read_from_last < entries_in_last) + if (!existing_changelogs.empty() && result.entries_read < entries_in_last) { auto description = existing_changelogs.rbegin()->second; current_writer = std::make_unique(description.path, WriteMode::Append, description.from_log_idx); - current_writer->setEntriesWritten(read_from_last); - current_writer->truncateToLength(index_to_start_pos[read_from_last]); + current_writer->setEntriesWritten(result.entries_read); + if (result.error) + current_writer->truncateToLength(result.last_position); } else { diff --git a/src/Coordination/tests/gtest_for_build.cpp b/src/Coordination/tests/gtest_for_build.cpp index 3fd2db84e3e..457d0dbc52a 100644 --- a/src/Coordination/tests/gtest_for_build.cpp +++ b/src/Coordination/tests/gtest_for_build.cpp @@ -867,6 +867,11 @@ TEST(CoordinationTest, ChangelogTestReadAfterBrokenTruncate) EXPECT_FALSE(fs::exists("./logs/changelog_21_25.bin")); EXPECT_FALSE(fs::exists("./logs/changelog_26_30.bin")); EXPECT_FALSE(fs::exists("./logs/changelog_31_35.bin")); + + DB::NuKeeperLogStore changelog_reader2("./logs", 5, true); + changelog_reader2.init(1); + EXPECT_EQ(changelog_reader2.size(), 11); + EXPECT_EQ(changelog_reader2.last_entry()->get_term(), 7777); } TEST(CoordinationTest, ChangelogTestReadAfterBrokenTruncate2) @@ -895,6 +900,16 @@ TEST(CoordinationTest, ChangelogTestReadAfterBrokenTruncate2) EXPECT_EQ(changelog_reader.last_entry()->get_term(), 450); EXPECT_TRUE(fs::exists("./logs/changelog_1_20.bin")); EXPECT_FALSE(fs::exists("./logs/changelog_21_40.bin")); + auto entry = getLogEntry("hello_world", 7777); + changelog_reader.append(entry); + EXPECT_EQ(changelog_reader.size(), 3); + EXPECT_EQ(changelog_reader.last_entry()->get_term(), 7777); + + + DB::NuKeeperLogStore changelog_reader2("./logs", 20, true); + changelog_reader2.init(1); + EXPECT_EQ(changelog_reader2.size(), 3); + EXPECT_EQ(changelog_reader2.last_entry()->get_term(), 7777); } int main(int argc, char ** argv) diff --git a/tests/integration/test_testkeeper_persistent_log/configs/use_test_keeper.xml b/tests/integration/test_testkeeper_persistent_log/configs/use_test_keeper.xml index 12dc7fd9447..2e48e91bca5 100644 --- a/tests/integration/test_testkeeper_persistent_log/configs/use_test_keeper.xml +++ b/tests/integration/test_testkeeper_persistent_log/configs/use_test_keeper.xml @@ -1,7 +1,7 @@ - node1 + node 9181 From 904b4754ccbd5a63b95402ae913c57ea2a260b5c Mon Sep 17 00:00:00 2001 From: alesapin Date: Thu, 18 Feb 2021 14:47:37 +0300 Subject: [PATCH 199/381] Fix tidy --- src/Coordination/Changelog.cpp | 2 +- src/Coordination/Changelog.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Coordination/Changelog.cpp b/src/Coordination/Changelog.cpp index 12943bd9272..a332ce37a8c 100644 --- a/src/Coordination/Changelog.cpp +++ b/src/Coordination/Changelog.cpp @@ -302,7 +302,7 @@ void Changelog::rotate(size_t new_start_log_idx) current_writer = std::make_unique(new_description.path, WriteMode::Rewrite, new_start_log_idx); } -ChangelogRecord Changelog::buildRecord(size_t index, nuraft::ptr log_entry) const +ChangelogRecord Changelog::buildRecord(size_t index, nuraft::ptr log_entry) { ChangelogRecordHeader header; header.index = index; diff --git a/src/Coordination/Changelog.h b/src/Coordination/Changelog.h index 38d83819da2..779d057d285 100644 --- a/src/Coordination/Changelog.h +++ b/src/Coordination/Changelog.h @@ -103,7 +103,7 @@ private: void rotate(size_t new_start_log_idx); - ChangelogRecord buildRecord(size_t index, nuraft::ptr log_entry) const; + static ChangelogRecord buildRecord(size_t index, nuraft::ptr log_entry); private: std::string changelogs_dir; From 2aad067e7c092af8162f1048b93c80216ec2d8f9 Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 18 Feb 2021 12:16:58 +0000 Subject: [PATCH 200/381] Support conversion for postgres numeric without precision and scale --- .../fetchPostgreSQLTableStructure.cpp | 35 ++++++++++++------- .../test_storage_postgresql/test.py | 8 ++--- 2 files changed, 27 insertions(+), 16 deletions(-) diff --git a/src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.cpp b/src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.cpp index ec23cfc8794..15ce9a1baed 100644 --- a/src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.cpp +++ b/src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.cpp @@ -54,19 +54,30 @@ static DataTypePtr convertPostgreSQLDataType(std::string & type, bool is_nullabl res = std::make_shared(); else if (type.starts_with("numeric")) { - /// Numeric and decimal will both end up here as numeric. - res = DataTypeFactory::instance().get(type); - uint32_t precision = getDecimalPrecision(*res); - uint32_t scale = getDecimalScale(*res); + /// Numeric and decimal will both end up here as numeric. If it has type and precision, + /// there will be Numeric(x, y), otherwise just Numeric + uint32_t precision, scale; + if (type.ends_with(")")) + { + res = DataTypeFactory::instance().get(type); + precision = getDecimalPrecision(*res); + scale = getDecimalScale(*res); + + if (precision <= DecimalUtils::maxPrecision()) + res = std::make_shared>(precision, scale); + else if (precision <= DecimalUtils::maxPrecision()) + res = std::make_shared>(precision, scale); + else if (precision <= DecimalUtils::maxPrecision()) + res = std::make_shared>(precision, scale); + else if (precision <= DecimalUtils::maxPrecision()) + res = std::make_shared>(precision, scale); + } + else + { + precision = DecimalUtils::maxPrecision(); + res = std::make_shared>(precision, precision); + } - if (precision <= DecimalUtils::maxPrecision()) - res = std::make_shared>(precision, scale); - else if (precision <= DecimalUtils::maxPrecision()) - res = std::make_shared>(precision, scale); - else if (precision <= DecimalUtils::maxPrecision()) - res = std::make_shared>(precision, scale); - else if (precision <= DecimalUtils::maxPrecision()) - res = std::make_shared>(precision, scale); } if (!res) diff --git a/tests/integration/test_storage_postgresql/test.py b/tests/integration/test_storage_postgresql/test.py index 4f567c19f2b..03af32a4803 100644 --- a/tests/integration/test_storage_postgresql/test.py +++ b/tests/integration/test_storage_postgresql/test.py @@ -63,13 +63,13 @@ def test_postgres_conversions(started_cluster): cursor.execute( '''CREATE TABLE IF NOT EXISTS test_types ( a smallint, b integer, c bigint, d real, e double precision, f serial, g bigserial, - h timestamp, i date, j numeric(5, 5), k decimal(5, 5))''') + h timestamp, i date, j decimal(5, 5), k numeric)''') node1.query(''' INSERT INTO TABLE FUNCTION postgresql('postgres1:5432', 'clickhouse', 'test_types', 'postgres', 'mysecretpassword') VALUES - (-32768, -2147483648, -9223372036854775808, 1.12345, 1.1234567890, 2147483647, 9223372036854775807, '2000-05-12 12:12:12', '2000-05-12', 0.2, 0.2)''') + (-32768, -2147483648, -9223372036854775808, 1.12345, 1.1234567890, 2147483647, 9223372036854775807, '2000-05-12 12:12:12', '2000-05-12', 0.22222, 0.22222)''') result = node1.query(''' - SELECT * FROM postgresql('postgres1:5432', 'clickhouse', 'test_types', 'postgres', 'mysecretpassword')''') - assert(result == '-32768\t-2147483648\t-9223372036854775808\t1.12345\t1.123456789\t2147483647\t9223372036854775807\t2000-05-12 12:12:12\t2000-05-12\t0.20000\t0.20000\n') + SELECT a, b, c, d, e, f, g, h, i, j, toDecimal32(k, 5) FROM postgresql('postgres1:5432', 'clickhouse', 'test_types', 'postgres', 'mysecretpassword')''') + assert(result == '-32768\t-2147483648\t-9223372036854775808\t1.12345\t1.123456789\t2147483647\t9223372036854775807\t2000-05-12 12:12:12\t2000-05-12\t0.22222\t0.22222\n') cursor.execute( '''CREATE TABLE IF NOT EXISTS test_array_dimensions From 77fd060665751fc6528dd9f77e0fdea41cbc23bc Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Sun, 14 Feb 2021 19:09:36 +0800 Subject: [PATCH 201/381] Normalize function names --- .../AggregateFunctionFactory.cpp | 16 +++-- src/Common/IFactoryWithAliases.h | 14 ++++ src/Functions/FunctionFactory.cpp | 15 +++-- src/Functions/FunctionsRound.cpp | 2 +- src/Functions/extractAllGroupsVertical.cpp | 2 +- src/Interpreters/FunctionNameNormalizer.cpp | 18 +++++ src/Interpreters/FunctionNameNormalizer.h | 14 ++++ src/Interpreters/MutationsInterpreter.cpp | 4 +- src/Interpreters/TreeRewriter.cpp | 4 ++ src/Interpreters/addTypeConversionToAST.cpp | 2 +- src/Interpreters/inplaceBlockConversions.cpp | 2 +- .../Impl/ConstantExpressionTemplate.cpp | 2 +- tests/integration/test_mysql_protocol/test.py | 2 +- .../00597_push_down_predicate.reference | 2 +- .../01029_early_constant_folding.reference | 2 +- ...1611_constant_folding_subqueries.reference | 2 +- ..._case_insensitive_function_names.reference | 66 +++++++++++++++++++ ...malize_case_insensitive_function_names.sql | 1 + 18 files changed, 151 insertions(+), 19 deletions(-) create mode 100644 src/Interpreters/FunctionNameNormalizer.cpp create mode 100644 src/Interpreters/FunctionNameNormalizer.h create mode 100644 tests/queries/0_stateless/01705_normalize_case_insensitive_function_names.reference create mode 100644 tests/queries/0_stateless/01705_normalize_case_insensitive_function_names.sql diff --git a/src/AggregateFunctions/AggregateFunctionFactory.cpp b/src/AggregateFunctions/AggregateFunctionFactory.cpp index 5fc690d59f2..061077dd8fa 100644 --- a/src/AggregateFunctions/AggregateFunctionFactory.cpp +++ b/src/AggregateFunctions/AggregateFunctionFactory.cpp @@ -30,6 +30,10 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } +const String & getAggregateFunctionCanonicalNameIfAny(const String & name) +{ + return AggregateFunctionFactory::instance().getCanonicalNameIfAny(name); +} void AggregateFunctionFactory::registerFunction(const String & name, Value creator_with_properties, CaseSensitiveness case_sensitiveness) { @@ -41,10 +45,14 @@ void AggregateFunctionFactory::registerFunction(const String & name, Value creat throw Exception("AggregateFunctionFactory: the aggregate function name '" + name + "' is not unique", ErrorCodes::LOGICAL_ERROR); - if (case_sensitiveness == CaseInsensitive - && !case_insensitive_aggregate_functions.emplace(Poco::toLower(name), creator_with_properties).second) - throw Exception("AggregateFunctionFactory: the case insensitive aggregate function name '" + name + "' is not unique", - ErrorCodes::LOGICAL_ERROR); + if (case_sensitiveness == CaseInsensitive) + { + auto key = Poco::toLower(name); + if (!case_insensitive_aggregate_functions.emplace(key, creator_with_properties).second) + throw Exception("AggregateFunctionFactory: the case insensitive aggregate function name '" + name + "' is not unique", + ErrorCodes::LOGICAL_ERROR); + case_insensitive_name_mapping[key] = name; + } } static DataTypes convertLowCardinalityTypesToNested(const DataTypes & types) diff --git a/src/Common/IFactoryWithAliases.h b/src/Common/IFactoryWithAliases.h index 49c03049b92..5ef795c92d0 100644 --- a/src/Common/IFactoryWithAliases.h +++ b/src/Common/IFactoryWithAliases.h @@ -35,6 +35,8 @@ protected: return name; } + std::unordered_map case_insensitive_name_mapping; + public: /// For compatibility with SQL, it's possible to specify that certain function name is case insensitive. enum CaseSensitiveness @@ -68,9 +70,12 @@ public: factory_name + ": the alias name '" + alias_name + "' is already registered as real name", ErrorCodes::LOGICAL_ERROR); if (case_sensitiveness == CaseInsensitive) + { if (!case_insensitive_aliases.emplace(alias_name_lowercase, real_dict_name).second) throw Exception( factory_name + ": case insensitive alias name '" + alias_name + "' is not unique", ErrorCodes::LOGICAL_ERROR); + case_insensitive_name_mapping[alias_name_lowercase] = real_name; + } if (!aliases.emplace(alias_name, real_dict_name).second) throw Exception(factory_name + ": alias name '" + alias_name + "' is not unique", ErrorCodes::LOGICAL_ERROR); @@ -111,6 +116,15 @@ public: return getMap().count(name) || getCaseInsensitiveMap().count(name) || isAlias(name); } + /// Return the canonical name (the name used in registration) if it's different from `name`. + const String & getCanonicalNameIfAny(const String & name) const + { + auto it = case_insensitive_name_mapping.find(Poco::toLower(name)); + if (it != case_insensitive_name_mapping.end()) + return it->second; + return name; + } + virtual ~IFactoryWithAliases() override {} private: diff --git a/src/Functions/FunctionFactory.cpp b/src/Functions/FunctionFactory.cpp index 768f1cfe487..09fd360a925 100644 --- a/src/Functions/FunctionFactory.cpp +++ b/src/Functions/FunctionFactory.cpp @@ -21,6 +21,10 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } +const String & getFunctionCanonicalNameIfAny(const String & name) +{ + return FunctionFactory::instance().getCanonicalNameIfAny(name); +} void FunctionFactory::registerFunction(const std::string & name, @@ -36,10 +40,13 @@ void FunctionFactory::registerFunction(const throw Exception("FunctionFactory: the function name '" + name + "' is already registered as alias", ErrorCodes::LOGICAL_ERROR); - if (case_sensitiveness == CaseInsensitive - && !case_insensitive_functions.emplace(function_name_lowercase, creator).second) - throw Exception("FunctionFactory: the case insensitive function name '" + name + "' is not unique", - ErrorCodes::LOGICAL_ERROR); + if (case_sensitiveness == CaseInsensitive) + { + if (!case_insensitive_functions.emplace(function_name_lowercase, creator).second) + throw Exception("FunctionFactory: the case insensitive function name '" + name + "' is not unique", + ErrorCodes::LOGICAL_ERROR); + case_insensitive_name_mapping[function_name_lowercase] = name; + } } diff --git a/src/Functions/FunctionsRound.cpp b/src/Functions/FunctionsRound.cpp index b1349bd2164..c5ad27a0b90 100644 --- a/src/Functions/FunctionsRound.cpp +++ b/src/Functions/FunctionsRound.cpp @@ -8,7 +8,7 @@ namespace DB void registerFunctionsRound(FunctionFactory & factory) { factory.registerFunction("round", FunctionFactory::CaseInsensitive); - factory.registerFunction("roundBankers", FunctionFactory::CaseInsensitive); + factory.registerFunction("roundBankers", FunctionFactory::CaseSensitive); factory.registerFunction("floor", FunctionFactory::CaseInsensitive); factory.registerFunction("ceil", FunctionFactory::CaseInsensitive); factory.registerFunction("trunc", FunctionFactory::CaseInsensitive); diff --git a/src/Functions/extractAllGroupsVertical.cpp b/src/Functions/extractAllGroupsVertical.cpp index 9cbd148b016..bf33eef70f3 100644 --- a/src/Functions/extractAllGroupsVertical.cpp +++ b/src/Functions/extractAllGroupsVertical.cpp @@ -18,7 +18,7 @@ namespace DB void registerFunctionExtractAllGroupsVertical(FunctionFactory & factory) { factory.registerFunction>(); - factory.registerAlias("extractAllGroups", VerticalImpl::Name, FunctionFactory::CaseInsensitive); + factory.registerAlias("extractAllGroups", VerticalImpl::Name, FunctionFactory::CaseSensitive); } } diff --git a/src/Interpreters/FunctionNameNormalizer.cpp b/src/Interpreters/FunctionNameNormalizer.cpp new file mode 100644 index 00000000000..f22f72b5e03 --- /dev/null +++ b/src/Interpreters/FunctionNameNormalizer.cpp @@ -0,0 +1,18 @@ +#include + +namespace DB +{ + +const String & getFunctionCanonicalNameIfAny(const String & name); +const String & getAggregateFunctionCanonicalNameIfAny(const String & name); + +void FunctionNameNormalizer::visit(ASTPtr & ast) +{ + if (auto * node_func = ast->as()) + node_func->name = getAggregateFunctionCanonicalNameIfAny(getFunctionCanonicalNameIfAny(node_func->name)); + + for (auto & child : ast->children) + visit(child); +} + +} diff --git a/src/Interpreters/FunctionNameNormalizer.h b/src/Interpreters/FunctionNameNormalizer.h new file mode 100644 index 00000000000..2b20c28bce0 --- /dev/null +++ b/src/Interpreters/FunctionNameNormalizer.h @@ -0,0 +1,14 @@ +#pragma once + +#include +#include + +namespace DB +{ + +struct FunctionNameNormalizer +{ + static void visit(ASTPtr &); +}; + +} diff --git a/src/Interpreters/MutationsInterpreter.cpp b/src/Interpreters/MutationsInterpreter.cpp index 528b5ec6d8e..c393b214ee8 100644 --- a/src/Interpreters/MutationsInterpreter.cpp +++ b/src/Interpreters/MutationsInterpreter.cpp @@ -442,10 +442,10 @@ ASTPtr MutationsInterpreter::prepare(bool dry_run) auto type_literal = std::make_shared(columns_desc.getPhysical(column).type->getName()); const auto & update_expr = kv.second; - auto updated_column = makeASTFunction("cast", + auto updated_column = makeASTFunction("CAST", makeASTFunction("if", getPartitionAndPredicateExpressionForMutationCommand(command), - makeASTFunction("cast", + makeASTFunction("CAST", update_expr->clone(), type_literal), std::make_shared(column)), diff --git a/src/Interpreters/TreeRewriter.cpp b/src/Interpreters/TreeRewriter.cpp index fd87d86bf97..cf4db8f174e 100644 --- a/src/Interpreters/TreeRewriter.cpp +++ b/src/Interpreters/TreeRewriter.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -934,6 +935,9 @@ void TreeRewriter::normalize(ASTPtr & query, Aliases & aliases, const Settings & MarkTableIdentifiersVisitor::Data identifiers_data{aliases}; MarkTableIdentifiersVisitor(identifiers_data).visit(query); + /// Rewrite function names to their canonical ones. + FunctionNameNormalizer().visit(query); + /// Common subexpression elimination. Rewrite rules. QueryNormalizer::Data normalizer_data(aliases, settings); QueryNormalizer(normalizer_data).visit(query); diff --git a/src/Interpreters/addTypeConversionToAST.cpp b/src/Interpreters/addTypeConversionToAST.cpp index bb42ad79daa..18591fd732c 100644 --- a/src/Interpreters/addTypeConversionToAST.cpp +++ b/src/Interpreters/addTypeConversionToAST.cpp @@ -20,7 +20,7 @@ namespace ErrorCodes ASTPtr addTypeConversionToAST(ASTPtr && ast, const String & type_name) { - auto func = makeASTFunction("cast", ast, std::make_shared(type_name)); + auto func = makeASTFunction("CAST", ast, std::make_shared(type_name)); if (ASTWithAlias * ast_with_alias = dynamic_cast(ast.get())) { diff --git a/src/Interpreters/inplaceBlockConversions.cpp b/src/Interpreters/inplaceBlockConversions.cpp index eba03d7aa61..c9a96a81b48 100644 --- a/src/Interpreters/inplaceBlockConversions.cpp +++ b/src/Interpreters/inplaceBlockConversions.cpp @@ -43,7 +43,7 @@ void addDefaultRequiredExpressionsRecursively(const Block & block, const String RequiredSourceColumnsVisitor(columns_context).visit(column_default_expr); NameSet required_columns_names = columns_context.requiredColumns(); - auto cast_func = makeASTFunction("cast", column_default_expr, std::make_shared(columns.get(required_column).type->getName())); + auto cast_func = makeASTFunction("CAST", column_default_expr, std::make_shared(columns.get(required_column).type->getName())); default_expr_list_accum->children.emplace_back(setAlias(cast_func, required_column)); added_columns.emplace(required_column); diff --git a/src/Processors/Formats/Impl/ConstantExpressionTemplate.cpp b/src/Processors/Formats/Impl/ConstantExpressionTemplate.cpp index d7a65c2f15d..1685688f02d 100644 --- a/src/Processors/Formats/Impl/ConstantExpressionTemplate.cpp +++ b/src/Processors/Formats/Impl/ConstantExpressionTemplate.cpp @@ -626,7 +626,7 @@ void ConstantExpressionTemplate::TemplateStructure::addNodesToCastResult(const I expr = makeASTFunction("assumeNotNull", std::move(expr)); } - expr = makeASTFunction("cast", std::move(expr), std::make_shared(result_column_type.getName())); + expr = makeASTFunction("CAST", std::move(expr), std::make_shared(result_column_type.getName())); if (null_as_default) { diff --git a/tests/integration/test_mysql_protocol/test.py b/tests/integration/test_mysql_protocol/test.py index 9532d4b8ba2..7f7d59674bc 100644 --- a/tests/integration/test_mysql_protocol/test.py +++ b/tests/integration/test_mysql_protocol/test.py @@ -217,7 +217,7 @@ def test_mysql_replacement_query(mysql_client, server_address): --password=123 -e "select database();" '''.format(host=server_address, port=server_port), demux=True) assert code == 0 - assert stdout.decode() == 'database()\ndefault\n' + assert stdout.decode() == 'DATABASE()\ndefault\n' code, (stdout, stderr) = mysql_client.exec_run(''' mysql --protocol tcp -h {host} -P {port} default -u default diff --git a/tests/queries/0_stateless/00597_push_down_predicate.reference b/tests/queries/0_stateless/00597_push_down_predicate.reference index 794d9e7af5f..bd1c4791df4 100644 --- a/tests/queries/0_stateless/00597_push_down_predicate.reference +++ b/tests/queries/0_stateless/00597_push_down_predicate.reference @@ -114,7 +114,7 @@ FROM ( SELECT 1 AS id, - identity(cast(1, \'UInt8\')) AS subquery + identity(CAST(1, \'UInt8\')) AS subquery WHERE subquery = 1 ) WHERE subquery = 1 diff --git a/tests/queries/0_stateless/01029_early_constant_folding.reference b/tests/queries/0_stateless/01029_early_constant_folding.reference index 8a1d4cec388..8a2d7e6c61a 100644 --- a/tests/queries/0_stateless/01029_early_constant_folding.reference +++ b/tests/queries/0_stateless/01029_early_constant_folding.reference @@ -2,7 +2,7 @@ SELECT 1 WHERE 0 SELECT 1 SELECT 1 -WHERE (1 IN (0, 2)) AND (2 = (identity(cast(2, \'UInt8\')) AS subquery)) +WHERE (1 IN (0, 2)) AND (2 = (identity(CAST(2, \'UInt8\')) AS subquery)) SELECT 1 WHERE 1 IN ( ( diff --git a/tests/queries/0_stateless/01611_constant_folding_subqueries.reference b/tests/queries/0_stateless/01611_constant_folding_subqueries.reference index d10502c5860..e46fd479413 100644 --- a/tests/queries/0_stateless/01611_constant_folding_subqueries.reference +++ b/tests/queries/0_stateless/01611_constant_folding_subqueries.reference @@ -5,7 +5,7 @@ SELECT (SELECT * FROM system.numbers LIMIT 1 OFFSET 1) AS n, toUInt64(10 / n) FO 1,10 EXPLAIN SYNTAX SELECT (SELECT * FROM system.numbers LIMIT 1 OFFSET 1) AS n, toUInt64(10 / n); SELECT - identity(cast(0, \'UInt64\')) AS n, + identity(CAST(0, \'UInt64\')) AS n, toUInt64(10 / n) SELECT * FROM (WITH (SELECT * FROM system.numbers LIMIT 1 OFFSET 1) AS n, toUInt64(10 / n) as q SELECT * FROM system.one WHERE q > 0); 0 diff --git a/tests/queries/0_stateless/01705_normalize_case_insensitive_function_names.reference b/tests/queries/0_stateless/01705_normalize_case_insensitive_function_names.reference new file mode 100644 index 00000000000..5b0f7bdeb2d --- /dev/null +++ b/tests/queries/0_stateless/01705_normalize_case_insensitive_function_names.reference @@ -0,0 +1,66 @@ +SELECT + CAST(1, 'INT'), + ceil(1), + ceil(1), + char(49), + CHAR_LENGTH('1'), + CHARACTER_LENGTH('1'), + coalesce(1), + concat('1', '1'), + corr(1, 1), + cos(1), + count(), + covarPop(1, 1), + covarSamp(1, 1), + DATABASE(), + dateDiff('DAY', toDate('2020-10-24'), toDate('2019-10-24')), + exp(1), + arrayFlatten([[1]]), + floor(1), + FQDN(), + greatest(1), + 1, + ifNull(1, 1), + lower('A'), + least(1), + length('1'), + log(1), + position('1', '1'), + log(1), + log10(1), + log2(1), + lower('A'), + max(1), + substring('123', 1, 1), + min(1), + 1 % 1, + NOT 1, + now(), + now64(), + nullIf(1, 1), + pi(), + position('123', '2'), + pow(1, 1), + pow(1, 1), + rand(), + replaceAll('1', '1', '2'), + reverse('123'), + round(1), + sin(1), + sqrt(1), + stddevPop(1), + stddevSamp(1), + substring('123', 2), + substring('123', 2), + count(), + tan(1), + tanh(1), + trunc(1), + trunc(1), + upper('A'), + upper('A'), + currentUser(), + varPop(1), + varSamp(1), + toWeek(toDate('2020-10-24')), + toYearWeek(toDate('2020-10-24')) diff --git a/tests/queries/0_stateless/01705_normalize_case_insensitive_function_names.sql b/tests/queries/0_stateless/01705_normalize_case_insensitive_function_names.sql new file mode 100644 index 00000000000..9b35087182c --- /dev/null +++ b/tests/queries/0_stateless/01705_normalize_case_insensitive_function_names.sql @@ -0,0 +1 @@ +EXPLAIN SYNTAX SELECT CAST(1 AS INT), CEIL(1), CEILING(1), CHAR(49), CHAR_LENGTH('1'), CHARACTER_LENGTH('1'), COALESCE(1), CONCAT('1', '1'), CORR(1, 1), COS(1), COUNT(1), COVAR_POP(1, 1), COVAR_SAMP(1, 1), DATABASE(), DATEDIFF('DAY', toDate('2020-10-24'), toDate('2019-10-24')), EXP(1), FLATTEN([[1]]), FLOOR(1), FQDN(), GREATEST(1), IF(1, 1, 1), IFNULL(1, 1), LCASE('A'), LEAST(1), LENGTH('1'), LN(1), LOCATE('1', '1'), LOG(1), LOG10(1), LOG2(1), LOWER('A'), MAX(1), MID('123', 1, 1), MIN(1), MOD(1, 1), NOT(1), NOW(), NOW64(), NULLIF(1, 1), PI(), POSITION('123', '2'), POW(1, 1), POWER(1, 1), RAND(), REPLACE('1', '1', '2'), REVERSE('123'), ROUND(1), SIN(1), SQRT(1), STDDEV_POP(1), STDDEV_SAMP(1), SUBSTR('123', 2), SUBSTRING('123', 2), SUM(1), TAN(1), TANH(1), TRUNC(1), TRUNCATE(1), UCASE('A'), UPPER('A'), USER(), VAR_POP(1), VAR_SAMP(1), WEEK(toDate('2020-10-24')), YEARWEEK(toDate('2020-10-24')) format TSVRaw; From 2dc7ba160a3bdc61765b12336edf753a0100f923 Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Sun, 14 Feb 2021 20:53:50 +0800 Subject: [PATCH 202/381] Better --- src/Interpreters/FunctionNameNormalizer.cpp | 27 +++++++++++++++++-- src/Interpreters/FunctionNameNormalizer.h | 2 +- src/Interpreters/InterpreterCreateQuery.cpp | 2 ++ ...OptimizeIfWithConstantConditionVisitor.cpp | 2 +- src/Interpreters/TreeRewriter.cpp | 2 +- src/Interpreters/inplaceBlockConversions.cpp | 2 +- src/Parsers/ExpressionElementParsers.cpp | 2 +- ...56_test_query_log_factories_info.reference | 2 +- 8 files changed, 33 insertions(+), 8 deletions(-) diff --git a/src/Interpreters/FunctionNameNormalizer.cpp b/src/Interpreters/FunctionNameNormalizer.cpp index f22f72b5e03..36ccc9340ea 100644 --- a/src/Interpreters/FunctionNameNormalizer.cpp +++ b/src/Interpreters/FunctionNameNormalizer.cpp @@ -1,18 +1,41 @@ #include +#include +#include + namespace DB { const String & getFunctionCanonicalNameIfAny(const String & name); const String & getAggregateFunctionCanonicalNameIfAny(const String & name); -void FunctionNameNormalizer::visit(ASTPtr & ast) +void FunctionNameNormalizer::visit(IAST * ast) { + if (!ast) + return; + + if (auto * node_storage = ast->as()) + { + visit(node_storage->partition_by); + visit(node_storage->primary_key); + visit(node_storage->order_by); + visit(node_storage->sample_by); + visit(node_storage->ttl_table); + return; + } + + if (auto * node_decl = ast->as()) + { + visit(node_decl->default_expression.get()); + visit(node_decl->ttl.get()); + return; + } + if (auto * node_func = ast->as()) node_func->name = getAggregateFunctionCanonicalNameIfAny(getFunctionCanonicalNameIfAny(node_func->name)); for (auto & child : ast->children) - visit(child); + visit(child.get()); } } diff --git a/src/Interpreters/FunctionNameNormalizer.h b/src/Interpreters/FunctionNameNormalizer.h index 2b20c28bce0..3f22bb2f627 100644 --- a/src/Interpreters/FunctionNameNormalizer.h +++ b/src/Interpreters/FunctionNameNormalizer.h @@ -8,7 +8,7 @@ namespace DB struct FunctionNameNormalizer { - static void visit(ASTPtr &); + static void visit(IAST *); }; } diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index e9a11b9eb0d..bc38d4e3821 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -56,6 +56,7 @@ #include #include #include +#include #include #include @@ -1118,6 +1119,7 @@ void InterpreterCreateQuery::prepareOnClusterQuery(ASTCreateQuery & create, cons BlockIO InterpreterCreateQuery::execute() { + FunctionNameNormalizer().visit(query_ptr.get()); auto & create = query_ptr->as(); if (!create.cluster.empty()) { diff --git a/src/Interpreters/OptimizeIfWithConstantConditionVisitor.cpp b/src/Interpreters/OptimizeIfWithConstantConditionVisitor.cpp index dee4c69118b..cdcf6f7dddd 100644 --- a/src/Interpreters/OptimizeIfWithConstantConditionVisitor.cpp +++ b/src/Interpreters/OptimizeIfWithConstantConditionVisitor.cpp @@ -29,7 +29,7 @@ static bool tryExtractConstValueFromCondition(const ASTPtr & condition, bool & v /// cast of numeric constant in condition to UInt8 if (const auto * function = condition->as()) { - if (function->name == "cast") + if (function->name == "CAST") { if (const auto * expr_list = function->arguments->as()) { diff --git a/src/Interpreters/TreeRewriter.cpp b/src/Interpreters/TreeRewriter.cpp index cf4db8f174e..7b1a960d435 100644 --- a/src/Interpreters/TreeRewriter.cpp +++ b/src/Interpreters/TreeRewriter.cpp @@ -936,7 +936,7 @@ void TreeRewriter::normalize(ASTPtr & query, Aliases & aliases, const Settings & MarkTableIdentifiersVisitor(identifiers_data).visit(query); /// Rewrite function names to their canonical ones. - FunctionNameNormalizer().visit(query); + FunctionNameNormalizer().visit(query.get()); /// Common subexpression elimination. Rewrite rules. QueryNormalizer::Data normalizer_data(aliases, settings); diff --git a/src/Interpreters/inplaceBlockConversions.cpp b/src/Interpreters/inplaceBlockConversions.cpp index c9a96a81b48..d06cde99425 100644 --- a/src/Interpreters/inplaceBlockConversions.cpp +++ b/src/Interpreters/inplaceBlockConversions.cpp @@ -79,7 +79,7 @@ ASTPtr convertRequiredExpressions(Block & block, const NamesAndTypesList & requi continue; auto cast_func = makeASTFunction( - "cast", std::make_shared(required_column.name), std::make_shared(required_column.type->getName())); + "CAST", std::make_shared(required_column.name), std::make_shared(required_column.type->getName())); conversion_expr_list->children.emplace_back(setAlias(cast_func, required_column.name)); diff --git a/src/Parsers/ExpressionElementParsers.cpp b/src/Parsers/ExpressionElementParsers.cpp index 3d868812304..7a426e7774d 100644 --- a/src/Parsers/ExpressionElementParsers.cpp +++ b/src/Parsers/ExpressionElementParsers.cpp @@ -864,7 +864,7 @@ bool ParserCastExpression::parseImpl(Pos & pos, ASTPtr & node, Expected & expect expr_list_args->children.push_back(std::move(type_literal)); auto func_node = std::make_shared(); - func_node->name = "cast"; + func_node->name = "CAST"; func_node->arguments = std::move(expr_list_args); func_node->children.push_back(func_node->arguments); diff --git a/tests/queries/0_stateless/01656_test_query_log_factories_info.reference b/tests/queries/0_stateless/01656_test_query_log_factories_info.reference index 3c93cd9ec26..324890c0a5a 100644 --- a/tests/queries/0_stateless/01656_test_query_log_factories_info.reference +++ b/tests/queries/0_stateless/01656_test_query_log_factories_info.reference @@ -11,7 +11,7 @@ arraySort(used_table_functions) ['numbers'] arraySort(used_functions) -['addDays','array','arrayFlatten','cast','modulo','plus','substring','toDate','toDayOfYear','toTypeName','toWeek'] +['CAST','addDays','array','arrayFlatten','modulo','plus','substring','toDate','toDayOfYear','toTypeName','toWeek'] arraySort(used_data_type_families) ['Array','Int32','Nullable','String'] From cac9c7fc079835b4e26cf2b5ff8ad776b1369c5d Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Mon, 15 Feb 2021 00:00:47 +0800 Subject: [PATCH 203/381] Fix tests --- tests/queries/0_stateless/00642_cast.reference | 4 ++-- tests/queries/0_stateless/00643_cast_zookeeper.reference | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/queries/0_stateless/00642_cast.reference b/tests/queries/0_stateless/00642_cast.reference index 3d5572932fb..7f5333f590e 100644 --- a/tests/queries/0_stateless/00642_cast.reference +++ b/tests/queries/0_stateless/00642_cast.reference @@ -10,11 +10,11 @@ hello CREATE TABLE default.cast ( `x` UInt8, - `e` Enum8('hello' = 1, 'world' = 2) DEFAULT cast(x, 'Enum8(\'hello\' = 1, \'world\' = 2)') + `e` Enum8('hello' = 1, 'world' = 2) DEFAULT CAST(x, 'Enum8(\'hello\' = 1, \'world\' = 2)') ) ENGINE = MergeTree ORDER BY e SETTINGS index_granularity = 8192 x UInt8 -e Enum8(\'hello\' = 1, \'world\' = 2) DEFAULT cast(x, \'Enum8(\\\'hello\\\' = 1, \\\'world\\\' = 2)\') +e Enum8(\'hello\' = 1, \'world\' = 2) DEFAULT CAST(x, \'Enum8(\\\'hello\\\' = 1, \\\'world\\\' = 2)\') 1 hello diff --git a/tests/queries/0_stateless/00643_cast_zookeeper.reference b/tests/queries/0_stateless/00643_cast_zookeeper.reference index 658233be742..9123463de1a 100644 --- a/tests/queries/0_stateless/00643_cast_zookeeper.reference +++ b/tests/queries/0_stateless/00643_cast_zookeeper.reference @@ -1,12 +1,12 @@ CREATE TABLE default.cast1 ( `x` UInt8, - `e` Enum8('hello' = 1, 'world' = 2) DEFAULT cast(x, 'Enum8(\'hello\' = 1, \'world\' = 2)') + `e` Enum8('hello' = 1, 'world' = 2) DEFAULT CAST(x, 'Enum8(\'hello\' = 1, \'world\' = 2)') ) ENGINE = ReplicatedMergeTree('/clickhouse/tables/test_00643/cast', 'r1') ORDER BY e SETTINGS index_granularity = 8192 x UInt8 -e Enum8(\'hello\' = 1, \'world\' = 2) DEFAULT cast(x, \'Enum8(\\\'hello\\\' = 1, \\\'world\\\' = 2)\') +e Enum8(\'hello\' = 1, \'world\' = 2) DEFAULT CAST(x, \'Enum8(\\\'hello\\\' = 1, \\\'world\\\' = 2)\') 1 hello 1 hello From f402aa4057814078b7b7ef2e0175ab2753d2bced Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Wed, 17 Feb 2021 23:36:37 +0800 Subject: [PATCH 204/381] Normalize constant expression --- src/Interpreters/evaluateConstantExpression.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Interpreters/evaluateConstantExpression.cpp b/src/Interpreters/evaluateConstantExpression.cpp index 02ef3426483..70b9baa544f 100644 --- a/src/Interpreters/evaluateConstantExpression.cpp +++ b/src/Interpreters/evaluateConstantExpression.cpp @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -35,6 +36,7 @@ std::pair> evaluateConstantExpression(co auto ast = node->clone(); ReplaceQueryParameterVisitor param_visitor(context.getQueryParameters()); param_visitor.visit(ast); + FunctionNameNormalizer().visit(ast.get()); String name = ast->getColumnName(); auto syntax_result = TreeRewriter(context).analyze(ast, source_columns); ExpressionActionsPtr expr_for_constant_folding = ExpressionAnalyzer(ast, syntax_result, context).getConstActions(); From 2c4bc43014c510292340954647fbebf0f72620e9 Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Thu, 18 Feb 2021 11:27:24 +0800 Subject: [PATCH 205/381] Backward compatible --- src/Core/Settings.h | 1 + src/Interpreters/TreeRewriter.cpp | 3 ++- src/Interpreters/evaluateConstantExpression.cpp | 5 ++++- src/Server/TCPHandler.cpp | 6 ++++++ 4 files changed, 13 insertions(+), 2 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 9bb9ad30f15..4c5fe93bb03 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -383,6 +383,7 @@ class IColumn; M(Bool, optimize_if_chain_to_multiif, false, "Replace if(cond1, then1, if(cond2, ...)) chains to multiIf. Currently it's not beneficial for numeric types.", 0) \ M(Bool, optimize_if_transform_strings_to_enum, false, "Replaces string-type arguments in If and Transform to enum. Disabled by default cause it could make inconsistent change in distributed query that would lead to its fail.", 0) \ M(Bool, optimize_monotonous_functions_in_order_by, true, "Replace monotonous function with its argument in ORDER BY", 0) \ + M(Bool, normalize_function_names, true, "Normalize function names to their canonical names", 0) \ M(Bool, allow_experimental_alter_materialized_view_structure, false, "Allow atomic alter on Materialized views. Work in progress.", 0) \ M(Bool, enable_early_constant_folding, true, "Enable query optimization where we analyze function and subqueries results and rewrite query if there're constants there", 0) \ M(Bool, deduplicate_blocks_in_dependent_materialized_views, false, "Should deduplicate blocks for materialized views if the block is not a duplicate for the table. Use true to always deduplicate in dependent tables.", 0) \ diff --git a/src/Interpreters/TreeRewriter.cpp b/src/Interpreters/TreeRewriter.cpp index 7b1a960d435..37f49874e0a 100644 --- a/src/Interpreters/TreeRewriter.cpp +++ b/src/Interpreters/TreeRewriter.cpp @@ -936,7 +936,8 @@ void TreeRewriter::normalize(ASTPtr & query, Aliases & aliases, const Settings & MarkTableIdentifiersVisitor(identifiers_data).visit(query); /// Rewrite function names to their canonical ones. - FunctionNameNormalizer().visit(query.get()); + if (settings.normalize_function_names) + FunctionNameNormalizer().visit(query.get()); /// Common subexpression elimination. Rewrite rules. QueryNormalizer::Data normalizer_data(aliases, settings); diff --git a/src/Interpreters/evaluateConstantExpression.cpp b/src/Interpreters/evaluateConstantExpression.cpp index 70b9baa544f..42e96bae07b 100644 --- a/src/Interpreters/evaluateConstantExpression.cpp +++ b/src/Interpreters/evaluateConstantExpression.cpp @@ -36,7 +36,10 @@ std::pair> evaluateConstantExpression(co auto ast = node->clone(); ReplaceQueryParameterVisitor param_visitor(context.getQueryParameters()); param_visitor.visit(ast); - FunctionNameNormalizer().visit(ast.get()); + + if (context.getSettingsRef().normalize_function_names) + FunctionNameNormalizer().visit(ast.get()); + String name = ast->getColumnName(); auto syntax_result = TreeRewriter(context).analyze(ast, source_columns); ExpressionActionsPtr expr_for_constant_folding = ExpressionAnalyzer(ast, syntax_result, context).getConstActions(); diff --git a/src/Server/TCPHandler.cpp b/src/Server/TCPHandler.cpp index c207d188a85..430a01bb97a 100644 --- a/src/Server/TCPHandler.cpp +++ b/src/Server/TCPHandler.cpp @@ -1133,6 +1133,12 @@ void TCPHandler::receiveQuery() } query_context->applySettingsChanges(settings_changes); + /// Disable function name normalization it's not an initial query. + if (client_info.query_kind != ClientInfo::QueryKind::INITIAL_QUERY) + { + query_context->setSetting("normalize_function_names", Field(0)); + } + // Use the received query id, or generate a random default. It is convenient // to also generate the default OpenTelemetry trace id at the same time, and // set the trace parent. From 556dc81ab990803f082dc6365656e5aac58a0a03 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Thu, 18 Feb 2021 16:32:01 +0300 Subject: [PATCH 206/381] Fix undefined-behavior in ReservoirSamplerDeterministic.h --- .../ReservoirSamplerDeterministic.h | 40 +++++++++++++------ 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/src/AggregateFunctions/ReservoirSamplerDeterministic.h b/src/AggregateFunctions/ReservoirSamplerDeterministic.h index 3b7817e9308..3013a17e1ca 100644 --- a/src/AggregateFunctions/ReservoirSamplerDeterministic.h +++ b/src/AggregateFunctions/ReservoirSamplerDeterministic.h @@ -56,7 +56,7 @@ class ReservoirSamplerDeterministic { bool good(const UInt32 hash) { - return hash == ((hash >> skip_degree) << skip_degree); + return !(hash & skip_mask); } public: @@ -135,11 +135,8 @@ public: throw Poco::Exception("Cannot merge ReservoirSamplerDeterministic's with different max sample size"); sorted = false; - if (b.skip_degree > skip_degree) - { - skip_degree = b.skip_degree; - thinOut(); - } + if (skip_degree < b.skip_degree) + setSkipDegree(b.skip_degree); for (const auto & sample : b.samples) if (good(sample.second)) @@ -184,22 +181,39 @@ private: size_t total_values = 0; /// How many values were inserted (regardless if they remain in sample or not). bool sorted = false; Array samples; - UInt8 skip_degree = 0; /// The number N determining that we save only one per 2^N elements in average. + + /// The number N determining that we store only one per 2^N elements in average. + UInt8 skip_degree = 0; + + /// skip_mask is calculated as (2 ^ skip_degree - 1). We store an element only if (hash & skip_mask) == 0. + /// For example, if skip_degree==0 then skip_mask==0 means we store each element; + /// if skip_degree==1 then skip_mask==0b0001 means we store one per 2 elements in average; + /// if skip_degree==4 then skip_mask==0b1111 means we store one per 16 elements in average. + UInt32 skip_mask = 0; void insertImpl(const T & v, const UInt32 hash) { /// Make a room for plus one element. while (samples.size() >= max_sample_size) - { - ++skip_degree; - if (skip_degree > detail::MAX_SKIP_DEGREE) - throw DB::Exception{"skip_degree exceeds maximum value", DB::ErrorCodes::MEMORY_LIMIT_EXCEEDED}; - thinOut(); - } + setSkipDegree(skip_degree + 1); samples.emplace_back(v, hash); } + void setSkipDegree(UInt8 skip_degree_) + { + if (skip_degree_ == skip_degree) + return; + if (skip_degree_ > detail::MAX_SKIP_DEGREE) + throw DB::Exception{"skip_degree exceeds maximum value", DB::ErrorCodes::MEMORY_LIMIT_EXCEEDED}; + skip_degree = skip_degree_; + if (skip_degree == detail::MAX_SKIP_DEGREE) + skip_mask = static_cast(-1); + else + skip_mask = (1 << skip_degree) - 1; + thinOut(); + } + void thinOut() { samples.resize(std::distance(samples.begin(), From 0336764426a2e5950dcc6ce27b6d89de09209368 Mon Sep 17 00:00:00 2001 From: alesapin Date: Thu, 18 Feb 2021 18:51:16 +0300 Subject: [PATCH 207/381] Fix tidy one more time --- src/Coordination/Changelog.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Coordination/Changelog.cpp b/src/Coordination/Changelog.cpp index a332ce37a8c..4a3955e23ab 100644 --- a/src/Coordination/Changelog.cpp +++ b/src/Coordination/Changelog.cpp @@ -174,7 +174,7 @@ public: readIntBinary(record.header.blob_size, read_buf); readIntBinary(record.header.blob_checksum, read_buf); auto buffer = nuraft::buffer::alloc(record.header.blob_size); - auto buffer_begin = reinterpret_cast(buffer->data_begin()); + auto * buffer_begin = reinterpret_cast(buffer->data_begin()); read_buf.readStrict(buffer_begin, record.header.blob_size); if (previous_index != 0 && previous_index + 1 != record.header.index) From 5cfe245e2203cf4ca62bc5e72897ebd358a64b5b Mon Sep 17 00:00:00 2001 From: sevirov <72220289+sevirov@users.noreply.github.com> Date: Thu, 18 Feb 2021 21:21:12 +0300 Subject: [PATCH 208/381] Update docs/ru/sql-reference/functions/type-conversion-functions.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/ru/sql-reference/functions/type-conversion-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index 92e674242df..1edebc26ccc 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -660,7 +660,7 @@ AS parseDateTimeBestEffortUS; ## parseDateTimeBestEffortUSOrNull {#parsedatetimebesteffortusornull} -Похожа на функцию [parseDateTimeBestEffortUS](#parsedatetimebesteffortUS), но разница состоит в том, что возвращает `NULL`, если входная строка не может быть преобразована в тип данных [DateTime](../../sql-reference/data-types/datetime.md). +Работает аналогично функции [parseDateTimeBestEffortUS](#parsedatetimebesteffortUS), но в отличие от нее возвращает `NULL`, если входная строка не может быть преобразована в тип данных [DateTime](../../sql-reference/data-types/datetime.md). **Синтаксис** From 1626833987b869c36096becebafbbb516939397d Mon Sep 17 00:00:00 2001 From: sevirov <72220289+sevirov@users.noreply.github.com> Date: Thu, 18 Feb 2021 21:21:25 +0300 Subject: [PATCH 209/381] Update docs/ru/sql-reference/functions/type-conversion-functions.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/ru/sql-reference/functions/type-conversion-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index 1edebc26ccc..80f24d53515 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -746,7 +746,7 @@ SELECT parseDateTimeBestEffortUSOrNull('10.2021') AS parseDateTimeBestEffortUSOr ## parseDateTimeBestEffortUSOrZero {#parsedatetimebesteffortusorzero} -Похожа на функцию [parseDateTimeBestEffortUS](#parsedatetimebesteffortUS), но разница в том, что возвращает нулевую дату или нулевую дату со временем, если входная строка не может быть преобразована в тип данных [DateTime](../../sql-reference/data-types/datetime.md). +Работает аналогично функции [parseDateTimeBestEffortUS](#parsedatetimebesteffortUS), но в отличие от нее возвращает нулевую дату или нулевую дату со временем, если входная строка не может быть преобразована в тип данных [DateTime](../../sql-reference/data-types/datetime.md). **Синтаксис** From 03640221a84828043770dd89e9fa2011af0ed126 Mon Sep 17 00:00:00 2001 From: Dmitriy Date: Thu, 18 Feb 2021 21:33:30 +0300 Subject: [PATCH 210/381] Add the zero date MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Добавил нулевую дату. --- docs/en/sql-reference/functions/type-conversion-functions.md | 2 +- docs/ru/sql-reference/functions/type-conversion-functions.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 11d54790ac2..def37cef366 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -777,7 +777,7 @@ Result: ## parseDateTimeBestEffortUSOrZero {#parsedatetimebesteffortusorzero} -Same as [parseDateTimeBestEffortUS](#parsedatetimebesteffortUS) function except that it returns zero date or zero date with time when it encounters a date format that cannot be processed. +Same as [parseDateTimeBestEffortUS](#parsedatetimebesteffortUS) function except that it returns zero date (`1970-01-01`) or zero date with time (`1970-01-01 00:00:00`) when it encounters a date format that cannot be processed. **Syntax** diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index 80f24d53515..4de2b5c6e3e 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -746,7 +746,7 @@ SELECT parseDateTimeBestEffortUSOrNull('10.2021') AS parseDateTimeBestEffortUSOr ## parseDateTimeBestEffortUSOrZero {#parsedatetimebesteffortusorzero} -Работает аналогично функции [parseDateTimeBestEffortUS](#parsedatetimebesteffortUS), но в отличие от нее возвращает нулевую дату или нулевую дату со временем, если входная строка не может быть преобразована в тип данных [DateTime](../../sql-reference/data-types/datetime.md). +Работает аналогично функции [parseDateTimeBestEffortUS](#parsedatetimebesteffortUS), но в отличие от нее возвращает нулевую дату (`1970-01-01`) или нулевую дату со временем (`1970-01-01 00:00:00`), если входная строка не может быть преобразована в тип данных [DateTime](../../sql-reference/data-types/datetime.md). **Синтаксис** From b854a7b7f8e80b9701b02e5218e37965631541f7 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Thu, 18 Feb 2021 21:41:50 +0300 Subject: [PATCH 211/381] Add some details into comment for first_stage/second_stage Regardless distributed_group_by_no_merge=2/optimize_distributed_group_by_sharding_key --- src/Interpreters/InterpreterSelectQuery.cpp | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index 9f97160f77f..9f48a9a193b 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -561,10 +561,20 @@ Block InterpreterSelectQuery::getSampleBlockImpl() if (storage && !options.only_analyze) from_stage = storage->getQueryProcessingStage(*context, options.to_stage, query_info); - /// Do I need to perform the first part of the pipeline - running on remote servers during distributed processing. + /// Do I need to perform the first part of the pipeline? + /// Running on remote servers during distributed processing or if query is not distributed. + /// + /// Also note that with distributed_group_by_no_merge=1 or when there is + /// only one remote server, it is equal to local query in terms of query + /// stages (or when due to optimize_distributed_group_by_sharding_key the query was processed up to Complete stage). bool first_stage = from_stage < QueryProcessingStage::WithMergeableState && options.to_stage >= QueryProcessingStage::WithMergeableState; - /// Do I need to execute the second part of the pipeline - running on the initiating server during distributed processing. + /// Do I need to execute the second part of the pipeline? + /// Running on the initiating server during distributed processing or if query is not distributed. + /// + /// Also note that with distributed_group_by_no_merge=2 (i.e. when optimize_distributed_group_by_sharding_key takes place) + /// the query on the remote server will be processed up to WithMergeableStateAfterAggregation, + /// So it will do partial second stage (second_stage=true), and initiator will do the final part. bool second_stage = from_stage <= QueryProcessingStage::WithMergeableState && options.to_stage > QueryProcessingStage::WithMergeableState; From af660140c320ca45bca0edfd89000b3c6da8ee6a Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Thu, 18 Feb 2021 21:41:50 +0300 Subject: [PATCH 212/381] Do only merging of sorted blocks on initiator with distributed_group_by_no_merge=2 When distributed_group_by_no_merge=2 is used (or when optimize_distributed_group_by_sharding_key takes place), remote servers will do full ORDER BY, so initiator can skip this step and do only merge of ordered blocks. --- src/Interpreters/InterpreterSelectQuery.cpp | 8 +++++++- ...buted_group_by_no_merge_order_by.reference | 20 +++++++++++++++++++ ...distributed_group_by_no_merge_order_by.sql | 20 +++++++++++++++++++ 3 files changed, 47 insertions(+), 1 deletion(-) create mode 100644 tests/queries/0_stateless/01730_distributed_group_by_no_merge_order_by.reference create mode 100644 tests/queries/0_stateless/01730_distributed_group_by_no_merge_order_by.sql diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index 9f48a9a193b..3008c55973d 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -1103,9 +1103,15 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu /** If there is an ORDER BY for distributed query processing, * but there is no aggregation, then on the remote servers ORDER BY was made * - therefore, we merge the sorted streams from remote servers. + * + * Also in case of remote servers was process the query up to WithMergeableStateAfterAggregation + * (distributed_group_by_no_merge=2 or optimize_distributed_group_by_sharding_key=1 takes place), + * then merge the sorted streams is enough, since remote servers already did full ORDER BY. */ - if (!expressions.first_stage && !expressions.need_aggregate && !(query.group_by_with_totals && !aggregate_final)) + if (from_aggregation_stage) + executeMergeSorted(query_plan, "for ORDER BY"); + else if (!expressions.first_stage && !expressions.need_aggregate && !(query.group_by_with_totals && !aggregate_final)) executeMergeSorted(query_plan, "for ORDER BY"); else /// Otherwise, just sort. executeOrder(query_plan, query_info.input_order_info); diff --git a/tests/queries/0_stateless/01730_distributed_group_by_no_merge_order_by.reference b/tests/queries/0_stateless/01730_distributed_group_by_no_merge_order_by.reference new file mode 100644 index 00000000000..02ae8a37e52 --- /dev/null +++ b/tests/queries/0_stateless/01730_distributed_group_by_no_merge_order_by.reference @@ -0,0 +1,20 @@ +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 diff --git a/tests/queries/0_stateless/01730_distributed_group_by_no_merge_order_by.sql b/tests/queries/0_stateless/01730_distributed_group_by_no_merge_order_by.sql new file mode 100644 index 00000000000..e43b81dca48 --- /dev/null +++ b/tests/queries/0_stateless/01730_distributed_group_by_no_merge_order_by.sql @@ -0,0 +1,20 @@ +drop table if exists data_01730; + +-- does not use 127.1 due to prefer_localhost_replica + +select * from remote('127.{2..11}', view(select * from numbers(1e6))) group by number order by number limit 20 settings distributed_group_by_no_merge=0, max_memory_usage='100Mi'; -- { serverError 241 } +-- no memory limit error, because with distributed_group_by_no_merge=2 remote servers will do ORDER BY and will cut to the LIMIT +select * from remote('127.{2..11}', view(select * from numbers(1e6))) group by number order by number limit 20 settings distributed_group_by_no_merge=2, max_memory_usage='100Mi'; + +-- since the MergingSortedTransform will start processing only when all ports (remotes) will have some data, +-- and the query with GROUP BY on remote servers will first do GROUP BY and then send the block, +-- so the initiator will first receive all blocks from remotes and only after start merging, +-- and will hit the memory limit. +select * from remote('127.{2..11}', view(select * from numbers(1e6))) group by number order by number limit 1e6 settings distributed_group_by_no_merge=2, max_memory_usage='100Mi'; -- { serverError 241 } + +-- with optimize_aggregation_in_order=1 remote servers will produce blocks more frequently, +-- since they don't need to wait until the aggregation will be finished, +-- and so the query will not hit the memory limit error. +create table data_01730 engine=MergeTree() order by key as select number key from numbers(1e6); +select * from remote('127.{2..11}', currentDatabase(), data_01730) group by key order by key limit 1e6 settings distributed_group_by_no_merge=2, max_memory_usage='100Mi', optimize_aggregation_in_order=1 format Null; +drop table data_01730; From 9c01869090e873603b3bb7ec1cd17fbcf264bc4f Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Thu, 18 Feb 2021 21:28:42 +0300 Subject: [PATCH 213/381] Fix 'Empty task was returned from async task queue' on query cancellation --- src/Processors/Executors/PipelineExecutor.cpp | 5 +++++ .../01731_async_task_queue_wait.reference | 0 .../0_stateless/01731_async_task_queue_wait.sh | 12 ++++++++++++ 3 files changed, 17 insertions(+) create mode 100644 tests/queries/0_stateless/01731_async_task_queue_wait.reference create mode 100755 tests/queries/0_stateless/01731_async_task_queue_wait.sh diff --git a/src/Processors/Executors/PipelineExecutor.cpp b/src/Processors/Executors/PipelineExecutor.cpp index 6192828784f..a724f22ed31 100644 --- a/src/Processors/Executors/PipelineExecutor.cpp +++ b/src/Processors/Executors/PipelineExecutor.cpp @@ -540,7 +540,12 @@ void PipelineExecutor::executeStepImpl(size_t thread_num, size_t num_threads, st /// If we execute in single thread, wait for async tasks here. auto res = async_task_queue.wait(lock); if (!res) + { + /// The query had been cancelled (finished is also set) + if (finished) + break; throw Exception("Empty task was returned from async task queue", ErrorCodes::LOGICAL_ERROR); + } node = static_cast(res.data); break; diff --git a/tests/queries/0_stateless/01731_async_task_queue_wait.reference b/tests/queries/0_stateless/01731_async_task_queue_wait.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/01731_async_task_queue_wait.sh b/tests/queries/0_stateless/01731_async_task_queue_wait.sh new file mode 100755 index 00000000000..eddbfdf5322 --- /dev/null +++ b/tests/queries/0_stateless/01731_async_task_queue_wait.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +# regression for 'Empty task was returned from async task queue' during query +# cancellation with async_socket_for_remote=1 (that ignores +# max_distributed_connections) +timeout 5s ${CLICKHOUSE_CLIENT} --max_distributed_connections=1 --format Null -q "select * from remote('127.{2..11}', view(select * from numbers(1e9))) group by number format Null" +# timedout +test $? -eq 124 From 7bcfe92cd7ba75f7d2ee2d58be3ec51f627a807f Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Thu, 18 Feb 2021 23:29:38 +0300 Subject: [PATCH 214/381] Mark 01730_distributed_group_by_no_merge_order_by as long https://clickhouse-test-reports.s3.yandex.net/20882/af660140c320ca45bca0edfd89000b3c6da8ee6a/functional_stateless_tests_flaky_check_(address).html#fail1 --- ...> 01730_distributed_group_by_no_merge_order_by_long.reference} | 0 ....sql => 01730_distributed_group_by_no_merge_order_by_long.sql} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename tests/queries/0_stateless/{01730_distributed_group_by_no_merge_order_by.reference => 01730_distributed_group_by_no_merge_order_by_long.reference} (100%) rename tests/queries/0_stateless/{01730_distributed_group_by_no_merge_order_by.sql => 01730_distributed_group_by_no_merge_order_by_long.sql} (100%) diff --git a/tests/queries/0_stateless/01730_distributed_group_by_no_merge_order_by.reference b/tests/queries/0_stateless/01730_distributed_group_by_no_merge_order_by_long.reference similarity index 100% rename from tests/queries/0_stateless/01730_distributed_group_by_no_merge_order_by.reference rename to tests/queries/0_stateless/01730_distributed_group_by_no_merge_order_by_long.reference diff --git a/tests/queries/0_stateless/01730_distributed_group_by_no_merge_order_by.sql b/tests/queries/0_stateless/01730_distributed_group_by_no_merge_order_by_long.sql similarity index 100% rename from tests/queries/0_stateless/01730_distributed_group_by_no_merge_order_by.sql rename to tests/queries/0_stateless/01730_distributed_group_by_no_merge_order_by_long.sql From ee98b2a472aa05d28d36f859eefff0d359b45910 Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 19 Feb 2021 00:03:16 +0300 Subject: [PATCH 215/381] Better list requests --- src/Coordination/NuKeeperStorage.cpp | 37 ++++++++++++++-------------- src/Coordination/NuKeeperStorage.h | 8 +++--- 2 files changed, 24 insertions(+), 21 deletions(-) diff --git a/src/Coordination/NuKeeperStorage.cpp b/src/Coordination/NuKeeperStorage.cpp index 631f975cddc..fa57b8141a7 100644 --- a/src/Coordination/NuKeeperStorage.cpp +++ b/src/Coordination/NuKeeperStorage.cpp @@ -25,10 +25,10 @@ static String parentPath(const String & path) return "/"; } -static String baseName(const String & path) +static std::string_view getBaseNameView(const String & path) { - auto rslash_pos = path.rfind('/'); - return path.substr(rslash_pos + 1); + size_t basename_start = path.rfind('/'); + return std::string_view{&path[basename_start + 1], path.length() - basename_start - 1}; } static NuKeeperStorage::ResponsesForSessions processWatchesImpl(const String & path, NuKeeperStorage::Watches & watches, NuKeeperStorage::Watches & list_watches, Coordination::Event event_type) @@ -167,14 +167,17 @@ struct NuKeeperStorageCreateRequest final : public NuKeeperStorageRequest /// Increment sequential number even if node is not sequential ++it->second.seq_num; - response.path_created = path_created; - container.emplace(path_created, std::move(created_node)); + + auto [child_itr, created] = container.emplace(path_created, std::move(created_node)); + + auto child_path_view = getBaseNameView(child_itr->first); + it->second.children.insert(child_path_view); if (request.is_ephemeral) ephemerals[session_id].emplace(path_created); - undo = [&container, &ephemerals, session_id, path_created, is_ephemeral = request.is_ephemeral, parent_path = it->first] + undo = [&container, &ephemerals, session_id, path_created, is_ephemeral = request.is_ephemeral, parent_path = it->first, child_path_view] { container.erase(path_created); if (is_ephemeral) @@ -183,6 +186,7 @@ struct NuKeeperStorageCreateRequest final : public NuKeeperStorageRequest --undo_parent.stat.cversion; --undo_parent.stat.numChildren; --undo_parent.seq_num; + undo_parent.children.erase(child_path_view); }; ++it->second.stat.cversion; @@ -250,21 +254,25 @@ struct NuKeeperStorageRemoveRequest final : public NuKeeperStorageRequest if (prev_node.is_ephemeral) ephemerals[session_id].erase(request.path); - container.erase(it); + auto child_basename_view = getBaseNameView(it->first); auto & parent = container.at(parentPath(request.path)); --parent.stat.numChildren; ++parent.stat.cversion; + parent.children.erase(child_basename_view); response.error = Coordination::Error::ZOK; + container.erase(it); + undo = [prev_node, &container, &ephemerals, session_id, path = request.path] { if (prev_node.is_ephemeral) ephemerals[session_id].emplace(path); - container.emplace(path, prev_node); + auto [itr, inserted] = container.emplace(path, prev_node); auto & undo_parent = container.at(parentPath(path)); ++undo_parent.stat.numChildren; --undo_parent.stat.cversion; + undo_parent.children.insert(getBaseNameView(itr->first)); }; } @@ -370,17 +378,10 @@ struct NuKeeperStorageListRequest final : public NuKeeperStorageRequest if (path_prefix.empty()) throw DB::Exception("Logical error: path cannot be empty", ErrorCodes::LOGICAL_ERROR); - if (path_prefix.back() != '/') - path_prefix += '/'; + for (const auto & name : it->second.children) + response.names.emplace_back(name); - /// Fairly inefficient. - for (auto child_it = container.upper_bound(path_prefix); - child_it != container.end() && startsWith(child_it->first, path_prefix); - ++child_it) - { - if (parentPath(child_it->first) == request.path) - response.names.emplace_back(baseName(child_it->first)); - } + std::sort(response.names.begin(), response.names.end()); response.stat = it->second.stat; response.error = Coordination::Error::ZOK; diff --git a/src/Coordination/NuKeeperStorage.h b/src/Coordination/NuKeeperStorage.h index 20ab1982b4e..bd1fc087d09 100644 --- a/src/Coordination/NuKeeperStorage.h +++ b/src/Coordination/NuKeeperStorage.h @@ -16,6 +16,7 @@ using namespace DB; struct NuKeeperStorageRequest; using NuKeeperStorageRequestPtr = std::shared_ptr; using ResponseCallback = std::function; +using ChildrenRefSet = std::unordered_set; class NuKeeperStorage { @@ -30,6 +31,7 @@ public: bool is_sequental = false; Coordination::Stat stat{}; int32_t seq_num = 0; + ChildrenRefSet children; }; struct ResponseForSession @@ -48,9 +50,9 @@ public: using RequestsForSessions = std::vector; - using Container = std::map; - using Ephemerals = std::unordered_map>; - using SessionAndWatcher = std::unordered_map>; + using Container = std::unordered_map; + using Ephemerals = std::unordered_map>; + using SessionAndWatcher = std::unordered_map>; using SessionAndTimeout = std::unordered_map; using SessionIDs = std::vector; From 839d6f7072d6de6b71cc497027ca40715968535e Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 19 Feb 2021 00:09:41 +0300 Subject: [PATCH 216/381] Revert "Better list requests" This reverts commit ee98b2a472aa05d28d36f859eefff0d359b45910. --- src/Coordination/NuKeeperStorage.cpp | 37 ++++++++++++++-------------- src/Coordination/NuKeeperStorage.h | 8 +++--- 2 files changed, 21 insertions(+), 24 deletions(-) diff --git a/src/Coordination/NuKeeperStorage.cpp b/src/Coordination/NuKeeperStorage.cpp index fa57b8141a7..631f975cddc 100644 --- a/src/Coordination/NuKeeperStorage.cpp +++ b/src/Coordination/NuKeeperStorage.cpp @@ -25,10 +25,10 @@ static String parentPath(const String & path) return "/"; } -static std::string_view getBaseNameView(const String & path) +static String baseName(const String & path) { - size_t basename_start = path.rfind('/'); - return std::string_view{&path[basename_start + 1], path.length() - basename_start - 1}; + auto rslash_pos = path.rfind('/'); + return path.substr(rslash_pos + 1); } static NuKeeperStorage::ResponsesForSessions processWatchesImpl(const String & path, NuKeeperStorage::Watches & watches, NuKeeperStorage::Watches & list_watches, Coordination::Event event_type) @@ -167,17 +167,14 @@ struct NuKeeperStorageCreateRequest final : public NuKeeperStorageRequest /// Increment sequential number even if node is not sequential ++it->second.seq_num; + response.path_created = path_created; - - auto [child_itr, created] = container.emplace(path_created, std::move(created_node)); - - auto child_path_view = getBaseNameView(child_itr->first); - it->second.children.insert(child_path_view); + container.emplace(path_created, std::move(created_node)); if (request.is_ephemeral) ephemerals[session_id].emplace(path_created); - undo = [&container, &ephemerals, session_id, path_created, is_ephemeral = request.is_ephemeral, parent_path = it->first, child_path_view] + undo = [&container, &ephemerals, session_id, path_created, is_ephemeral = request.is_ephemeral, parent_path = it->first] { container.erase(path_created); if (is_ephemeral) @@ -186,7 +183,6 @@ struct NuKeeperStorageCreateRequest final : public NuKeeperStorageRequest --undo_parent.stat.cversion; --undo_parent.stat.numChildren; --undo_parent.seq_num; - undo_parent.children.erase(child_path_view); }; ++it->second.stat.cversion; @@ -254,25 +250,21 @@ struct NuKeeperStorageRemoveRequest final : public NuKeeperStorageRequest if (prev_node.is_ephemeral) ephemerals[session_id].erase(request.path); - auto child_basename_view = getBaseNameView(it->first); + container.erase(it); auto & parent = container.at(parentPath(request.path)); --parent.stat.numChildren; ++parent.stat.cversion; - parent.children.erase(child_basename_view); response.error = Coordination::Error::ZOK; - container.erase(it); - undo = [prev_node, &container, &ephemerals, session_id, path = request.path] { if (prev_node.is_ephemeral) ephemerals[session_id].emplace(path); - auto [itr, inserted] = container.emplace(path, prev_node); + container.emplace(path, prev_node); auto & undo_parent = container.at(parentPath(path)); ++undo_parent.stat.numChildren; --undo_parent.stat.cversion; - undo_parent.children.insert(getBaseNameView(itr->first)); }; } @@ -378,10 +370,17 @@ struct NuKeeperStorageListRequest final : public NuKeeperStorageRequest if (path_prefix.empty()) throw DB::Exception("Logical error: path cannot be empty", ErrorCodes::LOGICAL_ERROR); - for (const auto & name : it->second.children) - response.names.emplace_back(name); + if (path_prefix.back() != '/') + path_prefix += '/'; - std::sort(response.names.begin(), response.names.end()); + /// Fairly inefficient. + for (auto child_it = container.upper_bound(path_prefix); + child_it != container.end() && startsWith(child_it->first, path_prefix); + ++child_it) + { + if (parentPath(child_it->first) == request.path) + response.names.emplace_back(baseName(child_it->first)); + } response.stat = it->second.stat; response.error = Coordination::Error::ZOK; diff --git a/src/Coordination/NuKeeperStorage.h b/src/Coordination/NuKeeperStorage.h index bd1fc087d09..20ab1982b4e 100644 --- a/src/Coordination/NuKeeperStorage.h +++ b/src/Coordination/NuKeeperStorage.h @@ -16,7 +16,6 @@ using namespace DB; struct NuKeeperStorageRequest; using NuKeeperStorageRequestPtr = std::shared_ptr; using ResponseCallback = std::function; -using ChildrenRefSet = std::unordered_set; class NuKeeperStorage { @@ -31,7 +30,6 @@ public: bool is_sequental = false; Coordination::Stat stat{}; int32_t seq_num = 0; - ChildrenRefSet children; }; struct ResponseForSession @@ -50,9 +48,9 @@ public: using RequestsForSessions = std::vector; - using Container = std::unordered_map; - using Ephemerals = std::unordered_map>; - using SessionAndWatcher = std::unordered_map>; + using Container = std::map; + using Ephemerals = std::unordered_map>; + using SessionAndWatcher = std::unordered_map>; using SessionAndTimeout = std::unordered_map; using SessionIDs = std::vector; From fc185e5fb73dc0ac82ab8b0b7a79518832401379 Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Fri, 19 Feb 2021 11:56:24 +0800 Subject: [PATCH 217/381] Another try --- src/Server/TCPHandler.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Server/TCPHandler.cpp b/src/Server/TCPHandler.cpp index 430a01bb97a..9794a86d3e3 100644 --- a/src/Server/TCPHandler.cpp +++ b/src/Server/TCPHandler.cpp @@ -1133,8 +1133,8 @@ void TCPHandler::receiveQuery() } query_context->applySettingsChanges(settings_changes); - /// Disable function name normalization it's not an initial query. - if (client_info.query_kind != ClientInfo::QueryKind::INITIAL_QUERY) + /// Disable function name normalization it's a secondary query. + if (client_info.query_kind == ClientInfo::QueryKind::SECONDARY_QUERY) { query_context->setSetting("normalize_function_names", Field(0)); } From 88a6d4e206c362dcafc0d8751cb2a6a450178ee8 Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 19 Feb 2021 10:05:52 +0300 Subject: [PATCH 218/381] Revert "Revert "Better list requests"" This reverts commit 839d6f7072d6de6b71cc497027ca40715968535e. --- src/Coordination/NuKeeperStorage.cpp | 37 ++++++++++++++-------------- src/Coordination/NuKeeperStorage.h | 8 +++--- 2 files changed, 24 insertions(+), 21 deletions(-) diff --git a/src/Coordination/NuKeeperStorage.cpp b/src/Coordination/NuKeeperStorage.cpp index 631f975cddc..fa57b8141a7 100644 --- a/src/Coordination/NuKeeperStorage.cpp +++ b/src/Coordination/NuKeeperStorage.cpp @@ -25,10 +25,10 @@ static String parentPath(const String & path) return "/"; } -static String baseName(const String & path) +static std::string_view getBaseNameView(const String & path) { - auto rslash_pos = path.rfind('/'); - return path.substr(rslash_pos + 1); + size_t basename_start = path.rfind('/'); + return std::string_view{&path[basename_start + 1], path.length() - basename_start - 1}; } static NuKeeperStorage::ResponsesForSessions processWatchesImpl(const String & path, NuKeeperStorage::Watches & watches, NuKeeperStorage::Watches & list_watches, Coordination::Event event_type) @@ -167,14 +167,17 @@ struct NuKeeperStorageCreateRequest final : public NuKeeperStorageRequest /// Increment sequential number even if node is not sequential ++it->second.seq_num; - response.path_created = path_created; - container.emplace(path_created, std::move(created_node)); + + auto [child_itr, created] = container.emplace(path_created, std::move(created_node)); + + auto child_path_view = getBaseNameView(child_itr->first); + it->second.children.insert(child_path_view); if (request.is_ephemeral) ephemerals[session_id].emplace(path_created); - undo = [&container, &ephemerals, session_id, path_created, is_ephemeral = request.is_ephemeral, parent_path = it->first] + undo = [&container, &ephemerals, session_id, path_created, is_ephemeral = request.is_ephemeral, parent_path = it->first, child_path_view] { container.erase(path_created); if (is_ephemeral) @@ -183,6 +186,7 @@ struct NuKeeperStorageCreateRequest final : public NuKeeperStorageRequest --undo_parent.stat.cversion; --undo_parent.stat.numChildren; --undo_parent.seq_num; + undo_parent.children.erase(child_path_view); }; ++it->second.stat.cversion; @@ -250,21 +254,25 @@ struct NuKeeperStorageRemoveRequest final : public NuKeeperStorageRequest if (prev_node.is_ephemeral) ephemerals[session_id].erase(request.path); - container.erase(it); + auto child_basename_view = getBaseNameView(it->first); auto & parent = container.at(parentPath(request.path)); --parent.stat.numChildren; ++parent.stat.cversion; + parent.children.erase(child_basename_view); response.error = Coordination::Error::ZOK; + container.erase(it); + undo = [prev_node, &container, &ephemerals, session_id, path = request.path] { if (prev_node.is_ephemeral) ephemerals[session_id].emplace(path); - container.emplace(path, prev_node); + auto [itr, inserted] = container.emplace(path, prev_node); auto & undo_parent = container.at(parentPath(path)); ++undo_parent.stat.numChildren; --undo_parent.stat.cversion; + undo_parent.children.insert(getBaseNameView(itr->first)); }; } @@ -370,17 +378,10 @@ struct NuKeeperStorageListRequest final : public NuKeeperStorageRequest if (path_prefix.empty()) throw DB::Exception("Logical error: path cannot be empty", ErrorCodes::LOGICAL_ERROR); - if (path_prefix.back() != '/') - path_prefix += '/'; + for (const auto & name : it->second.children) + response.names.emplace_back(name); - /// Fairly inefficient. - for (auto child_it = container.upper_bound(path_prefix); - child_it != container.end() && startsWith(child_it->first, path_prefix); - ++child_it) - { - if (parentPath(child_it->first) == request.path) - response.names.emplace_back(baseName(child_it->first)); - } + std::sort(response.names.begin(), response.names.end()); response.stat = it->second.stat; response.error = Coordination::Error::ZOK; diff --git a/src/Coordination/NuKeeperStorage.h b/src/Coordination/NuKeeperStorage.h index 20ab1982b4e..bd1fc087d09 100644 --- a/src/Coordination/NuKeeperStorage.h +++ b/src/Coordination/NuKeeperStorage.h @@ -16,6 +16,7 @@ using namespace DB; struct NuKeeperStorageRequest; using NuKeeperStorageRequestPtr = std::shared_ptr; using ResponseCallback = std::function; +using ChildrenRefSet = std::unordered_set; class NuKeeperStorage { @@ -30,6 +31,7 @@ public: bool is_sequental = false; Coordination::Stat stat{}; int32_t seq_num = 0; + ChildrenRefSet children; }; struct ResponseForSession @@ -48,9 +50,9 @@ public: using RequestsForSessions = std::vector; - using Container = std::map; - using Ephemerals = std::unordered_map>; - using SessionAndWatcher = std::unordered_map>; + using Container = std::unordered_map; + using Ephemerals = std::unordered_map>; + using SessionAndWatcher = std::unordered_map>; using SessionAndTimeout = std::unordered_map; using SessionIDs = std::vector; From b72b13bab05fc6f90396f335471023673c98c31f Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 19 Feb 2021 10:25:55 +0300 Subject: [PATCH 219/381] Better list performance --- src/Coordination/NuKeeperStorage.cpp | 27 +++++++++++++-------------- src/Coordination/NuKeeperStorage.h | 4 ++-- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/src/Coordination/NuKeeperStorage.cpp b/src/Coordination/NuKeeperStorage.cpp index fa57b8141a7..bb433474dc9 100644 --- a/src/Coordination/NuKeeperStorage.cpp +++ b/src/Coordination/NuKeeperStorage.cpp @@ -25,10 +25,10 @@ static String parentPath(const String & path) return "/"; } -static std::string_view getBaseNameView(const String & path) +static std::string getBaseName(const String & path) { size_t basename_start = path.rfind('/'); - return std::string_view{&path[basename_start + 1], path.length() - basename_start - 1}; + return std::string{&path[basename_start + 1], path.length() - basename_start - 1}; } static NuKeeperStorage::ResponsesForSessions processWatchesImpl(const String & path, NuKeeperStorage::Watches & watches, NuKeeperStorage::Watches & list_watches, Coordination::Event event_type) @@ -169,15 +169,15 @@ struct NuKeeperStorageCreateRequest final : public NuKeeperStorageRequest ++it->second.seq_num; response.path_created = path_created; - auto [child_itr, created] = container.emplace(path_created, std::move(created_node)); + container.emplace(path_created, std::move(created_node)); - auto child_path_view = getBaseNameView(child_itr->first); - it->second.children.insert(child_path_view); + auto child_path = getBaseName(path_created); + it->second.children.insert(child_path); if (request.is_ephemeral) ephemerals[session_id].emplace(path_created); - undo = [&container, &ephemerals, session_id, path_created, is_ephemeral = request.is_ephemeral, parent_path = it->first, child_path_view] + undo = [&container, &ephemerals, session_id, path_created, is_ephemeral = request.is_ephemeral, parent_path = it->first, child_path] { container.erase(path_created); if (is_ephemeral) @@ -186,7 +186,7 @@ struct NuKeeperStorageCreateRequest final : public NuKeeperStorageRequest --undo_parent.stat.cversion; --undo_parent.stat.numChildren; --undo_parent.seq_num; - undo_parent.children.erase(child_path_view); + undo_parent.children.erase(child_path); }; ++it->second.stat.cversion; @@ -254,25 +254,25 @@ struct NuKeeperStorageRemoveRequest final : public NuKeeperStorageRequest if (prev_node.is_ephemeral) ephemerals[session_id].erase(request.path); - auto child_basename_view = getBaseNameView(it->first); + auto child_basename = getBaseName(it->first); auto & parent = container.at(parentPath(request.path)); --parent.stat.numChildren; ++parent.stat.cversion; - parent.children.erase(child_basename_view); + parent.children.erase(child_basename); response.error = Coordination::Error::ZOK; container.erase(it); - undo = [prev_node, &container, &ephemerals, session_id, path = request.path] + undo = [prev_node, &container, &ephemerals, session_id, path = request.path, child_basename] { if (prev_node.is_ephemeral) ephemerals[session_id].emplace(path); - auto [itr, inserted] = container.emplace(path, prev_node); + container.emplace(path, prev_node); auto & undo_parent = container.at(parentPath(path)); ++undo_parent.stat.numChildren; --undo_parent.stat.cversion; - undo_parent.children.insert(getBaseNameView(itr->first)); + undo_parent.children.insert(child_basename); }; } @@ -378,8 +378,7 @@ struct NuKeeperStorageListRequest final : public NuKeeperStorageRequest if (path_prefix.empty()) throw DB::Exception("Logical error: path cannot be empty", ErrorCodes::LOGICAL_ERROR); - for (const auto & name : it->second.children) - response.names.emplace_back(name); + response.names.insert(response.names.end(), it->second.children.begin(), it->second.children.end()); std::sort(response.names.begin(), response.names.end()); diff --git a/src/Coordination/NuKeeperStorage.h b/src/Coordination/NuKeeperStorage.h index bd1fc087d09..299fad4eea0 100644 --- a/src/Coordination/NuKeeperStorage.h +++ b/src/Coordination/NuKeeperStorage.h @@ -16,7 +16,7 @@ using namespace DB; struct NuKeeperStorageRequest; using NuKeeperStorageRequestPtr = std::shared_ptr; using ResponseCallback = std::function; -using ChildrenRefSet = std::unordered_set; +using ChildrenSet = std::unordered_set; class NuKeeperStorage { @@ -31,7 +31,7 @@ public: bool is_sequental = false; Coordination::Stat stat{}; int32_t seq_num = 0; - ChildrenRefSet children; + ChildrenSet children; }; struct ResponseForSession From b9d6df9618c6a1b0efcd17c66cfa22aaa023d97a Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Fri, 19 Feb 2021 11:49:41 +0300 Subject: [PATCH 220/381] Check for eintr in epoll_wait --- src/Client/PacketReceiver.h | 145 ++++++++++++++++++ .../RemoteQueryExecutorReadContext.cpp | 10 +- src/Processors/Executors/PollingQueue.cpp | 7 +- 3 files changed, 156 insertions(+), 6 deletions(-) create mode 100644 src/Client/PacketReceiver.h diff --git a/src/Client/PacketReceiver.h b/src/Client/PacketReceiver.h new file mode 100644 index 00000000000..c9475bafa71 --- /dev/null +++ b/src/Client/PacketReceiver.h @@ -0,0 +1,145 @@ +#pragma once + +#if defined(OS_LINUX) + +#include +#include +#include +#include +#include + +namespace DB +{ + +/// Class for nonblocking packet receiving. It runs connection->receivePacket +/// in fiber and sets special read callback which is called when +/// reading from socket blocks. When read callback is called, +/// socket and receive timeout are added in epoll and execution returns to the main program. +/// So, you can poll this epoll file descriptor to determine when to resume +/// packet receiving (beside polling epoll descriptor, you also need to check connection->hasPendingData(), +/// because small packet can be read in buffer with the previous one, so new packet will be ready in buffer, +/// but there is no data socket to poll). +class PacketReceiver +{ +public: + PacketReceiver(Connection * connection_) : connection(connection_) + { + epoll.add(receive_timeout.getDescriptor()); + epoll.add(connection->getSocket()->impl()->sockfd()); + fiber = boost::context::fiber(std::allocator_arg_t(), fiber_stack, Routine{*this}); + } + + /// Resume packet receiving. + void resume() + { + /// If there is no pending data, check receive timeout. + if (!connection->hasReadPendingData() && !checkReceiveTimeout()) + return; + + fiber = std::move(fiber).resume(); + if (exception) + std::rethrow_exception(std::move(exception)); + } + + void cancel() + { + Fiber to_destroy = std::move(fiber); + connection = nullptr; + } + + Packet getPacket() { return std::move(packet); } + + int getFileDescriptor() const { return epoll.getFileDescriptor(); } + + bool isPacketReady() const { return !is_read_in_process; } + + bool isReceiveTimeoutExpired() const { return is_receive_timeout_expired; } + +private: + /// When epoll file descriptor is ready, check if it's an expired timeout + bool checkReceiveTimeout() + { + bool is_socket_ready = false; + is_receive_timeout_expired = false; + + epoll_event events[2]; + events[0].data.fd = events[1].data.fd = -1; + size_t ready_count = epoll.getManyReady(2, events, true); + + for (size_t i = 0; i != ready_count; ++i) + { + if (events[i].data.fd == connection->getSocket()->impl()->sockfd()) + is_socket_ready = true; + if (events[i].data.fd == receive_timeout.getDescriptor()) + is_receive_timeout_expired = true; + } + + if (is_receive_timeout_expired && !is_socket_ready) + { + receive_timeout.reset(); + return false; + } + + return true; + } + + struct Routine + { + PacketReceiver & receiver; + + struct ReadCallback + { + PacketReceiver & receiver; + Fiber & sink; + + void operator()(int, const Poco::Timespan & timeout, const std::string &) + { + receiver.receive_timeout.setRelative(timeout); + receiver.is_read_in_process = true; + sink = std::move(sink).resume(); + receiver.is_read_in_process = false; + receiver.receive_timeout.reset(); + } + }; + + Fiber operator()(Fiber && sink) + { + try + { + AsyncCallbackSetter async_setter(receiver.connection, ReadCallback{receiver, sink}); + while (true) + { + receiver.packet = receiver.connection->receivePacket(); + sink = std::move(sink).resume(); + } + + } + catch (const boost::context::detail::forced_unwind &) + { + /// This exception is thrown by fiber implementation in case if fiber is being deleted but hasn't exited + /// It should not be caught or it will segfault. + /// Other exceptions must be caught + throw; + } + catch (...) + { + receiver.exception = std::current_exception(); + } + + return std::move(sink); + } + }; + + Connection * connection; + TimerDescriptor receive_timeout; + Epoll epoll; + Fiber fiber; + FiberStack fiber_stack; + Packet packet; + bool is_read_in_process = false; + bool is_receive_timeout_expired = false; + std::exception_ptr exception; +}; + +} +#endif diff --git a/src/DataStreams/RemoteQueryExecutorReadContext.cpp b/src/DataStreams/RemoteQueryExecutorReadContext.cpp index bc47b049407..c79fffafcb1 100644 --- a/src/DataStreams/RemoteQueryExecutorReadContext.cpp +++ b/src/DataStreams/RemoteQueryExecutorReadContext.cpp @@ -146,9 +146,13 @@ bool RemoteQueryExecutorReadContext::checkTimeoutImpl() const events[0].data.fd = events[1].data.fd = events[2].data.fd = -1; /// Wait for epoll_fd will not block if it was polled externally. - int num_events = epoll_wait(epoll_fd, events, 3, 0); - if (num_events == -1) - throwFromErrno("Failed to epoll_wait", ErrorCodes::CANNOT_READ_FROM_SOCKET); + int num_events = 0; + while (num_events <= 0) + { + num_events = epoll_wait(epoll_fd, events, 3, 0); + if (num_events == -1 && errno != EINTR) + throwFromErrno("Failed to epoll_wait", ErrorCodes::CANNOT_READ_FROM_SOCKET); + } bool is_socket_ready = false; bool is_pipe_alarmed = false; diff --git a/src/Processors/Executors/PollingQueue.cpp b/src/Processors/Executors/PollingQueue.cpp index 93edfe53987..b9c7bdade2d 100644 --- a/src/Processors/Executors/PollingQueue.cpp +++ b/src/Processors/Executors/PollingQueue.cpp @@ -88,11 +88,12 @@ PollingQueue::TaskData PollingQueue::wait(std::unique_lock & lock) event.data.ptr = nullptr; int num_events = 0; - while (num_events == 0) + while (num_events <= 0) { num_events = epoll_wait(epoll_fd, &event, 1, 0); - if (num_events == -1) - throwFromErrno("Failed to epoll_wait", ErrorCodes::CANNOT_READ_FROM_SOCKET); + + if (num_events == -1 && errno != EINTR) + throwFromErrno("Failed to epoll_wait", ErrorCodes::CANNOT_READ_FROM_SOCKET); } lock.lock(); From 7d1119680e7881af7f5934773721cb48f40b35e7 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Fri, 19 Feb 2021 11:52:33 +0300 Subject: [PATCH 221/381] Remove not needed file. --- src/Client/PacketReceiver.h | 145 ------------------------------------ 1 file changed, 145 deletions(-) delete mode 100644 src/Client/PacketReceiver.h diff --git a/src/Client/PacketReceiver.h b/src/Client/PacketReceiver.h deleted file mode 100644 index c9475bafa71..00000000000 --- a/src/Client/PacketReceiver.h +++ /dev/null @@ -1,145 +0,0 @@ -#pragma once - -#if defined(OS_LINUX) - -#include -#include -#include -#include -#include - -namespace DB -{ - -/// Class for nonblocking packet receiving. It runs connection->receivePacket -/// in fiber and sets special read callback which is called when -/// reading from socket blocks. When read callback is called, -/// socket and receive timeout are added in epoll and execution returns to the main program. -/// So, you can poll this epoll file descriptor to determine when to resume -/// packet receiving (beside polling epoll descriptor, you also need to check connection->hasPendingData(), -/// because small packet can be read in buffer with the previous one, so new packet will be ready in buffer, -/// but there is no data socket to poll). -class PacketReceiver -{ -public: - PacketReceiver(Connection * connection_) : connection(connection_) - { - epoll.add(receive_timeout.getDescriptor()); - epoll.add(connection->getSocket()->impl()->sockfd()); - fiber = boost::context::fiber(std::allocator_arg_t(), fiber_stack, Routine{*this}); - } - - /// Resume packet receiving. - void resume() - { - /// If there is no pending data, check receive timeout. - if (!connection->hasReadPendingData() && !checkReceiveTimeout()) - return; - - fiber = std::move(fiber).resume(); - if (exception) - std::rethrow_exception(std::move(exception)); - } - - void cancel() - { - Fiber to_destroy = std::move(fiber); - connection = nullptr; - } - - Packet getPacket() { return std::move(packet); } - - int getFileDescriptor() const { return epoll.getFileDescriptor(); } - - bool isPacketReady() const { return !is_read_in_process; } - - bool isReceiveTimeoutExpired() const { return is_receive_timeout_expired; } - -private: - /// When epoll file descriptor is ready, check if it's an expired timeout - bool checkReceiveTimeout() - { - bool is_socket_ready = false; - is_receive_timeout_expired = false; - - epoll_event events[2]; - events[0].data.fd = events[1].data.fd = -1; - size_t ready_count = epoll.getManyReady(2, events, true); - - for (size_t i = 0; i != ready_count; ++i) - { - if (events[i].data.fd == connection->getSocket()->impl()->sockfd()) - is_socket_ready = true; - if (events[i].data.fd == receive_timeout.getDescriptor()) - is_receive_timeout_expired = true; - } - - if (is_receive_timeout_expired && !is_socket_ready) - { - receive_timeout.reset(); - return false; - } - - return true; - } - - struct Routine - { - PacketReceiver & receiver; - - struct ReadCallback - { - PacketReceiver & receiver; - Fiber & sink; - - void operator()(int, const Poco::Timespan & timeout, const std::string &) - { - receiver.receive_timeout.setRelative(timeout); - receiver.is_read_in_process = true; - sink = std::move(sink).resume(); - receiver.is_read_in_process = false; - receiver.receive_timeout.reset(); - } - }; - - Fiber operator()(Fiber && sink) - { - try - { - AsyncCallbackSetter async_setter(receiver.connection, ReadCallback{receiver, sink}); - while (true) - { - receiver.packet = receiver.connection->receivePacket(); - sink = std::move(sink).resume(); - } - - } - catch (const boost::context::detail::forced_unwind &) - { - /// This exception is thrown by fiber implementation in case if fiber is being deleted but hasn't exited - /// It should not be caught or it will segfault. - /// Other exceptions must be caught - throw; - } - catch (...) - { - receiver.exception = std::current_exception(); - } - - return std::move(sink); - } - }; - - Connection * connection; - TimerDescriptor receive_timeout; - Epoll epoll; - Fiber fiber; - FiberStack fiber_stack; - Packet packet; - bool is_read_in_process = false; - bool is_receive_timeout_expired = false; - std::exception_ptr exception; -}; - -} -#endif From 39f07d62a42288b83f8c5e46e026ebf9d051601d Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 19 Feb 2021 12:02:18 +0300 Subject: [PATCH 222/381] Disable in-memory compression by default --- src/Storages/MemorySettings.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/MemorySettings.h b/src/Storages/MemorySettings.h index 4a1ba57475f..5e3b5f81ba5 100644 --- a/src/Storages/MemorySettings.h +++ b/src/Storages/MemorySettings.h @@ -9,7 +9,7 @@ class ASTStorage; #define MEMORY_SETTINGS(M) \ - M(Bool, compress, true, "Compress data in memory", 0) \ + M(Bool, compress, false, "Compress data in memory", 0) \ DECLARE_SETTINGS_TRAITS(memorySettingsTraits, MEMORY_SETTINGS) From d438d7e390648d6be1c9718b58a18389d4d68650 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Fri, 19 Feb 2021 12:07:34 +0300 Subject: [PATCH 223/381] Fix timeout in epoll_wait for PollingQueue --- src/Processors/Executors/PollingQueue.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Processors/Executors/PollingQueue.cpp b/src/Processors/Executors/PollingQueue.cpp index b9c7bdade2d..3636fa82f73 100644 --- a/src/Processors/Executors/PollingQueue.cpp +++ b/src/Processors/Executors/PollingQueue.cpp @@ -90,7 +90,7 @@ PollingQueue::TaskData PollingQueue::wait(std::unique_lock & lock) while (num_events <= 0) { - num_events = epoll_wait(epoll_fd, &event, 1, 0); + num_events = epoll_wait(epoll_fd, &event, 1, -1); if (num_events == -1 && errno != EINTR) throwFromErrno("Failed to epoll_wait", ErrorCodes::CANNOT_READ_FROM_SOCKET); From ed4697cffc83c3b4c34d11189e9e300c969da618 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Fri, 19 Feb 2021 12:20:24 +0300 Subject: [PATCH 224/381] Fix timeout in epoll_wait for RemoteQueryExecutorReadContext --- src/DataStreams/RemoteQueryExecutorReadContext.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/DataStreams/RemoteQueryExecutorReadContext.cpp b/src/DataStreams/RemoteQueryExecutorReadContext.cpp index c79fffafcb1..3cc24ad5056 100644 --- a/src/DataStreams/RemoteQueryExecutorReadContext.cpp +++ b/src/DataStreams/RemoteQueryExecutorReadContext.cpp @@ -149,7 +149,7 @@ bool RemoteQueryExecutorReadContext::checkTimeoutImpl() const int num_events = 0; while (num_events <= 0) { - num_events = epoll_wait(epoll_fd, events, 3, 0); + num_events = epoll_wait(epoll_fd, events, 3, -1); if (num_events == -1 && errno != EINTR) throwFromErrno("Failed to epoll_wait", ErrorCodes::CANNOT_READ_FROM_SOCKET); } From 86a74ca6b5cd3618d574431d0c94a44ebac93baf Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 19 Feb 2021 12:24:50 +0300 Subject: [PATCH 225/381] Fix size deserialization --- src/Coordination/NuKeeperStorage.h | 2 +- src/Coordination/NuKeeperStorageSerializer.cpp | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/Coordination/NuKeeperStorage.h b/src/Coordination/NuKeeperStorage.h index 299fad4eea0..1a2e6202bf0 100644 --- a/src/Coordination/NuKeeperStorage.h +++ b/src/Coordination/NuKeeperStorage.h @@ -31,7 +31,7 @@ public: bool is_sequental = false; Coordination::Stat stat{}; int32_t seq_num = 0; - ChildrenSet children; + ChildrenSet children{}; }; struct ResponseForSession diff --git a/src/Coordination/NuKeeperStorageSerializer.cpp b/src/Coordination/NuKeeperStorageSerializer.cpp index 298df45cde0..c29d0d1f1fa 100644 --- a/src/Coordination/NuKeeperStorageSerializer.cpp +++ b/src/Coordination/NuKeeperStorageSerializer.cpp @@ -59,13 +59,16 @@ void NuKeeperStorageSerializer::deserialize(NuKeeperStorage & storage, ReadBuffe size_t container_size; Coordination::read(container_size, in); - while (storage.container.size() < container_size) + + size_t current_size = 0; + while (current_size < container_size) { std::string path; Coordination::read(path, in); NuKeeperStorage::Node node; readNode(node, in); storage.container[path] = node; + current_size++; } size_t ephemerals_size; Coordination::read(ephemerals_size, in); From fc1885ea9b01714290fba8ee8fbbe1a78894e573 Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Fri, 19 Feb 2021 17:28:01 +0800 Subject: [PATCH 226/381] Try fixing flaky tests --- tests/queries/0_stateless/00643_cast_zookeeper.sql | 2 ++ .../queries/0_stateless/01656_test_query_log_factories_info.sql | 2 ++ 2 files changed, 4 insertions(+) diff --git a/tests/queries/0_stateless/00643_cast_zookeeper.sql b/tests/queries/0_stateless/00643_cast_zookeeper.sql index c52d44bd88b..c9760f00ca7 100644 --- a/tests/queries/0_stateless/00643_cast_zookeeper.sql +++ b/tests/queries/0_stateless/00643_cast_zookeeper.sql @@ -1,3 +1,5 @@ +SET database_atomic_wait_for_drop_and_detach_synchronously=1; + DROP TABLE IF EXISTS cast1; DROP TABLE IF EXISTS cast2; diff --git a/tests/queries/0_stateless/01656_test_query_log_factories_info.sql b/tests/queries/0_stateless/01656_test_query_log_factories_info.sql index 9f374def8b5..17657cf60f5 100644 --- a/tests/queries/0_stateless/01656_test_query_log_factories_info.sql +++ b/tests/queries/0_stateless/01656_test_query_log_factories_info.sql @@ -1,3 +1,5 @@ +SET database_atomic_wait_for_drop_and_detach_synchronously=1; + SELECT uniqArray([1, 1, 2]), SUBSTRING('Hello, world', 7, 5), flatten([[[BIT_AND(123)]], [[mod(3, 2)], [CAST('1' AS INTEGER)]]]), From 6c9322bb2e779067d005879592157b5dba5074ac Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 19 Feb 2021 12:57:39 +0300 Subject: [PATCH 227/381] Sane constant while reading requests --- src/Server/NuKeeperTCPHandler.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/Server/NuKeeperTCPHandler.cpp b/src/Server/NuKeeperTCPHandler.cpp index e855e2c68f7..f25ca4a42ce 100644 --- a/src/Server/NuKeeperTCPHandler.cpp +++ b/src/Server/NuKeeperTCPHandler.cpp @@ -342,6 +342,7 @@ void NuKeeperTCPHandler::runImpl() PollResult result = poll_wrapper->poll(session_timeout); if (result.has_requests && !close_received) { + size_t requests_read = 0; do { auto [received_op, received_xid] = receiveRequest(); @@ -358,6 +359,10 @@ void NuKeeperTCPHandler::runImpl() LOG_TRACE(log, "Received heartbeat for session #{}", session_id); session_stopwatch.restart(); } + + if (requests_read > 50) + break; + requests_read++; } while (in->available()); } From 8f8a4f64235e6df11717fb9cb91be55c0673b3f5 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Fri, 19 Feb 2021 13:59:38 +0300 Subject: [PATCH 228/381] Update 01731_async_task_queue_wait.sh --- tests/queries/0_stateless/01731_async_task_queue_wait.sh | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/queries/0_stateless/01731_async_task_queue_wait.sh b/tests/queries/0_stateless/01731_async_task_queue_wait.sh index eddbfdf5322..7545ad1e81a 100755 --- a/tests/queries/0_stateless/01731_async_task_queue_wait.sh +++ b/tests/queries/0_stateless/01731_async_task_queue_wait.sh @@ -7,6 +7,4 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # regression for 'Empty task was returned from async task queue' during query # cancellation with async_socket_for_remote=1 (that ignores # max_distributed_connections) -timeout 5s ${CLICKHOUSE_CLIENT} --max_distributed_connections=1 --format Null -q "select * from remote('127.{2..11}', view(select * from numbers(1e9))) group by number format Null" -# timedout -test $? -eq 124 +$(timeout --signal=SIGINT 1 clickhouse client --max_distributed_connections=1 --max_block_size=2 --interactive_delay=900000 -q "select x from remote('127.{2,3}', view(select number + sleep(0.3) as x from numbers(16))) settings max_block_size = 2") 2>&1 | grep "Empty task was returned from async task queue" From 3d954c43142b28c0643b504a7f4d9333142b3fe0 Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 19 Feb 2021 14:06:17 +0300 Subject: [PATCH 229/381] Better request/response logic --- src/Server/NuKeeperTCPHandler.cpp | 70 ++++++++++++++----------------- 1 file changed, 32 insertions(+), 38 deletions(-) diff --git a/src/Server/NuKeeperTCPHandler.cpp b/src/Server/NuKeeperTCPHandler.cpp index f25ca4a42ce..081821504d3 100644 --- a/src/Server/NuKeeperTCPHandler.cpp +++ b/src/Server/NuKeeperTCPHandler.cpp @@ -40,7 +40,7 @@ namespace ErrorCodes struct PollResult { - size_t ready_responses_count{0}; + bool has_response{false}; bool has_requests{false}; bool error{false}; }; @@ -92,8 +92,22 @@ struct SocketInterruptablePollWrapper return pipe.fds_rw[1]; } - PollResult poll(Poco::Timespan remaining_time) + PollResult poll(Poco::Timespan remaining_time, const std::shared_ptr & in) { + PollResult result{}; + if (response_in.available() != 0) + { + UInt8 dummy; + readIntBinary(dummy, response_in); + result.has_response = true; + } + + if (in->available() != 0) + result.has_requests = true; + + if (result.has_response) + return result; + std::array outputs = {-1, -1}; #if defined(POCO_HAVE_FD_EPOLL) int rc; @@ -148,7 +162,6 @@ struct SocketInterruptablePollWrapper outputs[1] = pipe.fds_rw[0]; #endif - PollResult result{}; if (rc < 0) { result.error = true; @@ -169,16 +182,8 @@ struct SocketInterruptablePollWrapper else { UInt8 dummy; - do - { - /// All ready responses stored in responses queue, - /// but we have to count amount of ready responses in pipe - /// and process them only. Otherwise states of response_in - /// and response queue will be inconsistent and race condition is possible. - readIntBinary(dummy, response_in); - result.ready_responses_count++; - } - while (response_in.available()); + readIntBinary(dummy, response_in); + result.has_response = true; } } } @@ -339,42 +344,32 @@ void NuKeeperTCPHandler::runImpl() { using namespace std::chrono_literals; - PollResult result = poll_wrapper->poll(session_timeout); + PollResult result = poll_wrapper->poll(session_timeout, in); if (result.has_requests && !close_received) { - size_t requests_read = 0; - do + auto [received_op, received_xid] = receiveRequest(); + + if (received_op == Coordination::OpNum::Close) { - auto [received_op, received_xid] = receiveRequest(); - - if (received_op == Coordination::OpNum::Close) - { - LOG_DEBUG(log, "Received close event with xid {} for session id #{}", received_xid, session_id); - close_xid = received_xid; - close_received = true; - break; - } - else if (received_op == Coordination::OpNum::Heartbeat) - { - LOG_TRACE(log, "Received heartbeat for session #{}", session_id); - session_stopwatch.restart(); - } - - if (requests_read > 50) - break; - requests_read++; + LOG_DEBUG(log, "Received close event with xid {} for session id #{}", received_xid, session_id); + close_xid = received_xid; + close_received = true; + } + else if (received_op == Coordination::OpNum::Heartbeat) + { + LOG_TRACE(log, "Received heartbeat for session #{}", session_id); + session_stopwatch.restart(); } - while (in->available()); } /// Process exact amount of responses from pipe /// otherwise state of responses queue and signaling pipe /// became inconsistent and race condition is possible. - while (result.ready_responses_count != 0) + if (result.has_response) { Coordination::ZooKeeperResponsePtr response; if (!responses->tryPop(response)) - throw Exception(ErrorCodes::LOGICAL_ERROR, "We must have at least {} ready responses, but queue is empty. It's a bug.", result.ready_responses_count); + throw Exception(ErrorCodes::LOGICAL_ERROR, "We must have ready response, but queue is empty. It's a bug."); if (response->xid == close_xid) { @@ -388,7 +383,6 @@ void NuKeeperTCPHandler::runImpl() nu_keeper_storage_dispatcher->finishSession(session_id); return; } - result.ready_responses_count--; } if (result.error) From df1cf481cf118283c4d9b6afc6eaa419c5834d71 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Fri, 19 Feb 2021 14:14:31 +0300 Subject: [PATCH 230/381] Update 01731_async_task_queue_wait.sh --- tests/queries/0_stateless/01731_async_task_queue_wait.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/01731_async_task_queue_wait.sh b/tests/queries/0_stateless/01731_async_task_queue_wait.sh index 7545ad1e81a..936f850791d 100755 --- a/tests/queries/0_stateless/01731_async_task_queue_wait.sh +++ b/tests/queries/0_stateless/01731_async_task_queue_wait.sh @@ -7,4 +7,4 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # regression for 'Empty task was returned from async task queue' during query # cancellation with async_socket_for_remote=1 (that ignores # max_distributed_connections) -$(timeout --signal=SIGINT 1 clickhouse client --max_distributed_connections=1 --max_block_size=2 --interactive_delay=900000 -q "select x from remote('127.{2,3}', view(select number + sleep(0.3) as x from numbers(16))) settings max_block_size = 2") 2>&1 | grep "Empty task was returned from async task queue" +$(timeout --signal=SIGINT 1 ${CLICKHOUSE_CLIENT} --max_distributed_connections=1 --max_block_size=2 --interactive_delay=900000 -q "select x from remote('127.{2,3}', view(select number + sleep(0.3) as x from numbers(16))) settings max_block_size = 2") 2>&1 | grep "Empty task was returned from async task queue" From 866dfaec793f764dc9ba167d3ac9f6521b9b3381 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Fri, 19 Feb 2021 15:25:22 +0300 Subject: [PATCH 231/381] Update 01731_async_task_queue_wait.sh --- tests/queries/0_stateless/01731_async_task_queue_wait.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/01731_async_task_queue_wait.sh b/tests/queries/0_stateless/01731_async_task_queue_wait.sh index 936f850791d..89d8b63d745 100755 --- a/tests/queries/0_stateless/01731_async_task_queue_wait.sh +++ b/tests/queries/0_stateless/01731_async_task_queue_wait.sh @@ -7,4 +7,4 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # regression for 'Empty task was returned from async task queue' during query # cancellation with async_socket_for_remote=1 (that ignores # max_distributed_connections) -$(timeout --signal=SIGINT 1 ${CLICKHOUSE_CLIENT} --max_distributed_connections=1 --max_block_size=2 --interactive_delay=900000 -q "select x from remote('127.{2,3}', view(select number + sleep(0.3) as x from numbers(16))) settings max_block_size = 2") 2>&1 | grep "Empty task was returned from async task queue" +$(timeout --signal=SIGINT 1 ${CLICKHOUSE_CLIENT} --max_distributed_connections=1 --max_block_size=2 --interactive_delay=900000 -q "select x from remote('127.{2,3}', view(select number + sleep(0.3) as x from numbers(16))) settings max_block_size = 2") 2>&1 | grep "Empty task was returned from async task queue" || true From 5c9420c0779c648db5a42ecbb8f6db43cb98a76d Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 19 Feb 2021 15:56:51 +0300 Subject: [PATCH 232/381] More correct epoll usage --- src/Server/NuKeeperTCPHandler.cpp | 46 ++++++++++++++++--------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/src/Server/NuKeeperTCPHandler.cpp b/src/Server/NuKeeperTCPHandler.cpp index 081821504d3..92c7f4b968f 100644 --- a/src/Server/NuKeeperTCPHandler.cpp +++ b/src/Server/NuKeeperTCPHandler.cpp @@ -70,14 +70,14 @@ struct SocketInterruptablePollWrapper if (epollfd < 0) throwFromErrno("Cannot epoll_create", ErrorCodes::SYSTEM_ERROR); - socket_event.events = EPOLLIN | EPOLLERR; + socket_event.events = EPOLLIN | EPOLLERR | EPOLLPRI; socket_event.data.fd = sockfd; if (epoll_ctl(epollfd, EPOLL_CTL_ADD, sockfd, &socket_event) < 0) { ::close(epollfd); throwFromErrno("Cannot insert socket into epoll queue", ErrorCodes::SYSTEM_ERROR); } - pipe_event.events = EPOLLIN | EPOLLERR; + pipe_event.events = EPOLLIN | EPOLLERR | EPOLLPRI; pipe_event.data.fd = pipe.fds_rw[0]; if (epoll_ctl(epollfd, EPOLL_CTL_ADD, pipe.fds_rw[0], &pipe_event) < 0) { @@ -108,11 +108,12 @@ struct SocketInterruptablePollWrapper if (result.has_response) return result; - std::array outputs = {-1, -1}; + bool socket_ready = false; + bool fd_ready = false; #if defined(POCO_HAVE_FD_EPOLL) int rc; epoll_event evout[2]; - memset(evout, 0, sizeof(evout)); + evout[0].data.fd = evout[1].data.fd = -1; do { Poco::Timestamp start; @@ -129,10 +130,13 @@ struct SocketInterruptablePollWrapper } while (rc < 0 && errno == EINTR); - if (rc >= 1 && evout[0].events & EPOLLIN) - outputs[0] = evout[0].data.fd; - if (rc == 2 && evout[1].events & EPOLLIN) - outputs[1] = evout[1].data.fd; + for (int i = 0; i < rc; ++i) + { + if (evout[i].data.fd == sockfd) + socket_ready = true; + if (evout[i].data.fd == pipe.fds_rw[0]) + fd_ready = true; + } #else pollfd poll_buf[2]; poll_buf[0].fd = sockfd; @@ -156,10 +160,11 @@ struct SocketInterruptablePollWrapper } } while (rc < 0 && errno == POCO_EINTR); + if (rc >= 1 && poll_buf[0].revents & POLLIN) - outputs[0] = sockfd; + socket_ready = true; if (rc == 2 && poll_buf[1].revents & POLLIN) - outputs[1] = pipe.fds_rw[0]; + fd_ready = true; #endif if (rc < 0) @@ -173,19 +178,15 @@ struct SocketInterruptablePollWrapper } else { - for (auto fd : outputs) + if (socket_ready) { - if (fd != -1) - { - if (fd == sockfd) - result.has_requests = true; - else - { - UInt8 dummy; - readIntBinary(dummy, response_in); - result.has_response = true; - } - } + result.has_requests = true; + } + if (fd_ready) + { + UInt8 dummy; + readIntBinary(dummy, response_in); + result.has_response = true; } } return result; @@ -368,6 +369,7 @@ void NuKeeperTCPHandler::runImpl() if (result.has_response) { Coordination::ZooKeeperResponsePtr response; + if (!responses->tryPop(response)) throw Exception(ErrorCodes::LOGICAL_ERROR, "We must have ready response, but queue is empty. It's a bug."); From 068c9cfbf7a58dd7e624b3d1557ccdbaf227bf34 Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 19 Feb 2021 17:13:29 +0300 Subject: [PATCH 233/381] Fix logs level --- docker/test/fasttest/run.sh | 1 + src/Coordination/LoggerWrapper.h | 32 ++++++++++++++++++++++++-------- 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/docker/test/fasttest/run.sh b/docker/test/fasttest/run.sh index 53a0de21d5b..0ace1cd39da 100755 --- a/docker/test/fasttest/run.sh +++ b/docker/test/fasttest/run.sh @@ -70,6 +70,7 @@ function start_server --path "$FASTTEST_DATA" --user_files_path "$FASTTEST_DATA/user_files" --top_level_domains_path "$FASTTEST_DATA/top_level_domains" + --test_keeper_server.log_storage_path "$FASTTEST_DATA/coordination" ) clickhouse-server "${opts[@]}" &>> "$FASTTEST_OUTPUT/server.log" & server_pid=$! diff --git a/src/Coordination/LoggerWrapper.h b/src/Coordination/LoggerWrapper.h index 755b72c06cc..25a1969d2e9 100644 --- a/src/Coordination/LoggerWrapper.h +++ b/src/Coordination/LoggerWrapper.h @@ -9,12 +9,26 @@ namespace DB class LoggerWrapper : public nuraft::logger { +private: + + static inline const std::unordered_map LEVELS = + { + {LogsLevel::trace, Poco::Message::Priority::PRIO_TRACE}, + {LogsLevel::debug, Poco::Message::Priority::PRIO_DEBUG}, + {LogsLevel::information, Poco::Message::PRIO_INFORMATION}, + {LogsLevel::warning, Poco::Message::PRIO_WARNING}, + {LogsLevel::error, Poco::Message::PRIO_ERROR}, + {LogsLevel::fatal, Poco::Message::PRIO_FATAL} + }; + static inline const int LEVEL_MAX = static_cast(LogsLevel::trace); + static inline const int LEVEL_MIN = static_cast(LogsLevel::none); + public: LoggerWrapper(const std::string & name, LogsLevel level_) : log(&Poco::Logger::get(name)) - , level(static_cast(level_)) + , level(level_) { - log->setLevel(level); + log->setLevel(static_cast(LEVELS.at(level))); } void put_details( @@ -24,24 +38,26 @@ public: size_t /* line_number */, const std::string & msg) override { - LOG_IMPL(log, static_cast(level_), static_cast(level_), msg); + LogsLevel db_level = static_cast(level_); + LOG_IMPL(log, db_level, LEVELS.at(db_level), msg); } void set_level(int level_) override { - level_ = std::min(6, std::max(1, level_)); - log->setLevel(level_); - level = level_; + level_ = std::min(LEVEL_MAX, std::max(LEVEL_MIN, level_)); + level = static_cast(level_); + log->setLevel(static_cast(LEVELS.at(level))); } int get_level() override { - return level; + LogsLevel lvl = level; + return static_cast(lvl); } private: Poco::Logger * log; - std::atomic level; + std::atomic level; }; } From 12d05c27922eb1010eaede6fdf891995240dc644 Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 19 Feb 2021 17:47:53 +0300 Subject: [PATCH 234/381] Better startup --- contrib/NuRaft | 2 +- src/Coordination/NuKeeperServer.cpp | 9 +-------- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/contrib/NuRaft b/contrib/NuRaft index 7adf7ae33e7..c250d5ad58c 160000 --- a/contrib/NuRaft +++ b/contrib/NuRaft @@ -1 +1 @@ -Subproject commit 7adf7ae33e7d5c307342431b577c8ab1025ee793 +Subproject commit c250d5ad58c82e751264df40a94da682a2fc3519 diff --git a/src/Coordination/NuKeeperServer.cpp b/src/Coordination/NuKeeperServer.cpp index 8556fa85231..c2917e3ab76 100644 --- a/src/Coordination/NuKeeperServer.cpp +++ b/src/Coordination/NuKeeperServer.cpp @@ -161,7 +161,7 @@ bool NuKeeperServer::isLeaderAlive() const nuraft::cb_func::ReturnCode NuKeeperServer::callbackFunc(nuraft::cb_func::Type type, nuraft::cb_func::Param * /* param */) { - if (type == nuraft::cb_func::Type::BecomeFresh || type == nuraft::cb_func::Type::BecomeLeader) + if ((type == nuraft::cb_func::InitialBatchCommited && isLeader()) || type == nuraft::cb_func::BecomeFresh) { std::unique_lock lock(initialized_mutex); initialized_flag = true; @@ -176,13 +176,6 @@ void NuKeeperServer::waitInit() int64_t timeout = coordination_settings->startup_timeout.totalMilliseconds(); if (!initialized_cv.wait_for(lock, std::chrono::milliseconds(timeout), [&] { return initialized_flag; })) throw Exception(ErrorCodes::RAFT_ERROR, "Failed to wait RAFT initialization"); - - /// TODO FIXME somehow - while (isLeader() && raft_instance->get_committed_log_idx() != raft_instance->get_last_log_idx()) - { - LOG_WARNING(&Poco::Logger::get("NuKeeperServer"), "Loading from log store {}/{}", raft_instance->get_committed_log_idx(), raft_instance->get_last_log_idx()); - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - } } std::unordered_set NuKeeperServer::getDeadSessions() From ad374ec0953926af32227aea9744fc9c09da65ca Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 19 Feb 2021 19:05:26 +0300 Subject: [PATCH 235/381] Rename file --- src/Coordination/NuKeeperServer.cpp | 4 ++-- src/Coordination/NuKeeperServer.h | 4 ++-- ...ryStateManager.cpp => NuKeeperStateManager.cpp} | 14 +++++++------- ...MemoryStateManager.h => NuKeeperStateManager.h} | 6 +++--- src/Coordination/tests/gtest_for_build.cpp | 6 +++--- 5 files changed, 17 insertions(+), 17 deletions(-) rename src/Coordination/{InMemoryStateManager.cpp => NuKeeperStateManager.cpp} (88%) rename src/Coordination/{InMemoryStateManager.h => NuKeeperStateManager.h} (94%) diff --git a/src/Coordination/NuKeeperServer.cpp b/src/Coordination/NuKeeperServer.cpp index c2917e3ab76..c0dc3f85343 100644 --- a/src/Coordination/NuKeeperServer.cpp +++ b/src/Coordination/NuKeeperServer.cpp @@ -1,7 +1,7 @@ #include #include #include -#include +#include #include #include #include @@ -26,7 +26,7 @@ NuKeeperServer::NuKeeperServer( : server_id(server_id_) , coordination_settings(coordination_settings_) , state_machine(nuraft::cs_new(responses_queue_, coordination_settings)) - , state_manager(nuraft::cs_new(server_id, "test_keeper_server", config, coordination_settings)) + , state_manager(nuraft::cs_new(server_id, "test_keeper_server", config, coordination_settings)) , responses_queue(responses_queue_) { } diff --git a/src/Coordination/NuKeeperServer.h b/src/Coordination/NuKeeperServer.h index a8d269eb9eb..40f3efec76a 100644 --- a/src/Coordination/NuKeeperServer.h +++ b/src/Coordination/NuKeeperServer.h @@ -2,7 +2,7 @@ #include // Y_IGNORE #include -#include +#include #include #include #include @@ -20,7 +20,7 @@ private: nuraft::ptr state_machine; - nuraft::ptr state_manager; + nuraft::ptr state_manager; nuraft::raft_launcher launcher; diff --git a/src/Coordination/InMemoryStateManager.cpp b/src/Coordination/NuKeeperStateManager.cpp similarity index 88% rename from src/Coordination/InMemoryStateManager.cpp rename to src/Coordination/NuKeeperStateManager.cpp index 084ab043d12..14e8badd92f 100644 --- a/src/Coordination/InMemoryStateManager.cpp +++ b/src/Coordination/NuKeeperStateManager.cpp @@ -1,4 +1,4 @@ -#include +#include #include namespace DB @@ -9,7 +9,7 @@ namespace ErrorCodes extern const int RAFT_ERROR; } -InMemoryStateManager::InMemoryStateManager(int server_id_, const std::string & host, int port, const std::string & logs_path) +NuKeeperStateManager::NuKeeperStateManager(int server_id_, const std::string & host, int port, const std::string & logs_path) : my_server_id(server_id_) , my_port(port) , log_store(nuraft::cs_new(logs_path, 5000, true)) @@ -19,7 +19,7 @@ InMemoryStateManager::InMemoryStateManager(int server_id_, const std::string & h cluster_config->get_servers().push_back(peer_config); } -InMemoryStateManager::InMemoryStateManager( +NuKeeperStateManager::NuKeeperStateManager( int my_server_id_, const std::string & config_prefix, const Poco::Util::AbstractConfiguration & config, @@ -63,17 +63,17 @@ InMemoryStateManager::InMemoryStateManager( throw Exception(ErrorCodes::RAFT_ERROR, "At least one of servers should be able to start as leader (without )"); } -void InMemoryStateManager::loadLogStore(size_t start_log_index) +void NuKeeperStateManager::loadLogStore(size_t start_log_index) { log_store->init(start_log_index); } -void InMemoryStateManager::flushLogStore() +void NuKeeperStateManager::flushLogStore() { log_store->flush(); } -void InMemoryStateManager::save_config(const nuraft::cluster_config & config) +void NuKeeperStateManager::save_config(const nuraft::cluster_config & config) { // Just keep in memory in this example. // Need to write to disk here, if want to make it durable. @@ -81,7 +81,7 @@ void InMemoryStateManager::save_config(const nuraft::cluster_config & config) cluster_config = nuraft::cluster_config::deserialize(*buf); } -void InMemoryStateManager::save_state(const nuraft::srv_state & state) +void NuKeeperStateManager::save_state(const nuraft::srv_state & state) { // Just keep in memory in this example. // Need to write to disk here, if want to make it durable. diff --git a/src/Coordination/InMemoryStateManager.h b/src/Coordination/NuKeeperStateManager.h similarity index 94% rename from src/Coordination/InMemoryStateManager.h rename to src/Coordination/NuKeeperStateManager.h index c53f00702d4..66229a3b8d1 100644 --- a/src/Coordination/InMemoryStateManager.h +++ b/src/Coordination/NuKeeperStateManager.h @@ -10,16 +10,16 @@ namespace DB { -class InMemoryStateManager : public nuraft::state_mgr +class NuKeeperStateManager : public nuraft::state_mgr { public: - InMemoryStateManager( + NuKeeperStateManager( int server_id_, const std::string & config_prefix, const Poco::Util::AbstractConfiguration & config, const CoordinationSettingsPtr & coordination_settings); - InMemoryStateManager( + NuKeeperStateManager( int server_id_, const std::string & host, int port, diff --git a/src/Coordination/tests/gtest_for_build.cpp b/src/Coordination/tests/gtest_for_build.cpp index 457d0dbc52a..f871f39a906 100644 --- a/src/Coordination/tests/gtest_for_build.cpp +++ b/src/Coordination/tests/gtest_for_build.cpp @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include #include @@ -100,7 +100,7 @@ struct SimpliestRaftServer , port(port_) , endpoint(hostname + ":" + std::to_string(port)) , state_machine(nuraft::cs_new()) - , state_manager(nuraft::cs_new(server_id, hostname, port, logs_path)) + , state_manager(nuraft::cs_new(server_id, hostname, port, logs_path)) { state_manager->loadLogStore(1); nuraft::raft_params params; @@ -151,7 +151,7 @@ struct SimpliestRaftServer nuraft::ptr state_machine; // State manager. - nuraft::ptr state_manager; + nuraft::ptr state_manager; // Raft launcher. nuraft::raft_launcher launcher; From b84112a6039589c9a5e2399d4b0efc14d4adf1fc Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Fri, 19 Feb 2021 19:25:50 +0300 Subject: [PATCH 236/381] Function sumMap decimal fix --- src/AggregateFunctions/AggregateFunctionSumMap.h | 7 ++++++- .../queries/0_stateless/00502_sum_map.reference | 2 ++ tests/queries/0_stateless/00502_sum_map.sql | 16 ++++++++++++++++ 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/src/AggregateFunctions/AggregateFunctionSumMap.h b/src/AggregateFunctions/AggregateFunctionSumMap.h index 3079da36cda..f88a1468732 100644 --- a/src/AggregateFunctions/AggregateFunctionSumMap.h +++ b/src/AggregateFunctions/AggregateFunctionSumMap.h @@ -115,7 +115,12 @@ public: "Values for {} are expected to be Numeric, Float or Decimal, passed type {}", getName(), value_type->getName()}; - result_type = value_type_without_nullable->promoteNumericType(); + WhichDataType value_type_to_check(value_type); + + if (value_type_to_check.isDecimal()) + result_type = value_type_without_nullable; + else + result_type = value_type_without_nullable->promoteNumericType(); } types.emplace_back(std::make_shared(result_type)); diff --git a/tests/queries/0_stateless/00502_sum_map.reference b/tests/queries/0_stateless/00502_sum_map.reference index 0002c43945a..c38fb2ec7d6 100644 --- a/tests/queries/0_stateless/00502_sum_map.reference +++ b/tests/queries/0_stateless/00502_sum_map.reference @@ -22,3 +22,5 @@ ([1.01],[1]) (['a','b'],[1,2]) (['a','ab','abc'],[3,2,1]) +([1,2,3,4,5,6,7,8],[1.00000,2.00000,6.00000,8.00000,10.00000,12.00000,7.00000,8.00000]) +([1,2,3,4,5,6,7,8],[1.00000,2.00000,6.00000,8.00000,10.00000,12.00000,7.00000,8.00000]) diff --git a/tests/queries/0_stateless/00502_sum_map.sql b/tests/queries/0_stateless/00502_sum_map.sql index 021aaf3cd3b..51007a9c78a 100644 --- a/tests/queries/0_stateless/00502_sum_map.sql +++ b/tests/queries/0_stateless/00502_sum_map.sql @@ -38,3 +38,19 @@ select sumMap(val, cnt) from ( SELECT [ CAST(1.01, 'Decimal(10,2)') ] as val, [1 select sumMap(val, cnt) from ( SELECT [ CAST('a', 'FixedString(1)'), CAST('b', 'FixedString(1)' ) ] as val, [1, 2] as cnt ); select sumMap(val, cnt) from ( SELECT [ CAST('abc', 'String'), CAST('ab', 'String'), CAST('a', 'String') ] as val, [1, 2, 3] as cnt ); + +DROP TABLE IF EXISTS sum_map_decimal; + +CREATE TABLE sum_map_decimal( + statusMap Nested( + goal_id UInt16, + revenue Decimal32(5) + ) +) ENGINE = Log; + +INSERT INTO sum_map_decimal VALUES ([1, 2, 3], [1.0, 2.0, 3.0]), ([3, 4, 5], [3.0, 4.0, 5.0]), ([4, 5, 6], [4.0, 5.0, 6.0]), ([6, 7, 8], [6.0, 7.0, 8.0]); + +SELECT sumMap(statusMap.goal_id, statusMap.revenue) FROM sum_map_decimal; +SELECT sumMapWithOverflow(statusMap.goal_id, statusMap.revenue) FROM sum_map_decimal; + +DROP TABLE sum_map_decimal; From 0b5213c80d52595eb66ce8a992381073ac290e9a Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Fri, 19 Feb 2021 19:49:19 +0300 Subject: [PATCH 237/381] Added comment --- src/AggregateFunctions/AggregateFunctionSumMap.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/AggregateFunctions/AggregateFunctionSumMap.h b/src/AggregateFunctions/AggregateFunctionSumMap.h index f88a1468732..9c2cdb41844 100644 --- a/src/AggregateFunctions/AggregateFunctionSumMap.h +++ b/src/AggregateFunctions/AggregateFunctionSumMap.h @@ -116,7 +116,9 @@ public: getName(), value_type->getName()}; WhichDataType value_type_to_check(value_type); - + + /// Do not promote decimal because of implementation issues of this function design + /// If we decide to make this function more efficient we should promote decimal type during summ if (value_type_to_check.isDecimal()) result_type = value_type_without_nullable; else From fc03c1013cc73094ebb592623c60037acd196410 Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Fri, 19 Feb 2021 20:42:51 +0300 Subject: [PATCH 238/381] Fixed style check --- src/AggregateFunctions/AggregateFunctionSumMap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/AggregateFunctions/AggregateFunctionSumMap.h b/src/AggregateFunctions/AggregateFunctionSumMap.h index 9c2cdb41844..f6a473546f9 100644 --- a/src/AggregateFunctions/AggregateFunctionSumMap.h +++ b/src/AggregateFunctions/AggregateFunctionSumMap.h @@ -116,7 +116,7 @@ public: getName(), value_type->getName()}; WhichDataType value_type_to_check(value_type); - + /// Do not promote decimal because of implementation issues of this function design /// If we decide to make this function more efficient we should promote decimal type during summ if (value_type_to_check.isDecimal()) From 252bcccddaed5729e2a02fbd610209e0f7de5543 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 19 Feb 2021 21:32:39 +0300 Subject: [PATCH 239/381] Just little better --- src/Interpreters/Aggregator.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Interpreters/Aggregator.cpp b/src/Interpreters/Aggregator.cpp index 8040091256c..abff6f21acf 100644 --- a/src/Interpreters/Aggregator.cpp +++ b/src/Interpreters/Aggregator.cpp @@ -558,7 +558,7 @@ void NO_INLINE Aggregator::executeImplBatch( /// Generic case. - PODArray places(rows); + std::unique_ptr places(new AggregateDataPtr[rows]); /// For all rows. for (size_t i = 0; i < rows; ++i) @@ -589,9 +589,9 @@ void NO_INLINE Aggregator::executeImplBatch( for (AggregateFunctionInstruction * inst = aggregate_instructions; inst->that; ++inst) { if (inst->offsets) - inst->batch_that->addBatchArray(rows, places.data(), inst->state_offset, inst->batch_arguments, inst->offsets, aggregates_pool); + inst->batch_that->addBatchArray(rows, places.get(), inst->state_offset, inst->batch_arguments, inst->offsets, aggregates_pool); else - inst->batch_that->addBatch(rows, places.data(), inst->state_offset, inst->batch_arguments, aggregates_pool); + inst->batch_that->addBatch(rows, places.get(), inst->state_offset, inst->batch_arguments, aggregates_pool); } } From 66e775ef8811f1d1bba30a4369872b8ae04e0c54 Mon Sep 17 00:00:00 2001 From: Denis Zhuravlev Date: Fri, 19 Feb 2021 14:53:34 -0400 Subject: [PATCH 240/381] test for decimal ( p , s) in dictionaries --- .../01721_dictionary_decimal_p_s.reference | 10 +++ .../01721_dictionary_decimal_p_s.sql | 78 +++++++++++++++++++ 2 files changed, 88 insertions(+) create mode 100644 tests/queries/0_stateless/01721_dictionary_decimal_p_s.reference create mode 100644 tests/queries/0_stateless/01721_dictionary_decimal_p_s.sql diff --git a/tests/queries/0_stateless/01721_dictionary_decimal_p_s.reference b/tests/queries/0_stateless/01721_dictionary_decimal_p_s.reference new file mode 100644 index 00000000000..066b4bd1d97 --- /dev/null +++ b/tests/queries/0_stateless/01721_dictionary_decimal_p_s.reference @@ -0,0 +1,10 @@ +-------- 42 -------- +42 14.0000 14.00000000 14.00000000 14.0000000000000000618637523926765281280 +42 14.0000 14.00000000 14.00000000 +14.0000 14.00000000 14.00000000 +-------- 4999 -------- +4999 1666.3333 1666.33333333 1666.33333333 1633.3553612205046244471093725648757194800 +4999 1666.3333 1666.33333333 1666.33333333 +1666.3333 1666.33333333 1666.33333333 +-------- 5000 -------- +0.1100 0.11000000 0.11000000 diff --git a/tests/queries/0_stateless/01721_dictionary_decimal_p_s.sql b/tests/queries/0_stateless/01721_dictionary_decimal_p_s.sql new file mode 100644 index 00000000000..0451d455009 --- /dev/null +++ b/tests/queries/0_stateless/01721_dictionary_decimal_p_s.sql @@ -0,0 +1,78 @@ +set allow_experimental_bigint_types=1; +drop database if exists db_01721; +drop table if exists db_01721.table_decimal_dict; +drop dictionary if exists db_01721.decimal_dict; + + +create database db_01721; + +CREATE TABLE db_01721.table_decimal_dict( +KeyField UInt64, +Decimal32_ Decimal(5,4), +Decimal64_ Decimal(18,8), +Decimal128_ Decimal(25,8), +Decimal256_ Decimal(76,37) +) +ENGINE = Memory; + +insert into db_01721.table_decimal_dict +select number, + number / 3, + number / 3, + number / 3, + number / 3 +from numbers(5000); + + +CREATE DICTIONARY IF NOT EXISTS db_01721.decimal_dict ( + KeyField UInt64 DEFAULT 9999999, + Decimal32_ Decimal(5,4) DEFAULT 0.11, + Decimal64_ Decimal(18,8) DEFAULT 0.11, + Decimal128_ Decimal(25,8) DEFAULT 0.11 +-- ,Decimal256_ Decimal256(37) DEFAULT 0.11 +) +PRIMARY KEY KeyField +SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'table_decimal_dict' DB 'db_01721')) +LIFETIME(0) LAYOUT(SPARSE_HASHED); + +select '-------- 42 --------'; + +SELECT * from db_01721.table_decimal_dict where KeyField = 42; + +SELECT * from db_01721.decimal_dict where KeyField = 42; + +SELECT dictGet('db_01721.decimal_dict', 'Decimal32_', toUInt64(42)), + dictGet('db_01721.decimal_dict', 'Decimal64_', toUInt64(42)), + dictGet('db_01721.decimal_dict', 'Decimal128_', toUInt64(42)) + -- ,dictGet('db_01721.decimal_dict', 'Decimal256_', toUInt64(42)) +; + + +select '-------- 4999 --------'; + +SELECT * from db_01721.table_decimal_dict where KeyField = 4999; + +SELECT * from db_01721.decimal_dict where KeyField = 4999; + +SELECT dictGet('db_01721.decimal_dict', 'Decimal32_', toUInt64(4999)), + dictGet('db_01721.decimal_dict', 'Decimal64_', toUInt64(4999)), + dictGet('db_01721.decimal_dict', 'Decimal128_', toUInt64(4999)) + --,dictGet('db_01721.decimal_dict', 'Decimal256_', toUInt64(4999)) +; + +select '-------- 5000 --------'; + +SELECT * from db_01721.table_decimal_dict where KeyField = 5000; + +SELECT * from db_01721.decimal_dict where KeyField = 5000; + +SELECT dictGet('db_01721.decimal_dict', 'Decimal32_', toUInt64(5000)), + dictGet('db_01721.decimal_dict', 'Decimal64_', toUInt64(5000)), + dictGet('db_01721.decimal_dict', 'Decimal128_', toUInt64(5000)) + --,dictGet('db_01721.decimal_dict', 'Decimal256_', toUInt64(5000)) +; + +drop table if exists table_decimal_dict; +drop dictionary if exists cache_dict; +drop database if exists db_01721; + From fba1c7fcc165b1d84907a4a1ee37c809307cbf32 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Fri, 19 Feb 2021 21:48:58 +0300 Subject: [PATCH 241/381] Fix uncaught exception when HTTP client goes away Even after #20464 it was still possible, for example [1]. 2021.02.19 11:40:21.886191 [ 68373 ] {} DynamicQueryHandler: Request URI: /?database=test_ds2d6y&log_comment=/usr/share/clickhouse-test/queries/0_stateless/01302_aggregate_state_exception_memory_leak.sh&enable_http_compression=1&http_zlib_compression_level=1 2021.02.19 11:41:35.289940 [ 365 ] {} BaseDaemon: (version 21.3.1.6058, build id: 8D46D65205E2C8B7FE408A0B4EC76CA0483F9E92) (from thread 68373) Terminate called for uncaught exception: Code: 24, e.displayText() = DB::Exception: Cannot write to ostream at offset 262568, Stack trace (when copying this message, always include the lines below): 0. ./obj-x86_64-linux-gnu/../contrib/libcxx/include/exception:0: Poco::Exception::Exception(std::__1::basic_string, std::__1::allocator > const&, int) @ 0x15b3c7db in /usr/bin/clickhouse 1. ./obj-x86_64-linux-gnu/../src/Common/Exception.cpp:56: DB::Exception::Exception(std::__1::basic_string, std::__1::allocator > const&, int, bool) @ 0x8aba66e in /usr/bin/clickhouse 2. ./obj-x86_64-linux-gnu/../src/IO/WriteBufferFromOStream.cpp:0: DB::WriteBufferFromOStream::nextImpl() @ 0x8b8c105 in /usr/bin/clickhouse 3. ./obj-x86_64-linux-gnu/../src/IO/BufferBase.h:39: DB::WriteBufferFromOStream::~WriteBufferFromOStream() @ 0x8b8c537 in /usr/bin/clickhouse 4. ./obj-x86_64-linux-gnu/../src/IO/WriteBufferFromOStream.cpp:44: DB::Write [1]: https://clickhouse-test-reports.s3.yandex.net/16481/5d150cce4778dd14f58dcff67435bdec1efa155b/stress_test_(thread).html#fail1 And according to this partial stacktrace it seems that the dtor of WriteBufferFromOStream was called from WriteBufferFromHTTPServerResponse, since the class name starts from DB::Write* The problem is that if first time WriteBufferFromOStream::next() fails, it will reset position to make next write no-op, however WriteBufferFromHTTPServerResponse::next() will set position to available buffer back, and next() will throw again, but this time it can be from dtor. --- .../HTTP/WriteBufferFromHTTPServerResponse.cpp | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/Server/HTTP/WriteBufferFromHTTPServerResponse.cpp b/src/Server/HTTP/WriteBufferFromHTTPServerResponse.cpp index 86133fc2ffe..81f8cc30468 100644 --- a/src/Server/HTTP/WriteBufferFromHTTPServerResponse.cpp +++ b/src/Server/HTTP/WriteBufferFromHTTPServerResponse.cpp @@ -168,12 +168,18 @@ void WriteBufferFromHTTPServerResponse::onProgress(const Progress & progress) void WriteBufferFromHTTPServerResponse::finalize() { - next(); - if (out) + try { - out->next(); + next(); out.reset(); } + catch (...) + { + /// Avoid calling WriteBufferFromOStream::next() from dtor + /// (via WriteBufferFromHTTPServerResponse::next()) + out.reset(); + throw; + } if (!offset()) { From 0f77b6fd9585303162c5386a5b660d5448470d26 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 19 Feb 2021 22:01:45 +0300 Subject: [PATCH 242/381] Even more better --- src/Interpreters/AggregationCommon.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/Interpreters/AggregationCommon.h b/src/Interpreters/AggregationCommon.h index aafec9a7929..e896b0e14df 100644 --- a/src/Interpreters/AggregationCommon.h +++ b/src/Interpreters/AggregationCommon.h @@ -271,9 +271,13 @@ static T inline packFixedShuffle( size_t idx, const uint8_t * __restrict masks) { - __m128i res{}; + assert(num_srcs > 0); - for (size_t i = 0; i < num_srcs; ++i) + __m128i res = _mm_shuffle_epi8( + _mm_loadu_si128(reinterpret_cast(srcs[0] + elem_sizes[0] * idx)), + _mm_loadu_si128(reinterpret_cast(masks))); + + for (size_t i = 1; i < num_srcs; ++i) { res = _mm_xor_si128(res, _mm_shuffle_epi8( From 7ee72dfd0c46f0884c446003dfd3676644f6b19e Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 19 Feb 2021 22:24:20 +0300 Subject: [PATCH 243/381] Missed tests --- .../configs/use_test_keeper.xml | 8 ++ .../__init__.py | 1 + .../configs/enable_test_keeper1.xml | 39 ++++++++ .../configs/enable_test_keeper2.xml | 39 ++++++++ .../configs/enable_test_keeper3.xml | 39 ++++++++ .../configs/log_conf.xml | 12 +++ .../configs/use_test_keeper.xml | 16 +++ .../test.py | 98 +++++++++++++++++++ 8 files changed, 252 insertions(+) create mode 100644 tests/integration/test_testkeeper_back_to_back/configs/use_test_keeper.xml create mode 100644 tests/integration/test_testkeeper_persistent_log_multinode/__init__.py create mode 100644 tests/integration/test_testkeeper_persistent_log_multinode/configs/enable_test_keeper1.xml create mode 100644 tests/integration/test_testkeeper_persistent_log_multinode/configs/enable_test_keeper2.xml create mode 100644 tests/integration/test_testkeeper_persistent_log_multinode/configs/enable_test_keeper3.xml create mode 100644 tests/integration/test_testkeeper_persistent_log_multinode/configs/log_conf.xml create mode 100644 tests/integration/test_testkeeper_persistent_log_multinode/configs/use_test_keeper.xml create mode 100644 tests/integration/test_testkeeper_persistent_log_multinode/test.py diff --git a/tests/integration/test_testkeeper_back_to_back/configs/use_test_keeper.xml b/tests/integration/test_testkeeper_back_to_back/configs/use_test_keeper.xml new file mode 100644 index 00000000000..12dc7fd9447 --- /dev/null +++ b/tests/integration/test_testkeeper_back_to_back/configs/use_test_keeper.xml @@ -0,0 +1,8 @@ + + + + node1 + 9181 + + + diff --git a/tests/integration/test_testkeeper_persistent_log_multinode/__init__.py b/tests/integration/test_testkeeper_persistent_log_multinode/__init__.py new file mode 100644 index 00000000000..e5a0d9b4834 --- /dev/null +++ b/tests/integration/test_testkeeper_persistent_log_multinode/__init__.py @@ -0,0 +1 @@ +#!/usr/bin/env python3 diff --git a/tests/integration/test_testkeeper_persistent_log_multinode/configs/enable_test_keeper1.xml b/tests/integration/test_testkeeper_persistent_log_multinode/configs/enable_test_keeper1.xml new file mode 100644 index 00000000000..a47e5eae09a --- /dev/null +++ b/tests/integration/test_testkeeper_persistent_log_multinode/configs/enable_test_keeper1.xml @@ -0,0 +1,39 @@ + + + 9181 + 1 + /var/lib/clickhouse/coordination/log + + + 5000 + 10000 + trace + + + + + 1 + node1 + 44444 + true + 3 + + + 2 + node2 + 44444 + true + true + 2 + + + 3 + node3 + 44444 + true + true + 1 + + + + diff --git a/tests/integration/test_testkeeper_persistent_log_multinode/configs/enable_test_keeper2.xml b/tests/integration/test_testkeeper_persistent_log_multinode/configs/enable_test_keeper2.xml new file mode 100644 index 00000000000..18681f0dc95 --- /dev/null +++ b/tests/integration/test_testkeeper_persistent_log_multinode/configs/enable_test_keeper2.xml @@ -0,0 +1,39 @@ + + + 9181 + 2 + /var/lib/clickhouse/coordination/log + + + 5000 + 10000 + trace + + + + + 1 + node1 + 44444 + true + 3 + + + 2 + node2 + 44444 + true + true + 2 + + + 3 + node3 + 44444 + true + true + 1 + + + + diff --git a/tests/integration/test_testkeeper_persistent_log_multinode/configs/enable_test_keeper3.xml b/tests/integration/test_testkeeper_persistent_log_multinode/configs/enable_test_keeper3.xml new file mode 100644 index 00000000000..184d3724219 --- /dev/null +++ b/tests/integration/test_testkeeper_persistent_log_multinode/configs/enable_test_keeper3.xml @@ -0,0 +1,39 @@ + + + 9181 + 3 + /var/lib/clickhouse/coordination/log + + + 5000 + 10000 + trace + + + + + 1 + node1 + 44444 + true + 3 + + + 2 + node2 + 44444 + true + true + 2 + + + 3 + node3 + 44444 + true + true + 1 + + + + diff --git a/tests/integration/test_testkeeper_persistent_log_multinode/configs/log_conf.xml b/tests/integration/test_testkeeper_persistent_log_multinode/configs/log_conf.xml new file mode 100644 index 00000000000..318a6bca95d --- /dev/null +++ b/tests/integration/test_testkeeper_persistent_log_multinode/configs/log_conf.xml @@ -0,0 +1,12 @@ + + 3 + + trace + /var/log/clickhouse-server/log.log + /var/log/clickhouse-server/log.err.log + 1000M + 10 + /var/log/clickhouse-server/stderr.log + /var/log/clickhouse-server/stdout.log + + diff --git a/tests/integration/test_testkeeper_persistent_log_multinode/configs/use_test_keeper.xml b/tests/integration/test_testkeeper_persistent_log_multinode/configs/use_test_keeper.xml new file mode 100644 index 00000000000..b6139005d2f --- /dev/null +++ b/tests/integration/test_testkeeper_persistent_log_multinode/configs/use_test_keeper.xml @@ -0,0 +1,16 @@ + + + + node1 + 9181 + + + node2 + 9181 + + + node3 + 9181 + + + diff --git a/tests/integration/test_testkeeper_persistent_log_multinode/test.py b/tests/integration/test_testkeeper_persistent_log_multinode/test.py new file mode 100644 index 00000000000..cb9cf5a59d1 --- /dev/null +++ b/tests/integration/test_testkeeper_persistent_log_multinode/test.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python3 +import pytest +from helpers.cluster import ClickHouseCluster +import random +import string +import os +import time + +cluster = ClickHouseCluster(__file__) +node1 = cluster.add_instance('node1', main_configs=['configs/enable_test_keeper1.xml', 'configs/log_conf.xml', 'configs/use_test_keeper.xml'], stay_alive=True) +node2 = cluster.add_instance('node2', main_configs=['configs/enable_test_keeper2.xml', 'configs/log_conf.xml', 'configs/use_test_keeper.xml'], stay_alive=True) +node3 = cluster.add_instance('node3', main_configs=['configs/enable_test_keeper3.xml', 'configs/log_conf.xml', 'configs/use_test_keeper.xml'], stay_alive=True) + +from kazoo.client import KazooClient, KazooState + +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster.start() + + yield cluster + + finally: + cluster.shutdown() + +def get_fake_zk(nodename, timeout=30.0): + _fake_zk_instance = KazooClient(hosts=cluster.get_instance_ip(nodename) + ":9181", timeout=timeout) + def reset_listener(state): + nonlocal _fake_zk_instance + print("Fake zk callback called for state", state) + if state != KazooState.CONNECTED: + _fake_zk_instance._reset() + + _fake_zk_instance.add_listener(reset_listener) + _fake_zk_instance.start() + return _fake_zk_instance + +def stop_zk(zk): + try: + if zk: + zk.stop() + zk.close() + except: + pass + +def test_restart_multinode(started_cluster): + try: + node1_zk = node2_zk = node3_zk = None + + node1_zk = get_fake_zk("node1") + node2_zk = get_fake_zk("node2") + node3_zk = get_fake_zk("node3") + + for i in range(100): + node1_zk.create("/test_read_write_multinode_node" + str(i), ("somedata" + str(i)).encode()) + + for i in range(100): + if i % 10 == 0: + node1_zk.delete("/test_read_write_multinode_node" + str(i)) + + node2_zk.sync("/test_read_write_multinode_node0") + node3_zk.sync("/test_read_write_multinode_node0") + + for i in range(100): + if i % 10 != 0: + assert node2_zk.get("/test_read_write_multinode_node" + str(i))[0] == ("somedata" + str(i)).encode() + assert node3_zk.get("/test_read_write_multinode_node" + str(i))[0] == ("somedata" + str(i)).encode() + else: + assert node2_zk.exists("/test_read_write_multinode_node" + str(i)) is None + assert node3_zk.exists("/test_read_write_multinode_node" + str(i)) is None + + finally: + for zk in [node1_zk, node2_zk, node3_zk]: + stop_zk(zk) + + node1.restart_clickhouse(kill=True) + node2.restart_clickhouse(kill=True) + node3.restart_clickhouse(kill=True) + for i in range(100): + try: + node1_zk = get_fake_zk("node1") + node2_zk = get_fake_zk("node2") + node3_zk = get_fake_zk("node3") + for i in range(100): + if i % 10 != 0: + assert node1_zk.get("/test_read_write_multinode_node" + str(i))[0] == ("somedata" + str(i)).encode() + assert node2_zk.get("/test_read_write_multinode_node" + str(i))[0] == ("somedata" + str(i)).encode() + assert node3_zk.get("/test_read_write_multinode_node" + str(i))[0] == ("somedata" + str(i)).encode() + else: + assert node1_zk.exists("/test_read_write_multinode_node" + str(i)) is None + assert node2_zk.exists("/test_read_write_multinode_node" + str(i)) is None + assert node3_zk.exists("/test_read_write_multinode_node" + str(i)) is None + break + except Exception as ex: + print("Got exception as ex", ex) + finally: + for zk in [node1_zk, node2_zk, node3_zk]: + stop_zk(zk) From 7474a7e3ca139f1a4e88e83af011b304ebdcaf3c Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Fri, 19 Feb 2021 22:42:40 +0300 Subject: [PATCH 244/381] Increase buffer for uncaught exception / std::terminate Use PIPE_BUF over some magic number 1024 in terminate_handler, since according to pipe(7): PIPE_BUF POSIX.1 says that write(2)s of less than PIPE_BUF bytes must be atomic Also note that 1024, is too small, especially for C++ stacktraces (and especially for debug builds, that contains lots of non-inlined helpers for various ptrs). --- base/daemon/BaseDaemon.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/base/daemon/BaseDaemon.cpp b/base/daemon/BaseDaemon.cpp index db7019d3572..248ffdd4d10 100644 --- a/base/daemon/BaseDaemon.cpp +++ b/base/daemon/BaseDaemon.cpp @@ -416,7 +416,7 @@ static void sanitizerDeathCallback() else log_message = "Terminate called without an active exception"; - static const size_t buf_size = 1024; + static const size_t buf_size = PIPE_BUF; if (log_message.size() > buf_size - 16) log_message.resize(buf_size - 16); From f5893778cbf6544cb1a6b2d92d21248674bc864a Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Sat, 20 Feb 2021 00:01:13 +0300 Subject: [PATCH 245/381] Do not use view() in 01731_async_task_queue_wait to fix ANTLR parser --- tests/queries/0_stateless/01731_async_task_queue_wait.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/01731_async_task_queue_wait.sh b/tests/queries/0_stateless/01731_async_task_queue_wait.sh index 89d8b63d745..2f77628fc6d 100755 --- a/tests/queries/0_stateless/01731_async_task_queue_wait.sh +++ b/tests/queries/0_stateless/01731_async_task_queue_wait.sh @@ -7,4 +7,4 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # regression for 'Empty task was returned from async task queue' during query # cancellation with async_socket_for_remote=1 (that ignores # max_distributed_connections) -$(timeout --signal=SIGINT 1 ${CLICKHOUSE_CLIENT} --max_distributed_connections=1 --max_block_size=2 --interactive_delay=900000 -q "select x from remote('127.{2,3}', view(select number + sleep(0.3) as x from numbers(16))) settings max_block_size = 2") 2>&1 | grep "Empty task was returned from async task queue" || true +$(timeout --signal=SIGINT 1 ${CLICKHOUSE_CLIENT} --max_distributed_connections=1 --max_block_size=2 --interactive_delay=900000 -q "select number + sleep(0.3) as x from remote('127.{2,3}', system.numbers) settings max_block_size = 2") 2>&1 | grep "Empty task was returned from async task queue" || true From d0fe8900f980167530a0e1be56dd0cd219c6f08a Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Sat, 20 Feb 2021 00:04:28 +0300 Subject: [PATCH 246/381] Fix bash syntax in 01731_async_task_queue_wait --- tests/queries/0_stateless/01731_async_task_queue_wait.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/01731_async_task_queue_wait.sh b/tests/queries/0_stateless/01731_async_task_queue_wait.sh index 2f77628fc6d..e0babf3c6ff 100755 --- a/tests/queries/0_stateless/01731_async_task_queue_wait.sh +++ b/tests/queries/0_stateless/01731_async_task_queue_wait.sh @@ -7,4 +7,4 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # regression for 'Empty task was returned from async task queue' during query # cancellation with async_socket_for_remote=1 (that ignores # max_distributed_connections) -$(timeout --signal=SIGINT 1 ${CLICKHOUSE_CLIENT} --max_distributed_connections=1 --max_block_size=2 --interactive_delay=900000 -q "select number + sleep(0.3) as x from remote('127.{2,3}', system.numbers) settings max_block_size = 2") 2>&1 | grep "Empty task was returned from async task queue" || true +timeout --signal=SIGINT 1 ${CLICKHOUSE_CLIENT} --max_distributed_connections=1 --max_block_size=2 --interactive_delay=900000 -q "select number + sleep(0.3) as x from remote('127.{2,3}', system.numbers) settings max_block_size = 2" 2>&1 | grep "Empty task was returned from async task queue" || true From 2f7d0ba92677f595b1d760af2a826cc6fa181802 Mon Sep 17 00:00:00 2001 From: M0r64n Date: Sat, 20 Feb 2021 03:27:23 +0400 Subject: [PATCH 247/381] Replace direct truncate with O_TRUNC flag --- src/Storages/StorageFile.cpp | 16 ++++++++++------ .../01721_engine_file_truncate_on_insert.sql | 4 ++-- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index 856d03ea2ce..5524569e1f0 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -475,7 +475,8 @@ public: std::unique_lock && lock_, const CompressionMethod compression_method, const Context & context, - const std::optional & format_settings) + const std::optional & format_settings, + int & flags) : storage(storage_) , metadata_snapshot(metadata_snapshot_) , lock(std::move(lock_)) @@ -491,13 +492,14 @@ public: * INSERT data; SELECT *; last SELECT returns only insert_data */ storage.table_fd_was_used = true; - naked_buffer = std::make_unique(storage.table_fd); + naked_buffer = std::make_unique(storage.table_fd, DBMS_DEFAULT_BUFFER_SIZE); } else { if (storage.paths.size() != 1) throw Exception("Table '" + storage.getStorageID().getNameForLogs() + "' is in readonly mode because of globs in filepath", ErrorCodes::DATABASE_ACCESS_DENIED); - naked_buffer = std::make_unique(storage.paths[0], DBMS_DEFAULT_BUFFER_SIZE, O_WRONLY | O_APPEND | O_CREAT); + flags |= O_WRONLY | O_APPEND | O_CREAT; + naked_buffer = std::make_unique(storage.paths[0], DBMS_DEFAULT_BUFFER_SIZE, flags); } /// In case of CSVWithNames we have already written prefix. @@ -552,10 +554,11 @@ BlockOutputStreamPtr StorageFile::write( if (format_name == "Distributed") throw Exception("Method write is not implemented for Distributed format", ErrorCodes::NOT_IMPLEMENTED); + int flags = 0; + std::string path; if (context.getSettingsRef().engine_file_truncate_on_insert) - if (0 != ::truncate(paths[0].c_str(), 0)) - throwFromErrnoWithPath("Cannot truncate file " + paths[0], paths[0], ErrorCodes::CANNOT_TRUNCATE_FILE); + flags |= O_TRUNC; if (!paths.empty()) { @@ -569,7 +572,8 @@ BlockOutputStreamPtr StorageFile::write( std::unique_lock{rwlock, getLockTimeout(context)}, chooseCompressionMethod(path, compression_method), context, - format_settings); + format_settings, + flags); } bool StorageFile::storesDataOnDisk() const diff --git a/tests/queries/0_stateless/01721_engine_file_truncate_on_insert.sql b/tests/queries/0_stateless/01721_engine_file_truncate_on_insert.sql index 42d935cc0dd..079b2546a20 100644 --- a/tests/queries/0_stateless/01721_engine_file_truncate_on_insert.sql +++ b/tests/queries/0_stateless/01721_engine_file_truncate_on_insert.sql @@ -1,7 +1,7 @@ DROP TABLE IF EXISTS test; -INSERT INTO TABLE FUNCTION file('01718_file/test/data.TSV', 'TSV', 'id UInt32') VALUES (1); -ATTACH TABLE test FROM '01718_file/test' (id UInt8) ENGINE=File(TSV); +INSERT INTO TABLE FUNCTION file('01721_file/test/data.TSV', 'TSV', 'id UInt32') VALUES (1); +ATTACH TABLE test FROM '01721_file/test' (id UInt8) ENGINE=File(TSV); INSERT INTO test VALUES (2), (3); INSERT INTO test VALUES (4); From 2a36d6cb55af14b0dcf87c1b806afbf5c7dec8be Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Sat, 20 Feb 2021 02:41:58 +0300 Subject: [PATCH 248/381] review suggestions --- src/Common/ZooKeeper/ZooKeeper.h | 2 +- src/Databases/DatabaseAtomic.cpp | 11 ++- src/Databases/DatabaseFactory.cpp | 15 +++- src/Databases/DatabaseReplicated.cpp | 79 ++++++++++---------- src/Databases/DatabaseReplicated.h | 9 ++- src/Databases/DatabaseReplicatedSettings.cpp | 23 ++++++ src/Databases/DatabaseReplicatedSettings.h | 26 +++++++ src/Databases/DatabaseReplicatedWorker.cpp | 13 ++-- src/Databases/DatabaseReplicatedWorker.h | 12 +++ src/Databases/DatabaseWithDictionaries.cpp | 4 +- src/Databases/ya.make | 1 + src/Interpreters/Context.cpp | 4 +- src/Interpreters/Context.h | 12 +-- src/Interpreters/DDLTask.cpp | 26 +++---- src/Interpreters/DDLTask.h | 41 ++++++++-- src/Interpreters/DDLWorker.cpp | 47 ++++++------ src/Interpreters/DDLWorker.h | 4 +- src/Interpreters/InterpreterAlterQuery.cpp | 2 +- src/Interpreters/InterpreterCreateQuery.cpp | 4 +- src/Interpreters/InterpreterDropQuery.cpp | 4 +- src/Interpreters/InterpreterRenameQuery.cpp | 2 +- src/Storages/StorageMaterializedView.cpp | 4 +- src/Storages/StorageReplicatedMergeTree.cpp | 10 +-- tests/queries/skip_list.json | 1 + 24 files changed, 232 insertions(+), 124 deletions(-) create mode 100644 src/Databases/DatabaseReplicatedSettings.cpp create mode 100644 src/Databases/DatabaseReplicatedSettings.h diff --git a/src/Common/ZooKeeper/ZooKeeper.h b/src/Common/ZooKeeper/ZooKeeper.h index fbe1bede91a..5b37e4d6024 100644 --- a/src/Common/ZooKeeper/ZooKeeper.h +++ b/src/Common/ZooKeeper/ZooKeeper.h @@ -315,7 +315,7 @@ public: return std::make_shared(path, zookeeper, false, false, ""); } - void reset() + void setAlreadyRemoved() { need_remove = false; } diff --git a/src/Databases/DatabaseAtomic.cpp b/src/Databases/DatabaseAtomic.cpp index 2065e036863..71e0effb2d2 100644 --- a/src/Databases/DatabaseAtomic.cpp +++ b/src/Databases/DatabaseAtomic.cpp @@ -115,11 +115,14 @@ void DatabaseAtomic::dropTable(const Context & context, const String & table_nam std::unique_lock lock(mutex); table = getTableUnlocked(table_name, lock); table_metadata_path_drop = DatabaseCatalog::instance().getPathForDroppedMetadata(table->getStorageID()); - auto txn = context.getMetadataTransaction(); + auto txn = context.getZooKeeperMetadataTransaction(); if (txn && !context.isInternalSubquery()) txn->commit(); /// Commit point (a sort of) for Replicated database /// NOTE: replica will be lost if server crashes before the following rename + /// We apply changes in ZooKeeper before applying changes in local metadata file + /// to reduce probability of failures between these operations + /// (it's more likely to lost connection, than to fail before applying local changes). /// TODO better detection and recovery Poco::File(table_metadata_path).renameTo(table_metadata_path_drop); /// Mark table as dropped @@ -241,7 +244,7 @@ void DatabaseAtomic::renameTable(const Context & context, const String & table_n } /// Table renaming actually begins here - auto txn = context.getMetadataTransaction(); + auto txn = context.getZooKeeperMetadataTransaction(); if (txn && !context.isInternalSubquery()) txn->commit(); /// Commit point (a sort of) for Replicated database @@ -302,7 +305,7 @@ void DatabaseAtomic::commitCreateTable(const ASTCreateQuery & query, const Stora DatabaseCatalog::instance().addUUIDMapping(query.uuid); locked_uuid = true; - auto txn = query_context.getMetadataTransaction(); + auto txn = query_context.getZooKeeperMetadataTransaction(); if (txn && !query_context.isInternalSubquery()) txn->commit(); /// Commit point (a sort of) for Replicated database @@ -337,7 +340,7 @@ void DatabaseAtomic::commitAlterTable(const StorageID & table_id, const String & if (table_id.uuid != actual_table_id.uuid) throw Exception("Cannot alter table because it was renamed", ErrorCodes::CANNOT_ASSIGN_ALTER); - auto txn = query_context.getMetadataTransaction(); + auto txn = query_context.getZooKeeperMetadataTransaction(); if (txn && !query_context.isInternalSubquery()) txn->commit(); /// Commit point (a sort of) for Replicated database diff --git a/src/Databases/DatabaseFactory.cpp b/src/Databases/DatabaseFactory.cpp index ca2b9bb083e..cd0143556c9 100644 --- a/src/Databases/DatabaseFactory.cpp +++ b/src/Databases/DatabaseFactory.cpp @@ -103,8 +103,11 @@ DatabasePtr DatabaseFactory::getImpl(const ASTCreateQuery & create, const String if (engine_define->engine->arguments && !engine_may_have_arguments) throw Exception("Database engine " + engine_name + " cannot have arguments", ErrorCodes::BAD_ARGUMENTS); - if (engine_define->engine->parameters || engine_define->partition_by || engine_define->primary_key || engine_define->order_by || - engine_define->sample_by || (!endsWith(engine_name, "MySQL") && engine_define->settings)) + bool has_unexpected_element = engine_define->engine->parameters || engine_define->partition_by || + engine_define->primary_key || engine_define->order_by || + engine_define->sample_by; + bool may_have_settings = endsWith(engine_name, "MySQL") || engine_name == "Replicated"; + if (has_unexpected_element || (!may_have_settings && engine_define->settings)) throw Exception("Database engine " + engine_name + " cannot have parameters, primary_key, order_by, sample_by, settings", ErrorCodes::UNKNOWN_ELEMENT_IN_AST); @@ -205,7 +208,13 @@ DatabasePtr DatabaseFactory::getImpl(const ASTCreateQuery & create, const String shard_name = context.getMacros()->expand(shard_name); replica_name = context.getMacros()->expand(replica_name); - return std::make_shared(database_name, metadata_path, uuid, zookeeper_path, shard_name, replica_name, context); + DatabaseReplicatedSettings database_replicated_settings{}; + if (engine_define->settings) + database_replicated_settings.loadFromQuery(*engine_define); + + return std::make_shared(database_name, metadata_path, uuid, + zookeeper_path, shard_name, replica_name, + std::move(database_replicated_settings), context); } #if USE_LIBPQXX diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 441880ae616..12cff3407d3 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -63,11 +63,13 @@ DatabaseReplicated::DatabaseReplicated( const String & zookeeper_path_, const String & shard_name_, const String & replica_name_, + DatabaseReplicatedSettings db_settings_, const Context & context_) : DatabaseAtomic(name_, metadata_path_, uuid, "DatabaseReplicated (" + name_ + ")", context_) , zookeeper_path(zookeeper_path_) , shard_name(shard_name_) , replica_name(replica_name_) + , db_settings(std::move(db_settings_)) { if (zookeeper_path.empty() || shard_name.empty() || replica_name.empty()) throw Exception("ZooKeeper path, shard and replica names must be non-empty", ErrorCodes::BAD_ARGUMENTS); @@ -141,7 +143,8 @@ ClusterPtr DatabaseReplicated::getCluster() const break; } if (!success) - throw Exception(ErrorCodes::ALL_CONNECTION_TRIES_FAILED, "Cannot get consistent cluster snapshot"); + throw Exception(ErrorCodes::ALL_CONNECTION_TRIES_FAILED, "Cannot get consistent cluster snapshot," + "because replicas are created or removed concurrently"); assert(!hosts.empty()); assert(hosts.size() == host_ids.size()); @@ -172,7 +175,7 @@ ClusterPtr DatabaseReplicated::getCluster() const return std::make_shared(global_context.getSettingsRef(), shards, username, password, global_context.getTCPPort(), false); } -void DatabaseReplicated::tryConnectToZooKeeper(bool force_attach) +void DatabaseReplicated::tryConnectToZooKeeperAndInitDatabase(bool force_attach) { try { @@ -228,6 +231,9 @@ bool DatabaseReplicated::createDatabaseNodesInZooKeeper(const zkutil::ZooKeeperP ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/log", "", zkutil::CreateMode::Persistent)); ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/replicas", "", zkutil::CreateMode::Persistent)); ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/counter", "", zkutil::CreateMode::Persistent)); + /// We create and remove counter/cnt- node to increment sequential number of counter/ node and make log entry numbers start from 1. + /// New replicas are created with log pointer equal to 0 and log pointer is a number of the last executed entry. + /// It means that we cannot have log entry with number 0. ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/counter/cnt-", "", zkutil::CreateMode::Persistent)); ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_path + "/counter/cnt-", -1)); ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/metadata", "", zkutil::CreateMode::Persistent)); @@ -253,10 +259,7 @@ void DatabaseReplicated::createReplicaNodesInZooKeeper(const zkutil::ZooKeeperPt auto host_id = getHostID(global_context, db_uuid); /// On replica creation add empty entry to log. Can be used to trigger some actions on other replicas (e.g. update cluster info). - DDLLogEntry entry; - entry.hosts = {}; - entry.query = {}; - entry.initiator = {}; + DDLLogEntry entry{}; String query_path_prefix = zookeeper_path + "/log/query-"; String counter_prefix = zookeeper_path + "/counter/cnt-"; @@ -273,7 +276,7 @@ void DatabaseReplicated::createReplicaNodesInZooKeeper(const zkutil::ZooKeeperPt void DatabaseReplicated::loadStoredObjects(Context & context, bool has_force_restore_data_flag, bool force_attach) { - tryConnectToZooKeeper(force_attach); + tryConnectToZooKeeperAndInitDatabase(force_attach); DatabaseAtomic::loadStoredObjects(context, has_force_restore_data_flag, force_attach); @@ -281,7 +284,7 @@ void DatabaseReplicated::loadStoredObjects(Context & context, bool has_force_res ddl_worker->startup(); } -BlockIO DatabaseReplicated::propose(const ASTPtr & query, const Context & query_context) +BlockIO DatabaseReplicated::tryEnqueueReplicatedDDL(const ASTPtr & query, const Context & query_context) { if (is_readonly) throw Exception(ErrorCodes::NO_ZOOKEEPER, "Database is in readonly mode, because it cannot connect to ZooKeeper"); @@ -405,7 +408,7 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep String db_name = getDatabaseName(); String to_db_name = getDatabaseName() + BROKEN_TABLES_SUFFIX; - if (total_tables < tables_to_detach.size() * 2) + if (total_tables * db_settings.max_broken_tables_ratio < tables_to_detach.size()) throw Exception(ErrorCodes::DATABASE_REPLICATION_FAILED, "Too many tables to recreate: {} of {}", tables_to_detach.size(), total_tables); else if (!tables_to_detach.empty()) { @@ -594,12 +597,12 @@ void DatabaseReplicated::shutdown() void DatabaseReplicated::dropTable(const Context & context, const String & table_name, bool no_delay) { - auto txn = context.getMetadataTransaction(); + auto txn = context.getZooKeeperMetadataTransaction(); assert(!ddl_worker->isCurrentlyActive() || txn); - if (txn && txn->is_initial_query) + if (txn && txn->isInitialQuery()) { String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(table_name); - txn->ops.emplace_back(zkutil::makeRemoveRequest(metadata_zk_path, -1)); + txn->addOp(zkutil::makeRemoveRequest(metadata_zk_path, -1)); } DatabaseAtomic::dropTable(context, table_name, no_delay); } @@ -607,10 +610,10 @@ void DatabaseReplicated::dropTable(const Context & context, const String & table void DatabaseReplicated::renameTable(const Context & context, const String & table_name, IDatabase & to_database, const String & to_table_name, bool exchange, bool dictionary) { - auto txn = context.getMetadataTransaction(); + auto txn = context.getZooKeeperMetadataTransaction(); assert(txn); - if (txn->is_initial_query) + if (txn->isInitialQuery()) { if (this != &to_database) throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Moving tables between databases is not supported for Replicated engine"); @@ -622,16 +625,16 @@ void DatabaseReplicated::renameTable(const Context & context, const String & tab throw Exception(ErrorCodes::UNKNOWN_TABLE, "Table {} does not exist", to_table_name); String statement = readMetadataFile(table_name); - String metadata_zk_path = txn->zookeeper_path + "/metadata/" + escapeForFileName(table_name); - String metadata_zk_path_to = txn->zookeeper_path + "/metadata/" + escapeForFileName(to_table_name); - txn->ops.emplace_back(zkutil::makeRemoveRequest(metadata_zk_path, -1)); + String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(table_name); + String metadata_zk_path_to = zookeeper_path + "/metadata/" + escapeForFileName(to_table_name); + txn->addOp(zkutil::makeRemoveRequest(metadata_zk_path, -1)); if (exchange) { String statement_to = readMetadataFile(to_table_name); - txn->ops.emplace_back(zkutil::makeRemoveRequest(metadata_zk_path_to, -1)); - txn->ops.emplace_back(zkutil::makeCreateRequest(metadata_zk_path, statement_to, zkutil::CreateMode::Persistent)); + txn->addOp(zkutil::makeRemoveRequest(metadata_zk_path_to, -1)); + txn->addOp(zkutil::makeCreateRequest(metadata_zk_path, statement_to, zkutil::CreateMode::Persistent)); } - txn->ops.emplace_back(zkutil::makeCreateRequest(metadata_zk_path_to, statement, zkutil::CreateMode::Persistent)); + txn->addOp(zkutil::makeCreateRequest(metadata_zk_path_to, statement, zkutil::CreateMode::Persistent)); } DatabaseAtomic::renameTable(context, table_name, to_database, to_table_name, exchange, dictionary); @@ -641,14 +644,14 @@ void DatabaseReplicated::commitCreateTable(const ASTCreateQuery & query, const S const String & table_metadata_tmp_path, const String & table_metadata_path, const Context & query_context) { - auto txn = query_context.getMetadataTransaction(); + auto txn = query_context.getZooKeeperMetadataTransaction(); assert(!ddl_worker->isCurrentlyActive() || txn); - if (txn && txn->is_initial_query) + if (txn && txn->isInitialQuery()) { - String metadata_zk_path = txn->zookeeper_path + "/metadata/" + escapeForFileName(query.table); + String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(query.table); String statement = getObjectDefinitionFromCreateQuery(query.clone()); /// zk::multi(...) will throw if `metadata_zk_path` exists - txn->ops.emplace_back(zkutil::makeCreateRequest(metadata_zk_path, statement, zkutil::CreateMode::Persistent)); + txn->addOp(zkutil::makeCreateRequest(metadata_zk_path, statement, zkutil::CreateMode::Persistent)); } DatabaseAtomic::commitCreateTable(query, table, table_metadata_tmp_path, table_metadata_path, query_context); } @@ -657,11 +660,11 @@ void DatabaseReplicated::commitAlterTable(const StorageID & table_id, const String & table_metadata_tmp_path, const String & table_metadata_path, const String & statement, const Context & query_context) { - auto txn = query_context.getMetadataTransaction(); - if (txn && txn->is_initial_query) + auto txn = query_context.getZooKeeperMetadataTransaction(); + if (txn && txn->isInitialQuery()) { - String metadata_zk_path = txn->zookeeper_path + "/metadata/" + escapeForFileName(table_id.table_name); - txn->ops.emplace_back(zkutil::makeSetRequest(metadata_zk_path, statement, -1)); + String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(table_id.table_name); + txn->addOp(zkutil::makeSetRequest(metadata_zk_path, statement, -1)); } DatabaseAtomic::commitAlterTable(table_id, table_metadata_tmp_path, table_metadata_path, statement, query_context); } @@ -670,37 +673,37 @@ void DatabaseReplicated::createDictionary(const Context & context, const String & dictionary_name, const ASTPtr & query) { - auto txn = context.getMetadataTransaction(); + auto txn = context.getZooKeeperMetadataTransaction(); assert(!ddl_worker->isCurrentlyActive() || txn); - if (txn && txn->is_initial_query) + if (txn && txn->isInitialQuery()) { - String metadata_zk_path = txn->zookeeper_path + "/metadata/" + escapeForFileName(dictionary_name); + String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(dictionary_name); String statement = getObjectDefinitionFromCreateQuery(query->clone()); - txn->ops.emplace_back(zkutil::makeCreateRequest(metadata_zk_path, statement, zkutil::CreateMode::Persistent)); + txn->addOp(zkutil::makeCreateRequest(metadata_zk_path, statement, zkutil::CreateMode::Persistent)); } DatabaseAtomic::createDictionary(context, dictionary_name, query); } void DatabaseReplicated::removeDictionary(const Context & context, const String & dictionary_name) { - auto txn = context.getMetadataTransaction(); + auto txn = context.getZooKeeperMetadataTransaction(); assert(!ddl_worker->isCurrentlyActive() || txn); - if (txn && txn->is_initial_query) + if (txn && txn->isInitialQuery()) { String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(dictionary_name); - txn->ops.emplace_back(zkutil::makeRemoveRequest(metadata_zk_path, -1)); + txn->addOp(zkutil::makeRemoveRequest(metadata_zk_path, -1)); } DatabaseAtomic::removeDictionary(context, dictionary_name); } void DatabaseReplicated::detachTablePermanently(const Context & context, const String & table_name) { - auto txn = context.getMetadataTransaction(); + auto txn = context.getZooKeeperMetadataTransaction(); assert(!ddl_worker->isCurrentlyActive() || txn); - if (txn && txn->is_initial_query) + if (txn && txn->isInitialQuery()) { String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(table_name); - txn->ops.emplace_back(zkutil::makeRemoveRequest(metadata_zk_path, -1)); + txn->addOp(zkutil::makeRemoveRequest(metadata_zk_path, -1)); } DatabaseAtomic::detachTablePermanently(context, table_name); } diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index a3a53e02ee4..fde53cf2c29 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include @@ -22,13 +23,14 @@ class DatabaseReplicated : public DatabaseAtomic public: DatabaseReplicated(const String & name_, const String & metadata_path_, UUID uuid, const String & zookeeper_path_, const String & shard_name_, const String & replica_name_, + DatabaseReplicatedSettings db_settings_, const Context & context); ~DatabaseReplicated() override; String getEngineName() const override { return "Replicated"; } - /// If current query is initial, then the following methods add metadata updating ZooKeeper operations to current MetadataTransaction. + /// If current query is initial, then the following methods add metadata updating ZooKeeper operations to current ZooKeeperMetadataTransaction. void dropTable(const Context &, const String & table_name, bool no_delay) override; void renameTable(const Context & context, const String & table_name, IDatabase & to_database, const String & to_table_name, bool exchange, bool dictionary) override; @@ -46,7 +48,7 @@ public: /// Try to execute DLL query on current host as initial query. If query is succeed, /// then it will be executed on all replicas. - BlockIO propose(const ASTPtr & query, const Context & query_context); + BlockIO tryEnqueueReplicatedDDL(const ASTPtr & query, const Context & query_context); void stopReplication(); @@ -64,7 +66,7 @@ public: friend struct DatabaseReplicatedTask; friend class DatabaseReplicatedDDLWorker; private: - void tryConnectToZooKeeper(bool force_attach); + void tryConnectToZooKeeperAndInitDatabase(bool force_attach); bool createDatabaseNodesInZooKeeper(const ZooKeeperPtr & current_zookeeper); void createReplicaNodesInZooKeeper(const ZooKeeperPtr & current_zookeeper); @@ -78,6 +80,7 @@ private: String shard_name; String replica_name; String replica_path; + DatabaseReplicatedSettings db_settings; zkutil::ZooKeeperPtr getZooKeeper() const; diff --git a/src/Databases/DatabaseReplicatedSettings.cpp b/src/Databases/DatabaseReplicatedSettings.cpp new file mode 100644 index 00000000000..61febcf2810 --- /dev/null +++ b/src/Databases/DatabaseReplicatedSettings.cpp @@ -0,0 +1,23 @@ +#include +#include +#include + +namespace DB +{ + +IMPLEMENT_SETTINGS_TRAITS(DatabaseReplicatedSettingsTraits, LIST_OF_DATABASE_REPLICATED_SETTINGS) + +void DatabaseReplicatedSettings::loadFromQuery(ASTStorage & storage_def) +{ + if (storage_def.settings) + { + applyChanges(storage_def.settings->changes); + return; + } + + auto settings_ast = std::make_shared(); + settings_ast->is_standalone = false; + storage_def.set(storage_def.settings, settings_ast); +} + +} diff --git a/src/Databases/DatabaseReplicatedSettings.h b/src/Databases/DatabaseReplicatedSettings.h new file mode 100644 index 00000000000..11d5b3820e4 --- /dev/null +++ b/src/Databases/DatabaseReplicatedSettings.h @@ -0,0 +1,26 @@ +#pragma once +#include +#include + +namespace DB +{ + +class ASTStorage; + +#define LIST_OF_DATABASE_REPLICATED_SETTINGS(M) \ + M(Float, max_broken_tables_ratio, 0.5, "Do not recover replica automatically if the ratio of staled tables to all tables is greater", 0) \ + M(UInt64, max_replication_lag_to_enqueue, 10, "Replica will throw exception on attempt to execute query if its replication lag greater", 0) \ + M(UInt64, wait_entry_commited_timeout_sec, 3600, "Replicas will try to cancel query if timeout exceed, but initiator host has not executed it yet", 0) \ + +DECLARE_SETTINGS_TRAITS(DatabaseReplicatedSettingsTraits, LIST_OF_DATABASE_REPLICATED_SETTINGS) + + +/** Settings for the MaterializeMySQL database engine. + * Could be loaded from a CREATE DATABASE query (SETTINGS clause). + */ +struct DatabaseReplicatedSettings : public BaseSettings +{ + void loadFromQuery(ASTStorage & storage_def); +}; + +} diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp index ff15878b136..e0c5717711c 100644 --- a/src/Databases/DatabaseReplicatedWorker.cpp +++ b/src/Databases/DatabaseReplicatedWorker.cpp @@ -30,7 +30,7 @@ void DatabaseReplicatedDDLWorker::initializeMainThread() { auto zookeeper = getAndSetZooKeeper(); if (database->is_readonly) - database->tryConnectToZooKeeper(false); + database->tryConnectToZooKeeperAndInitDatabase(false); initializeReplication(); initialized = true; return; @@ -98,8 +98,7 @@ String DatabaseReplicatedDDLWorker::tryEnqueueAndExecuteEntry(DDLLogEntry & entr UInt32 our_log_ptr = parse(zookeeper->get(database->replica_path + "/log_ptr")); UInt32 max_log_ptr = parse(zookeeper->get(database->zookeeper_path + "/max_log_ptr")); assert(our_log_ptr <= max_log_ptr); - constexpr UInt32 max_replication_lag = 16; - if (max_replication_lag < max_log_ptr - our_log_ptr) + if (database->db_settings.max_replication_lag_to_enqueue < max_log_ptr - our_log_ptr) throw Exception(ErrorCodes::NOT_A_LEADER, "Cannot enqueue query on this replica, " "because it has replication lag of {} queries. Try other replica.", max_log_ptr - our_log_ptr); @@ -131,7 +130,7 @@ String DatabaseReplicatedDDLWorker::tryEnqueueAndExecuteEntry(DDLLogEntry & entr if (zookeeper->expired() || stop_flag) throw Exception(ErrorCodes::DATABASE_REPLICATION_FAILED, "ZooKeeper session expired or replication stopped, try again"); - processTask(*task); + processTask(*task, zookeeper); if (!task->was_executed) { @@ -139,7 +138,7 @@ String DatabaseReplicatedDDLWorker::tryEnqueueAndExecuteEntry(DDLLogEntry & entr task->execution_status.code, task->execution_status.message); } - try_node->reset(); + try_node->setAlreadyRemoved(); return entry_path; } @@ -178,7 +177,7 @@ DDLTaskPtr DatabaseReplicatedDDLWorker::initAndCheckTask(const String & entry_na /// Query is not committed yet. We cannot just skip it and execute next one, because reordering may break replication. LOG_TRACE(log, "Waiting for initiator {} to commit or rollback entry {}", initiator_name, entry_path); constexpr size_t wait_time_ms = 1000; - constexpr size_t max_iterations = 3600; + size_t max_iterations = database->db_settings.wait_entry_commited_timeout_sec; size_t iteration = 0; while (!wait_committed_or_failed->tryWait(wait_time_ms)) @@ -194,7 +193,7 @@ DDLTaskPtr DatabaseReplicatedDDLWorker::initAndCheckTask(const String & entry_na if (max_iterations <= ++iteration) { /// What can we do if initiator hangs for some reason? Seems like we can remove /try node. - /// Initiator will fail to commit entry to ZK (including ops for replicated table) if /try does not exist. + /// Initiator will fail to commit ZooKeeperMetadataTransaction (including ops for replicated table) if /try does not exist. /// But it's questionable. /// We use tryRemove(...) because multiple hosts (including initiator) may try to do it concurrently. diff --git a/src/Databases/DatabaseReplicatedWorker.h b/src/Databases/DatabaseReplicatedWorker.h index 1eafe2489e7..6dd8dc408d7 100644 --- a/src/Databases/DatabaseReplicatedWorker.h +++ b/src/Databases/DatabaseReplicatedWorker.h @@ -6,6 +6,18 @@ namespace DB class DatabaseReplicated; +/// It's similar to DDLWorker, but has the following differences: +/// 1. DDL queue in ZooKeeper is not shared between multiple clusters and databases, +/// each DatabaseReplicated has its own queue in ZooKeeper and DatabaseReplicatedDDLWorker object. +/// 2. Shards and replicas are identified by shard_name and replica_name arguments of database engine, +/// not by address:port pairs. Cluster (of multiple database replicas) is identified by its zookeeper_path. +/// 3. After creation of an entry in DDL queue initiator tries to execute the entry locally +/// and other hosts wait for query to finish on initiator host. +/// If query succeed on initiator, then all hosts must execute it, so they will retry until query succeed. +/// We assume that cluster is homogenous, so if replicas are in consistent state and query succeed on one host, +/// then all hosts can execute it (maybe after several retries). +/// 4. Each database replica stores its log pointer in ZooKeeper. Cleanup thread removes old entry +/// if its number < max_log_ptr - logs_to_keep. class DatabaseReplicatedDDLWorker : public DDLWorker { public: diff --git a/src/Databases/DatabaseWithDictionaries.cpp b/src/Databases/DatabaseWithDictionaries.cpp index 7ce5de56b64..d92f0f1897e 100644 --- a/src/Databases/DatabaseWithDictionaries.cpp +++ b/src/Databases/DatabaseWithDictionaries.cpp @@ -194,7 +194,7 @@ void DatabaseWithDictionaries::createDictionary(const Context & context, const S detachDictionary(dictionary_name); }); - auto txn = context.getMetadataTransaction(); + auto txn = context.getZooKeeperMetadataTransaction(); if (txn && !context.isInternalSubquery()) txn->commit(); /// Commit point (a sort of) for Replicated database @@ -219,7 +219,7 @@ void DatabaseWithDictionaries::removeDictionary(const Context & context, const S { String dictionary_metadata_path = getObjectMetadataPath(dictionary_name); - auto txn = context.getMetadataTransaction(); + auto txn = context.getZooKeeperMetadataTransaction(); if (txn && !context.isInternalSubquery()) txn->commit(); /// Commit point (a sort of) for Replicated database diff --git a/src/Databases/ya.make b/src/Databases/ya.make index 38f79532080..8bd3f291a64 100644 --- a/src/Databases/ya.make +++ b/src/Databases/ya.make @@ -17,6 +17,7 @@ SRCS( DatabaseOnDisk.cpp DatabaseOrdinary.cpp DatabaseReplicated.cpp + DatabaseReplicatedSettings.cpp DatabaseReplicatedWorker.cpp DatabaseWithDictionaries.cpp DatabasesCommon.cpp diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index 766b14dea42..98e4a87fba3 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -2553,14 +2553,14 @@ StorageID Context::resolveStorageIDImpl(StorageID storage_id, StorageNamespace w return StorageID::createEmpty(); } -void Context::initMetadataTransaction(MetadataTransactionPtr txn, [[maybe_unused]] bool attach_existing) +void Context::initZooKeeperMetadataTransaction(ZooKeeperMetadataTransactionPtr txn, [[maybe_unused]] bool attach_existing) { assert(!metadata_transaction); assert(attach_existing || query_context == this); metadata_transaction = std::move(txn); } -MetadataTransactionPtr Context::getMetadataTransaction() const +ZooKeeperMetadataTransactionPtr Context::getZooKeeperMetadataTransaction() const { assert(!metadata_transaction || hasQueryContext()); return metadata_transaction; diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index 24d0eb4b0de..563fb172488 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -117,8 +117,8 @@ using VolumePtr = std::shared_ptr; struct NamedSession; struct BackgroundTaskSchedulingSettings; -struct MetadataTransaction; -using MetadataTransactionPtr = std::shared_ptr; +class ZooKeeperMetadataTransaction; +using ZooKeeperMetadataTransactionPtr = std::shared_ptr; #if USE_EMBEDDED_COMPILER class CompiledExpressionCache; @@ -281,7 +281,7 @@ private: /// to be customized in HTTP and TCP servers by overloading the customizeContext(DB::Context&) /// methods. - MetadataTransactionPtr metadata_transaction; /// Distributed DDL context. I'm not sure if it's a suitable place for this, + ZooKeeperMetadataTransactionPtr metadata_transaction; /// Distributed DDL context. I'm not sure if it's a suitable place for this, /// but it's the easiest way to pass this through the whole stack from executeQuery(...) /// to DatabaseOnDisk::commitCreateTable(...) or IStorage::alter(...) without changing /// thousands of signatures. @@ -746,8 +746,10 @@ public: IHostContextPtr & getHostContext(); const IHostContextPtr & getHostContext() const; - void initMetadataTransaction(MetadataTransactionPtr txn, bool attach_existing = false); - MetadataTransactionPtr getMetadataTransaction() const; + /// Initialize context of distributed DDL query with Replicated database. + void initZooKeeperMetadataTransaction(ZooKeeperMetadataTransactionPtr txn, bool attach_existing = false); + /// Returns context of current distributed DDL query or nullptr. + ZooKeeperMetadataTransactionPtr getZooKeeperMetadataTransaction() const; struct MySQLWireContext { diff --git a/src/Interpreters/DDLTask.cpp b/src/Interpreters/DDLTask.cpp index 7f47f0a6659..4be465d3de4 100644 --- a/src/Interpreters/DDLTask.cpp +++ b/src/Interpreters/DDLTask.cpp @@ -96,7 +96,7 @@ void DDLTaskBase::parseQueryFromEntry(const Context & context) query = parseQuery(parser_query, begin, end, description, 0, context.getSettingsRef().max_parser_depth); } -std::unique_ptr DDLTaskBase::makeQueryContext(Context & from_context) +std::unique_ptr DDLTaskBase::makeQueryContext(Context & from_context, const ZooKeeperPtr & /*zookeeper*/) { auto query_context = std::make_unique(from_context); query_context->makeQueryContext(); @@ -293,28 +293,26 @@ String DatabaseReplicatedTask::getShardID() const return database->shard_name; } -std::unique_ptr DatabaseReplicatedTask::makeQueryContext(Context & from_context) +std::unique_ptr DatabaseReplicatedTask::makeQueryContext(Context & from_context, const ZooKeeperPtr & zookeeper) { - auto query_context = DDLTaskBase::makeQueryContext(from_context); + auto query_context = DDLTaskBase::makeQueryContext(from_context, zookeeper); query_context->getClientInfo().query_kind = ClientInfo::QueryKind::SECONDARY_QUERY; query_context->setCurrentDatabase(database->getDatabaseName()); - auto txn = std::make_shared(); - query_context->initMetadataTransaction(txn); - txn->current_zookeeper = from_context.getZooKeeper(); - txn->zookeeper_path = database->zookeeper_path; - txn->is_initial_query = is_initial_query; + auto txn = std::make_shared(zookeeper, database->zookeeper_path, is_initial_query); + query_context->initZooKeeperMetadataTransaction(txn); if (is_initial_query) { - txn->ops.emplace_back(zkutil::makeRemoveRequest(entry_path + "/try", -1)); - txn->ops.emplace_back(zkutil::makeCreateRequest(entry_path + "/committed", host_id_str, zkutil::CreateMode::Persistent)); - txn->ops.emplace_back(zkutil::makeSetRequest(database->zookeeper_path + "/max_log_ptr", toString(getLogEntryNumber(entry_name)), -1)); + txn->addOp(zkutil::makeRemoveRequest(entry_path + "/try", -1)); + txn->addOp(zkutil::makeCreateRequest(entry_path + "/committed", host_id_str, zkutil::CreateMode::Persistent)); + txn->addOp(zkutil::makeSetRequest(database->zookeeper_path + "/max_log_ptr", toString(getLogEntryNumber(entry_name)), -1)); } - txn->ops.emplace_back(zkutil::makeSetRequest(database->replica_path + "/log_ptr", toString(getLogEntryNumber(entry_name)), -1)); + txn->addOp(zkutil::makeSetRequest(database->replica_path + "/log_ptr", toString(getLogEntryNumber(entry_name)), -1)); - std::move(ops.begin(), ops.end(), std::back_inserter(txn->ops)); + for (auto & op : ops) + txn->addOp(std::move(op)); ops.clear(); return query_context; @@ -335,7 +333,7 @@ UInt32 DDLTaskBase::getLogEntryNumber(const String & log_entry_name) return parse(log_entry_name.substr(strlen(name))); } -void MetadataTransaction::commit() +void ZooKeeperMetadataTransaction::commit() { assert(state == CREATED); state = FAILED; diff --git a/src/Interpreters/DDLTask.h b/src/Interpreters/DDLTask.h index f02e17103aa..18c1f4c80cd 100644 --- a/src/Interpreters/DDLTask.h +++ b/src/Interpreters/DDLTask.h @@ -20,8 +20,8 @@ class ASTQueryWithOnCluster; using ZooKeeperPtr = std::shared_ptr; class DatabaseReplicated; -struct MetadataTransaction; -using MetadataTransactionPtr = std::shared_ptr; +class ZooKeeperMetadataTransaction; +using ZooKeeperMetadataTransactionPtr = std::shared_ptr; struct HostID { @@ -95,7 +95,7 @@ struct DDLTaskBase virtual String getShardID() const = 0; - virtual std::unique_ptr makeQueryContext(Context & from_context); + virtual std::unique_ptr makeQueryContext(Context & from_context, const ZooKeeperPtr & zookeeper); inline String getActiveNodePath() const { return entry_path + "/active/" + host_id_str; } inline String getFinishedNodePath() const { return entry_path + "/finished/" + host_id_str; } @@ -132,13 +132,19 @@ struct DatabaseReplicatedTask : public DDLTaskBase DatabaseReplicatedTask(const String & name, const String & path, DatabaseReplicated * database_); String getShardID() const override; - std::unique_ptr makeQueryContext(Context & from_context) override; + std::unique_ptr makeQueryContext(Context & from_context, const ZooKeeperPtr & zookeeper) override; DatabaseReplicated * database; }; - -struct MetadataTransaction +/// The main purpose of ZooKeeperMetadataTransaction is to execute all zookeeper operation related to query +/// in a single transaction when we performed all required checks and ready to "commit" changes. +/// For example, create ALTER_METADATA entry in ReplicatedMergeTree log, +/// create path/to/entry/finished/host_id node in distributed DDL queue to mark query as executed and +/// update metadata in path/to/replicated_database/metadata/table_name +/// It's used for DatabaseReplicated. +/// TODO we can also use it for ordinary ON CLUSTER queries +class ZooKeeperMetadataTransaction { enum State { @@ -153,8 +159,29 @@ struct MetadataTransaction bool is_initial_query; Coordination::Requests ops; +public: + ZooKeeperMetadataTransaction(const ZooKeeperPtr & current_zookeeper_, const String & zookeeper_path_, bool is_initial_query_) + : current_zookeeper(current_zookeeper_) + , zookeeper_path(zookeeper_path_) + , is_initial_query(is_initial_query_) + { + } + + bool isInitialQuery() const { return is_initial_query; } + + bool isExecuted() const { return state != CREATED; } + + String getDatabaseZooKeeperPath() const { return zookeeper_path; } + + void addOp(Coordination::RequestPtr && op) + { + assert(!isExecuted()); + ops.emplace_back(op); + } + void moveOpsTo(Coordination::Requests & other_ops) { + assert(!isExecuted()); std::move(ops.begin(), ops.end(), std::back_inserter(other_ops)); ops.clear(); state = COMMITTED; @@ -162,7 +189,7 @@ struct MetadataTransaction void commit(); - ~MetadataTransaction() { assert(state != CREATED || std::uncaught_exception()); } + ~ZooKeeperMetadataTransaction() { assert(isExecuted() || std::uncaught_exception()); } }; } diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index 12fd03b3b70..67f716c235c 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -195,16 +195,15 @@ void DDLWorker::startup() void DDLWorker::shutdown() { - stop_flag = true; - queue_updated_event->set(); - cleanup_event->set(); - - if (main_thread.joinable()) + bool prev_stop_flag = stop_flag.exchange(true); + if (!prev_stop_flag) + { + queue_updated_event->set(); + cleanup_event->set(); main_thread.join(); - if (cleanup_thread.joinable()) cleanup_thread.join(); - - worker_pool.reset(); + worker_pool.reset(); + } } DDLWorker::~DDLWorker() @@ -267,6 +266,8 @@ DDLTaskPtr DDLWorker::initAndCheckTask(const String & entry_name, String & out_r } /// Stage 2: resolve host_id and check if we should execute query or not + /// Multiple clusters can use single DDL queue path in ZooKeeper, + /// So we should skip task if we cannot find current host in cluster hosts list. if (!task->findCurrentHostID(context, log)) { out_reason = "There is no a local address in host list"; @@ -317,7 +318,7 @@ void DDLWorker::scheduleTasks() bool status_written = zookeeper->exists(task->getFinishedNodePath()); if (task->was_executed && !status_written && task_still_exists) { - processTask(*task); + processTask(*task, zookeeper); } } @@ -364,15 +365,15 @@ void DDLWorker::scheduleTasks() if (worker_pool) { - worker_pool->scheduleOrThrowOnError([this, &saved_task]() + worker_pool->scheduleOrThrowOnError([this, &saved_task, &zookeeper]() { setThreadName("DDLWorkerExec"); - processTask(saved_task); + processTask(saved_task, zookeeper); }); } else { - processTask(saved_task); + processTask(saved_task, zookeeper); } } } @@ -385,7 +386,7 @@ DDLTaskBase & DDLWorker::saveTask(DDLTaskPtr && task) return *current_tasks.back(); } -bool DDLWorker::tryExecuteQuery(const String & query, DDLTaskBase & task) +bool DDLWorker::tryExecuteQuery(const String & query, DDLTaskBase & task, const ZooKeeperPtr & zookeeper) { /// Add special comment at the start of query to easily identify DDL-produced queries in query_log String query_prefix = "/* ddl_entry=" + task.entry_name + " */ "; @@ -398,14 +399,16 @@ bool DDLWorker::tryExecuteQuery(const String & query, DDLTaskBase & task) try { - auto query_context = task.makeQueryContext(context); + auto query_context = task.makeQueryContext(context, zookeeper); if (!task.is_initial_query) query_scope.emplace(*query_context); executeQuery(istr, ostr, !task.is_initial_query, *query_context, {}); - if (auto txn = query_context->getMetadataTransaction()) + if (auto txn = query_context->getZooKeeperMetadataTransaction()) { - if (txn->state == MetadataTransaction::CREATED) + /// Most queries commit changes to ZooKeeper right before applying local changes, + /// but some queries does not support it, so we have to do it here. + if (!txn->isExecuted()) txn->commit(); } } @@ -463,10 +466,8 @@ void DDLWorker::updateMaxDDLEntryID(const String & entry_name) } } -void DDLWorker::processTask(DDLTaskBase & task) +void DDLWorker::processTask(DDLTaskBase & task, const ZooKeeperPtr & zookeeper) { - auto zookeeper = tryGetZooKeeper(); - LOG_DEBUG(log, "Processing task {} ({})", task.entry_name, task.entry.query); String active_node_path = task.getActiveNodePath(); @@ -541,7 +542,7 @@ void DDLWorker::processTask(DDLTaskBase & task) else { storage.reset(); - tryExecuteQuery(rewritten_query, task); + tryExecuteQuery(rewritten_query, task, zookeeper); } } catch (const Coordination::Exception &) @@ -565,7 +566,7 @@ void DDLWorker::processTask(DDLTaskBase & task) } else { - /// task.ops where not executed by table or database engine, se DDLWorker is responsible for + /// task.ops where not executed by table or database engine, so DDLWorker is responsible for /// writing query execution status into ZooKeeper. task.ops.emplace_back(zkutil::makeSetRequest(finished_node_path, task.execution_status.serializeText(), -1)); } @@ -589,7 +590,7 @@ void DDLWorker::processTask(DDLTaskBase & task) } /// Active node was removed in multi ops - active_node->reset(); + active_node->setAlreadyRemoved(); task.completely_processed = true; } @@ -712,7 +713,7 @@ bool DDLWorker::tryExecuteQueryOnLeaderReplica( /// If the leader will unexpectedly changed this method will return false /// and on the next iteration new leader will take lock - if (tryExecuteQuery(rewritten_query, task)) + if (tryExecuteQuery(rewritten_query, task, zookeeper)) { executed_by_us = true; break; diff --git a/src/Interpreters/DDLWorker.h b/src/Interpreters/DDLWorker.h index c39a832c098..8b0a8f038a0 100644 --- a/src/Interpreters/DDLWorker.h +++ b/src/Interpreters/DDLWorker.h @@ -77,7 +77,7 @@ protected: /// Returns non-empty DDLTaskPtr if entry parsed and the check is passed virtual DDLTaskPtr initAndCheckTask(const String & entry_name, String & out_reason, const ZooKeeperPtr & zookeeper); - void processTask(DDLTaskBase & task); + void processTask(DDLTaskBase & task, const ZooKeeperPtr & zookeeper); void updateMaxDDLEntryID(const String & entry_name); /// Check that query should be executed on leader replica only @@ -95,7 +95,7 @@ protected: const String & node_path, const ZooKeeperPtr & zookeeper); - bool tryExecuteQuery(const String & query, DDLTaskBase & task); + bool tryExecuteQuery(const String & query, DDLTaskBase & task, const ZooKeeperPtr & zookeeper); /// Checks and cleanups queue's nodes void cleanupQueue(Int64 current_time_seconds, const ZooKeeperPtr & zookeeper); diff --git a/src/Interpreters/InterpreterAlterQuery.cpp b/src/Interpreters/InterpreterAlterQuery.cpp index 402f05895bc..bf624507574 100644 --- a/src/Interpreters/InterpreterAlterQuery.cpp +++ b/src/Interpreters/InterpreterAlterQuery.cpp @@ -54,7 +54,7 @@ BlockIO InterpreterAlterQuery::execute() { auto guard = DatabaseCatalog::instance().getDDLGuard(table_id.database_name, table_id.table_name); guard->releaseTableLock(); - return typeid_cast(database.get())->propose(query_ptr, context); + return typeid_cast(database.get())->tryEnqueueReplicatedDDL(query_ptr, context); } StoragePtr table = DatabaseCatalog::instance().getTable(table_id, context); diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 2021c1f1d60..2b1dddde78c 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -880,7 +880,7 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create) { assertOrSetUUID(create, database); guard->releaseTableLock(); - return typeid_cast(database.get())->propose(query_ptr, context); + return typeid_cast(database.get())->tryEnqueueReplicatedDDL(query_ptr, context); } } @@ -1092,7 +1092,7 @@ BlockIO InterpreterCreateQuery::createDictionary(ASTCreateQuery & create) if (!create.attach) assertOrSetUUID(create, database); guard->releaseTableLock(); - return typeid_cast(database.get())->propose(query_ptr, context); + return typeid_cast(database.get())->tryEnqueueReplicatedDDL(query_ptr, context); } if (database->isDictionaryExist(dictionary_name)) diff --git a/src/Interpreters/InterpreterDropQuery.cpp b/src/Interpreters/InterpreterDropQuery.cpp index 9e63c647f71..33e93a79c41 100644 --- a/src/Interpreters/InterpreterDropQuery.cpp +++ b/src/Interpreters/InterpreterDropQuery.cpp @@ -146,7 +146,7 @@ BlockIO InterpreterDropQuery::executeToTableImpl(const ASTDropQuery & query, Dat ddl_guard->releaseTableLock(); table.reset(); - return typeid_cast(database.get())->propose(query.clone(), context); + return typeid_cast(database.get())->tryEnqueueReplicatedDDL(query.clone(), context); } if (query.kind == ASTDropQuery::Kind::Detach) @@ -231,7 +231,7 @@ BlockIO InterpreterDropQuery::executeToDictionary( context.checkAccess(AccessType::DROP_DICTIONARY, database_name, dictionary_name); ddl_guard->releaseTableLock(); - return typeid_cast(database.get())->propose(query_ptr, context); + return typeid_cast(database.get())->tryEnqueueReplicatedDDL(query_ptr, context); } if (!database || !database->isDictionaryExist(dictionary_name)) diff --git a/src/Interpreters/InterpreterRenameQuery.cpp b/src/Interpreters/InterpreterRenameQuery.cpp index b9d7faac73c..923a342d9ea 100644 --- a/src/Interpreters/InterpreterRenameQuery.cpp +++ b/src/Interpreters/InterpreterRenameQuery.cpp @@ -90,7 +90,7 @@ BlockIO InterpreterRenameQuery::executeToTables(const ASTRenameQuery & rename, c UniqueTableName to(elem.to_database_name, elem.to_table_name); ddl_guards[from]->releaseTableLock(); ddl_guards[to]->releaseTableLock(); - return typeid_cast(database.get())->propose(query_ptr, context); + return typeid_cast(database.get())->tryEnqueueReplicatedDDL(query_ptr, context); } else { diff --git a/src/Storages/StorageMaterializedView.cpp b/src/Storages/StorageMaterializedView.cpp index 32317968fe5..325bf3d2f74 100644 --- a/src/Storages/StorageMaterializedView.cpp +++ b/src/Storages/StorageMaterializedView.cpp @@ -212,11 +212,11 @@ static void executeDropQuery(ASTDropQuery::Kind kind, const Context & global_con /// looks like expected behaviour and we have tests for it. auto drop_context = Context(global_context); drop_context.getClientInfo().query_kind = ClientInfo::QueryKind::SECONDARY_QUERY; - if (auto txn = current_context.getMetadataTransaction()) + if (auto txn = current_context.getZooKeeperMetadataTransaction()) { /// For Replicated database drop_context.setQueryContext(const_cast(current_context)); - drop_context.initMetadataTransaction(txn, true); + drop_context.initZooKeeperMetadataTransaction(txn, true); } InterpreterDropQuery drop_interpreter(ast_drop_query, drop_context); drop_interpreter.execute(); diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index ff39bf91fbb..f2c88cdedd9 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -4282,12 +4282,12 @@ void StorageReplicatedMergeTree::alter( zkutil::makeCreateRequest(mutations_path + "/", mutation_entry.toString(), zkutil::CreateMode::PersistentSequential)); } - if (auto txn = query_context.getMetadataTransaction()) + if (auto txn = query_context.getZooKeeperMetadataTransaction()) { txn->moveOpsTo(ops); /// NOTE: IDatabase::alterTable(...) is called when executing ALTER_METADATA queue entry without query context, /// so we have to update metadata of DatabaseReplicated here. - String metadata_zk_path = txn->zookeeper_path + "/metadata/" + escapeForFileName(table_id.table_name); + String metadata_zk_path = txn->getDatabaseZooKeeperPath() + "/metadata/" + escapeForFileName(table_id.table_name); auto ast = DatabaseCatalog::instance().getDatabase(table_id.database_name)->getCreateTableQuery(table_id.table_name, query_context); applyMetadataChangesToCreateQuery(ast, future_metadata); ops.emplace_back(zkutil::makeSetRequest(metadata_zk_path, getObjectDefinitionFromCreateQuery(ast), -1)); @@ -5262,7 +5262,7 @@ void StorageReplicatedMergeTree::mutate(const MutationCommands & commands, const requests.emplace_back(zkutil::makeCreateRequest( mutations_path + "/", mutation_entry.toString(), zkutil::CreateMode::PersistentSequential)); - if (auto txn = query_context.getMetadataTransaction()) + if (auto txn = query_context.getZooKeeperMetadataTransaction()) txn->moveOpsTo(requests); Coordination::Responses responses; @@ -5766,7 +5766,7 @@ void StorageReplicatedMergeTree::replacePartitionFrom( } } - if (auto txn = context.getMetadataTransaction()) + if (auto txn = context.getZooKeeperMetadataTransaction()) txn->moveOpsTo(ops); ops.emplace_back(zkutil::makeSetRequest(zookeeper_path + "/log", "", -1)); /// Just update version @@ -6269,7 +6269,7 @@ bool StorageReplicatedMergeTree::dropAllPartsInPartition( Coordination::Requests ops; ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/log/log-", entry.toString(), zkutil::CreateMode::PersistentSequential)); ops.emplace_back(zkutil::makeSetRequest(zookeeper_path + "/log", "", -1)); /// Just update version. - if (auto txn = query_context.getMetadataTransaction()) + if (auto txn = query_context.getZooKeeperMetadataTransaction()) txn->moveOpsTo(ops); Coordination::Responses responses = zookeeper.multi(ops); diff --git a/tests/queries/skip_list.json b/tests/queries/skip_list.json index f08a41e32b8..e6bb3747fb0 100644 --- a/tests/queries/skip_list.json +++ b/tests/queries/skip_list.json @@ -108,6 +108,7 @@ "memory_tracking", "memory_usage", "live_view", + "00825_protobuf_format_map", "00152_insert_different_granularity", "01715_background_checker_blather_zookeeper", "01714_alter_drop_version", From e8583ddfe2f03b20d86e9ce85a8215e7ee46d0f4 Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Sat, 20 Feb 2021 09:10:15 +0300 Subject: [PATCH 249/381] Update BaseDaemon.cpp --- base/daemon/BaseDaemon.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/base/daemon/BaseDaemon.cpp b/base/daemon/BaseDaemon.cpp index 248ffdd4d10..83384038b7c 100644 --- a/base/daemon/BaseDaemon.cpp +++ b/base/daemon/BaseDaemon.cpp @@ -416,7 +416,9 @@ static void sanitizerDeathCallback() else log_message = "Terminate called without an active exception"; - static const size_t buf_size = PIPE_BUF; + /// POSIX.1 says that write(2)s of less than PIPE_BUF bytes must be atomic - man 7 pipe + /// And the buffer should not be too small because our exception messages can be large. + static constexpr size_t buf_size = PIPE_BUF; if (log_message.size() > buf_size - 16) log_message.resize(buf_size - 16); From 487fb09ff670a379deddc953b2bd1f52d3c77a39 Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Sat, 20 Feb 2021 14:11:01 +0800 Subject: [PATCH 250/381] Suppress signed overflow in AggregateFunctionGroupArrayMoving 2 --- src/AggregateFunctions/AggregateFunctionGroupArrayMoving.h | 6 +++--- tests/queries/0_stateless/01177_group_array_moving.sql | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/AggregateFunctions/AggregateFunctionGroupArrayMoving.h b/src/AggregateFunctions/AggregateFunctionGroupArrayMoving.h index 2a713f3aed2..3bab831d316 100644 --- a/src/AggregateFunctions/AggregateFunctionGroupArrayMoving.h +++ b/src/AggregateFunctions/AggregateFunctionGroupArrayMoving.h @@ -52,7 +52,7 @@ struct MovingSumData : public MovingData { static constexpr auto name = "groupArrayMovingSum"; - T get(size_t idx, UInt64 window_size) const + T NO_SANITIZE_UNDEFINED get(size_t idx, UInt64 window_size) const { if (idx < window_size) return this->value[idx]; @@ -66,7 +66,7 @@ struct MovingAvgData : public MovingData { static constexpr auto name = "groupArrayMovingAvg"; - T get(size_t idx, UInt64 window_size) const + T NO_SANITIZE_UNDEFINED get(size_t idx, UInt64 window_size) const { if (idx < window_size) return this->value[idx] / window_size; @@ -114,7 +114,7 @@ public: return std::make_shared(std::make_shared()); } - void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena * arena) const override + void NO_SANITIZE_UNDEFINED add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena * arena) const override { auto value = static_cast(*columns[0]).getData()[row_num]; this->data(place).add(static_cast(value), arena); diff --git a/tests/queries/0_stateless/01177_group_array_moving.sql b/tests/queries/0_stateless/01177_group_array_moving.sql index b1969e204fc..5689cd95f75 100644 --- a/tests/queries/0_stateless/01177_group_array_moving.sql +++ b/tests/queries/0_stateless/01177_group_array_moving.sql @@ -1,2 +1,4 @@ SELECT groupArrayMovingSum(257)(-9223372036854775808), groupArrayMovingSum(1048575)(18446744073709551615), groupArrayMovingSum(9223372036854775807)(number * 9223372036854775807) FROM remote('127.0.0.{1..2}', numbers(3)); SELECT groupArrayMovingAvg(257)(-9223372036854775808), groupArrayMovingAvg(1048575)(18446744073709551615), groupArrayMovingAvg(9223372036854775807)(number * 9223372036854775807) FROM remote('127.0.0.{1..2}', numbers(3)); + +SELECT groupArrayMovingSum(257)(-9223372036854775808), groupArrayMovingSum(1)(10.000100135803223, [NULL, NULL], NULL), groupArrayMovingSum(NULL)(NULL) FROM numbers(1023) FORMAT Null; From 7c04f15c8031a63f20573b9948dd18005f860f26 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Sat, 20 Feb 2021 09:11:42 +0300 Subject: [PATCH 251/381] Add log message when stacktrace cannot be obtained for thread This is to provide better diagnostics for 01051_system_stack_trace failure [1]. [1]: https://clickhouse-test-reports.s3.yandex.net/20881/866dfaec793f764dc9ba167d3ac9f6521b9b3381/functional_stateless_tests_(release,_wide_parts_enabled).html#fail1 --- src/Storages/System/StorageSystemStackTrace.cpp | 4 ++++ src/Storages/System/StorageSystemStackTrace.h | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/src/Storages/System/StorageSystemStackTrace.cpp b/src/Storages/System/StorageSystemStackTrace.cpp index abb2fdf54ed..e74d56108ad 100644 --- a/src/Storages/System/StorageSystemStackTrace.cpp +++ b/src/Storages/System/StorageSystemStackTrace.cpp @@ -16,6 +16,7 @@ #include #include #include +#include namespace DB @@ -150,6 +151,7 @@ namespace StorageSystemStackTrace::StorageSystemStackTrace(const StorageID & table_id_) : IStorageSystemOneBlock(table_id_) + , log(&Poco::Logger::get("StorageSystemStackTrace")) { notification_pipe.open(); @@ -229,6 +231,8 @@ void StorageSystemStackTrace::fillData(MutableColumns & res_columns, const Conte } else { + LOG_DEBUG(log, "Cannot obtain a stack trace for thread {}", tid); + /// Cannot obtain a stack trace. But create a record in result nevertheless. res_columns[0]->insert(tid); diff --git a/src/Storages/System/StorageSystemStackTrace.h b/src/Storages/System/StorageSystemStackTrace.h index a389f02eb09..582618d2ecd 100644 --- a/src/Storages/System/StorageSystemStackTrace.h +++ b/src/Storages/System/StorageSystemStackTrace.h @@ -6,6 +6,10 @@ #include #include +namespace Poco +{ +class Logger; +} namespace DB { @@ -30,6 +34,8 @@ protected: void fillData(MutableColumns & res_columns, const Context & context, const SelectQueryInfo & query_info) const override; mutable std::mutex mutex; + + Poco::Logger * log; }; } From 4390cb3d73f8672269fe030a709899ca119909a9 Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Sat, 20 Feb 2021 09:49:02 +0300 Subject: [PATCH 252/381] Update config.xml --- programs/server/config.xml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/programs/server/config.xml b/programs/server/config.xml index fe2a068787b..ba9b8b04b05 100644 --- a/programs/server/config.xml +++ b/programs/server/config.xml @@ -285,10 +285,9 @@ Cache is used when 'use_uncompressed_cache' user setting turned on (off by default). Uncompressed cache is advantageous only for very short queries and in rare cases. - Note: uncompressed cache is pointless for lz4, because memory bandwidth is slower than multi-core decompression. - Enabling it will only make queries slower. - If number of CPU cores is in order of 100 and memory bandwidth is in range of 100-200 GB/sec, - there is a chance it is also being pointless for zstd. + Note: uncompressed cache can be pointless for lz4, because memory bandwidth + is slower than multi-core decompression on some server configurations. + Enabling it can sometimes paradoxically make queries slower. --> 8589934592 From f820047cc841fa2b129e3f3d20ebcc0c28d1940c Mon Sep 17 00:00:00 2001 From: kssenii Date: Fri, 19 Feb 2021 15:48:48 +0000 Subject: [PATCH 253/381] Fix --- .../PostgreSQL/fetchPostgreSQLTableStructure.cpp | 7 +++++-- tests/integration/test_storage_postgresql/test.py | 8 ++++---- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.cpp b/src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.cpp index 15ce9a1baed..e065a497115 100644 --- a/src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.cpp +++ b/src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.cpp @@ -56,7 +56,7 @@ static DataTypePtr convertPostgreSQLDataType(std::string & type, bool is_nullabl { /// Numeric and decimal will both end up here as numeric. If it has type and precision, /// there will be Numeric(x, y), otherwise just Numeric - uint32_t precision, scale; + UInt32 precision, scale; if (type.ends_with(")")) { res = DataTypeFactory::instance().get(type); @@ -71,11 +71,14 @@ static DataTypePtr convertPostgreSQLDataType(std::string & type, bool is_nullabl res = std::make_shared>(precision, scale); else if (precision <= DecimalUtils::maxPrecision()) res = std::make_shared>(precision, scale); + else + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Presicion {} and scale {} are too big and not supported", precision, scale); } else { precision = DecimalUtils::maxPrecision(); - res = std::make_shared>(precision, precision); + scale = precision >> 1; + res = std::make_shared>(precision, scale); } } diff --git a/tests/integration/test_storage_postgresql/test.py b/tests/integration/test_storage_postgresql/test.py index 03af32a4803..cee495438a2 100644 --- a/tests/integration/test_storage_postgresql/test.py +++ b/tests/integration/test_storage_postgresql/test.py @@ -63,13 +63,13 @@ def test_postgres_conversions(started_cluster): cursor.execute( '''CREATE TABLE IF NOT EXISTS test_types ( a smallint, b integer, c bigint, d real, e double precision, f serial, g bigserial, - h timestamp, i date, j decimal(5, 5), k numeric)''') + h timestamp, i date, j decimal(5, 3), k numeric)''') node1.query(''' INSERT INTO TABLE FUNCTION postgresql('postgres1:5432', 'clickhouse', 'test_types', 'postgres', 'mysecretpassword') VALUES - (-32768, -2147483648, -9223372036854775808, 1.12345, 1.1234567890, 2147483647, 9223372036854775807, '2000-05-12 12:12:12', '2000-05-12', 0.22222, 0.22222)''') + (-32768, -2147483648, -9223372036854775808, 1.12345, 1.1234567890, 2147483647, 9223372036854775807, '2000-05-12 12:12:12', '2000-05-12', 22.222, 22.222)''') result = node1.query(''' - SELECT a, b, c, d, e, f, g, h, i, j, toDecimal32(k, 5) FROM postgresql('postgres1:5432', 'clickhouse', 'test_types', 'postgres', 'mysecretpassword')''') - assert(result == '-32768\t-2147483648\t-9223372036854775808\t1.12345\t1.123456789\t2147483647\t9223372036854775807\t2000-05-12 12:12:12\t2000-05-12\t0.22222\t0.22222\n') + SELECT a, b, c, d, e, f, g, h, i, j, toDecimal128(k, 3) FROM postgresql('postgres1:5432', 'clickhouse', 'test_types', 'postgres', 'mysecretpassword')''') + assert(result == '-32768\t-2147483648\t-9223372036854775808\t1.12345\t1.123456789\t2147483647\t9223372036854775807\t2000-05-12 12:12:12\t2000-05-12\t22.222\t22.222\n') cursor.execute( '''CREATE TABLE IF NOT EXISTS test_array_dimensions From 5d36ceaaee50c1442dfef55a3d98c240ee2f7bd6 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Sat, 20 Feb 2021 08:31:05 +0300 Subject: [PATCH 254/381] Fix WriteBufferFromHTTPServerResponse usage in odbc-bridge --- programs/odbc-bridge/ColumnInfoHandler.cpp | 10 ++++++++- .../odbc-bridge/IdentifierQuoteHandler.cpp | 10 ++++++++- programs/odbc-bridge/MainHandler.cpp | 22 +++++++++++++++++-- programs/odbc-bridge/SchemaAllowedHandler.cpp | 10 ++++++++- 4 files changed, 47 insertions(+), 5 deletions(-) diff --git a/programs/odbc-bridge/ColumnInfoHandler.cpp b/programs/odbc-bridge/ColumnInfoHandler.cpp index 5aef7f1ac38..14fa734f246 100644 --- a/programs/odbc-bridge/ColumnInfoHandler.cpp +++ b/programs/odbc-bridge/ColumnInfoHandler.cpp @@ -160,7 +160,15 @@ void ODBCColumnsInfoHandler::handleRequest(HTTPServerRequest & request, HTTPServ } WriteBufferFromHTTPServerResponse out(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, keep_alive_timeout); - writeStringBinary(columns.toString(), out); + try + { + writeStringBinary(columns.toString(), out); + out.finalize(); + } + catch (...) + { + out.finalize(); + } } catch (...) { diff --git a/programs/odbc-bridge/IdentifierQuoteHandler.cpp b/programs/odbc-bridge/IdentifierQuoteHandler.cpp index ec4e4493d61..5060d37c479 100644 --- a/programs/odbc-bridge/IdentifierQuoteHandler.cpp +++ b/programs/odbc-bridge/IdentifierQuoteHandler.cpp @@ -50,7 +50,15 @@ void IdentifierQuoteHandler::handleRequest(HTTPServerRequest & request, HTTPServ auto identifier = getIdentifierQuote(hdbc); WriteBufferFromHTTPServerResponse out(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, keep_alive_timeout); - writeStringBinary(identifier, out); + try + { + writeStringBinary(identifier, out); + out.finalize(); + } + catch (...) + { + out.finalize(); + } } catch (...) { diff --git a/programs/odbc-bridge/MainHandler.cpp b/programs/odbc-bridge/MainHandler.cpp index b9670397878..4fcc9deea6a 100644 --- a/programs/odbc-bridge/MainHandler.cpp +++ b/programs/odbc-bridge/MainHandler.cpp @@ -187,9 +187,27 @@ void ODBCHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse auto message = getCurrentExceptionMessage(true); response.setStatusAndReason( Poco::Net::HTTPResponse::HTTP_INTERNAL_SERVER_ERROR); // can't call process_error, because of too soon response sending - writeStringBinary(message, out); - tryLogCurrentException(log); + try + { + writeStringBinary(message, out); + out.finalize(); + } + catch (...) + { + tryLogCurrentException(log); + } + + tryLogCurrentException(log); + } + + try + { + out.finalize(); + } + catch (...) + { + tryLogCurrentException(log); } } diff --git a/programs/odbc-bridge/SchemaAllowedHandler.cpp b/programs/odbc-bridge/SchemaAllowedHandler.cpp index 48744b6d2ca..d4a70db61f4 100644 --- a/programs/odbc-bridge/SchemaAllowedHandler.cpp +++ b/programs/odbc-bridge/SchemaAllowedHandler.cpp @@ -61,7 +61,15 @@ void SchemaAllowedHandler::handleRequest(HTTPServerRequest & request, HTTPServer bool result = isSchemaAllowed(hdbc); WriteBufferFromHTTPServerResponse out(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, keep_alive_timeout); - writeBoolText(result, out); + try + { + writeBoolText(result, out); + out.finalize(); + } + catch (...) + { + out.finalize(); + } } catch (...) { From 1ccb333ac50e1e62d9507e424c3daeee465e14f9 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Sat, 20 Feb 2021 08:28:47 +0300 Subject: [PATCH 255/381] Fix WriteBufferFromHTTPServerResponse usage in other places (add missing finalize()) Since I saw the following: 0. DB::WriteBufferFromOStream::nextImpl() 1. DB::WriteBufferFromHTTPServerResponse::nextImpl() 2. DB::WriteBufferFromHTTPServerResponse::finalize() 3. DB::WriteBufferFromHTTPServerResponse::~WriteBufferFromHTTPServerResponse() 4. DB::StaticRequestHandler::handleRequest(Poco::Net::HTTPServerRequest&, Poco::Net::HTTPServerResponse&) 5. Poco::Net::HTTPServerConnection::run() 6. Poco::Net::TCPServerConnection::start() --- src/Server/InterserverIOHTTPHandler.cpp | 26 +++++++++++++++++++------ src/Server/PrometheusRequestHandler.cpp | 13 ++++++++++--- src/Server/StaticRequestHandler.cpp | 2 ++ 3 files changed, 32 insertions(+), 9 deletions(-) diff --git a/src/Server/InterserverIOHTTPHandler.cpp b/src/Server/InterserverIOHTTPHandler.cpp index 3296da94578..740072e8e9f 100644 --- a/src/Server/InterserverIOHTTPHandler.cpp +++ b/src/Server/InterserverIOHTTPHandler.cpp @@ -94,6 +94,23 @@ void InterserverIOHTTPHandler::handleRequest(HTTPServerRequest & request, HTTPSe used_output.out = std::make_shared( response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, keep_alive_timeout); + auto write_response = [&](const std::string & message) + { + if (response.sent()) + return; + + auto & out = *used_output.out; + try + { + writeString(message, out); + out.finalize(); + } + catch (...) + { + out.finalize(); + } + }; + try { if (auto [message, success] = checkAuthentication(request); success) @@ -104,8 +121,7 @@ void InterserverIOHTTPHandler::handleRequest(HTTPServerRequest & request, HTTPSe else { response.setStatusAndReason(HTTPServerResponse::HTTP_UNAUTHORIZED); - if (!response.sent()) - writeString(message, *used_output.out); + write_response(message); LOG_WARNING(log, "Query processing failed request: '{}' authentication failed", request.getURI()); } } @@ -120,8 +136,7 @@ void InterserverIOHTTPHandler::handleRequest(HTTPServerRequest & request, HTTPSe bool is_real_error = e.code() != ErrorCodes::ABORTED; std::string message = getCurrentExceptionMessage(is_real_error); - if (!response.sent()) - writeString(message, *used_output.out); + write_response(message); if (is_real_error) LOG_ERROR(log, message); @@ -132,8 +147,7 @@ void InterserverIOHTTPHandler::handleRequest(HTTPServerRequest & request, HTTPSe { response.setStatusAndReason(Poco::Net::HTTPResponse::HTTP_INTERNAL_SERVER_ERROR); std::string message = getCurrentExceptionMessage(false); - if (!response.sent()) - writeString(message, *used_output.out); + write_response(message); LOG_ERROR(log, message); } diff --git a/src/Server/PrometheusRequestHandler.cpp b/src/Server/PrometheusRequestHandler.cpp index 83cb8e85a9e..bf78a37166a 100644 --- a/src/Server/PrometheusRequestHandler.cpp +++ b/src/Server/PrometheusRequestHandler.cpp @@ -24,9 +24,16 @@ void PrometheusRequestHandler::handleRequest(HTTPServerRequest & request, HTTPSe response.setContentType("text/plain; version=0.0.4; charset=UTF-8"); - auto wb = WriteBufferFromHTTPServerResponse(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, keep_alive_timeout); - metrics_writer.write(wb); - wb.finalize(); + WriteBufferFromHTTPServerResponse wb(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, keep_alive_timeout); + try + { + metrics_writer.write(wb); + wb.finalize(); + } + catch (...) + { + wb.finalize(); + } } catch (...) { diff --git a/src/Server/StaticRequestHandler.cpp b/src/Server/StaticRequestHandler.cpp index f3f564c1cf8..9f959239be9 100644 --- a/src/Server/StaticRequestHandler.cpp +++ b/src/Server/StaticRequestHandler.cpp @@ -126,6 +126,8 @@ void StaticRequestHandler::handleRequest(HTTPServerRequest & request, HTTPServer std::string exception_message = getCurrentExceptionMessage(false, true); trySendExceptionToClient(exception_message, exception_code, request, response, *out); } + + out->finalize(); } void StaticRequestHandler::writeResponse(WriteBuffer & out) From 2ab37d025a62f650d4b90f5fafa23f4076ab3844 Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Sat, 20 Feb 2021 16:14:38 +0800 Subject: [PATCH 256/381] Skip non-parallel tests --- tests/queries/skip_list.json | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/queries/skip_list.json b/tests/queries/skip_list.json index fdb845b7e72..1164d7b0004 100644 --- a/tests/queries/skip_list.json +++ b/tests/queries/skip_list.json @@ -364,6 +364,7 @@ "00626_replace_partition_from_table", "00626_replace_partition_from_table_zookeeper", "00633_materialized_view_and_too_many_parts_zookeeper", + "00643_cast_zookeeper", "00652_mergetree_mutations", "00652_replicated_mutations_zookeeper", "00682_empty_parts_merge", @@ -577,10 +578,11 @@ "01602_show_create_view", "01603_rename_overwrite_bug", "01646_system_restart_replicas_smoke", // system restart replicas is a global query - "01676_dictget_in_default_expression", - "01715_background_checker_blather_zookeeper", - "01700_system_zookeeper_path_in", + "01656_test_query_log_factories_info", "01669_columns_declaration_serde", + "01676_dictget_in_default_expression", + "01700_system_zookeeper_path_in", + "01715_background_checker_blather_zookeeper", "attach", "ddl_dictionaries", "dictionary", From d947dbc185beee7a78bf73ba2aceeb81e664e013 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 20 Feb 2021 11:44:35 +0300 Subject: [PATCH 257/381] Add test to skip list --- tests/queries/0_stateless/arcadia_skip_list.txt | 1 + tests/queries/skip_list.json | 9 ++++++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/queries/0_stateless/arcadia_skip_list.txt b/tests/queries/0_stateless/arcadia_skip_list.txt index 5466fb4bfb8..4e523545938 100644 --- a/tests/queries/0_stateless/arcadia_skip_list.txt +++ b/tests/queries/0_stateless/arcadia_skip_list.txt @@ -206,3 +206,4 @@ 01683_dist_INSERT_block_structure_mismatch 01702_bitmap_native_integers 01686_event_time_microseconds_part_log +01017_uniqCombined_memory_usage diff --git a/tests/queries/skip_list.json b/tests/queries/skip_list.json index fdb845b7e72..70963190125 100644 --- a/tests/queries/skip_list.json +++ b/tests/queries/skip_list.json @@ -17,7 +17,8 @@ "functions_bad_arguments", /// Too long for TSan "01603_read_with_backoff_bug", /// Too long for TSan "01646_system_restart_replicas_smoke", /// RESTART REPLICAS can acquire too much locks, while only 64 is possible from one thread under TSan - "01641_memory_tracking_insert_optimize" /// INSERT lots of rows is too heavy for TSan + "01641_memory_tracking_insert_optimize", /// INSERT lots of rows is too heavy for TSan + "01017_uniqCombined_memory_usage" /// Fine thresholds on memory usage ], "address-sanitizer": [ "00877", @@ -27,7 +28,8 @@ "01103_check_cpu_instructions_at_startup", "01473_event_time_microseconds", "01526_max_untracked_memory", /// requires TraceCollector, does not available under sanitizers - "01193_metadata_loading" + "01193_metadata_loading", + "01017_uniqCombined_memory_usage" /// Fine thresholds on memory usage ], "ub-sanitizer": [ "capnproto", @@ -48,7 +50,8 @@ "00877_memory_limit_for_new_delete", /// memory limits don't work correctly under msan because it replaces malloc/free "01473_event_time_microseconds", "01526_max_untracked_memory", /// requires TraceCollector, does not available under sanitizers - "01193_metadata_loading" + "01193_metadata_loading", + "01017_uniqCombined_memory_usage" /// Fine thresholds on memory usage ], "debug-build": [ "query_profiler", From f37631830f8139a68c42111c11584956f992630a Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Sat, 20 Feb 2021 16:45:25 +0800 Subject: [PATCH 258/381] Comments --- src/Interpreters/FunctionNameNormalizer.cpp | 4 ++++ src/Interpreters/ya.make | 1 + src/Server/TCPHandler.cpp | 4 +++- 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/Interpreters/FunctionNameNormalizer.cpp b/src/Interpreters/FunctionNameNormalizer.cpp index 36ccc9340ea..255f4d8c6bb 100644 --- a/src/Interpreters/FunctionNameNormalizer.cpp +++ b/src/Interpreters/FunctionNameNormalizer.cpp @@ -14,6 +14,8 @@ void FunctionNameNormalizer::visit(IAST * ast) if (!ast) return; + // Normalize only selected children. Avoid normalizing engine clause because some engine might + // have the same name as function, e.g. Log. if (auto * node_storage = ast->as()) { visit(node_storage->partition_by); @@ -24,6 +26,8 @@ void FunctionNameNormalizer::visit(IAST * ast) return; } + // Normalize only selected children. Avoid normalizing type clause because some type might + // have the same name as function, e.g. Date. if (auto * node_decl = ast->as()) { visit(node_decl->default_expression.get()); diff --git a/src/Interpreters/ya.make b/src/Interpreters/ya.make index cd4980927e4..e7882ec8d98 100644 --- a/src/Interpreters/ya.make +++ b/src/Interpreters/ya.make @@ -58,6 +58,7 @@ SRCS( ExternalModelsLoader.cpp ExtractExpressionInfoVisitor.cpp FillingRow.cpp + FunctionNameNormalizer.cpp HashJoin.cpp IExternalLoadable.cpp IInterpreter.cpp diff --git a/src/Server/TCPHandler.cpp b/src/Server/TCPHandler.cpp index 9794a86d3e3..d2ce2a409a9 100644 --- a/src/Server/TCPHandler.cpp +++ b/src/Server/TCPHandler.cpp @@ -1133,7 +1133,9 @@ void TCPHandler::receiveQuery() } query_context->applySettingsChanges(settings_changes); - /// Disable function name normalization it's a secondary query. + /// Disable function name normalization when it's a secondary query, because queries are either + /// already normalized on initiator node, or not normalized and should remain unnormalized for + /// compatibility. if (client_info.query_kind == ClientInfo::QueryKind::SECONDARY_QUERY) { query_context->setSetting("normalize_function_names", Field(0)); From a38a31c954aa03251767f769f8c6b5584165b2dd Mon Sep 17 00:00:00 2001 From: kssenii Date: Sat, 20 Feb 2021 09:58:24 +0000 Subject: [PATCH 259/381] Fix typos check --- src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.cpp b/src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.cpp index e065a497115..d3a42ead3f6 100644 --- a/src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.cpp +++ b/src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.cpp @@ -72,7 +72,7 @@ static DataTypePtr convertPostgreSQLDataType(std::string & type, bool is_nullabl else if (precision <= DecimalUtils::maxPrecision()) res = std::make_shared>(precision, scale); else - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Presicion {} and scale {} are too big and not supported", precision, scale); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Precision {} and scale {} are too big and not supported", precision, scale); } else { From 89dd15a91df89a3975e68ad3f6d4651f517e33ba Mon Sep 17 00:00:00 2001 From: alesapin Date: Sat, 20 Feb 2021 14:04:38 +0300 Subject: [PATCH 260/381] Checksum for header and logging, better names --- src/Coordination/Changelog.cpp | 239 +++++++++++++------------- src/Coordination/Changelog.h | 61 ++++--- src/Coordination/NuKeeperLogStore.cpp | 3 +- src/Coordination/NuKeeperLogStore.h | 2 + 4 files changed, 168 insertions(+), 137 deletions(-) diff --git a/src/Coordination/Changelog.cpp b/src/Coordination/Changelog.cpp index 4a3955e23ab..3d3c1ad230d 100644 --- a/src/Coordination/Changelog.cpp +++ b/src/Coordination/Changelog.cpp @@ -20,22 +20,6 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -std::string toString(const ChangelogVersion & version) -{ - if (version == ChangelogVersion::V0) - return "V0"; - - throw Exception(ErrorCodes::UNKNOWN_FORMAT_VERSION, "Unknown chagelog version {}", static_cast(version)); -} - -ChangelogVersion fromString(const std::string & version_str) -{ - if (version_str == "V0") - return ChangelogVersion::V0; - - throw Exception(ErrorCodes::UNKNOWN_FORMAT_VERSION, "Unknown chagelog version {}", version_str); -} - namespace { @@ -44,11 +28,10 @@ constexpr auto DEFAULT_PREFIX = "changelog"; std::string formatChangelogPath(const std::string & prefix, const ChangelogFileDescription & name) { std::filesystem::path path(prefix); - path /= std::filesystem::path(name.prefix + "_" + std::to_string(name.from_log_idx) + "_" + std::to_string(name.to_log_idx) + ".bin"); + path /= std::filesystem::path(name.prefix + "_" + std::to_string(name.from_log_index) + "_" + std::to_string(name.to_log_index) + ".bin"); return path; } - ChangelogFileDescription getChangelogFileDescription(const std::string & path_str) { std::filesystem::path path(path_str); @@ -60,8 +43,8 @@ ChangelogFileDescription getChangelogFileDescription(const std::string & path_st ChangelogFileDescription result; result.prefix = filename_parts[0]; - result.from_log_idx = parse(filename_parts[1]); - result.to_log_idx = parse(filename_parts[2]); + result.from_log_index = parse(filename_parts[1]); + result.to_log_index = parse(filename_parts[2]); result.path = path_str; return result; } @@ -71,6 +54,17 @@ LogEntryPtr makeClone(const LogEntryPtr & entry) return cs_new(entry->get_term(), nuraft::buffer::clone(entry->get_buf()), entry->get_val_type()); } +Checksum computeRecordChecksum(const ChangelogRecord & record) +{ + const auto * header_start = reinterpret_cast(&record.header); + auto sum = CityHash_v1_0_2::CityHash128(header_start, sizeof(record.header)); + + if (record.header.blob_size != 0) + sum = CityHash_v1_0_2::CityHash128WithSeed(reinterpret_cast(record.blob->data_begin()), record.header.blob_size, sum); + + return sum; +} + } class ChangelogWriter @@ -86,12 +80,9 @@ public: off_t appendRecord(ChangelogRecord && record, bool sync) { off_t result = plain_buf.count(); - writeIntBinary(record.header.version, plain_buf); - writeIntBinary(record.header.index, plain_buf); - writeIntBinary(record.header.term, plain_buf); - writeIntBinary(record.header.value_type, plain_buf); - writeIntBinary(record.header.blob_size, plain_buf); - writeIntBinary(record.header.blob_checksum, plain_buf); + writeIntBinary(computeRecordChecksum(record), plain_buf); + + writePODBinary(record.header, plain_buf); if (record.header.blob_size != 0) plain_buf.write(reinterpret_cast(record.blob->data_begin()), record.blob->size()); @@ -157,7 +148,7 @@ public: , read_buf(filepath) {} - ChangelogReadResult readChangelog(IndexToLogEntry & logs, size_t start_log_idx, IndexToOffset & index_to_offset) + ChangelogReadResult readChangelog(IndexToLogEntry & logs, size_t start_log_index, IndexToOffset & index_to_offset, Poco::Logger * log) { size_t previous_index = 0; ChangelogReadResult result{}; @@ -166,24 +157,31 @@ public: while (!read_buf.eof()) { result.last_position = read_buf.count(); + Checksum record_checksum; + readIntBinary(record_checksum, read_buf); + ChangelogRecord record; - readIntBinary(record.header.version, read_buf); - readIntBinary(record.header.index, read_buf); - readIntBinary(record.header.term, read_buf); - readIntBinary(record.header.value_type, read_buf); - readIntBinary(record.header.blob_size, read_buf); - readIntBinary(record.header.blob_checksum, read_buf); - auto buffer = nuraft::buffer::alloc(record.header.blob_size); - auto * buffer_begin = reinterpret_cast(buffer->data_begin()); - read_buf.readStrict(buffer_begin, record.header.blob_size); + readPODBinary(record.header, read_buf); + if (record.header.version > CURRENT_CHANGELOG_VERSION) + throw Exception(ErrorCodes::UNKNOWN_FORMAT_VERSION, "Unsupported changelog version {} on path {}", record.header.version, filepath); + + if (record.header.blob_size != 0) + { + auto buffer = nuraft::buffer::alloc(record.header.blob_size); + auto * buffer_begin = reinterpret_cast(buffer->data_begin()); + read_buf.readStrict(buffer_begin, record.header.blob_size); + record.blob = buffer; + } + else + record.blob = nullptr; if (previous_index != 0 && previous_index + 1 != record.header.index) throw Exception(ErrorCodes::CORRUPTED_DATA, "Previous log entry {}, next log entry {}, seems like some entries skipped", previous_index, record.header.index); previous_index = record.header.index; - Checksum checksum = CityHash_v1_0_2::CityHash128(buffer_begin, record.header.blob_size); - if (checksum != record.header.blob_checksum) + Checksum checksum = computeRecordChecksum(record); + if (checksum != record_checksum) { throw Exception(ErrorCodes::CHECKSUM_DOESNT_MATCH, "Checksums doesn't match for log {} (version {}), index {}, blob_size {}", @@ -195,10 +193,10 @@ public: result.entries_read += 1; - if (record.header.index < start_log_idx) + if (record.header.index < start_log_index) continue; - auto log_entry = nuraft::cs_new(record.header.term, buffer, record.header.value_type); + auto log_entry = nuraft::cs_new(record.header.term, record.blob, record.header.value_type); logs.emplace(record.header.index, log_entry); index_to_offset[record.header.index] = result.last_position; @@ -206,13 +204,16 @@ public: } catch (const Exception & ex) { + if (ex.code() == ErrorCodes::UNKNOWN_FORMAT_VERSION) + throw ex; + result.error = true; - LOG_WARNING(&Poco::Logger::get("RaftChangelog"), "Cannot completely read changelog on path {}, error: {}", filepath, ex.message()); + LOG_WARNING(log, "Cannot completely read changelog on path {}, error: {}", filepath, ex.message()); } catch (...) { result.error = true; - tryLogCurrentException(&Poco::Logger::get("RaftChangelog")); + tryLogCurrentException(log); } return result; @@ -223,9 +224,10 @@ private: ReadBufferFromFile read_buf; }; -Changelog::Changelog(const std::string & changelogs_dir_, size_t rotate_interval_) +Changelog::Changelog(const std::string & changelogs_dir_, size_t rotate_interval_, Poco::Logger * log_) : changelogs_dir(changelogs_dir_) , rotate_interval(rotate_interval_) + , log(log_) { namespace fs = std::filesystem; if (!fs::exists(changelogs_dir)) @@ -234,96 +236,104 @@ Changelog::Changelog(const std::string & changelogs_dir_, size_t rotate_interval for (const auto & p : fs::directory_iterator(changelogs_dir)) { auto file_description = getChangelogFileDescription(p.path()); - existing_changelogs[file_description.from_log_idx] = file_description; + existing_changelogs[file_description.from_log_index] = file_description; } } -void Changelog::readChangelogAndInitWriter(size_t from_log_idx) +void Changelog::readChangelogAndInitWriter(size_t from_log_index) { - start_index = from_log_idx == 0 ? 1 : from_log_idx; + start_index = from_log_index == 0 ? 1 : from_log_index; size_t total_read = 0; size_t entries_in_last = 0; - size_t incomplete_log_idx = 0; + size_t incomplete_log_index = 0; ChangelogReadResult result{}; - for (const auto & [start_idx, changelog_description] : existing_changelogs) - { - entries_in_last = changelog_description.to_log_idx - changelog_description.from_log_idx + 1; - if (changelog_description.to_log_idx >= from_log_idx) + for (const auto & [start_index, changelog_description] : existing_changelogs) + { + entries_in_last = changelog_description.to_log_index - changelog_description.from_log_index + 1; + + if (changelog_description.to_log_index >= from_log_index) { ChangelogReader reader(changelog_description.path); - result = reader.readChangelog(logs, from_log_idx, index_to_start_pos); + result = reader.readChangelog(logs, from_log_index, index_to_start_pos, log); total_read += result.entries_read; - /// May happen after truncate and crash + /// May happen after truncate, crash or simply unfinished log if (result.entries_read < entries_in_last) { - incomplete_log_idx = start_idx; + incomplete_log_index = start_index; break; } } } - if (incomplete_log_idx != 0) + if (incomplete_log_index != 0) { - for (auto itr = existing_changelogs.upper_bound(incomplete_log_idx); itr != existing_changelogs.end();) + /// All subsequent logs shouldn't exist. But they may exist if we crashed after writeAt started. Remove them. + for (auto itr = existing_changelogs.upper_bound(incomplete_log_index); itr != existing_changelogs.end();) { + LOG_WARNING(log, "Removing changelog {}, beacuse it's goes after broken changelog entry", itr->second.path); std::filesystem::remove(itr->second.path); itr = existing_changelogs.erase(itr); } + + /// Continue to write into existing log + if (!existing_changelogs.empty()) + { + auto description = existing_changelogs.rbegin()->second; + LOG_TRACE(log, "Continue to write into {}", description.path); + current_writer = std::make_unique(description.path, WriteMode::Append, description.from_log_index); + current_writer->setEntriesWritten(result.entries_read); + + /// Truncate all broken entries from log + if (result.error) + { + LOG_WARNING(log, "Read finished with error, truncating all broken log entries"); + current_writer->truncateToLength(result.last_position); + } + } } - if (!existing_changelogs.empty() && result.entries_read < entries_in_last) - { - auto description = existing_changelogs.rbegin()->second; - current_writer = std::make_unique(description.path, WriteMode::Append, description.from_log_idx); - current_writer->setEntriesWritten(result.entries_read); - if (result.error) - current_writer->truncateToLength(result.last_position); - } - else - { + /// Start new log if we don't initialize writer from previous log + if (!current_writer) rotate(start_index + total_read); - } } -void Changelog::rotate(size_t new_start_log_idx) +void Changelog::rotate(size_t new_start_log_index) { + //// doesn't exist on init if (current_writer) current_writer->flush(); ChangelogFileDescription new_description; new_description.prefix = DEFAULT_PREFIX; - new_description.from_log_idx = new_start_log_idx; - new_description.to_log_idx = new_start_log_idx + rotate_interval - 1; + new_description.from_log_index = new_start_log_index; + new_description.to_log_index = new_start_log_index + rotate_interval - 1; new_description.path = formatChangelogPath(changelogs_dir, new_description); - existing_changelogs[new_start_log_idx] = new_description; - current_writer = std::make_unique(new_description.path, WriteMode::Rewrite, new_start_log_idx); + + LOG_TRACE(log, "Starting new changelog {}", new_description.path); + existing_changelogs[new_start_log_index] = new_description; + current_writer = std::make_unique(new_description.path, WriteMode::Rewrite, new_start_log_index); } -ChangelogRecord Changelog::buildRecord(size_t index, nuraft::ptr log_entry) +ChangelogRecord Changelog::buildRecord(size_t index, const LogEntryPtr & log_entry) { ChangelogRecordHeader header; + header.version = ChangelogVersion::V0; header.index = index; header.term = log_entry->get_term(); header.value_type = log_entry->get_val_type(); auto buffer = log_entry->get_buf_ptr(); if (buffer) - { header.blob_size = buffer->size(); - header.blob_checksum = CityHash_v1_0_2::CityHash128(reinterpret_cast(buffer->data_begin()), buffer->size()); - } else - { header.blob_size = 0; - header.blob_checksum = std::make_pair(0, 0); - } return ChangelogRecord{header, buffer}; } -void Changelog::appendEntry(size_t index, nuraft::ptr log_entry, bool force_sync) +void Changelog::appendEntry(size_t index, const LogEntryPtr & log_entry, bool force_sync) { if (!current_writer) throw Exception(ErrorCodes::LOGICAL_ERROR, "Changelog must be initialized before appending records"); @@ -341,13 +351,13 @@ void Changelog::appendEntry(size_t index, nuraft::ptr log_ent logs[index] = makeClone(log_entry); } -void Changelog::writeAt(size_t index, nuraft::ptr log_entry, bool force_sync) +void Changelog::writeAt(size_t index, const LogEntryPtr & log_entry, bool force_sync) { if (index_to_start_pos.count(index) == 0) throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot write at index {} because changelog doesn't contain it", index); - bool need_rollback = index < current_writer->getStartIndex(); - if (need_rollback) + bool go_to_previous_file = index < current_writer->getStartIndex(); + if (go_to_previous_file) { auto index_changelog = existing_changelogs.lower_bound(index); ChangelogFileDescription description; @@ -357,14 +367,15 @@ void Changelog::writeAt(size_t index, nuraft::ptr log_entry, description = std::prev(index_changelog)->second; current_writer = std::make_unique(description.path, WriteMode::Append, index_changelog->first); - current_writer->setEntriesWritten(description.to_log_idx - description.from_log_idx + 1); + current_writer->setEntriesWritten(description.to_log_index - description.from_log_index + 1); } auto entries_written = current_writer->getEntriesWritten(); current_writer->truncateToLength(index_to_start_pos[index]); - if (need_rollback) + if (go_to_previous_file) { + /// Remove all subsequent files auto to_remove_itr = existing_changelogs.upper_bound(index); for (auto itr = to_remove_itr; itr != existing_changelogs.end();) { @@ -373,11 +384,14 @@ void Changelog::writeAt(size_t index, nuraft::ptr log_entry, } } - /// Rollback in memory state - for (auto itr = logs.lower_bound(index); itr != logs.end();) + /// Remove redundant logs from memory + for (size_t i = index; ; ++i) { - index_to_start_pos.erase(itr->first); - itr = logs.erase(itr); + auto log_itr = logs.find(i); + if (log_itr == logs.end()) + break; + logs.erase(log_itr); + index_to_start_pos.erase(i); entries_written--; } @@ -386,37 +400,32 @@ void Changelog::writeAt(size_t index, nuraft::ptr log_entry, appendEntry(index, log_entry, force_sync); } -void Changelog::compact(size_t up_to_log_idx) +void Changelog::compact(size_t up_to_log_index) { for (auto itr = existing_changelogs.begin(); itr != existing_changelogs.end();) { - if (itr->second.to_log_idx <= up_to_log_idx) + /// Remove all completely outdated changelog files + if (itr->second.to_log_index <= up_to_log_index) { - for (size_t idx = itr->second.from_log_idx; idx <= itr->second.to_log_idx; ++idx) - { - auto index_pos = index_to_start_pos.find(idx); - if (index_pos == index_to_start_pos.end()) - break; - index_to_start_pos.erase(index_pos); - } + + LOG_INFO(log, "Removing changelog {} because of compaction", itr->second.path); + std::erase_if(index_to_start_pos, [right_index = itr->second.to_log_index] (const auto & item) { return item.first <= right_index; }); std::filesystem::remove(itr->second.path); itr = existing_changelogs.erase(itr); } - else + else /// Files are ordered, so all subsequent should exist break; } - auto start = logs.begin(); - auto end = logs.upper_bound(up_to_log_idx); - logs.erase(start, end); - start_index = up_to_log_idx + 1; + start_index = up_to_log_index + 1; + std::erase_if(logs, [up_to_log_index] (const auto & item) { return item.first <= up_to_log_index; }); } LogEntryPtr Changelog::getLastEntry() const { static LogEntryPtr fake_entry = nuraft::cs_new(0, nuraft::buffer::alloc(sizeof(size_t))); - size_t next_idx = getNextEntryIndex() - 1; - auto entry = logs.find(next_idx); + size_t next_index = getNextEntryIndex() - 1; + auto entry = logs.find(next_index); if (entry == logs.end()) return fake_entry; @@ -437,10 +446,10 @@ LogEntriesPtr Changelog::getLogEntriesBetween(size_t start, size_t end) return ret; } -LogEntryPtr Changelog::entryAt(size_t idx) +LogEntryPtr Changelog::entryAt(size_t index) { nuraft::ptr src = nullptr; - auto entry = logs.find(idx); + auto entry = logs.find(index); if (entry == logs.end()) return nullptr; @@ -448,12 +457,12 @@ LogEntryPtr Changelog::entryAt(size_t idx) return makeClone(src); } -nuraft::ptr Changelog::serializeEntriesToBuffer(size_t index, int32_t cnt) +nuraft::ptr Changelog::serializeEntriesToBuffer(size_t index, int32_t count) { std::vector> returned_logs; size_t size_total = 0; - for (size_t i = index; i < index + cnt; ++i) + for (size_t i = index; i < index + count; ++i) { auto entry = logs.find(i); if (entry == logs.end()) @@ -464,9 +473,9 @@ nuraft::ptr Changelog::serializeEntriesToBuffer(size_t index, in returned_logs.push_back(buf); } - nuraft::ptr buf_out = nuraft::buffer::alloc(sizeof(int32_t) + cnt * sizeof(int32_t) + size_total); + nuraft::ptr buf_out = nuraft::buffer::alloc(sizeof(int32_t) + count * sizeof(int32_t) + size_total); buf_out->pos(0); - buf_out->put(static_cast(cnt)); + buf_out->put(static_cast(count)); for (auto & entry : returned_logs) { @@ -484,17 +493,17 @@ void Changelog::applyEntriesFromBuffer(size_t index, nuraft::buffer & buffer, bo for (int i = 0; i < num_logs; ++i) { - size_t cur_idx = index + i; + size_t cur_index = index + i; int buf_size = buffer.get_int(); nuraft::ptr buf_local = nuraft::buffer::alloc(buf_size); buffer.get(buf_local); LogEntryPtr log_entry = nuraft::log_entry::deserialize(*buf_local); - if (i == 0 && logs.count(cur_idx)) - writeAt(cur_idx, log_entry, force_sync); + if (i == 0 && logs.count(cur_index)) + writeAt(cur_index, log_entry, force_sync); else - appendEntry(cur_idx, log_entry, force_sync); + appendEntry(cur_index, log_entry, force_sync); } } diff --git a/src/Coordination/Changelog.h b/src/Coordination/Changelog.h index 779d057d285..38679d604de 100644 --- a/src/Coordination/Changelog.h +++ b/src/Coordination/Changelog.h @@ -15,60 +15,68 @@ using Checksum = CityHash_v1_0_2::uint128; using LogEntryPtr = nuraft::ptr; using LogEntries = std::vector; using LogEntriesPtr = nuraft::ptr; +using BufferPtr = nuraft::ptr; using IndexToOffset = std::unordered_map; -using IndexToLogEntry = std::map; +using IndexToLogEntry = std::unordered_map; enum class ChangelogVersion : uint8_t { V0 = 0, }; -std::string toString(const ChangelogVersion & version); -ChangelogVersion fromString(const std::string & version_str); - static constexpr auto CURRENT_CHANGELOG_VERSION = ChangelogVersion::V0; -struct ChangelogRecordHeader +struct __attribute__((__packed__)) ChangelogRecordHeader { ChangelogVersion version = CURRENT_CHANGELOG_VERSION; - size_t index; + size_t index; /// entry log number size_t term; nuraft::log_val_type value_type; size_t blob_size; - Checksum blob_checksum; }; +/// Changelog record on disk struct ChangelogRecord { ChangelogRecordHeader header; nuraft::ptr blob; }; +/// changelog_fromindex_toindex.bin +/// [fromindex, toindex] <- inclusive struct ChangelogFileDescription { std::string prefix; - size_t from_log_idx; - size_t to_log_idx; + size_t from_log_index; + size_t to_log_index; std::string path; }; class ChangelogWriter; +/// Simpliest changelog with files rotation. +/// No compression, no metadata, just entries with headers one by one +/// Able to read broken files/entries and discard them. class Changelog { public: - Changelog(const std::string & changelogs_dir_, size_t rotate_interval_); + Changelog(const std::string & changelogs_dir_, size_t rotate_interval_, Poco::Logger * log_); - void readChangelogAndInitWriter(size_t from_log_idx); + /// Read changelog from files on changelogs_dir_ skipping all entries before from_log_index + /// Truncate broken entries, remove files after broken entries. + void readChangelogAndInitWriter(size_t from_log_index); - void appendEntry(size_t index, LogEntryPtr log_entry, bool force_sync); + /// Add entry to log with index. Call fsync if force_sync true. + void appendEntry(size_t index, const LogEntryPtr & log_entry, bool force_sync); - void writeAt(size_t index, LogEntryPtr log_entry, bool force_sync); + /// Write entry at index and truncate all subsequent entries. + void writeAt(size_t index, const LogEntryPtr & log_entry, bool force_sync); - void compact(size_t up_to_log_idx); + /// Remove log files with to_log_index <= up_to_log_index. + void compact(size_t up_to_log_index); size_t getNextEntryIndex() const { @@ -80,16 +88,22 @@ public: return start_index; } + /// Last entry in log, or fake entry with term 0 if log is empty LogEntryPtr getLastEntry() const; - LogEntriesPtr getLogEntriesBetween(size_t start_index, size_t end_idx); + /// Return log entries between [start, end) + LogEntriesPtr getLogEntriesBetween(size_t start_index, size_t end_index); - LogEntryPtr entryAt(size_t idx); + /// Return entry at position index + LogEntryPtr entryAt(size_t index); - nuraft::ptr serializeEntriesToBuffer(size_t index, int32_t cnt); + /// Serialize entries from index into buffer + BufferPtr serializeEntriesToBuffer(size_t index, int32_t count); + /// Apply entries from buffer overriding existing entries void applyEntriesFromBuffer(size_t index, nuraft::buffer & buffer, bool force_sync); + /// Fsync log to disk void flush(); size_t size() const @@ -97,20 +111,25 @@ public: return logs.size(); } + /// Fsync log to disk ~Changelog(); private: - void rotate(size_t new_start_log_idx); + /// Starts new file [new_start_log_index, new_start_log_index + rotate_interval] + void rotate(size_t new_start_log_index); - static ChangelogRecord buildRecord(size_t index, nuraft::ptr log_entry); + /// Pack log_entry into changelog record + static ChangelogRecord buildRecord(size_t index, const LogEntryPtr & log_entry); private: - std::string changelogs_dir; + const std::string changelogs_dir; + const size_t rotate_interval; + Poco::Logger * log; + std::map existing_changelogs; std::unique_ptr current_writer; IndexToOffset index_to_start_pos; - const size_t rotate_interval; IndexToLogEntry logs; size_t start_index = 0; }; diff --git a/src/Coordination/NuKeeperLogStore.cpp b/src/Coordination/NuKeeperLogStore.cpp index 8834bdc4d69..6aba078bb80 100644 --- a/src/Coordination/NuKeeperLogStore.cpp +++ b/src/Coordination/NuKeeperLogStore.cpp @@ -4,7 +4,8 @@ namespace DB { NuKeeperLogStore::NuKeeperLogStore(const std::string & changelogs_path, size_t rotate_interval_, bool force_sync_) - : changelog(changelogs_path, rotate_interval_) + : log(&Poco::Logger::get("NuKeeperLogStore")) + , changelog(changelogs_path, rotate_interval_, log) , force_sync(force_sync_) { } diff --git a/src/Coordination/NuKeeperLogStore.h b/src/Coordination/NuKeeperLogStore.h index 0ff92220316..a94b662fda4 100644 --- a/src/Coordination/NuKeeperLogStore.h +++ b/src/Coordination/NuKeeperLogStore.h @@ -4,6 +4,7 @@ #include #include #include +#include namespace DB { @@ -43,6 +44,7 @@ public: private: mutable std::mutex changelog_lock; + Poco::Logger * log; Changelog changelog; bool force_sync; }; From e7f792c94d2835676f82fd7942f6f8a591fe7e4d Mon Sep 17 00:00:00 2001 From: alesapin Date: Sat, 20 Feb 2021 14:28:39 +0300 Subject: [PATCH 261/381] Fix typos --- src/Coordination/Changelog.cpp | 2 +- src/Coordination/Changelog.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Coordination/Changelog.cpp b/src/Coordination/Changelog.cpp index 3d3c1ad230d..efb0f2798e2 100644 --- a/src/Coordination/Changelog.cpp +++ b/src/Coordination/Changelog.cpp @@ -272,7 +272,7 @@ void Changelog::readChangelogAndInitWriter(size_t from_log_index) /// All subsequent logs shouldn't exist. But they may exist if we crashed after writeAt started. Remove them. for (auto itr = existing_changelogs.upper_bound(incomplete_log_index); itr != existing_changelogs.end();) { - LOG_WARNING(log, "Removing changelog {}, beacuse it's goes after broken changelog entry", itr->second.path); + LOG_WARNING(log, "Removing changelog {}, because it's goes after broken changelog entry", itr->second.path); std::filesystem::remove(itr->second.path); itr = existing_changelogs.erase(itr); } diff --git a/src/Coordination/Changelog.h b/src/Coordination/Changelog.h index 38679d604de..f758edc27ed 100644 --- a/src/Coordination/Changelog.h +++ b/src/Coordination/Changelog.h @@ -56,7 +56,7 @@ struct ChangelogFileDescription class ChangelogWriter; -/// Simpliest changelog with files rotation. +/// Simplest changelog with files rotation. /// No compression, no metadata, just entries with headers one by one /// Able to read broken files/entries and discard them. class Changelog From 863c0992540c68b781b393a35d8c8f47dddbdd20 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Sat, 20 Feb 2021 15:56:28 +0300 Subject: [PATCH 262/381] fix --- docker/test/fasttest/run.sh | 2 +- src/Databases/DatabaseReplicatedWorker.h | 2 +- ...ference => 01541_max_memory_usage_for_user_long.reference} | 0 ...ge_for_user.sh => 01541_max_memory_usage_for_user_long.sh} | 0 tests/queries/skip_list.json | 4 ++-- 5 files changed, 4 insertions(+), 4 deletions(-) rename tests/queries/0_stateless/{01541_max_memory_usage_for_user.reference => 01541_max_memory_usage_for_user_long.reference} (100%) rename tests/queries/0_stateless/{01541_max_memory_usage_for_user.sh => 01541_max_memory_usage_for_user_long.sh} (100%) diff --git a/docker/test/fasttest/run.sh b/docker/test/fasttest/run.sh index 1c5f62a9e46..c9c8cb1382d 100755 --- a/docker/test/fasttest/run.sh +++ b/docker/test/fasttest/run.sh @@ -326,7 +326,7 @@ function run_tests # Look at DistributedFilesToInsert, so cannot run in parallel. 01460_DistributedFilesToInsert - 01541_max_memory_usage_for_user + 01541_max_memory_usage_for_user_long # Require python libraries like scipy, pandas and numpy 01322_ttest_scipy diff --git a/src/Databases/DatabaseReplicatedWorker.h b/src/Databases/DatabaseReplicatedWorker.h index 6dd8dc408d7..6ba46a98bca 100644 --- a/src/Databases/DatabaseReplicatedWorker.h +++ b/src/Databases/DatabaseReplicatedWorker.h @@ -14,7 +14,7 @@ class DatabaseReplicated; /// 3. After creation of an entry in DDL queue initiator tries to execute the entry locally /// and other hosts wait for query to finish on initiator host. /// If query succeed on initiator, then all hosts must execute it, so they will retry until query succeed. -/// We assume that cluster is homogenous, so if replicas are in consistent state and query succeed on one host, +/// We assume that cluster is homogeneous, so if replicas are in consistent state and query succeed on one host, /// then all hosts can execute it (maybe after several retries). /// 4. Each database replica stores its log pointer in ZooKeeper. Cleanup thread removes old entry /// if its number < max_log_ptr - logs_to_keep. diff --git a/tests/queries/0_stateless/01541_max_memory_usage_for_user.reference b/tests/queries/0_stateless/01541_max_memory_usage_for_user_long.reference similarity index 100% rename from tests/queries/0_stateless/01541_max_memory_usage_for_user.reference rename to tests/queries/0_stateless/01541_max_memory_usage_for_user_long.reference diff --git a/tests/queries/0_stateless/01541_max_memory_usage_for_user.sh b/tests/queries/0_stateless/01541_max_memory_usage_for_user_long.sh similarity index 100% rename from tests/queries/0_stateless/01541_max_memory_usage_for_user.sh rename to tests/queries/0_stateless/01541_max_memory_usage_for_user_long.sh diff --git a/tests/queries/skip_list.json b/tests/queries/skip_list.json index e6bb3747fb0..77c4d487082 100644 --- a/tests/queries/skip_list.json +++ b/tests/queries/skip_list.json @@ -440,7 +440,7 @@ "01530_drop_database_atomic_sync", "01532_execute_merges_on_single_replica", "01532_primary_key_without_order_by_zookeeper", - "01541_max_memory_usage_for_user", + "01541_max_memory_usage_for_user_long", "01551_mergetree_read_in_order_spread", "01552_dict_fixedstring", "01554_bloom_filter_index_big_integer_uuid", @@ -717,7 +717,7 @@ "01527_clickhouse_local_optimize", "01527_dist_sharding_key_dictGet_reload", "01530_drop_database_atomic_sync", - "01541_max_memory_usage_for_user", + "01541_max_memory_usage_for_user_long", "01542_dictionary_load_exception_race", "01575_disable_detach_table_of_dictionary", "01593_concurrent_alter_mutations_kill", From d723f25fbd6474675d4e846c8a187418a540153a Mon Sep 17 00:00:00 2001 From: Evgeniia Sudarikova Date: Sat, 20 Feb 2021 16:51:31 +0300 Subject: [PATCH 263/381] delete extra text --- docs/en/sql-reference/functions/array-functions.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/en/sql-reference/functions/array-functions.md b/docs/en/sql-reference/functions/array-functions.md index 531c5e5be49..c9c418d57a4 100644 --- a/docs/en/sql-reference/functions/array-functions.md +++ b/docs/en/sql-reference/functions/array-functions.md @@ -1378,7 +1378,6 @@ SELECT arrayMax([1, 2, 4]) AS res; Result: -``` text ```text ┌─res─┐ │ 4 │ From 6cc2fb5e9f32517ebd29104c56f3ee07517f462d Mon Sep 17 00:00:00 2001 From: vdimir Date: Sat, 20 Feb 2021 18:00:59 +0300 Subject: [PATCH 264/381] Try to fix race in storage join: block parralel inserts --- src/Functions/FunctionJoinGet.cpp | 8 ++++---- src/Functions/FunctionJoinGet.h | 15 +++++++-------- src/Interpreters/HashJoin.cpp | 11 +---------- src/Interpreters/HashJoin.h | 9 ++++++++- src/Storages/StorageJoin.cpp | 10 +++++++--- src/Storages/StorageJoin.h | 18 ++++++++++++++---- 6 files changed, 41 insertions(+), 30 deletions(-) diff --git a/src/Functions/FunctionJoinGet.cpp b/src/Functions/FunctionJoinGet.cpp index 6b15bf821b2..3a2649c11a8 100644 --- a/src/Functions/FunctionJoinGet.cpp +++ b/src/Functions/FunctionJoinGet.cpp @@ -25,7 +25,7 @@ ColumnPtr ExecutableFunctionJoinGet::execute(const ColumnsWithTypeAndNa auto key = arguments[i]; keys.emplace_back(std::move(key)); } - return join->joinGet(keys, result_columns).column; + return join->join->joinGet(keys, result_columns).column; } template @@ -87,13 +87,13 @@ FunctionBaseImplPtr JoinGetOverloadResolver::build(const ColumnsWithTyp + ", should be greater or equal to 3", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); auto [storage_join, attr_name] = getJoin(arguments, context); - auto join = storage_join->getJoin(); + auto join_holder = storage_join->getJoin(); DataTypes data_types(arguments.size() - 2); for (size_t i = 2; i < arguments.size(); ++i) data_types[i - 2] = arguments[i].type; - auto return_type = join->joinGetCheckAndGetReturnType(data_types, attr_name, or_null); + auto return_type = join_holder->join->joinGetCheckAndGetReturnType(data_types, attr_name, or_null); auto table_lock = storage_join->lockForShare(context.getInitialQueryId(), context.getSettingsRef().lock_acquire_timeout); - return std::make_unique>(table_lock, storage_join, join, attr_name, data_types, return_type); + return std::make_unique>(table_lock, join_holder, attr_name, data_types, return_type); } void registerFunctionJoinGet(FunctionFactory & factory) diff --git a/src/Functions/FunctionJoinGet.h b/src/Functions/FunctionJoinGet.h index 27f348e9698..820c6cd3fa2 100644 --- a/src/Functions/FunctionJoinGet.h +++ b/src/Functions/FunctionJoinGet.h @@ -9,13 +9,14 @@ namespace DB class Context; class HashJoin; +class HashJoinHolder; using HashJoinPtr = std::shared_ptr; template class ExecutableFunctionJoinGet final : public IExecutableFunctionImpl { public: - ExecutableFunctionJoinGet(HashJoinPtr join_, const DB::Block & result_columns_) + ExecutableFunctionJoinGet(std::shared_ptr join_, const DB::Block & result_columns_) : join(std::move(join_)), result_columns(result_columns_) {} static constexpr auto name = or_null ? "joinGetOrNull" : "joinGet"; @@ -29,7 +30,7 @@ public: String getName() const override { return name; } private: - HashJoinPtr join; + std::shared_ptr join; DB::Block result_columns; }; @@ -39,12 +40,11 @@ class FunctionJoinGet final : public IFunctionBaseImpl public: static constexpr auto name = or_null ? "joinGetOrNull" : "joinGet"; - FunctionJoinGet(TableLockHolder table_lock_, StoragePtr storage_join_, - HashJoinPtr join_, String attr_name_, + FunctionJoinGet(TableLockHolder table_lock_, + std::shared_ptr join_, String attr_name_, DataTypes argument_types_, DataTypePtr return_type_) : table_lock(std::move(table_lock_)) - , storage_join(std::move(storage_join_)) - , join(std::move(join_)) + , join(join_) , attr_name(std::move(attr_name_)) , argument_types(std::move(argument_types_)) , return_type(std::move(return_type_)) @@ -60,8 +60,7 @@ public: private: TableLockHolder table_lock; - StoragePtr storage_join; - HashJoinPtr join; + std::shared_ptr join; const String attr_name; DataTypes argument_types; DataTypePtr return_type; diff --git a/src/Interpreters/HashJoin.cpp b/src/Interpreters/HashJoin.cpp index 5c50b53e2ca..cd158241860 100644 --- a/src/Interpreters/HashJoin.cpp +++ b/src/Interpreters/HashJoin.cpp @@ -423,19 +423,16 @@ bool HashJoin::empty() const size_t HashJoin::getTotalByteCount() const { - std::shared_lock lock(data->rwlock); return getTotalByteCountLocked(); } size_t HashJoin::getTotalRowCount() const { - std::shared_lock lock(data->rwlock); return getTotalRowCountLocked(); } bool HashJoin::alwaysReturnsEmptySet() const { - std::shared_lock lock(data->rwlock); return isInnerOrRight(getKind()) && data->empty && !overDictionary(); } @@ -652,7 +649,7 @@ bool HashJoin::addJoinedBlock(const Block & source_block, bool check_limits) size_t total_bytes = 0; { - std::unique_lock lock(data->rwlock); + assert(storage_join_lock.mutex() == nullptr); data->blocks.emplace_back(std::move(structured_block)); Block * stored_block = &data->blocks.back(); @@ -1219,8 +1216,6 @@ void HashJoin::joinBlockImplCross(Block & block, ExtraBlockPtr & not_processed) DataTypePtr HashJoin::joinGetCheckAndGetReturnType(const DataTypes & data_types, const String & column_name, bool or_null) const { - std::shared_lock lock(data->rwlock); - size_t num_keys = data_types.size(); if (right_table_keys.columns() != num_keys) throw Exception( @@ -1273,8 +1268,6 @@ ColumnWithTypeAndName HashJoin::joinGetImpl(const Block & block, const Block & b // TODO: return array of values when strictness == ASTTableJoin::Strictness::All ColumnWithTypeAndName HashJoin::joinGet(const Block & block, const Block & block_with_columns_to_add) const { - std::shared_lock lock(data->rwlock); - if ((strictness == ASTTableJoin::Strictness::Any || strictness == ASTTableJoin::Strictness::RightAny) && kind == ASTTableJoin::Kind::Left) { @@ -1287,8 +1280,6 @@ ColumnWithTypeAndName HashJoin::joinGet(const Block & block, const Block & block void HashJoin::joinBlock(Block & block, ExtraBlockPtr & not_processed) { - std::shared_lock lock(data->rwlock); - const Names & key_names_left = table_join->keyNamesLeft(); JoinCommon::checkTypesOfKeys(block, key_names_left, right_table_keys, key_names_right); diff --git a/src/Interpreters/HashJoin.h b/src/Interpreters/HashJoin.h index 06ce7559f31..06e07dc10dd 100644 --- a/src/Interpreters/HashJoin.h +++ b/src/Interpreters/HashJoin.h @@ -308,7 +308,7 @@ public: { /// Protect state for concurrent use in insertFromBlock and joinBlock. /// @note that these methods could be called simultaneously only while use of StorageJoin. - mutable std::shared_mutex rwlock; +// mutable std::shared_mutex rwlock; Type type = Type::EMPTY; bool empty = true; @@ -322,6 +322,11 @@ public: Arena pool; }; + void setLock(std::shared_mutex & rwlock) + { + storage_join_lock = std::shared_lock(rwlock); + } + void reuseJoinedData(const HashJoin & join); std::shared_ptr getJoinedData() const @@ -371,6 +376,8 @@ private: Block totals; + std::shared_lock storage_join_lock; + void init(Type type_); const Block & savedBlockSample() const { return data->sample_block; } diff --git a/src/Storages/StorageJoin.cpp b/src/Storages/StorageJoin.cpp index 8d4f0b3b3be..f130316566f 100644 --- a/src/Storages/StorageJoin.cpp +++ b/src/Storages/StorageJoin.cpp @@ -97,11 +97,17 @@ HashJoinPtr StorageJoin::getJoin(std::shared_ptr analyzed_join) const HashJoinPtr join_clone = std::make_shared(analyzed_join, metadata_snapshot->getSampleBlock().sortColumns()); join_clone->reuseJoinedData(*join); + join_clone->setLock(rwlock); + return join_clone; } -void StorageJoin::insertBlock(const Block & block) { join->addJoinedBlock(block, true); } +void StorageJoin::insertBlock(const Block & block) +{ + std::unique_lock lock(rwlock); + join->addJoinedBlock(block, true); +} size_t StorageJoin::getSize() const { return join->getTotalRowCount(); } std::optional StorageJoin::totalRows(const Settings &) const { return join->getTotalRowCount(); } @@ -267,7 +273,6 @@ public: JoinSource(const HashJoin & parent_, UInt64 max_block_size_, Block sample_block_) : SourceWithProgress(sample_block_) , parent(parent_) - , lock(parent.data->rwlock) , max_block_size(max_block_size_) , sample_block(std::move(sample_block_)) { @@ -312,7 +317,6 @@ protected: private: const HashJoin & parent; - std::shared_lock lock; UInt64 max_block_size; Block sample_block; Block restored_block; /// sample_block with parent column types diff --git a/src/Storages/StorageJoin.h b/src/Storages/StorageJoin.h index c453c036b65..6d3ec2710c9 100644 --- a/src/Storages/StorageJoin.h +++ b/src/Storages/StorageJoin.h @@ -14,6 +14,18 @@ class TableJoin; class HashJoin; using HashJoinPtr = std::shared_ptr; +class HashJoinHolder +{ + std::shared_lock lock; +public: + HashJoinPtr join; + + HashJoinHolder(std::shared_mutex & rwlock, HashJoinPtr join_) + : lock(rwlock) + , join(join_) + { + } +}; /** Allows you save the state for later use on the right side of the JOIN. * When inserted into a table, the data will be inserted into the state, @@ -31,12 +43,9 @@ public: void truncate(const ASTPtr &, const StorageMetadataPtr & metadata_snapshot, const Context &, TableExclusiveLockHolder &) override; /// Access the innards. - HashJoinPtr & getJoin() { return join; } + std::shared_ptr getJoin() { return std::make_shared(rwlock, join); } HashJoinPtr getJoin(std::shared_ptr analyzed_join) const; - /// Verify that the data structure is suitable for implementing this type of JOIN. - void assertCompatible(ASTTableJoin::Kind kind_, ASTTableJoin::Strictness strictness_) const; - Pipe read( const Names & column_names, const StorageMetadataPtr & /*metadata_snapshot*/, @@ -60,6 +69,7 @@ private: std::shared_ptr table_join; HashJoinPtr join; + mutable std::shared_mutex rwlock; void insertBlock(const Block & block) override; void finishInsert() override {} From 0c2cf3cf30b707fdf46c88760c931c194a086d2d Mon Sep 17 00:00:00 2001 From: alesapin Date: Sat, 20 Feb 2021 18:36:56 +0300 Subject: [PATCH 265/381] Calculate checksum with siphash --- src/Coordination/Changelog.cpp | 51 ++++++++++++++++++++++------------ src/Coordination/Changelog.h | 9 +++--- 2 files changed, 38 insertions(+), 22 deletions(-) diff --git a/src/Coordination/Changelog.cpp b/src/Coordination/Changelog.cpp index efb0f2798e2..adf367c565d 100644 --- a/src/Coordination/Changelog.cpp +++ b/src/Coordination/Changelog.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include namespace DB @@ -56,13 +57,15 @@ LogEntryPtr makeClone(const LogEntryPtr & entry) Checksum computeRecordChecksum(const ChangelogRecord & record) { - const auto * header_start = reinterpret_cast(&record.header); - auto sum = CityHash_v1_0_2::CityHash128(header_start, sizeof(record.header)); - + SipHash hash; + hash.update(record.header.version); + hash.update(record.header.index); + hash.update(record.header.term); + hash.update(record.header.value_type); + hash.update(record.header.blob_size); if (record.header.blob_size != 0) - sum = CityHash_v1_0_2::CityHash128WithSeed(reinterpret_cast(record.blob->data_begin()), record.header.blob_size, sum); - - return sum; + hash.update(reinterpret_cast(record.blob->data_begin()), record.blob->size()); + return hash.get64(); } } @@ -82,7 +85,11 @@ public: off_t result = plain_buf.count(); writeIntBinary(computeRecordChecksum(record), plain_buf); - writePODBinary(record.header, plain_buf); + writeIntBinary(record.header.version, plain_buf); + writeIntBinary(record.header.index, plain_buf); + writeIntBinary(record.header.term, plain_buf); + writeIntBinary(record.header.value_type, plain_buf); + writeIntBinary(record.header.blob_size, plain_buf); if (record.header.blob_size != 0) plain_buf.write(reinterpret_cast(record.blob->data_begin()), record.blob->size()); @@ -160,8 +167,14 @@ public: Checksum record_checksum; readIntBinary(record_checksum, read_buf); + /// Initialization is required, otherwise checksums may fail ChangelogRecord record; - readPODBinary(record.header, read_buf); + readIntBinary(record.header.version, read_buf); + readIntBinary(record.header.index, read_buf); + readIntBinary(record.header.term, read_buf); + readIntBinary(record.header.value_type, read_buf); + readIntBinary(record.header.blob_size, read_buf); + if (record.header.version > CURRENT_CHANGELOG_VERSION) throw Exception(ErrorCodes::UNKNOWN_FORMAT_VERSION, "Unsupported changelog version {} on path {}", record.header.version, filepath); @@ -248,7 +261,7 @@ void Changelog::readChangelogAndInitWriter(size_t from_log_index) size_t incomplete_log_index = 0; ChangelogReadResult result{}; - for (const auto & [start_index, changelog_description] : existing_changelogs) + for (const auto & [changelog_start_index, changelog_description] : existing_changelogs) { entries_in_last = changelog_description.to_log_index - changelog_description.from_log_index + 1; @@ -261,7 +274,7 @@ void Changelog::readChangelogAndInitWriter(size_t from_log_index) /// May happen after truncate, crash or simply unfinished log if (result.entries_read < entries_in_last) { - incomplete_log_index = start_index; + incomplete_log_index = changelog_start_index; break; } } @@ -319,18 +332,20 @@ void Changelog::rotate(size_t new_start_log_index) ChangelogRecord Changelog::buildRecord(size_t index, const LogEntryPtr & log_entry) { - ChangelogRecordHeader header; - header.version = ChangelogVersion::V0; - header.index = index; - header.term = log_entry->get_term(); - header.value_type = log_entry->get_val_type(); + ChangelogRecord record; + record.header.version = ChangelogVersion::V0; + record.header.index = index; + record.header.term = log_entry->get_term(); + record.header.value_type = log_entry->get_val_type(); auto buffer = log_entry->get_buf_ptr(); if (buffer) - header.blob_size = buffer->size(); + record.header.blob_size = buffer->size(); else - header.blob_size = 0; + record.header.blob_size = 0; - return ChangelogRecord{header, buffer}; + record.blob = buffer; + + return record; } void Changelog::appendEntry(size_t index, const LogEntryPtr & log_entry, bool force_sync) diff --git a/src/Coordination/Changelog.h b/src/Coordination/Changelog.h index f758edc27ed..0f67c2a9a7d 100644 --- a/src/Coordination/Changelog.h +++ b/src/Coordination/Changelog.h @@ -10,7 +10,7 @@ namespace DB { -using Checksum = CityHash_v1_0_2::uint128; +using Checksum = UInt64; using LogEntryPtr = nuraft::ptr; using LogEntries = std::vector; @@ -27,7 +27,7 @@ enum class ChangelogVersion : uint8_t static constexpr auto CURRENT_CHANGELOG_VERSION = ChangelogVersion::V0; -struct __attribute__((__packed__)) ChangelogRecordHeader +struct ChangelogRecordHeader { ChangelogVersion version = CURRENT_CHANGELOG_VERSION; size_t index; /// entry log number @@ -115,12 +115,13 @@ public: ~Changelog(); private: + /// Pack log_entry into changelog record + static ChangelogRecord buildRecord(size_t index, const LogEntryPtr & log_entry); /// Starts new file [new_start_log_index, new_start_log_index + rotate_interval] void rotate(size_t new_start_log_index); - /// Pack log_entry into changelog record - static ChangelogRecord buildRecord(size_t index, const LogEntryPtr & log_entry); + private: const std::string changelogs_dir; From 9f520f42c117e33ad107f9ea33465e11e2cf26e3 Mon Sep 17 00:00:00 2001 From: alesapin Date: Sat, 20 Feb 2021 18:56:55 +0300 Subject: [PATCH 266/381] Fix style --- src/Coordination/Changelog.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/Coordination/Changelog.h b/src/Coordination/Changelog.h index 0f67c2a9a7d..be38915066d 100644 --- a/src/Coordination/Changelog.h +++ b/src/Coordination/Changelog.h @@ -121,8 +121,6 @@ private: /// Starts new file [new_start_log_index, new_start_log_index + rotate_interval] void rotate(size_t new_start_log_index); - - private: const std::string changelogs_dir; const size_t rotate_interval; From 48e188681c88b88c11924f98976993d500fbb1d4 Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Sat, 20 Feb 2021 16:05:33 +0300 Subject: [PATCH 267/381] do not start mutation for alters with wrong type conversion --- src/Storages/MergeTree/MergeTreeData.cpp | 23 ++++++++++++++----- .../01732_alters_bad_conversions.reference | 4 ++++ .../01732_alters_bad_conversions.sql | 17 ++++++++++++++ 3 files changed, 38 insertions(+), 6 deletions(-) create mode 100644 tests/queries/0_stateless/01732_alters_bad_conversions.reference create mode 100644 tests/queries/0_stateless/01732_alters_bad_conversions.sql diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index a0d23b8ab22..b09f068f509 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -1482,6 +1483,7 @@ void MergeTreeData::checkAlterIsPossible(const AlterCommands & commands, const S for (const auto & column : old_metadata.getColumns().getAllPhysical()) old_types.emplace(column.name, column.type.get()); + NamesAndTypesList columns_to_check_conversion; for (const AlterCommand & command : commands) { /// Just validate partition expression @@ -1571,9 +1573,9 @@ void MergeTreeData::checkAlterIsPossible(const AlterCommands & commands, const S throw Exception("ALTER of key column " + backQuoteIfNeed(command.column_name) + " is forbidden", ErrorCodes::ALTER_OF_COLUMN_IS_FORBIDDEN); - if (columns_alter_type_check_safe_for_partition.count(command.column_name)) + if (command.type == AlterCommand::MODIFY_COLUMN) { - if (command.type == AlterCommand::MODIFY_COLUMN) + if (columns_alter_type_check_safe_for_partition.count(command.column_name)) { auto it = old_types.find(command.column_name); @@ -1584,11 +1586,8 @@ void MergeTreeData::checkAlterIsPossible(const AlterCommands & commands, const S + " is not safe because it can change the representation of partition key", ErrorCodes::ALTER_OF_COLUMN_IS_FORBIDDEN); } - } - if (columns_alter_type_metadata_only.count(command.column_name)) - { - if (command.type == AlterCommand::MODIFY_COLUMN) + if (columns_alter_type_metadata_only.count(command.column_name)) { auto it = old_types.find(command.column_name); assert(it != old_types.end()); @@ -1598,6 +1597,12 @@ void MergeTreeData::checkAlterIsPossible(const AlterCommands & commands, const S + " is not safe because it can change the representation of primary key", ErrorCodes::ALTER_OF_COLUMN_IS_FORBIDDEN); } + + if (old_metadata.getColumns().has(command.column_name)) + { + columns_to_check_conversion.push_back( + new_metadata.getColumns().getPhysical(command.column_name)); + } } } } @@ -1605,6 +1610,12 @@ void MergeTreeData::checkAlterIsPossible(const AlterCommands & commands, const S checkProperties(new_metadata, old_metadata); checkTTLExpressions(new_metadata, old_metadata); + if (!columns_to_check_conversion.empty()) + { + auto old_header = old_metadata.getSampleBlock(); + performRequiredConversions(old_header, columns_to_check_conversion, global_context); + } + if (old_metadata.hasSettingsChanges()) { const auto current_changes = old_metadata.getSettingsChanges()->as().changes; diff --git a/tests/queries/0_stateless/01732_alters_bad_conversions.reference b/tests/queries/0_stateless/01732_alters_bad_conversions.reference new file mode 100644 index 00000000000..5f570c78579 --- /dev/null +++ b/tests/queries/0_stateless/01732_alters_bad_conversions.reference @@ -0,0 +1,4 @@ +CREATE TABLE default.bad_conversions\n(\n `a` UInt32\n)\nENGINE = MergeTree\nORDER BY tuple()\nSETTINGS index_granularity = 8192 +0 +CREATE TABLE default.bad_conversions_2\n(\n `e` Enum8(\'foo\' = 1, \'bar\' = 2)\n)\nENGINE = MergeTree\nORDER BY tuple()\nSETTINGS index_granularity = 8192 +0 diff --git a/tests/queries/0_stateless/01732_alters_bad_conversions.sql b/tests/queries/0_stateless/01732_alters_bad_conversions.sql new file mode 100644 index 00000000000..27da5242368 --- /dev/null +++ b/tests/queries/0_stateless/01732_alters_bad_conversions.sql @@ -0,0 +1,17 @@ +DROP TABLE IF EXISTS bad_conversions; +DROP TABLE IF EXISTS bad_conversions_2; + +CREATE TABLE bad_conversions (a UInt32) ENGINE = MergeTree ORDER BY tuple(); +INSERT INTO bad_conversions VALUES (1); +ALTER TABLE bad_conversions MODIFY COLUMN a Array(String); -- { serverError 53 } +SHOW CREATE TABLE bad_conversions; +SELECT count() FROM system.mutations WHERE table = 'bad_conversions' AND database = currentDatabase(); + +CREATE TABLE bad_conversions_2 (e Enum('foo' = 1, 'bar' = 2)) ENGINE = MergeTree ORDER BY tuple(); +INSERT INTO bad_conversions_2 VALUES (1); +ALTER TABLE bad_conversions_2 MODIFY COLUMN e Enum('bar' = 1, 'foo' = 2); -- { serverError 70 } +SHOW CREATE TABLE bad_conversions_2; +SELECT count() FROM system.mutations WHERE table = 'bad_conversions_2' AND database = currentDatabase(); + +DROP TABLE IF EXISTS bad_conversions; +DROP TABLE IF EXISTS bad_conversions_2; From fe159de141bd47ae1915fea24ad520d71ae6a9a3 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Sat, 20 Feb 2021 19:30:27 +0300 Subject: [PATCH 268/381] Update version_date.tsv after release 21.2.4.6 --- utils/list-versions/version_date.tsv | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/list-versions/version_date.tsv b/utils/list-versions/version_date.tsv index d0d782e77ec..f7035ebb506 100644 --- a/utils/list-versions/version_date.tsv +++ b/utils/list-versions/version_date.tsv @@ -1,3 +1,4 @@ +v21.2.4.6-stable 2021-02-20 v21.2.3.15-stable 2021-02-14 v21.2.2.8-stable 2021-02-07 v21.1.4.46-stable 2021-02-14 From 4fa822dd287cb699e170da2941effb3c89c7f0ea Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Sat, 20 Feb 2021 20:21:55 +0300 Subject: [PATCH 269/381] Update version_date.tsv after release 21.1.5.4 --- utils/list-versions/version_date.tsv | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/list-versions/version_date.tsv b/utils/list-versions/version_date.tsv index f7035ebb506..1ccf3c66580 100644 --- a/utils/list-versions/version_date.tsv +++ b/utils/list-versions/version_date.tsv @@ -1,6 +1,7 @@ v21.2.4.6-stable 2021-02-20 v21.2.3.15-stable 2021-02-14 v21.2.2.8-stable 2021-02-07 +v21.1.5.4-stable 2021-02-20 v21.1.4.46-stable 2021-02-14 v21.1.3.32-stable 2021-02-03 v21.1.2.15-stable 2021-01-18 From e49d90405cac621c35698443d69b8a2de887a9da Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Sat, 20 Feb 2021 20:39:18 +0300 Subject: [PATCH 270/381] Update version_date.tsv after release 20.12.7.3 --- utils/list-versions/version_date.tsv | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/list-versions/version_date.tsv b/utils/list-versions/version_date.tsv index 1ccf3c66580..b0abdaab087 100644 --- a/utils/list-versions/version_date.tsv +++ b/utils/list-versions/version_date.tsv @@ -5,6 +5,7 @@ v21.1.5.4-stable 2021-02-20 v21.1.4.46-stable 2021-02-14 v21.1.3.32-stable 2021-02-03 v21.1.2.15-stable 2021-01-18 +v20.12.7.3-stable 2021-02-20 v20.12.6.29-stable 2021-02-14 v20.12.5.18-stable 2021-02-03 v20.12.5.14-stable 2020-12-28 From 64e76a4a8da87adb374ffeb571fe76eac4850ae8 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 20 Feb 2021 21:13:36 +0300 Subject: [PATCH 271/381] Minor changes in Decimal --- src/Core/DecimalComparison.h | 2 +- src/Core/DecimalFunctions.h | 24 +++++++++---------- src/Core/MySQL/MySQLReplication.cpp | 6 ++--- src/DataTypes/DataTypeDateTime64.cpp | 4 ++-- src/DataTypes/DataTypeDecimalBase.h | 10 ++++---- src/DataTypes/DataTypesDecimal.cpp | 2 +- src/DataTypes/DataTypesDecimal.h | 2 +- src/DataTypes/convertMySQLDataType.cpp | 6 ++--- .../fetchPostgreSQLTableStructure.cpp | 8 +++---- src/Functions/array/arrayAggregation.cpp | 2 +- src/Functions/array/arrayCumSum.cpp | 2 +- .../array/arrayCumSumNonNegative.cpp | 2 +- src/Functions/isDecimalOverflow.cpp | 2 +- src/IO/WriteHelpers.h | 20 ++++++++-------- 14 files changed, 46 insertions(+), 46 deletions(-) diff --git a/src/Core/DecimalComparison.h b/src/Core/DecimalComparison.h index 8279d01d35a..486c2c1f8f4 100644 --- a/src/Core/DecimalComparison.h +++ b/src/Core/DecimalComparison.h @@ -78,7 +78,7 @@ public: static bool compare(A a, B b, UInt32 scale_a, UInt32 scale_b) { - static const UInt32 max_scale = DecimalUtils::maxPrecision(); + static const UInt32 max_scale = DecimalUtils::max_precision; if (scale_a > max_scale || scale_b > max_scale) throw Exception("Bad scale of decimal field", ErrorCodes::DECIMAL_OVERFLOW); diff --git a/src/Core/DecimalFunctions.h b/src/Core/DecimalFunctions.h index 2b916cbf538..355cf1d378a 100644 --- a/src/Core/DecimalFunctions.h +++ b/src/Core/DecimalFunctions.h @@ -24,13 +24,13 @@ namespace ErrorCodes namespace DecimalUtils { -static constexpr size_t minPrecision() { return 1; } -template static constexpr size_t maxPrecision() { return 0; } -template <> constexpr size_t maxPrecision() { return 9; } -template <> constexpr size_t maxPrecision() { return 18; } -template <> constexpr size_t maxPrecision() { return 18; } -template <> constexpr size_t maxPrecision() { return 38; } -template <> constexpr size_t maxPrecision() { return 76; } +inline constexpr size_t min_precision = 1; +template inline constexpr size_t max_precision = 0; +template <> inline constexpr size_t max_precision = 9; +template <> inline constexpr size_t max_precision = 18; +template <> inline constexpr size_t max_precision = 18; +template <> inline constexpr size_t max_precision = 38; +template <> inline constexpr size_t max_precision = 76; template inline auto scaleMultiplier(UInt32 scale) @@ -87,7 +87,7 @@ struct DataTypeDecimalTrait * * Sign of `whole` controls sign of result: negative whole => negative result, positive whole => positive result. * Sign of `fractional` is expected to be positive, otherwise result is undefined. - * If `scale` is to big (scale > maxPrecision), result is undefined. + * If `scale` is to big (scale > max_precision), result is undefined. */ template inline DecimalType decimalFromComponentsWithMultiplier( @@ -287,21 +287,21 @@ inline auto binaryOpResult(const DecimalType & tx, const DecimalType & ty) scale = (tx.getScale() > ty.getScale() ? tx.getScale() : ty.getScale()); if constexpr (sizeof(T) < sizeof(U)) - return DataTypeDecimalTrait(DecimalUtils::maxPrecision(), scale); + return DataTypeDecimalTrait(DecimalUtils::max_precision, scale); else - return DataTypeDecimalTrait(DecimalUtils::maxPrecision(), scale); + return DataTypeDecimalTrait(DecimalUtils::max_precision, scale); } template typename DecimalType> inline const DataTypeDecimalTrait binaryOpResult(const DecimalType & tx, const DataTypeNumber &) { - return DataTypeDecimalTrait(DecimalUtils::maxPrecision(), tx.getScale()); + return DataTypeDecimalTrait(DecimalUtils::max_precision, tx.getScale()); } template typename DecimalType> inline const DataTypeDecimalTrait binaryOpResult(const DataTypeNumber &, const DecimalType & ty) { - return DataTypeDecimalTrait(DecimalUtils::maxPrecision(), ty.getScale()); + return DataTypeDecimalTrait(DecimalUtils::max_precision, ty.getScale()); } } diff --git a/src/Core/MySQL/MySQLReplication.cpp b/src/Core/MySQL/MySQLReplication.cpp index 8fdf337c849..1b202c4edb4 100644 --- a/src/Core/MySQL/MySQLReplication.cpp +++ b/src/Core/MySQL/MySQLReplication.cpp @@ -475,11 +475,11 @@ namespace MySQLReplication { const auto & dispatch = [](const size_t & precision, const size_t & scale, const auto & function) -> Field { - if (precision <= DecimalUtils::maxPrecision()) + if (precision <= DecimalUtils::max_precision) return Field(function(precision, scale, Decimal32())); - else if (precision <= DecimalUtils::maxPrecision()) + else if (precision <= DecimalUtils::max_precision) return Field(function(precision, scale, Decimal64())); - else if (precision <= DecimalUtils::maxPrecision()) + else if (precision <= DecimalUtils::max_precision) return Field(function(precision, scale, Decimal128())); return Field(function(precision, scale, Decimal256())); diff --git a/src/DataTypes/DataTypeDateTime64.cpp b/src/DataTypes/DataTypeDateTime64.cpp index 09e39c2de1a..17b94e871bf 100644 --- a/src/DataTypes/DataTypeDateTime64.cpp +++ b/src/DataTypes/DataTypeDateTime64.cpp @@ -28,7 +28,7 @@ namespace ErrorCodes static constexpr UInt32 max_scale = 9; DataTypeDateTime64::DataTypeDateTime64(UInt32 scale_, const std::string & time_zone_name) - : DataTypeDecimalBase(DecimalUtils::maxPrecision(), scale_), + : DataTypeDecimalBase(DecimalUtils::max_precision, scale_), TimezoneMixin(time_zone_name) { if (scale > max_scale) @@ -37,7 +37,7 @@ DataTypeDateTime64::DataTypeDateTime64(UInt32 scale_, const std::string & time_z } DataTypeDateTime64::DataTypeDateTime64(UInt32 scale_, const TimezoneMixin & time_zone_info) - : DataTypeDecimalBase(DecimalUtils::maxPrecision(), scale_), + : DataTypeDecimalBase(DecimalUtils::max_precision, scale_), TimezoneMixin(time_zone_info) { if (scale > max_scale) diff --git a/src/DataTypes/DataTypeDecimalBase.h b/src/DataTypes/DataTypeDecimalBase.h index c861b3bcac0..d9079166fa7 100644 --- a/src/DataTypes/DataTypeDecimalBase.h +++ b/src/DataTypes/DataTypeDecimalBase.h @@ -65,7 +65,7 @@ public: static constexpr bool is_parametric = true; - static constexpr size_t maxPrecision() { return DecimalUtils::maxPrecision(); } + static constexpr size_t maxPrecision() { return DecimalUtils::max_precision; } DataTypeDecimalBase(UInt32 precision_, UInt32 scale_) : precision(precision_), @@ -197,17 +197,17 @@ inline const DecimalType decimalResultType(const DataTypeNumber & tx, cons template